def loadReviews(): global documentbase, functionCollection reviews = [] with open("imdb62.txt") as f: for line in f: line = line.split('\t') reviews.append(features.document(line[5], line[1])) documentbase = features.documentbase(reviews).strippedDuplicates() functionCollection = features.documentFunctionCollection() documentbase.functionCollection = functionCollection
def loadCorpus(self): #folder should be a readable folder containing a file called 'meta-file.json' #returns a tuple trainingDocumentbase,unknownDocumentbase of the training and unlabelled documents. folder = config.tira_base_directory + '/' + self.input_dataset + '/' with open(folder + 'meta-file.json', 'rt') as f: metadata = json.load(f) encoding = metadata['encoding'] def readfile(filename): with open(filename, 'rt', encoding=encoding) as f: return f.read() author_names = [ auth['author-name'] for auth in metadata['candidate-authors'] ] docs = [] for author in author_names: documents = os.listdir(folder + author) docs += [ features.document(readfile(folder + author + '/' + doc), author) for doc in documents ] training_docbase = features.documentbase(docs) unknown_paths = [ folder + metadata['folder'] + '/' + doc['unknown-text'] for doc in metadata['unknown-texts'] ] unknown_docbase = features.documentbase( [features.document(readfile(p)) for p in unknown_paths]) if self.functionCollection is not None: training_docbase.functionCollection = self.functionCollection unknown_docbase.functionCollection = self.functionCollection self._unknown_paths = { d.identifier: path for (d, path) in zip(unknown_docbase.documents, unknown_paths) } return training_docbase, unknown_docbase
def extractDocuments(content): for white1 in [' ','\n']: for white2 in [' ','\n']: content=content.replace(white1+'&'+white2, white1+'&'+white2) for node in ET.XML(DTD+'<root>'+content+'</root>'): auth=None text='' for subnode in node: if subnode.tag=='BYLINE' and subnode.text[:3]=='By ' and subnode.text.lower() != 'by the associated press': if auth is not None: print('Multiple authors of one document: %s & %s (discard document)' % (auth,subnode.text[3:])) auth=None break auth=subnode.text[3:] elif subnode.tag=='TEXT' and subnode.text: text+=subnode.text text=text.strip() if auth and text: yield features.document(text,auth)
def threeTrain(view1,view2,view3,trainingBase, unlabelledBase, testBase, num_iterations, num_unlabelled,results_stream=None,initial_classifier1=None,\ initial_classifier2=None,initial_classifier3=None): #if no initial classifiers are given, they are learned from the trainingBase. if None in trainingBase.authors: raise Exception("Training Base should be labelled.") print("unlabelled authors: ", unlabelledBase.authors, "; test authors: ", testBase.authors) labelled1 = trainingBase labelled2 = trainingBase labelled3 = trainingBase balanced1 = labelled1 balanced2 = labelled2 balanced3 = labelled3 extra_true1 = 0 extra_true2 = 0 extra_true3 = 0 extra_false1 = 0 extra_false2 = 0 extra_false3 = 0 parallelGroup = easyparallel.ParallelismGroup() functionCollection = trainingBase.functionCollection if hasattr( trainingBase, 'functionCollection') else None #@profile def prepareDocuments(docs): import pickle print("preparing %d documents" % len(docs)) chunksize = 15000 if functionCollection is not None: for i in range(0, len(docs), chunksize): chunk = docs[i:i + chunksize] functionCollection.moveToMemory(chunk, neededDocumentFunctions) #print("forget unnecessary document functions...") gc.collect() if config.debug_memory: print("garbage: ", len(gc.garbage)) print("15 most common types:") objgraph.show_most_common_types(limit=15) c_syntax_tree.showCMemoryStatistics() showMemoryStatistics() functionCollection.showMemoryStatistics() functionCollection.getFunction( features.stanfordTreeDocumentFunction ).cachedValues.showMemoryStatistics() print("leaking: ", len(objgraph.get_leaking_objects())) prepareDocuments(trainingBase.documents) prepareDocuments(testBase.documents) verificationBase = testBase.subbase([ i for i, doc in enumerate(testBase.documents) if doc.author is not None ]) for iteration in range(num_iterations): gc.collect() remaining_unlabelled = len(unlabelledBase.documents) if remaining_unlabelled == 0: break if remaining_unlabelled > num_unlabelled: choiceIndices = random.sample(range(remaining_unlabelled), num_unlabelled) else: choiceIndices = list(range(remaining_unlabelled)) if len(choiceIndices) > num_unlabelled: raise Exception("Error detected.") choice = [unlabelledBase.documents[i] for i in choiceIndices] prepareDocuments(choice) cached_keys = [ sorted( list( functionCollection.getFunction( f).cachedValues.memory_cache)) for f in neededDocumentFunctions ] ''' classifier1 = view1.createClassifier(balanced1) classified1 = classifier1.predict(choice) classifier2 = view2.createClassifier(balanced2) classified2 = classifier2.predict(choice) classifier3 = view3.createClassifier(balanced3) classified3 = classifier3.predict(choice) ''' if iteration == 0 and initial_classifier1 is not None: classifier1 = initial_classifier1 parallelGroup.add_branch(classifier1.predict, choice) else: parallelGroup.add_branch(trainAndPredict, view1, balanced1, choice) if iteration == 0 and initial_classifier2 is not None: classifier2 = initial_classifier2 parallelGroup.add_branch(classifier2.predict, choice) else: parallelGroup.add_branch(trainAndPredict, view2, balanced2, choice) if iteration == 0 and initial_classifier3 is not None: classifier3 = initial_classifier3 parallelGroup.add_branch(classifier3.predict, choice) else: parallelGroup.add_branch(trainAndPredict, view3, balanced3, choice) print("waiting for classification and prediction...") parallelGroup_results = parallelGroup.get_results() print("got results!") if iteration == 0 and initial_classifier1 is not None: classified1 = parallelGroup_results[0] else: classifier1, classified1 = parallelGroup_results[0] if iteration == 0 and initial_classifier2 is not None: classified2 = parallelGroup_results[1] else: classifier2, classified2 = parallelGroup_results[1] if iteration == 0 and initial_classifier3 is not None: classified3 = parallelGroup_results[2] else: classifier3, classified3 = parallelGroup_results[2] if verificationBase.documents: print("prediction verificationBase for the records...") parallelGroup.add_branch(classifier1.getValuev, verificationBase.documents) parallelGroup.add_branch(classifier2.getValuev, verificationBase.documents) parallelGroup.add_branch(classifier3.getValuev, verificationBase.documents) parallelGroup.get_results() print("got results for verificationBase!") resline="%d,%d,%d,%d,%d,%d" % (iteration,len(verificationBase.documents),getSuccessRate(verificationBase,classifier1),\ getSuccessRate(verificationBase,classifier2),getSuccessRate(verificationBase,classifier3),\ getAccumulatedSuccessRate(verificationBase,classifier1,classifier2,classifier3)) print("RESULT:", resline) if results_stream != None: results_stream.write(resline + "\n") results_stream.flush() extraLabelled1 = [] extraLabelled2 = [] extraLabelled3 = [] for l1, l2, l3, doc in zip(classified1, classified2, classified3, choice): print("classified: %s, %s, %s. true: %s" % (l1, l2, l3, doc.author)) #print(p1,p2,p3) if l1 is None or l2 is None or l3 is None: raise Exception( "Classifier should assign proper labels (i.e. distinct from None)" ) discard = True if l1 == l2: discard = False if config.do_fake: extraLabelled3.append(doc) else: extraLabelled3.append(features.document(doc.text, l1)) if doc.author == l1: extra_true3 += 1 else: extra_false3 += 1 if l1 == l3: discard = False if config.do_fake: extraLabelled2.append(doc) else: extraLabelled2.append(features.document(doc.text, l1)) if doc.author == l1: extra_true2 += 1 else: extra_false2 += 1 if l2 == l3: discard = False if config.do_fake: extraLabelled1.append(doc) else: extraLabelled1.append(features.document(doc.text, l2)) if doc.author == l2: extra_true1 += 1 else: extra_false1 += 1 if discard and functionCollection is not None: #forget doc if the same text (=identifier) does not occur anywhere where it is needed idf = doc.identifier if idf not in labelled1.byIdentifier and idf not in labelled2.byIdentifier and idf not in labelled3.byIdentifier and \ idf not in testBase.byIdentifier: functionCollection.forgetDocument(doc) labelled1 = labelled1.extend(extraLabelled1) labelled2 = labelled2.extend(extraLabelled2) labelled3 = labelled3.extend(extraLabelled3) print("labelled 1: ", Counter([d.author for d in labelled1.documents])) print("labelled 2: ", Counter([d.author for d in labelled2.documents])) print("labelled 3: ", Counter([d.author for d in labelled3.documents])) if None in labelled1.authors or None in labelled2.authors or None in labelled3.authors: raise Exception("Did not expect to find stupid labels here.") unlabelledBase = unlabelledBase.subbase( list( set(range(len(unlabelledBase.documents))) - set(choiceIndices))) if config.undersample: ''' balanced1 = getBalancedSubbase(labelled1,classifier1) balanced2 = getBalancedSubbase(labelled2,classifier2) balanced3 = getBalancedSubbase(labelled3,classifier3) ''' parallelGroup.add_branch(getBalancedSubbase, labelled1, classifier1) parallelGroup.add_branch(getBalancedSubbase, labelled2, classifier2) parallelGroup.add_branch(getBalancedSubbase, labelled3, classifier3) balanced1, balanced2, balanced3 = parallelGroup.get_results() else: balanced1, balanced2, balanced3 = labelled1, labelled2, labelled3 classifier1.free() classifier2.free() classifier3.free() classifier1 = None classifier2 = None classifier3 = None print("added documents (true/false): %d/%d %d/%d %d/%d" % (extra_true1, extra_false1, extra_true2, extra_false2, extra_true3, extra_false3)) ''' classifier1 = view1.createClassifier(balanced1,regression.multiclassLogit) classifier2 = view2.createClassifier(balanced2,regression.multiclassLogit) classifier3 = view3.createClassifier(balanced3,regression.multiclassLogit) ''' parallelGroup.add_branch(trainAndPredict, view1, balanced1, testBase.documents) parallelGroup.add_branch(trainAndPredict, view2, balanced2, testBase.documents) parallelGroup.add_branch(trainAndPredict, view3, balanced3, testBase.documents) parallelGroup_results = parallelGroup.get_results() classifier1, classifier2, classifier3 = (r[0] for r in parallelGroup_results) pred = getAccumulatedPrediction(testBase, classifier1, classifier2, classifier3) if verificationBase.documents: correct = len([ None for (pred, doc) in zip(pred, testBase.documents) if pred == doc.author ]) resline="%d,%d,%d,%d,%d,%d" % (num_iterations,len(verificationBase.documents),getSuccessRate(verificationBase,classifier1),\ getSuccessRate(verificationBase,classifier2),getSuccessRate(verificationBase,classifier3),correct) print("RESULTS: ", resline) if results_stream != None: results_stream.write(resline + "\n") results_stream.flush() classifier1.clearCache() classifier2.clearCache() classifier3.clearCache() return pred
docs, [features.stanfordTreeDocumentFunction]) functionCollection.getValues(docs, features.tokensDocumentFunction) functionCollection.getValues(docs, features.posDocumentFunction) functionCollection.getValues(docs, features.stDocumentDocumentFunction) for doc in docs: functionCollection.forgetDocument(doc) print("prepared %d documents" % len(documentbase.documents)) if __name__ == '__main__': import sys if len(sys.argv) < 6: print("usage: see ", sys.argv[0]) sys.exit(1) stanford_db = sys.argv[1] tokens_db = sys.argv[2] pos_db = sys.argv[3] c_syntax_tree_db = sys.argv[4] documents = sys.argv[5:] functionCollection = features.documentFunctionCollection() def readfile(filename): with open(filename, 'rt') as f: return f.read() documentbase = features.documentbase( [features.document(readfile(d)) for f in documents]) documentbase.functionCollection = functionCollection prepareDocuments(stanford_db, tokens_db, pos_db, c_syntax_tree_db, documentbase)
'Rita Beamish' : ['RITA BEAMISH'] } # !!! reducing number of authors to three !!! for auth in list(selected_authors): if auth[0] != 'D': del selected_authors[auth] # selected_author_names = list(selected_authors.keys()) selected_authors_reversed={} for name,pseudos in selected_authors.items(): for p in pseudos: selected_authors_reversed[p] = name selected_docs=[] for doc in docs: if doc.author in selected_authors_reversed: selected_docs.append(features.document(doc.text, selected_authors_reversed[doc.author])) selected_base = features.documentbase(selected_docs) selected_base.functionCollection = functionCollection selected_author_documents=[selected_base.byAuthor[auth] for auth in selected_author_names] def prepareSelected(): global selected_base prepare_documents.prepareDocumentsChunked('ap-selected-stanford.db','ap-selected-tokens.db','ap-selected-pos.db','ap-selected-c_syntax_tree.db',\ selected_base) def genCrossvalIndices(N,k): sampleSize=N//k indices=list(range(N)) result=[indices] for _ in range(k-1): sample=random.sample(indices,sampleSize) result.append(sample) for i in sample: