Python document 예제들, features.document Python 예제들

예제 #1

0

파일 보기

파일: imdb62.py 프로젝트: webis-de/authorship-threetrain

def loadReviews():
    global documentbase, functionCollection
    reviews = []
    with open("imdb62.txt") as f:
        for line in f:
            line = line.split('\t')
            reviews.append(features.document(line[5], line[1]))
    documentbase = features.documentbase(reviews).strippedDuplicates()
    functionCollection = features.documentFunctionCollection()
    documentbase.functionCollection = functionCollection

예제 #2

0

파일 보기

    def loadCorpus(self):
        #folder should be a readable folder containing a file called 'meta-file.json'
        #returns a tuple trainingDocumentbase,unknownDocumentbase of the training and unlabelled documents.
        folder = config.tira_base_directory + '/' + self.input_dataset + '/'
        with open(folder + 'meta-file.json', 'rt') as f:
            metadata = json.load(f)
        encoding = metadata['encoding']

        def readfile(filename):
            with open(filename, 'rt', encoding=encoding) as f:
                return f.read()

        author_names = [
            auth['author-name'] for auth in metadata['candidate-authors']
        ]
        docs = []
        for author in author_names:
            documents = os.listdir(folder + author)
            docs += [
                features.document(readfile(folder + author + '/' + doc),
                                  author) for doc in documents
            ]
        training_docbase = features.documentbase(docs)
        unknown_paths = [
            folder + metadata['folder'] + '/' + doc['unknown-text']
            for doc in metadata['unknown-texts']
        ]
        unknown_docbase = features.documentbase(
            [features.document(readfile(p)) for p in unknown_paths])
        if self.functionCollection is not None:
            training_docbase.functionCollection = self.functionCollection
            unknown_docbase.functionCollection = self.functionCollection
        self._unknown_paths = {
            d.identifier: path
            for (d, path) in zip(unknown_docbase.documents, unknown_paths)
        }
        return training_docbase, unknown_docbase

예제 #3

0

파일 보기

파일: ap.py 프로젝트: webis-de/authorship-threetrain

def extractDocuments(content):
	for white1 in [' ','\n']:
		for white2 in [' ','\n']:
			content=content.replace(white1+'&'+white2, white1+'&amp;'+white2)
	for node in ET.XML(DTD+'<root>'+content+'</root>'):
		auth=None
		text=''
		for subnode in node:
			if subnode.tag=='BYLINE' and subnode.text[:3]=='By ' and subnode.text.lower() != 'by the associated press':
				if auth is not None:
					print('Multiple authors of one document: %s & %s (discard document)' % (auth,subnode.text[3:]))
					auth=None
					break
				auth=subnode.text[3:]
			elif subnode.tag=='TEXT' and subnode.text:
				text+=subnode.text
		text=text.strip()
		if auth and text:
			yield features.document(text,auth)

예제 #4

0

파일 보기

def threeTrain(view1,view2,view3,trainingBase, unlabelledBase, testBase, num_iterations, num_unlabelled,results_stream=None,initial_classifier1=None,\
   initial_classifier2=None,initial_classifier3=None):
    #if no initial classifiers are given, they are learned from the trainingBase.
    if None in trainingBase.authors:
        raise Exception("Training Base should be labelled.")
    print("unlabelled authors: ", unlabelledBase.authors, "; test authors: ",
          testBase.authors)
    labelled1 = trainingBase
    labelled2 = trainingBase
    labelled3 = trainingBase
    balanced1 = labelled1
    balanced2 = labelled2
    balanced3 = labelled3
    extra_true1 = 0
    extra_true2 = 0
    extra_true3 = 0
    extra_false1 = 0
    extra_false2 = 0
    extra_false3 = 0
    parallelGroup = easyparallel.ParallelismGroup()
    functionCollection = trainingBase.functionCollection if hasattr(
        trainingBase, 'functionCollection') else None

    #@profile
    def prepareDocuments(docs):
        import pickle
        print("preparing %d documents" % len(docs))
        chunksize = 15000
        if functionCollection is not None:
            for i in range(0, len(docs), chunksize):
                chunk = docs[i:i + chunksize]
                functionCollection.moveToMemory(chunk, neededDocumentFunctions)
                #print("forget unnecessary document functions...")
                gc.collect()
                if config.debug_memory:
                    print("garbage: ", len(gc.garbage))
                    print("15 most common types:")
                    objgraph.show_most_common_types(limit=15)
                    c_syntax_tree.showCMemoryStatistics()
                    showMemoryStatistics()
                    functionCollection.showMemoryStatistics()
                    functionCollection.getFunction(
                        features.stanfordTreeDocumentFunction
                    ).cachedValues.showMemoryStatistics()
                    print("leaking: ", len(objgraph.get_leaking_objects()))

    prepareDocuments(trainingBase.documents)
    prepareDocuments(testBase.documents)
    verificationBase = testBase.subbase([
        i for i, doc in enumerate(testBase.documents) if doc.author is not None
    ])
    for iteration in range(num_iterations):
        gc.collect()
        remaining_unlabelled = len(unlabelledBase.documents)
        if remaining_unlabelled == 0:
            break
        if remaining_unlabelled > num_unlabelled:
            choiceIndices = random.sample(range(remaining_unlabelled),
                                          num_unlabelled)
        else:
            choiceIndices = list(range(remaining_unlabelled))
        if len(choiceIndices) > num_unlabelled:
            raise Exception("Error detected.")
        choice = [unlabelledBase.documents[i] for i in choiceIndices]
        prepareDocuments(choice)
        cached_keys = [
            sorted(
                list(
                    functionCollection.getFunction(
                        f).cachedValues.memory_cache))
            for f in neededDocumentFunctions
        ]
        '''
		classifier1 = view1.createClassifier(balanced1)
		classified1 = classifier1.predict(choice)
		classifier2 = view2.createClassifier(balanced2)
		classified2 = classifier2.predict(choice)
		classifier3 = view3.createClassifier(balanced3)
		classified3 = classifier3.predict(choice)
		'''
        if iteration == 0 and initial_classifier1 is not None:
            classifier1 = initial_classifier1
            parallelGroup.add_branch(classifier1.predict, choice)
        else:
            parallelGroup.add_branch(trainAndPredict, view1, balanced1, choice)
        if iteration == 0 and initial_classifier2 is not None:
            classifier2 = initial_classifier2
            parallelGroup.add_branch(classifier2.predict, choice)
        else:
            parallelGroup.add_branch(trainAndPredict, view2, balanced2, choice)
        if iteration == 0 and initial_classifier3 is not None:
            classifier3 = initial_classifier3
            parallelGroup.add_branch(classifier3.predict, choice)
        else:
            parallelGroup.add_branch(trainAndPredict, view3, balanced3, choice)
        print("waiting for classification and prediction...")
        parallelGroup_results = parallelGroup.get_results()
        print("got results!")
        if iteration == 0 and initial_classifier1 is not None:
            classified1 = parallelGroup_results[0]
        else:
            classifier1, classified1 = parallelGroup_results[0]
        if iteration == 0 and initial_classifier2 is not None:
            classified2 = parallelGroup_results[1]
        else:
            classifier2, classified2 = parallelGroup_results[1]
        if iteration == 0 and initial_classifier3 is not None:
            classified3 = parallelGroup_results[2]
        else:
            classifier3, classified3 = parallelGroup_results[2]
        if verificationBase.documents:
            print("prediction verificationBase for the records...")
            parallelGroup.add_branch(classifier1.getValuev,
                                     verificationBase.documents)
            parallelGroup.add_branch(classifier2.getValuev,
                                     verificationBase.documents)
            parallelGroup.add_branch(classifier3.getValuev,
                                     verificationBase.documents)
            parallelGroup.get_results()
            print("got results for verificationBase!")
            resline="%d,%d,%d,%d,%d,%d" % (iteration,len(verificationBase.documents),getSuccessRate(verificationBase,classifier1),\
             getSuccessRate(verificationBase,classifier2),getSuccessRate(verificationBase,classifier3),\
             getAccumulatedSuccessRate(verificationBase,classifier1,classifier2,classifier3))
            print("RESULT:", resline)
            if results_stream != None:
                results_stream.write(resline + "\n")
                results_stream.flush()
        extraLabelled1 = []
        extraLabelled2 = []
        extraLabelled3 = []
        for l1, l2, l3, doc in zip(classified1, classified2, classified3,
                                   choice):
            print("classified: %s, %s, %s. true: %s" %
                  (l1, l2, l3, doc.author))
            #print(p1,p2,p3)
            if l1 is None or l2 is None or l3 is None:
                raise Exception(
                    "Classifier should assign proper labels (i.e. distinct from None)"
                )
            discard = True
            if l1 == l2:
                discard = False
                if config.do_fake:
                    extraLabelled3.append(doc)
                else:
                    extraLabelled3.append(features.document(doc.text, l1))
                if doc.author == l1:
                    extra_true3 += 1
                else:
                    extra_false3 += 1
            if l1 == l3:
                discard = False
                if config.do_fake:
                    extraLabelled2.append(doc)
                else:
                    extraLabelled2.append(features.document(doc.text, l1))
                if doc.author == l1:
                    extra_true2 += 1
                else:
                    extra_false2 += 1
            if l2 == l3:
                discard = False
                if config.do_fake:
                    extraLabelled1.append(doc)
                else:
                    extraLabelled1.append(features.document(doc.text, l2))
                if doc.author == l2:
                    extra_true1 += 1
                else:
                    extra_false1 += 1
            if discard and functionCollection is not None:
                #forget doc if the same text (=identifier) does not occur anywhere where it is needed
                idf = doc.identifier
                if idf not in labelled1.byIdentifier and idf not in labelled2.byIdentifier and idf not in labelled3.byIdentifier and \
                   idf not in testBase.byIdentifier:
                    functionCollection.forgetDocument(doc)
        labelled1 = labelled1.extend(extraLabelled1)
        labelled2 = labelled2.extend(extraLabelled2)
        labelled3 = labelled3.extend(extraLabelled3)
        print("labelled 1: ", Counter([d.author for d in labelled1.documents]))
        print("labelled 2: ", Counter([d.author for d in labelled2.documents]))
        print("labelled 3: ", Counter([d.author for d in labelled3.documents]))
        if None in labelled1.authors or None in labelled2.authors or None in labelled3.authors:
            raise Exception("Did not expect to find stupid labels here.")
        unlabelledBase = unlabelledBase.subbase(
            list(
                set(range(len(unlabelledBase.documents))) -
                set(choiceIndices)))
        if config.undersample:
            '''
			balanced1 = getBalancedSubbase(labelled1,classifier1)
			balanced2 = getBalancedSubbase(labelled2,classifier2)
			balanced3 = getBalancedSubbase(labelled3,classifier3)
			'''
            parallelGroup.add_branch(getBalancedSubbase, labelled1,
                                     classifier1)
            parallelGroup.add_branch(getBalancedSubbase, labelled2,
                                     classifier2)
            parallelGroup.add_branch(getBalancedSubbase, labelled3,
                                     classifier3)
            balanced1, balanced2, balanced3 = parallelGroup.get_results()
        else:
            balanced1, balanced2, balanced3 = labelled1, labelled2, labelled3
        classifier1.free()
        classifier2.free()
        classifier3.free()
        classifier1 = None
        classifier2 = None
        classifier3 = None
        print("added documents (true/false): %d/%d   %d/%d   %d/%d" %
              (extra_true1, extra_false1, extra_true2, extra_false2,
               extra_true3, extra_false3))
    '''
	classifier1 = view1.createClassifier(balanced1,regression.multiclassLogit)
	classifier2 = view2.createClassifier(balanced2,regression.multiclassLogit)
	classifier3 = view3.createClassifier(balanced3,regression.multiclassLogit)
	'''
    parallelGroup.add_branch(trainAndPredict, view1, balanced1,
                             testBase.documents)
    parallelGroup.add_branch(trainAndPredict, view2, balanced2,
                             testBase.documents)
    parallelGroup.add_branch(trainAndPredict, view3, balanced3,
                             testBase.documents)
    parallelGroup_results = parallelGroup.get_results()
    classifier1, classifier2, classifier3 = (r[0]
                                             for r in parallelGroup_results)
    pred = getAccumulatedPrediction(testBase, classifier1, classifier2,
                                    classifier3)
    if verificationBase.documents:
        correct = len([
            None for (pred, doc) in zip(pred, testBase.documents)
            if pred == doc.author
        ])
        resline="%d,%d,%d,%d,%d,%d" % (num_iterations,len(verificationBase.documents),getSuccessRate(verificationBase,classifier1),\
         getSuccessRate(verificationBase,classifier2),getSuccessRate(verificationBase,classifier3),correct)
        print("RESULTS: ", resline)
        if results_stream != None:
            results_stream.write(resline + "\n")
            results_stream.flush()
    classifier1.clearCache()
    classifier2.clearCache()
    classifier3.clearCache()
    return pred

예제 #5

0

파일 보기

            docs, [features.stanfordTreeDocumentFunction])
        functionCollection.getValues(docs, features.tokensDocumentFunction)
        functionCollection.getValues(docs, features.posDocumentFunction)
        functionCollection.getValues(docs, features.stDocumentDocumentFunction)
        for doc in docs:
            functionCollection.forgetDocument(doc)
    print("prepared %d documents" % len(documentbase.documents))


if __name__ == '__main__':
    import sys
    if len(sys.argv) < 6:
        print("usage: see ", sys.argv[0])
        sys.exit(1)
    stanford_db = sys.argv[1]
    tokens_db = sys.argv[2]
    pos_db = sys.argv[3]
    c_syntax_tree_db = sys.argv[4]
    documents = sys.argv[5:]
    functionCollection = features.documentFunctionCollection()

    def readfile(filename):
        with open(filename, 'rt') as f:
            return f.read()

    documentbase = features.documentbase(
        [features.document(readfile(d)) for f in documents])
    documentbase.functionCollection = functionCollection
    prepareDocuments(stanford_db, tokens_db, pos_db, c_syntax_tree_db,
                     documentbase)

예제 #6

0

파일 보기

파일: ap.py 프로젝트: webis-de/authorship-threetrain

	'Rita Beamish' : ['RITA BEAMISH']
}
# !!! reducing number of authors to three !!!
for auth in list(selected_authors):
        if auth[0] != 'D':
                del selected_authors[auth]
#
selected_author_names = list(selected_authors.keys())
selected_authors_reversed={}
for name,pseudos in selected_authors.items():
	for p in pseudos:
		selected_authors_reversed[p] = name
selected_docs=[]
for doc in docs:
	if doc.author in selected_authors_reversed:
		selected_docs.append(features.document(doc.text, selected_authors_reversed[doc.author]))
selected_base = features.documentbase(selected_docs)
selected_base.functionCollection = functionCollection
selected_author_documents=[selected_base.byAuthor[auth] for auth in selected_author_names]
def prepareSelected():
	global selected_base
	prepare_documents.prepareDocumentsChunked('ap-selected-stanford.db','ap-selected-tokens.db','ap-selected-pos.db','ap-selected-c_syntax_tree.db',\
		selected_base)
def genCrossvalIndices(N,k):
	sampleSize=N//k
	indices=list(range(N))
	result=[indices]
	for _ in range(k-1):
		sample=random.sample(indices,sampleSize)
		result.append(sample)
		for i in sample: