def test_loadBiocFile_dir(): scriptDir = os.path.dirname(__file__) dataPath = os.path.join(scriptDir, 'data') corpus = kindred.load(dataFormat='biocxml', path=dataPath) assert isinstance(corpus, kindred.Corpus) assert len(corpus.documents) == 1 doc = corpus.documents[0] assert isinstance(doc, kindred.Document) entities = doc.entities relations = doc.relations sourceEntityIDsToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation( 'causes', [sourceEntityIDsToEntity["T1"], sourceEntityIDsToEntity["T2"]], ['obj', 'subj']) ], "(%s) not as expected" % relations
def test_loadEmptyDirectory(): with TempDir() as tempDir: for dataformat in ['standoff', 'simpletag', 'json', 'biocxml']: with pytest.raises(RuntimeError) as excinfo: corpus = kindred.load(dataformat, tempDir) expectedError = 'No documents loaded from directory (%s). Are you sure this directory contains the corpus (format: %s)' % ( tempDir.rstrip('/'), dataformat) assert excinfo.value.args == (expectedError, )
def test_saveStandoffFile(): text = "The colorectal cancer was caused by mutations in APC" e1 = kindred.Entity(entityType="disease", text="colorectal cancer", position=[(4, 21)], sourceEntityID="T1") e2 = kindred.Entity(entityType="gene", text="APC", position=[(49, 52)], sourceEntityID="T2") rel = kindred.Relation(relationType="causes", entities=[e1, e2], argNames=['obj', 'subj']) doc = kindred.Document(text, [e1, e2], [rel]) corpus = kindred.Corpus() corpus.addDocument(doc) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc, kindred.Document) entities = loadedDoc.entities relations = loadedDoc.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['obj', 'subj'], sourceRelationID='R1') ], "(%s) not as expected" % relations
def test_saveBB3Data(): corpus = kindred.bionlpst.load('2016-BB3-event-train') assert isinstance(corpus, kindred.Corpus) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert len(corpus.documents) == len(loadedCorpus.documents)
def test_saveStandoffFile_fromSimpleTag_triple(): text = '<drug id="T1">Erlotinib</drug>, a <gene id="T2">EGFR</gene> inhibitor is commonly used for <disease id="T3">NSCLC</disease> patients. <relation type="druginfo" drug="T1" gene="T2" disease="T3" />' corpus = kindred.Corpus(text, loadFromSimpleTag=True) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc, kindred.Document) entities = loadedDoc.entities relations = loadedDoc.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='drug', expectedText='Erlotinib', expectedPos=[(0, 9)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='EGFR', expectedPos=[(13, 17)], expectedSourceEntityID="T2") assertEntity(entities[2], expectedType='disease', expectedText='NSCLC', expectedPos=[(49, 54)], expectedSourceEntityID="T3") assert relations == [ kindred.Relation('druginfo', [ sourceEntityIDToEntity["T3"], sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"] ], ['disease', 'drug', 'gene'], sourceRelationID='R1') ], "(%s) not as expected" % relations
def test_saveStandoffFile_fromSimpleTag_binary(): text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />' corpus = kindred.Corpus(text, loadFromSimpleTag=True) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc, kindred.Document) entities = loadedDoc.entities relations = loadedDoc.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") print([r.sourceRelationID for r in relations]) assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['obj', 'subj'], sourceRelationID='R1') ], "(%s) not as expected" % relations
def test_savePubAnnotationFile_fromSimpleTag(): text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />' corpus = kindred.Corpus(text, loadFromSimpleTag=True) with TempDir() as tempDir: tempFile = os.path.join(tempDir, 'corpus.json') kindred.save(corpus, 'pubannotation', tempFile) loadedCorpus = kindred.load('pubannotation', tempFile) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc, kindred.Document) entities = loadedDoc.entities relations = loadedDoc.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") print([r.sourceRelationID for r in relations]) assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['subj', 'obj'], sourceRelationID='R1') ], "(%s) not as expected" % relations
def test_loadStandoffFile_triple(): scriptDir = os.path.dirname(__file__) txtPath = os.path.join(scriptDir, 'data_triple', 'example.txt') corpus = kindred.load('standoff', txtPath) assert isinstance(corpus, kindred.Corpus) assert len(corpus.documents) == 1 doc = corpus.documents[0] assert isinstance(doc, kindred.Document) entities = doc.entities relations = doc.relations sourceEntityIDsToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='drug', expectedText='Erlotinib', expectedPos=[(0, 9)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='EGFR', expectedPos=[(13, 17)], expectedSourceEntityID="T2") assertEntity(entities[2], expectedType='disease', expectedText='NSCLC', expectedPos=[(49, 54)], expectedSourceEntityID="T3") assert relations == [ kindred.Relation('druginfo', [ sourceEntityIDsToEntity["T3"], sourceEntityIDsToEntity["T1"], sourceEntityIDsToEntity["T2"] ], ['disease', 'drug', 'gene'], sourceRelationID='R0') ], "(%s) not as expected" % relations
def load(taskName, ignoreEntities=[]): """ Download and load the corresponding corpus from the BioNLP Shared Task :param taskName: The name of the shared task to download (e.g. 'BioNLP-ST-2016_BB-event_train'). Use kindred.bionlpst.listTasks() to get a list of valid options :param ignoreEntities: A list of any entities that should be ignored during loading :type taskName: str :type ignoreEntities: list of str :return: The loaded corpus :rtype: kindred.Corpus """ global taskOptions tempDir = tempfile.mkdtemp() assert taskName in taskOptions.keys(), "%s not a valid option in %s" % ( taskName, taskOptions.keys()) url, expectedFile, expectedSHA256 = taskOptions[taskName] filesToDownload = [(url, expectedFile, expectedSHA256)] expectedDir = expectedFile.replace('.zip', '') try: kindred.utils._downloadFiles(filesToDownload, tempDir) except: exc_info = sys.exc_info() shutil.rmtree(tempDir) six.reraise(*exc_info) mainDir = kindred.utils._findDir(expectedDir, tempDir) corpus = kindred.load(dataFormat='standoff', path=mainDir, ignoreEntities=ignoreEntities) shutil.rmtree(tempDir) return corpus
def test_saveStandoffFile_SeparateSentences(): texts = [ 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />', '<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />' ] corpus = kindred.Corpus() for t in texts: doc = kindred.Document(t, loadFromSimpleTag=True) corpus.addDocument(doc) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 2 data = loadedCorpus.documents[0] assert isinstance(data, kindred.Document) entities = data.entities relations = data.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['obj', 'subj']) ], "(%s) not as expected" % relations data = loadedCorpus.documents[1] assert isinstance(data, kindred.Document) entities = data.entities relations = data.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='Li-Fraumeni', expectedPos=[(0, 11)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='P53', expectedPos=[(39, 42)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['obj', 'subj']) ], "(%s) not as expected" % relations
def get_rels(): c.execute("SELECT * FROM relationships") return c.fetchall() def get_rels_clean(): c.execute("SELECT gene, disease, relation FROM relationships") return c.fetchall() # Kindred # 5 classes trainCorpus = kindred.load(dataFormat='json', path='relation/db/1') devCorpus = kindred.load(dataFormat='json', path='ner-dump') predictionCorpus = devCorpus.clone() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(predictionCorpus) f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score') print("5 CLASSES ---------------------") for i in predictionCorpus.documents: for j in (i.relations): rel = Relationships(j.entities[0].text, j.entities[1].text, j.relationType, i.text)
parser = argparse.ArgumentParser( description='Create PR curves using a train and test set') parser.add_argument('--train', required=True, type=str, help='Directory containing stand-off training test') parser.add_argument('--test', required=True, type=str, help='Directory containing stand-off testing test') args = parser.parse_args() print("threshold\tprecision\trecall") for threshold in np.arange(0, 1.01, 0.01): trainCorpus = kindred.load('standoff', args.train) testCorpus = kindred.load('standoff', args.test) predCorpus = testCorpus.clone() predCorpus.removeRelations() parser = kindred.Parser(model='en_core_sci_sm') parser.parse(trainCorpus) parser.parse(testCorpus) parser.parse(predCorpus) classifier = kindred.RelationClassifier( classifierType='LogisticRegression', threshold=threshold, acceptedEntityTypes=[('Chemical', 'Mutation')]) classifier.train(trainCorpus)
import kindred import argparse import os import csv with open('5_types_lp.csv', mode='w') as csv_file: fieldnames = ['iteration_num', 'svm', 'decision_trees', 'neural_net'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() Corpus = kindred.load(dataFormat='json', path='db/5types') # avg_svm = 0 # avg_dct = 0 # avg_nn = 0 count = 0 iter_num = 200 print("-------------5 CLASSES------------") while count < iter_num: print("------", count, "------") trainCorpus, devCorpus = Corpus.split(trainFraction=0.9) predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier()
headers = [ 'pmid', 'title', 'journal', 'journal_short', 'year', 'month', 'day', 'section', 'subsection', 'chemical_mesh_id', 'chemical_pharmgkb_id', 'chemical_drugbank_id', 'chemical_text', 'chemical_normalized', 'chemical_position', 'variant_id', 'variant_type', 'variant_text', 'variant_normalized', 'variant_position', 'gene_ids', 'gene_names', 'score', 'sentence', 'formatted_sentence' ] with open(args.outKB, 'w') as outF: outF.write("\t".join(headers) + "\n") for mode, trainingData in zip(modes, trainingFiles): print("Creating classifier for %s" % mode) predictedCount = 0 trainCorpus = kindred.load('biocxml', trainingData) corpus = kindred.load('biocxml', args.inBioC) for doc in trainCorpus.documents: for relation in doc.relations: relation.relationType = 'ChemicalMutation' for doc in corpus.documents: if mode == 'star_allele': doc.entities = [ e for e in doc.entities if not (e.entityType == 'Mutation' and not e.text.strip().startswith('*')) ] elif mode == 'rs': doc.entities = [
import kindred import argparse import os if __name__ == '__main__': parser = argparse.ArgumentParser(description='Use annotated sentences to build a Kindred classifer and apply to unannotated sentences') parser.add_argument('--dataToBuildModel',required=True,type=str,help='Sentences with relations') parser.add_argument('--dataToApplyModel',required=True,type=str,help='Sentences without annotated relations to make predictions on') parser.add_argument('--outDir',required=True,type=str,help='Directory to store output') args = parser.parse_args() print("Loading corpora...") trainCorpus = kindred.load('standoff',args.dataToBuildModel) predictionCorpus = kindred.load('standoff',args.dataToApplyModel) print("Building classifier...") classifier = kindred.RelationClassifier() classifier.train(trainCorpus) print("Applying classifier...") classifier.predict(predictionCorpus) if not os.path.isdir(args.outDir): os.makedirs(args.outDir) print("Saving results to directory...") kindred.save(predictionCorpus,'standoff',args.outDir) print("\nPredicted relations:") for relation in predictionCorpus.getRelations(): print("%s\t%s" % (relation.entities[0].text,relation.entities[1].text))
'Driver': 0.80, 'Oncogene': 0.76, 'Tumor_Suppressor': 0.92 } else: thresholds = {'Driver': 0.5, 'Oncogene': 0.5, 'Tumor_Suppressor': 0.5} for relationType, outModel in zip( ['Driver', 'Oncogene', 'Tumor_Suppressor'], [ args.outModel_Driver, args.outModel_Oncogene, args.outModel_TumorSuppressor ]): print("Building %s model" % relationType) print(" Loading training") goldDir = 'gold' trainCorpus = kindred.load('standoff', args.inTrain) for doc in trainCorpus.documents: doc.relations = [ r for r in doc.relations if r.relationType == relationType ] print(" Doing training") features = "entityTypes,unigramsBetweenEntities,bigrams,dependencyPathEdges,dependencyPathEdgesNearEntities".split( ',') threshold = thresholds[relationType] classifier = kindred.RelationClassifier( classifierType='LogisticRegression', threshold=threshold, features=features, acceptedEntityTypes=[('cancer', 'gene')])
with open(args.variantStopwords) as f: variantStopwords = set(line.strip().lower() for line in f) stopwords = {'dopamine', 'insulin', 'caffeine', 'nicotine', 'choline'} print("Loading sentences...") corpus = kindred.Corpus() filenames = sorted(os.listdir(args.sentenceData)) if args.fileCount: filenames = random.sample(filenames, args.fileCount) for i, filename in enumerate(filenames): print(i, filename) tmpCorpus = kindred.load('biocxml', os.path.join(args.sentenceData, filename)) if args.filterTerms: tmpCorpus.documents = [ doc for doc in tmpCorpus.documents if any(filterTerm in doc.text.lower() for filterTerm in filterTerms) ] tmpCorpus.documents = [ doc for doc in tmpCorpus.documents if not any(stopword in doc.text.lower() for stopword in stopwords) ] filtered = [] for doc in tmpCorpus.documents: if args.mode == 'star_allele': doc.entities = [