def test_saveStandoffFile_noArgNames(): text = "The colorectal cancer was caused by mutations in APC" e1 = kindred.Entity(entityType="disease",text="colorectal cancer",position=[(4, 21)],sourceEntityID="T1") e2 = kindred.Entity(entityType="gene",text="APC",position=[(49, 52)],sourceEntityID="T2") rel = kindred.Relation(relationType="causes",entityIDs=[e1.entityID,e2.entityID]) doc = kindred.Document(text,[e1,e2],[rel],relationsUseSourceIDs=False) corpus = kindred.Corpus() corpus.addDocument(doc) tempDir = tempfile.mkdtemp() kindred.save(corpus,'standoff',tempDir) loadedCorpus = kindred.loadDir('standoff',tempDir) assert isinstance(loadedCorpus,kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc,kindred.Document) entities = loadedDoc.getEntities() relations = loadedDoc.getRelations() sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['arg1','arg2'])], "(%s) not as expected" % relations shutil.rmtree(tempDir)
def load(taskName,ignoreEntities=[]): """ Download and load the corresponding corpus from the BioNLP Shared Task :param taskName: The name of the shared task to download (e.g. 'BioNLP-ST-2016_BB-event_train'). Use kindred.bionlpst.listTasks() to get a list of valid options :param ignoreEntities: A list of any entities that should be ignored during loading :type taskName: str :type ignoreEntities: list of str :return: The loaded corpus :rtype: kindred.Corpus """ global taskOptions tempDir = tempfile.mkdtemp() assert taskName in taskOptions.keys(), "%s not a valid option in %s" % (taskName, taskOptions.keys()) url,expectedFile,expectedSHA256 = taskOptions[taskName] filesToDownload = [(url,expectedFile,expectedSHA256)] expectedDir = expectedFile.replace('.zip','') try: kindred.utils._downloadFiles(filesToDownload,tempDir) except: exc_info = sys.exc_info() shutil.rmtree(tempDir) six.reraise(*exc_info) mainDir = kindred.utils._findDir(expectedDir,tempDir) corpus = kindred.loadDir(dataFormat='standoff',directory=mainDir,ignoreEntities=ignoreEntities) shutil.rmtree(tempDir) return corpus
def test_saveStandoffFile_fromSimpleTag(): text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />' corpus = kindred.Corpus() doc = kindred.Document(text) corpus.addDocument(doc) tempDir = tempfile.mkdtemp() kindred.save(corpus,'standoff',tempDir) loadedCorpus = kindred.loadDir('standoff',tempDir) assert isinstance(loadedCorpus,kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc,kindred.Document) entities = loadedDoc.getEntities() relations = loadedDoc.getRelations() sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations shutil.rmtree(tempDir)
def test_saveStandoffFile_SeparateSentences(): texts = ['The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />','<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />'] corpus = kindred.Corpus() for t in texts: doc = kindred.Document(t) corpus.addDocument(doc) tempDir = tempfile.mkdtemp() kindred.save(corpus,'standoff',tempDir) loadedCorpus = kindred.loadDir('standoff',tempDir) assert isinstance(loadedCorpus,kindred.Corpus) assert len(loadedCorpus.documents) == 2 data = loadedCorpus.documents[0] assert isinstance(data,kindred.Document) entities = data.getEntities() relations = data.getRelations() sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations data = loadedCorpus.documents[1] assert isinstance(data,kindred.Document) entities = data.getEntities() relations = data.getRelations() sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='Li-Fraumeni',expectedPos=[(0,11)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='P53',expectedPos=[(39,42)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations shutil.rmtree(tempDir)
def test_loadBiocFile_dir(): scriptDir = os.path.dirname(__file__) dataPath = os.path.join(scriptDir, 'data') corpus = kindred.loadDir(dataFormat='bioc', directory=dataPath) assert isinstance(corpus, kindred.Corpus) assert len(corpus.documents) == 1 data = corpus.documents[0] assert isinstance(data, kindred.Document) entities = data.getEntities() relations = data.getRelations() sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs() assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation('causes', [ sourceEntityIDsToEntityIDs["T1"], sourceEntityIDsToEntityIDs["T2"] ], ['obj', 'subj']) ], "(%s) not as expected" % relations
def test_loadEmptyDirectory(): tempDir = tempfile.mkdtemp() for dataformat in ['standoff','simpletag','json','bioc']: with pytest.raises(RuntimeError) as excinfo: corpus = kindred.loadDir(dataformat,tempDir) expectedError = 'No documents loaded from directory (%s/). Are you sure this directory contains the corpus (format: %s)' % (tempDir.rstrip('/'),dataformat) assert excinfo.value.args == (expectedError ,) shutil.rmtree(tempDir)
def test_saveBB3Data(): corpus = kindred.bionlpst.load('2016-BB3-event-train') assert isinstance(corpus,kindred.Corpus) tempDir = tempfile.mkdtemp() kindred.save(corpus,'standoff',tempDir) loadedCorpus = kindred.loadDir('standoff',tempDir) assert len(corpus.documents) == len(loadedCorpus.documents) shutil.rmtree(tempDir)
def test_saveStandoffFile_fromSimpleTag_triple(): text = '<drug id="T1">Erlotinib</drug>, a <gene id="T2">EGFR</gene> inhibitor is commonly used for <disease id="T3">NSCLC</disease> patients. <relation type="druginfo" drug="T1" gene="T2" disease="T3" />' corpus = kindred.Corpus(text, loadFromSimpleTag=True) tempDir = tempfile.mkdtemp() kindred.save(corpus, 'standoff', tempDir) loadedCorpus = kindred.loadDir('standoff', tempDir) shutil.rmtree(tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc, kindred.Document) entities = loadedDoc.getEntities() relations = loadedDoc.getRelations() sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs() assertEntity(entities[0], expectedType='drug', expectedText='Erlotinib', expectedPos=[(0, 9)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='EGFR', expectedPos=[(13, 17)], expectedSourceEntityID="T2") assertEntity(entities[2], expectedType='disease', expectedText='NSCLC', expectedPos=[(49, 54)], expectedSourceEntityID="T3") assert relations == [ kindred.Relation('druginfo', [ sourceEntityIDsToEntityIDs["T3"], sourceEntityIDsToEntityIDs["T1"], sourceEntityIDsToEntityIDs["T2"] ], ['disease', 'drug', 'gene']) ], "(%s) not as expected" % relations
) parser.add_argument('--reltype', type=str, required=True, help='Relation type to analyze. Must be one of %s' % reltypes) parser.add_argument('--outCurve', type=str, required=True, help='File to output curve data to') args = parser.parse_args() with open(args.outCurve, 'w') as outF: outF.write("%s\t%s\t%s\n" % ('threshold', 'precision', 'recall')) for threshold in [-0.1] + list(np.arange(0, 1, 0.01)) + [1.0]: train = kindred.loadDir('standoff', args.trainDir) gold = kindred.loadDir('standoff', args.testDir) # Trim back to relation type of choice for doc in train.documents: doc.relations = [ r for r in doc.relations if r.relationType == args.reltype ] for doc in gold.documents: doc.relations = [ r for r in doc.relations if r.relationType == args.reltype ] entityType = entityTypes[args.reltype] entityCount = len(entityType) classifier = kindred.RelationClassifier(