예제 #1
0
def test_saveStandoffFile_fromSimpleTag():
	text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />'
	corpus = kindred.Corpus()
	doc = kindred.Document(text)
	corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 1
	loadedDoc = loadedCorpus.documents[0]
	
	assert isinstance(loadedDoc,kindred.Document)
	entities = loadedDoc.getEntities()
	relations = loadedDoc.getRelations()

	sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs()

	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
예제 #2
0
def test_saveStandoffFile_SeparateSentences():
	texts = ['The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />','<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />']
	corpus = kindred.Corpus()
	for t in texts:
		doc = kindred.Document(t)
		corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 2
	
	data = loadedCorpus.documents[0]
	assert isinstance(data,kindred.Document)
	entities = data.getEntities()
	relations = data.getRelations()
	sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs()
	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations
	
	data = loadedCorpus.documents[1]
	assert isinstance(data,kindred.Document)
	entities = data.getEntities()
	relations = data.getRelations()
	sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs()
	assertEntity(entities[0],expectedType='disease',expectedText='Li-Fraumeni',expectedPos=[(0,11)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='P53',expectedPos=[(39,42)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
예제 #3
0
def test_saveStandoffFile_noArgNames():
	text = "The colorectal cancer was caused by mutations in APC"
	e1 = kindred.Entity(entityType="disease",text="colorectal cancer",position=[(4, 21)],sourceEntityID="T1")
	e2 = kindred.Entity(entityType="gene",text="APC",position=[(49, 52)],sourceEntityID="T2")
	rel = kindred.Relation(relationType="causes",entityIDs=[e1.entityID,e2.entityID])
	doc = kindred.Document(text,[e1,e2],[rel],relationsUseSourceIDs=False)
	corpus = kindred.Corpus()
	corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 1
	loadedDoc = loadedCorpus.documents[0]
	
	assert isinstance(loadedDoc,kindred.Document)
	entities = loadedDoc.getEntities()
	relations = loadedDoc.getRelations()

	sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs()

	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['arg1','arg2'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
예제 #4
0
def test_iterLoadBiocFile():
	text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />'
	corpus = kindred.Corpus(text,loadFromSimpleTag=True)
	docsToCreate = 100

	tempDir = tempfile.mkdtemp()

	singleDoc = corpus.documents[0]
	corpus.documents = [ singleDoc for _ in range(docsToCreate) ]

	kindred.save(corpus,'bioc',tempDir)

	biocPath = os.path.join(tempDir,'collection.bioc.xml')
	totalDocCount = 0
	for corpus in kindred.iterLoadDataFromBioc(biocPath,corpusSizeCutoff=3):
		assert isinstance(corpus,kindred.Corpus)

		assert len(corpus.documents) <= 25
		totalDocCount += len(corpus.documents)

		for doc in corpus.documents:
			assert isinstance(doc,kindred.Document)
			entities = doc.getEntities()
			relations = doc.getRelations()

			sourceEntityIDsToEntityIDs = doc.getSourceEntityIDsToEntityIDs()

			assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
			assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
			assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations

	assert totalDocCount == docsToCreate
	shutil.rmtree(tempDir)
예제 #5
0
def test_saveStandoffFile_noSourceEntityID():
    text = 'The <disease>colorectal cancer</disease> is bad.'
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    with TempDir() as tempDir:
        with pytest.raises(AssertionError) as excinfo:
            kindred.save(corpus, 'standoff', tempDir)
        assert excinfo.value.args[
            0] == 'Entities must have a sourceEntityID (e.g. T1) to be saved in the standoff format'
예제 #6
0
파일: test_save.py 프로젝트: vj1494/kindred
def test_saveStandoffFile():
    text = "The colorectal cancer was caused by mutations in APC"
    e1 = kindred.Entity(entityType="disease",
                        text="colorectal cancer",
                        position=[(4, 21)],
                        sourceEntityID="T1")
    e2 = kindred.Entity(entityType="gene",
                        text="APC",
                        position=[(49, 52)],
                        sourceEntityID="T2")
    rel = kindred.Relation(relationType="causes",
                           entities=[e1, e2],
                           argNames=['obj', 'subj'])
    doc = kindred.Document(text, [e1, e2], [rel])
    corpus = kindred.Corpus()
    corpus.addDocument(doc)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.entities
    relations = loadedDoc.relations

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'],
            sourceRelationID='R1')
    ], "(%s) not as expected" % relations
예제 #7
0
def test_saveBB3Data():
	corpus = kindred.bionlpst.load('2016-BB3-event-train')
	assert isinstance(corpus,kindred.Corpus)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)
	assert len(corpus.documents) == len(loadedCorpus.documents)

	shutil.rmtree(tempDir)
예제 #8
0
def test_saveBB3Data():
    corpus = kindred.bionlpst.load('2016-BB3-event-train')
    assert isinstance(corpus, kindred.Corpus)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)
        assert len(corpus.documents) == len(loadedCorpus.documents)
예제 #9
0
def test_iterLoadBiocFile():
    text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />'
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)
    docsToCreate = 100

    with TempDir() as tempDir:

        singleDoc = corpus.documents[0]
        corpus.documents = [singleDoc for _ in range(docsToCreate)]

        tempFile = os.path.join(tempDir, 'corpus.bioc.xml')
        kindred.save(corpus, 'biocxml', tempFile)

        totalDocCount = 0
        for corpus in kindred.iterLoad('biocxml', tempFile,
                                       corpusSizeCutoff=3):
            assert isinstance(corpus, kindred.Corpus)

            assert len(corpus.documents) <= 25
            totalDocCount += len(corpus.documents)

            for doc in corpus.documents:
                assert isinstance(doc, kindred.Document)
                entities = doc.entities
                relations = doc.relations

                sourceEntityIDsToEntity = {
                    entity.sourceEntityID: entity
                    for entity in entities
                }

                assertEntity(entities[0],
                             expectedType='disease',
                             expectedText='colorectal cancer',
                             expectedPos=[(4, 21)],
                             expectedSourceEntityID="T1")
                assertEntity(entities[1],
                             expectedType='gene',
                             expectedText='APC',
                             expectedPos=[(49, 52)],
                             expectedSourceEntityID="T2")
                assert relations == [
                    kindred.Relation('causes', [
                        sourceEntityIDsToEntity["T1"],
                        sourceEntityIDsToEntity["T2"]
                    ], ['obj', 'subj'])
                ], "(%s) not as expected" % relations

        assert totalDocCount == docsToCreate
예제 #10
0
파일: test_save.py 프로젝트: vj1494/kindred
def test_saveStandoffFile_fromSimpleTag_triple():
    text = '<drug id="T1">Erlotinib</drug>, a <gene id="T2">EGFR</gene> inhibitor is commonly used for <disease id="T3">NSCLC</disease> patients. <relation type="druginfo" drug="T1" gene="T2" disease="T3" />'
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.entities
    relations = loadedDoc.relations

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='drug',
                 expectedText='Erlotinib',
                 expectedPos=[(0, 9)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='EGFR',
                 expectedPos=[(13, 17)],
                 expectedSourceEntityID="T2")
    assertEntity(entities[2],
                 expectedType='disease',
                 expectedText='NSCLC',
                 expectedPos=[(49, 54)],
                 expectedSourceEntityID="T3")
    assert relations == [
        kindred.Relation('druginfo', [
            sourceEntityIDToEntity["T3"], sourceEntityIDToEntity["T1"],
            sourceEntityIDToEntity["T2"]
        ], ['disease', 'drug', 'gene'],
                         sourceRelationID='R1')
    ], "(%s) not as expected" % relations
예제 #11
0
파일: test_save.py 프로젝트: vj1494/kindred
def test_saveStandoffFile_fromSimpleTag_binary():
    text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />'
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.entities
    relations = loadedDoc.relations

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    print([r.sourceRelationID for r in relations])
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'],
            sourceRelationID='R1')
    ], "(%s) not as expected" % relations
예제 #12
0
def test_saveStandoffFile_fromSimpleTag_triple():
    text = '<drug id="T1">Erlotinib</drug>, a <gene id="T2">EGFR</gene> inhibitor is commonly used for <disease id="T3">NSCLC</disease> patients. <relation type="druginfo" drug="T1" gene="T2" disease="T3" />'
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    tempDir = tempfile.mkdtemp()

    kindred.save(corpus, 'standoff', tempDir)

    loadedCorpus = kindred.loadDir('standoff', tempDir)
    shutil.rmtree(tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.getEntities()
    relations = loadedDoc.getRelations()

    sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs()

    assertEntity(entities[0],
                 expectedType='drug',
                 expectedText='Erlotinib',
                 expectedPos=[(0, 9)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='EGFR',
                 expectedPos=[(13, 17)],
                 expectedSourceEntityID="T2")
    assertEntity(entities[2],
                 expectedType='disease',
                 expectedText='NSCLC',
                 expectedPos=[(49, 54)],
                 expectedSourceEntityID="T3")
    assert relations == [
        kindred.Relation('druginfo', [
            sourceEntityIDsToEntityIDs["T3"], sourceEntityIDsToEntityIDs["T1"],
            sourceEntityIDsToEntityIDs["T2"]
        ], ['disease', 'drug', 'gene'])
    ], "(%s) not as expected" % relations
예제 #13
0
파일: test_save.py 프로젝트: vj1494/kindred
def test_savePubAnnotationFile_fromSimpleTag():
    text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />'
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    with TempDir() as tempDir:
        tempFile = os.path.join(tempDir, 'corpus.json')

        kindred.save(corpus, 'pubannotation', tempFile)

        loadedCorpus = kindred.load('pubannotation', tempFile)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.entities
    relations = loadedDoc.relations

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    print([r.sourceRelationID for r in relations])
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['subj', 'obj'],
            sourceRelationID='R1')
    ], "(%s) not as expected" % relations
예제 #14
0
def _bionlpst_seedev_testSet():
    trainData = kindred.bionlpst.load('2016-SeeDev-binary-train')
    devData = kindred.bionlpst.load('2016-SeeDev-binary-dev')
    testData = kindred.bionlpst.load('2016-SeeDev-binary-test')

    trainAndDevData = trainData + devData

    print("Starting training...")
    classifier = kindred.RelationClassifier()
    classifier.train(trainAndDevData)

    print("Predicting training...")
    predictedRelations = classifier.predict(
        testData)  #devData_TextAndEntities)

    print("Saving...")
    outDir = 'out.SeeDev'
    kindred.save(testData,
                 'standoff',
                 outDir,
                 predictedRelations=predictedRelations)
예제 #15
0
def _bionlpst_bb3_testSet():
    trainData = kindred.bionlpst.load('2016-BB3-event-train')
    devData = kindred.bionlpst.load('2016-BB3-event-dev')
    testData = kindred.bionlpst.load('2016-BB3-event-test')

    trainAndDevData = trainData + devData

    print("Starting training...")
    classifier = kindred.RelationClassifier(useBuilder=True)
    #classifier = RelationClassifier(useBuilder=False)
    classifier.train(trainAndDevData)

    print("Predicting training...")
    predictedRelations = classifier.predict(
        testData)  #devData_TextAndEntities)

    print("Saving...")
    outDir = 'out.BB3'
    kindred.save(testData,
                 'standoff',
                 outDir,
                 predictedRelations=predictedRelations)
예제 #16
0
                entityTypes = set([
                    entity.entityType
                    for entity, tokenIndices in sentence.entityAnnotations
                ])
                entityInfo = [(e.entityType, e.text)
                              for e, tokenIndices in sentence.entityAnnotations
                              ]

                hasMutation = "Mutation" in entityTypes
                hasChemical = "Chemical" in entityTypes

                if hasMutation and hasChemical:
                    sentenceStart = sentence.tokens[0].startPos

                    sentenceEntities = [
                        kindred.Entity(e.entityType,
                                       e.text,
                                       [(e.position[0][0] - sentenceStart,
                                         e.position[0][1] - sentenceStart)],
                                       e.sourceEntityID,
                                       e.externalID,
                                       metadata=e.metadata)
                        for e, _ in sentence.entityAnnotations
                    ]
                    newDoc = kindred.Document(sentence.text,
                                              sentenceEntities,
                                              metadata=doc.metadata)
                    sentenceCorpus.addDocument(newDoc)

    kindred.save(sentenceCorpus, 'biocxml', args.outBioc)
예제 #17
0
import os

if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Use annotated sentences to build a Kindred classifer and apply to unannotated sentences')
	parser.add_argument('--dataToBuildModel',required=True,type=str,help='Sentences with relations')
	parser.add_argument('--dataToApplyModel',required=True,type=str,help='Sentences without annotated relations to make predictions on')
	parser.add_argument('--outDir',required=True,type=str,help='Directory to store output')
	args = parser.parse_args()

	print("Loading corpora...")
	trainCorpus = kindred.load('standoff',args.dataToBuildModel)
	predictionCorpus = kindred.load('standoff',args.dataToApplyModel)

	print("Building classifier...")
	classifier = kindred.RelationClassifier()
	classifier.train(trainCorpus)

	print("Applying classifier...")
	classifier.predict(predictionCorpus)

	if not os.path.isdir(args.outDir):
		os.makedirs(args.outDir)

	print("Saving results to directory...")
	kindred.save(predictionCorpus,'standoff',args.outDir)

	print("\nPredicted relations:")
	for relation in predictionCorpus.getRelations():
		print("%s\t%s" % (relation.entities[0].text,relation.entities[1].text))

예제 #18
0
    sentenceCorpus = corpus.splitIntoSentences()

    print("Looking for measurement words, e.g. voltage")
    wordlist = {
        ('voltage', ): {('measurement', 'voltage')},
        ('current', ): {('measurement', 'current')}
    }

    entityRecognizer = kindred.EntityRecognizer(wordlist)
    entityRecognizer.annotate(sentenceCorpus)

    print("Looking for numeric values")
    quantityRecognizer = QuantityRecognizer()
    quantityRecognizer.annotate(sentenceCorpus)

    print("Find every pair of a measurement word and a value")
    candidateBuilder = kindred.CandidateBuilder(
        acceptedEntityTypes=[('measurement', 'quantity')])
    candidateRelations = candidateBuilder.build(sentenceCorpus)

    print("Let's annotate a few")
    withRelations, noRelations = kindred.manuallyAnnotate(
        sentenceCorpus, candidateRelations)

    outDir = 'numericalAnnotations'
    if not os.path.isdir(outDir):
        os.makedirs(outDir)

    print("Saving results to directory...")
    kindred.save(withRelations, 'standoff', outDir)
예제 #19
0
    wordlistLookup = kindred.EntityRecognizer.loadWordlists(wordlistDict,
                                                            idColumn=0,
                                                            termsColumn=0)

    print("Annotating entities in corpus with wordlists")
    entityRecognizer = kindred.EntityRecognizer(wordlistLookup)
    entityRecognizer.annotate(sentenceCorpus)

    print("Finding all candidate relations")
    acceptedEntityTypes = wordlistDict
    candidateBuilder = kindred.CandidateBuilder(
        entityCount=len(wordlistDict),
        acceptedEntityTypes=[tuple(sorted(wordlistDict.keys()))])
    candidateRelations = candidateBuilder.build(sentenceCorpus)

    print(
        "Time to through some of the candidate relations and annotate some...")
    annotatedCorpus, unannotatedCorpus = kindred.manuallyAnnotate(
        sentenceCorpus, candidateRelations)

    print(
        "\nSaving annotated corpus of %d sentences (with relations that you have just annotated)"
        % len(annotatedCorpus.documents))
    kindred.save(annotatedCorpus, 'standoff', annotatedDir)

    print(
        "Saving unannotated corpus of %d sentences (which you did not review)"
        % len(unannotatedCorpus.documents))
    kindred.save(unannotatedCorpus, 'standoff', unannotatedDir)
예제 #20
0
def test_saveStandoffFile_SeparateSentences():
    texts = [
        'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />',
        '<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />'
    ]
    corpus = kindred.Corpus()
    for t in texts:
        doc = kindred.Document(t, loadFromSimpleTag=True)
        corpus.addDocument(doc)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 2

    data = loadedCorpus.documents[0]
    assert isinstance(data, kindred.Document)
    entities = data.entities
    relations = data.relations
    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }
    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'])
    ], "(%s) not as expected" % relations

    data = loadedCorpus.documents[1]
    assert isinstance(data, kindred.Document)
    entities = data.entities
    relations = data.relations
    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }
    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='Li-Fraumeni',
                 expectedPos=[(0, 11)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='P53',
                 expectedPos=[(39, 42)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'])
    ], "(%s) not as expected" % relations
예제 #21
0
                if not (e.entityType == 'Chemical'
                        and not e.metadata['conceptid'] in drugMeshIDs)
            ]
            doc.entities = [
                e for e in doc.entities
                if not (e.entityType == 'Chemical' and len(e.text) <= 4)
            ]
            doc.entities = [
                e for e in doc.entities
                if not (e.entityType == 'Mutation'
                        and pgmine.normalizeMutation(e.text) is None)
            ]
            doc.entities = [
                e for e in doc.entities
                if not (e.entityType == 'Mutation'
                        and e.text.lower() in variantStopwords)
            ]

            entityTypes = set(e.entityType for e in doc.entities)
            if 'Chemical' in entityTypes and 'Mutation' in entityTypes:
                filtered.append(doc)

        corpus.documents += filtered

    print("Found: ", len(corpus.documents))
    corpus.documents = random.sample(corpus.documents, 500)

    kindred.save(corpus, 'standoff', args.outDir)
    kindred.save(corpus, 'biocxml', os.path.join(args.outDir,
                                                 'corpus.bioc.xml'))