Exemplo n.º 1
0
def test_loadBiocFile_dir():
    scriptDir = os.path.dirname(__file__)
    dataPath = os.path.join(scriptDir, 'data')

    corpus = kindred.load(dataFormat='biocxml', path=dataPath)

    assert isinstance(corpus, kindred.Corpus)
    assert len(corpus.documents) == 1
    doc = corpus.documents[0]

    assert isinstance(doc, kindred.Document)
    entities = doc.entities
    relations = doc.relations

    sourceEntityIDsToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDsToEntity["T1"], sourceEntityIDsToEntity["T2"]],
            ['obj', 'subj'])
    ], "(%s) not as expected" % relations
Exemplo n.º 2
0
def test_loadEmptyDirectory():
    with TempDir() as tempDir:
        for dataformat in ['standoff', 'simpletag', 'json', 'biocxml']:
            with pytest.raises(RuntimeError) as excinfo:
                corpus = kindred.load(dataformat, tempDir)
            expectedError = 'No documents loaded from directory (%s). Are you sure this directory contains the corpus (format: %s)' % (
                tempDir.rstrip('/'), dataformat)
            assert excinfo.value.args == (expectedError, )
Exemplo n.º 3
0
def test_saveStandoffFile():
    text = "The colorectal cancer was caused by mutations in APC"
    e1 = kindred.Entity(entityType="disease",
                        text="colorectal cancer",
                        position=[(4, 21)],
                        sourceEntityID="T1")
    e2 = kindred.Entity(entityType="gene",
                        text="APC",
                        position=[(49, 52)],
                        sourceEntityID="T2")
    rel = kindred.Relation(relationType="causes",
                           entities=[e1, e2],
                           argNames=['obj', 'subj'])
    doc = kindred.Document(text, [e1, e2], [rel])
    corpus = kindred.Corpus()
    corpus.addDocument(doc)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.entities
    relations = loadedDoc.relations

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'],
            sourceRelationID='R1')
    ], "(%s) not as expected" % relations
Exemplo n.º 4
0
def test_saveBB3Data():
    corpus = kindred.bionlpst.load('2016-BB3-event-train')
    assert isinstance(corpus, kindred.Corpus)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)
        assert len(corpus.documents) == len(loadedCorpus.documents)
Exemplo n.º 5
0
def test_saveStandoffFile_fromSimpleTag_triple():
    text = '<drug id="T1">Erlotinib</drug>, a <gene id="T2">EGFR</gene> inhibitor is commonly used for <disease id="T3">NSCLC</disease> patients. <relation type="druginfo" drug="T1" gene="T2" disease="T3" />'
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.entities
    relations = loadedDoc.relations

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='drug',
                 expectedText='Erlotinib',
                 expectedPos=[(0, 9)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='EGFR',
                 expectedPos=[(13, 17)],
                 expectedSourceEntityID="T2")
    assertEntity(entities[2],
                 expectedType='disease',
                 expectedText='NSCLC',
                 expectedPos=[(49, 54)],
                 expectedSourceEntityID="T3")
    assert relations == [
        kindred.Relation('druginfo', [
            sourceEntityIDToEntity["T3"], sourceEntityIDToEntity["T1"],
            sourceEntityIDToEntity["T2"]
        ], ['disease', 'drug', 'gene'],
                         sourceRelationID='R1')
    ], "(%s) not as expected" % relations
Exemplo n.º 6
0
def test_saveStandoffFile_fromSimpleTag_binary():
    text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />'
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.entities
    relations = loadedDoc.relations

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    print([r.sourceRelationID for r in relations])
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'],
            sourceRelationID='R1')
    ], "(%s) not as expected" % relations
Exemplo n.º 7
0
def test_savePubAnnotationFile_fromSimpleTag():
    text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />'
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    with TempDir() as tempDir:
        tempFile = os.path.join(tempDir, 'corpus.json')

        kindred.save(corpus, 'pubannotation', tempFile)

        loadedCorpus = kindred.load('pubannotation', tempFile)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.entities
    relations = loadedDoc.relations

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    print([r.sourceRelationID for r in relations])
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['subj', 'obj'],
            sourceRelationID='R1')
    ], "(%s) not as expected" % relations
Exemplo n.º 8
0
def test_loadStandoffFile_triple():
    scriptDir = os.path.dirname(__file__)
    txtPath = os.path.join(scriptDir, 'data_triple', 'example.txt')

    corpus = kindred.load('standoff', txtPath)

    assert isinstance(corpus, kindred.Corpus)
    assert len(corpus.documents) == 1
    doc = corpus.documents[0]

    assert isinstance(doc, kindred.Document)
    entities = doc.entities
    relations = doc.relations

    sourceEntityIDsToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='drug',
                 expectedText='Erlotinib',
                 expectedPos=[(0, 9)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='EGFR',
                 expectedPos=[(13, 17)],
                 expectedSourceEntityID="T2")
    assertEntity(entities[2],
                 expectedType='disease',
                 expectedText='NSCLC',
                 expectedPos=[(49, 54)],
                 expectedSourceEntityID="T3")
    assert relations == [
        kindred.Relation('druginfo', [
            sourceEntityIDsToEntity["T3"], sourceEntityIDsToEntity["T1"],
            sourceEntityIDsToEntity["T2"]
        ], ['disease', 'drug', 'gene'],
                         sourceRelationID='R0')
    ], "(%s) not as expected" % relations
Exemplo n.º 9
0
def load(taskName, ignoreEntities=[]):
    """
	Download and load the corresponding corpus from the BioNLP Shared Task
	
	:param taskName: The name of the shared task to download (e.g. 'BioNLP-ST-2016_BB-event_train'). Use kindred.bionlpst.listTasks() to get a list of valid options
	:param ignoreEntities: A list of any entities that should be ignored during loading
	:type taskName: str
	:type ignoreEntities: list of str
	:return: The loaded corpus
	:rtype: kindred.Corpus
	"""
    global taskOptions

    tempDir = tempfile.mkdtemp()

    assert taskName in taskOptions.keys(), "%s not a valid option in %s" % (
        taskName, taskOptions.keys())
    url, expectedFile, expectedSHA256 = taskOptions[taskName]
    filesToDownload = [(url, expectedFile, expectedSHA256)]
    expectedDir = expectedFile.replace('.zip', '')

    try:
        kindred.utils._downloadFiles(filesToDownload, tempDir)
    except:
        exc_info = sys.exc_info()
        shutil.rmtree(tempDir)
        six.reraise(*exc_info)

    mainDir = kindred.utils._findDir(expectedDir, tempDir)

    corpus = kindred.load(dataFormat='standoff',
                          path=mainDir,
                          ignoreEntities=ignoreEntities)

    shutil.rmtree(tempDir)

    return corpus
Exemplo n.º 10
0
def test_saveStandoffFile_SeparateSentences():
    texts = [
        'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />',
        '<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />'
    ]
    corpus = kindred.Corpus()
    for t in texts:
        doc = kindred.Document(t, loadFromSimpleTag=True)
        corpus.addDocument(doc)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 2

    data = loadedCorpus.documents[0]
    assert isinstance(data, kindred.Document)
    entities = data.entities
    relations = data.relations
    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }
    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'])
    ], "(%s) not as expected" % relations

    data = loadedCorpus.documents[1]
    assert isinstance(data, kindred.Document)
    entities = data.entities
    relations = data.relations
    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }
    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='Li-Fraumeni',
                 expectedPos=[(0, 11)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='P53',
                 expectedPos=[(39, 42)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'])
    ], "(%s) not as expected" % relations
Exemplo n.º 11
0
def get_rels():
    c.execute("SELECT * FROM relationships")
    return c.fetchall()


def get_rels_clean():
    c.execute("SELECT gene, disease, relation FROM relationships")
    return c.fetchall()


# Kindred

# 5 classes

trainCorpus = kindred.load(dataFormat='json', path='relation/db/1')
devCorpus = kindred.load(dataFormat='json', path='ner-dump')

predictionCorpus = devCorpus.clone()

classifier = kindred.RelationClassifier()
classifier.train(trainCorpus)
classifier.predict(predictionCorpus)

f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score')

print("5 CLASSES ---------------------")
for i in predictionCorpus.documents:
    for j in (i.relations):
        rel = Relationships(j.entities[0].text, j.entities[1].text,
                            j.relationType, i.text)
Exemplo n.º 12
0
    parser = argparse.ArgumentParser(
        description='Create PR curves using a train and test set')
    parser.add_argument('--train',
                        required=True,
                        type=str,
                        help='Directory containing stand-off training test')
    parser.add_argument('--test',
                        required=True,
                        type=str,
                        help='Directory containing stand-off testing test')
    args = parser.parse_args()

    print("threshold\tprecision\trecall")
    for threshold in np.arange(0, 1.01, 0.01):

        trainCorpus = kindred.load('standoff', args.train)
        testCorpus = kindred.load('standoff', args.test)

        predCorpus = testCorpus.clone()
        predCorpus.removeRelations()

        parser = kindred.Parser(model='en_core_sci_sm')
        parser.parse(trainCorpus)
        parser.parse(testCorpus)
        parser.parse(predCorpus)

        classifier = kindred.RelationClassifier(
            classifierType='LogisticRegression',
            threshold=threshold,
            acceptedEntityTypes=[('Chemical', 'Mutation')])
        classifier.train(trainCorpus)
Exemplo n.º 13
0
import kindred
import argparse
import os
import csv

with open('5_types_lp.csv', mode='w') as csv_file:
    fieldnames = ['iteration_num', 'svm', 'decision_trees', 'neural_net']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()

    Corpus = kindred.load(dataFormat='json', path='db/5types')

    # avg_svm = 0
    # avg_dct = 0
    # avg_nn = 0

    count = 0
    iter_num = 200

    print("-------------5 CLASSES------------")

    while count < iter_num:
        print("------", count, "------")

        trainCorpus, devCorpus = Corpus.split(trainFraction=0.9)

        predictionCorpus = devCorpus.clone()
        predictionCorpus.removeRelations()

        classifier = kindred.RelationClassifier()
Exemplo n.º 14
0
    headers = [
        'pmid', 'title', 'journal', 'journal_short', 'year', 'month', 'day',
        'section', 'subsection', 'chemical_mesh_id', 'chemical_pharmgkb_id',
        'chemical_drugbank_id', 'chemical_text', 'chemical_normalized',
        'chemical_position', 'variant_id', 'variant_type', 'variant_text',
        'variant_normalized', 'variant_position', 'gene_ids', 'gene_names',
        'score', 'sentence', 'formatted_sentence'
    ]
    with open(args.outKB, 'w') as outF:
        outF.write("\t".join(headers) + "\n")

        for mode, trainingData in zip(modes, trainingFiles):
            print("Creating classifier for %s" % mode)
            predictedCount = 0

            trainCorpus = kindred.load('biocxml', trainingData)
            corpus = kindred.load('biocxml', args.inBioC)

            for doc in trainCorpus.documents:
                for relation in doc.relations:
                    relation.relationType = 'ChemicalMutation'

            for doc in corpus.documents:
                if mode == 'star_allele':
                    doc.entities = [
                        e for e in doc.entities
                        if not (e.entityType == 'Mutation'
                                and not e.text.strip().startswith('*'))
                    ]
                elif mode == 'rs':
                    doc.entities = [
Exemplo n.º 15
0
import kindred
import argparse
import os

if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Use annotated sentences to build a Kindred classifer and apply to unannotated sentences')
	parser.add_argument('--dataToBuildModel',required=True,type=str,help='Sentences with relations')
	parser.add_argument('--dataToApplyModel',required=True,type=str,help='Sentences without annotated relations to make predictions on')
	parser.add_argument('--outDir',required=True,type=str,help='Directory to store output')
	args = parser.parse_args()

	print("Loading corpora...")
	trainCorpus = kindred.load('standoff',args.dataToBuildModel)
	predictionCorpus = kindred.load('standoff',args.dataToApplyModel)

	print("Building classifier...")
	classifier = kindred.RelationClassifier()
	classifier.train(trainCorpus)

	print("Applying classifier...")
	classifier.predict(predictionCorpus)

	if not os.path.isdir(args.outDir):
		os.makedirs(args.outDir)

	print("Saving results to directory...")
	kindred.save(predictionCorpus,'standoff',args.outDir)

	print("\nPredicted relations:")
	for relation in predictionCorpus.getRelations():
		print("%s\t%s" % (relation.entities[0].text,relation.entities[1].text))
Exemplo n.º 16
0
            'Driver': 0.80,
            'Oncogene': 0.76,
            'Tumor_Suppressor': 0.92
        }
    else:
        thresholds = {'Driver': 0.5, 'Oncogene': 0.5, 'Tumor_Suppressor': 0.5}

    for relationType, outModel in zip(
        ['Driver', 'Oncogene', 'Tumor_Suppressor'], [
            args.outModel_Driver, args.outModel_Oncogene,
            args.outModel_TumorSuppressor
        ]):
        print("Building %s model" % relationType)
        print("  Loading training")
        goldDir = 'gold'
        trainCorpus = kindred.load('standoff', args.inTrain)

        for doc in trainCorpus.documents:
            doc.relations = [
                r for r in doc.relations if r.relationType == relationType
            ]

        print("  Doing training")
        features = "entityTypes,unigramsBetweenEntities,bigrams,dependencyPathEdges,dependencyPathEdgesNearEntities".split(
            ',')
        threshold = thresholds[relationType]
        classifier = kindred.RelationClassifier(
            classifierType='LogisticRegression',
            threshold=threshold,
            features=features,
            acceptedEntityTypes=[('cancer', 'gene')])
Exemplo n.º 17
0
    with open(args.variantStopwords) as f:
        variantStopwords = set(line.strip().lower() for line in f)

    stopwords = {'dopamine', 'insulin', 'caffeine', 'nicotine', 'choline'}
    print("Loading sentences...")
    corpus = kindred.Corpus()
    filenames = sorted(os.listdir(args.sentenceData))

    if args.fileCount:
        filenames = random.sample(filenames, args.fileCount)

    for i, filename in enumerate(filenames):
        print(i, filename)

        tmpCorpus = kindred.load('biocxml',
                                 os.path.join(args.sentenceData, filename))
        if args.filterTerms:
            tmpCorpus.documents = [
                doc for doc in tmpCorpus.documents
                if any(filterTerm in doc.text.lower()
                       for filterTerm in filterTerms)
            ]
        tmpCorpus.documents = [
            doc for doc in tmpCorpus.documents
            if not any(stopword in doc.text.lower() for stopword in stopwords)
        ]

        filtered = []
        for doc in tmpCorpus.documents:
            if args.mode == 'star_allele':
                doc.entities = [