예제 #1
0
def test_entityrecognizer_fusion_OFF():
    lookup = makeTestLookup()

    text = 'EGFR-ERBB2 is not a real fusion gene'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'gene'
    assert entity1.externalID == 'HGNC:3236'
    assert entity1.text == 'EGFR'
    assert entity1.position == [(0, 4)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'gene'
    assert entity2.externalID == 'HGNC:2064'
    assert entity2.text == 'ERBB2'
    assert entity2.position == [(5, 10)]
    assert entity2.sourceEntityID == 'T2'
예제 #2
0
def test_entityrecognizer_twoSentences():
    lookup = makeTestLookup()

    text = 'EGFR is one gene. ERBB2 is another gene.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'gene'
    assert entity1.externalID == 'HGNC:3236'
    assert entity1.text == 'EGFR'
    assert entity1.position == [(0, 4)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'gene'
    assert entity2.externalID == 'HGNC:2064'
    assert entity2.text == 'ERBB2'
    assert entity2.position == [(18, 23)]
    assert entity2.sourceEntityID == 'T2'
예제 #3
0
def test_entityrecognizer_fusion_3():
    lookup = makeTestLookup()

    text = 'EGFR-lymphoma is not anything.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, detectFusionGenes=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'gene'
    assert entity1.externalID == 'HGNC:3236'
    assert entity1.text == 'EGFR'
    assert entity1.position == [(0, 4)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'cancer'
    assert entity2.externalID == 'DOID:0060058'
    assert entity2.text == 'lymphoma'
    assert entity2.position == [(5, 13)]
    assert entity2.sourceEntityID == 'T2'
예제 #4
0
def test_entityrecognizer_fusion_1():
    lookup = makeTestLookup()

    text = 'EGFR-ERBB2 is not a real fusion gene, but FGFR3-TACC3 is.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, detectFusionGenes=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'gene'
    assert entity1.externalID == 'combo|HGNC:3236|HGNC:2064'
    assert entity1.text == 'EGFR-ERBB2'
    assert entity1.position == [(0, 10)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'gene'
    assert entity2.externalID == 'combo|HGNC:3690|HGNC:11524'
    assert entity2.text == 'FGFR3-TACC3'
    assert entity2.position == [(42, 53)]
    assert entity2.sourceEntityID == 'T2'
예제 #5
0
def test_entityrecognizer_basic():
    lookup = makeTestLookup()

    text = 'EGFR is a gene associated with lung cancer'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'gene'
    assert entity.externalID == 'HGNC:3236'
    assert entity.text == 'EGFR'
    assert entity.position == [(0, 4)]
    assert entity.sourceEntityID == 'T1'

    assert len(doc.sentences) == 1
    sentence = doc.sentences[0]
    assert sentence.entityAnnotations == [(entity, [0])]
예제 #6
0
def test_entityrecognizer_merge_brackets_OFF():
    lookup = makeTestLookup()

    text = 'This paper studies non-small cell lung carcinoma (NSCLC).'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'cancer'
    assert entity1.externalID == 'DOID:3908'
    assert entity1.text == 'non-small cell lung carcinoma'
    assert entity1.position == [(19, 48)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'cancer'
    assert entity2.externalID == 'DOID:3908'
    assert entity2.text == 'NSCLC'
    assert entity2.position == [(50, 55)]
    assert entity2.sourceEntityID == 'T2'
예제 #7
0
def test_entityrecognizer_acronyms_OFF():
    lookup = makeTestLookup()

    text = 'The Never Ending Umbrella (NEU) is a true classic.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'movie'
    assert entity1.externalID == 'IMDB:9999'
    assert entity1.text == 'Never Ending Umbrella'
    assert entity1.position == [(4, 25)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'gene'
    assert entity2.externalID == 'HGNC:2064'
    assert entity2.text == 'NEU'
    assert entity2.position == [(27, 30)]
    assert entity2.sourceEntityID == 'T2'
예제 #8
0
def test_entityrecognizer_merge_negativecase():
    lookup = makeTestLookup()

    text = 'EGFR ERBB2 is not anything.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, mergeTerms=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'gene'
    assert entity1.externalID == 'HGNC:3236'
    assert entity1.text == 'EGFR'
    assert entity1.position == [(0, 4)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'gene'
    assert entity2.externalID == 'HGNC:2064'
    assert entity2.text == 'ERBB2'
    assert entity2.position == [(5, 10)]
    assert entity2.sourceEntityID == 'T2'
예제 #9
0
def test_entityrecognizer_removepathways_off():
    lookup = makeTestLookup()

    text = 'EGFR signalling is involved in lung cancer'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, removePathways=False)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'gene'
    assert entity.externalID == 'HGNC:3236'
    assert entity.text == 'EGFR'
    assert entity.position == [(0, 4)]
    assert entity.sourceEntityID == 'T1'

    assert len(doc.sentences) == 1
    sentence = doc.sentences[0]
    assert sentence.entityAnnotations == [(entity, [0])]
예제 #10
0
def test_entityrecognizer_merge_idintersections():
    lookup = makeTestLookup()

    text = 'We studied the genes known as GLP-1R GLP1R GLP1 GLP-1.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, mergeTerms=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]

    assert len(doc.sentences) == 1
    assert len(doc.entities) == 2

    assert doc.entities[0].entityType == 'gene'
    assert doc.entities[0].externalID == 'HGNC:4324'
    assert doc.entities[0].text == 'GLP-1R GLP1R'
    assert doc.entities[0].position == [(30, 42)]
    assert doc.entities[0].sourceEntityID == 'T1'

    assert doc.entities[1].entityType == 'gene'
    assert doc.entities[1].externalID == 'HGNC:4191'
    assert doc.entities[1].text == 'GLP1 GLP-1'
    assert doc.entities[1].position == [(43, 53)]
    assert doc.entities[1].sourceEntityID == 'T2'
예제 #11
0
def test_entityrecognizer_merge_triple_brackets():
    lookup = makeTestLookup()

    text = 'HER2 neu (ERBB2) is a gene.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, mergeTerms=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    #print(doc.entities)

    assert len(doc.sentences) == 1
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'gene'
    assert entity.externalID == 'HGNC:2064'
    assert entity.text == 'HER2 neu (ERBB2)'
    assert entity.position == [(0, 16)]
    assert entity.sourceEntityID == 'T1'
예제 #12
0
def findFusions(biocFile, genesFile, wordlistPickle, outFile):
    print("%s : start" % now())

    with open(wordlistPickle, 'rb') as f:
        termLookup = pickle.load(f)

    hugo2Name = {}
    with open(genesFile) as f:
        for line in f:
            hugo_gene_id, gene_name, synoyms, entrez_gene_id = line.strip(
                '\n').split('\t')
            hugo2Name[hugo_gene_id] = gene_name

    print("%s : processing..." % now())
    parser = kindred.Parser(model='en_core_sci_sm')
    ner = kindred.EntityRecognizer(lookup=termLookup,
                                   detectFusionGenes=True,
                                   detectMicroRNA=True,
                                   acronymDetectionForAmbiguity=True,
                                   mergeTerms=True,
                                   detectVariants=True)
    with open(outFile, 'w') as outF:
        for corpusno, corpus in enumerate(kindred.iterLoad(
                'biocxml', biocFile)):
            parser.parse(corpus)
            ner.annotate(corpus)

            for doc in corpus.documents:
                pmid = ''
                if 'pmid' in doc.metadata:
                    pmid = doc.metadata['pmid']
                #fusions = [ e for  e in doc.entities if e.entityType == 'Gene' ]
                for e in doc.entities:
                    if e.entityType == 'gene' and e.externalID.startswith(
                            'combo|'):
                        gene_ids = e.externalID.split('|')[1:]

                        if len(gene_ids) != 2:
                            continue

                        if any('&' in gene_id for gene_id in gene_ids):
                            continue

                        for gene_id in gene_ids:
                            assert gene_id in hugo2Name, 'Unable to find HUGO gene name for ID: %s (text=%s)' % (
                                gene_id, e.text)

                        gene_names = [
                            hugo2Name[gene_id] for gene_id in gene_ids
                        ]

                        assert len(gene_names) == 2

                        outData = [pmid, e.text] + gene_ids + gene_names
                        outF.write("\t".join(outData) + "\n")
예제 #13
0
def test_entityrecognizer_removepathways_4():
    lookup = makeTestLookup()

    text = 'EGFR cascade is involved in lung cancer'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, removePathways=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 0
예제 #14
0
def test_entityrecognizer_variant_stopwords():
	lookup = {}

	text = 'The V600E variant is well studied.'
	
	corpus = kindred.Corpus(text)

	parser = kindred.Parser()
	parser.parse(corpus)

	ner = kindred.EntityRecognizer(lookup,detectVariants=True,variantStopwords=['V600E'])
	ner.annotate(corpus)

	doc = corpus.documents[0]
	assert len(doc.entities) == 0
예제 #15
0
def test_entityrecognizer_microRNA_mirOFF():
    lookup = makeTestLookup()

    text = 'mir-83 is a gene associated with lung cancer'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 0
예제 #16
0
def test_entityrecognizer_fusion_4():
    lookup = makeTestLookup()

    text = 'EGFR-banana is not anything.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, detectFusionGenes=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'gene'
    assert entity.externalID == 'HGNC:3236'
    assert entity.text == 'EGFR'
    assert entity.position == [(0, 4)]
예제 #17
0
def test_entityrecognizer_merge_nobrackets():
    lookup = makeTestLookup()

    text = 'HER2 neu is a gene.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, mergeTerms=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'gene'
    assert entity.externalID == 'HGNC:2064'
    assert entity.text == 'HER2 neu'
    assert entity.position == [(0, 8)]
예제 #18
0
def test_entityrecognizer_merge_brackets_left():
    lookup = makeTestLookup()

    text = 'This paper studies (NSCLC) non-small cell lung carcinoma.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, mergeTerms=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'cancer'
    assert entity.externalID == 'DOID:3908'
    assert entity.text == '(NSCLC) non-small cell lung carcinoma'
    assert entity.position == [(19, 56)]
예제 #19
0
def test_entityrecognizer_acronyms_acronymHasCorrectID_hyphen():
    lookup = makeTestLookup()

    text = 'Diffuse large b-cell lymphoma (DLBCL) is a challenging research topic.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, acronymDetectionForAmbiguity=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'cancer'
    assert entity.externalID == 'DOID:0050745'
    assert entity.text == 'DLBCL'
    assert entity.position == [(31, 36)]
예제 #20
0
def test_entityrecognizer_acronyms_bothHaveIDs_plural():
    lookup = makeTestLookup()

    text = 'The Never Ending Umbrellas (NEUs) are true classics.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, acronymDetectionForAmbiguity=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'movie'
    assert entity.externalID == 'IMDB:9999'
    assert entity.text == 'Never Ending Umbrellas'
    assert entity.position == [(4, 26)]
예제 #21
0
def test_entityrecognizer_microRNA_mir1():
    lookup = makeTestLookup()

    text = 'mir-83 is a gene associated with lung cancer'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, detectMicroRNA=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'gene'
    assert entity.externalID == 'mirna|mir-83'
    assert entity.text == 'mir-83'
    assert entity.position == [(0, 6)]
예제 #22
0
def test_entityrecognizer_polymorphism():
    lookup = {}

    text = 'The rs12345 variant is well studied.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, detectPolymorphisms=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'variant'
    assert entity.externalID == 'dbsnp|rs12345'
    assert entity.text == 'rs12345'
    assert entity.position == [(4, 11)]
    assert entity.sourceEntityID == 'T1'
예제 #23
0
def test_entityrecognizer_variant_2():
    lookup = {}

    text = 'The BRAF p.Val600Glu variant is well studied.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, detectVariants=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'variant'
    assert entity.externalID == 'substitution|V600E'
    assert entity.text == 'Val600Glu'
    assert entity.position == [(11, 20)]
    assert entity.sourceEntityID == 'T1'
예제 #24
0
def test_entityrecognizer_variant_1():
    lookup = {}

    text = 'The V600E variant is well studied.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, detectVariants=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    print(doc.entities)
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'omicevent'
    assert entity.externalID == 'substitution|V600E'
    assert entity.text == 'V600E'
    assert entity.position == [(4, 9)]
예제 #25
0
def test_entityrecognizer_fusion_2():
    lookup = makeTestLookup()

    text = 'HER2-neu is a gene.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, detectFusionGenes=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'gene'
    assert entity.externalID == 'HGNC:2064'
    assert entity.text == 'HER2-neu'
    assert entity.position == [(0, 8)]
    assert entity.sourceEntityID == 'T1'
예제 #26
0
        documents = [d for d in documents if d['doi'] == testMode]
        assert len(documents) == 1
        text = documents[0]['title'] + "\n" + documents[0]['abstract']

        print("Title: %s" % documents[0]['title'])
        print("Abstract: %s" % documents[0]['abstract'])

        corpus = kindred.Corpus(text=text)
        corpus.documents[0].metadata = {"title": documents[0]['title']}
        parser = kindred.Parser(model='en_core_sci_sm')
        parser.parse(corpus)

    print("Annotating corpus...")
    sys.stdout.flush()
    corpus.removeEntities()
    ner = kindred.EntityRecognizer(termLookup, mergeTerms=True)
    ner.annotate(corpus)

    if testMode:
        doc = corpus.documents[0]
        for e in doc.entities:
            print("  %s" % str(e))
        #print(doc.entities)

    #assert False

    corpusMap = {}
    for kindred_doc in corpus.documents:
        corpusMap[kindred_doc.text] = kindred_doc

    for d in documents:
def cancermine(sentenceFile, modelFilenames, filterTerms, wordlistPickle,
               genes, cancerTypes, outData):
    print("%s : start" % now())

    models = {}
    assert isinstance(modelFilenames, list)
    for modelFilename in modelFilenames:
        with open(modelFilename, 'rb') as f:
            models[modelFilename] = pickle.load(f)

    IDToTerm = {}
    Hugo2Entrez = defaultdict(lambda: 'NA')
    with codecs.open(genes, 'r', 'utf-8') as f:
        for line in f:
            gene_hugo_id, singleterm, _, gene_entrez_id = line.strip().split(
                '\t')
            IDToTerm[gene_hugo_id] = singleterm
            Hugo2Entrez[gene_hugo_id] = gene_entrez_id

    with codecs.open(cancerTypes, 'r', 'utf-8') as f:
        for line in f:
            cancerid, singleterm, _ = line.strip().split('\t')
            IDToTerm[cancerid] = singleterm

    with codecs.open(filterTerms, 'r', 'utf-8') as f:
        filterTerms = [line.strip().lower() for line in f]

    with open(wordlistPickle, 'rb') as f:
        termLookup = pickle.load(f)

    # Truncate the output file
    with codecs.open(outData, 'w', 'utf-8') as outF:
        pass

    timers = Counter()

    print("%s : loading..." % now())
    with open(sentenceFile) as f:
        sentenceData = json.load(f)

    corpus = kindred.Corpus()
    for sentence in sentenceData:
        metadata = dict(sentence)
        del metadata["sentence"]
        doc = kindred.Document(sentence["sentence"], metadata=metadata)
        corpus.addDocument(doc)

    print("%s : loaded..." % now())
    startTime = time.time()
    parser = kindred.Parser()
    parser.parse(corpus)
    timers['parser'] += time.time() - startTime
    print("%s : parsed" % now())

    startTime = time.time()
    ner = kindred.EntityRecognizer(lookup=termLookup,
                                   detectFusionGenes=False,
                                   detectMicroRNA=False,
                                   acronymDetectionForAmbiguity=True,
                                   mergeTerms=True,
                                   removePathways=True)
    ner.annotate(corpus)
    timers['ner'] += time.time() - startTime
    print("%s : ner" % now())

    with codecs.open(outData, 'a', 'utf-8') as outF:
        startTime = time.time()
        for modelname, model in models.items():
            model.predict(corpus)
        timers['predicted'] += time.time() - startTime

        print("%s : predicted" % now())

        startTime = time.time()

        for doc in corpus.documents:
            if len(doc.relations) == 0:
                continue

            entity_to_sentence = {}
            for sentence in doc.sentences:
                for entity, tokenIndices in sentence.entityAnnotations:
                    assert not entity in entity_to_sentence
                    entity_to_sentence[entity] = sentence

            for relation in doc.relations:
                sentence = entity_to_sentence[relation.entities[0]]
                sentenceTextLower = sentence.text.lower()

                hasFilterTerm = any(filterTerm in sentenceTextLower
                                    for filterTerm in filterTerms)
                if not hasFilterTerm:
                    continue
                #words = [ t.word for t in sentence.tokens ]
                #text = " ".join(words)

                sentenceStart = sentence.tokens[0].startPos

                relType = relation.relationType
                entityData = []
                for entity in relation.entities:
                    entityData.append(entity.externalID)
                    if entity.entityType == 'gene':
                        entityData.append(Hugo2Entrez[entity.externalID])

                    entityData.append(entity.text)

                    if entity.externalID.startswith('combo'):
                        externalIDsplit = entity.externalID.split('|')
                        normalizedTerms = [
                            getNormalizedTerm("", st.replace('&', ';'),
                                              IDToTerm)
                            for st in externalIDsplit[1:]
                        ]
                        normalizedTerm = "|".join(normalizedTerms)
                    elif entity.externalID.startswith('mirna|'):
                        normalizedTerm = normalizeMIRName(entity.externalID)
                    else:
                        normalizedTerm = getNormalizedTerm(
                            entity.text, entity.externalID, IDToTerm)

                    entityData.append(normalizedTerm)

                    assert len(
                        entity.position
                    ) == 1, "Expecting entities that are contigious and have only one start and end position within the text"
                    startPos, endPos = entity.position[0]
                    entityData.append(startPos - sentenceStart)
                    entityData.append(endPos - sentenceStart)

                if doc.metadata["pmid"]:
                    m = doc.metadata
                    if not 'subsection' in m:
                        m['subsection'] = None

                    prob = relation.probability
                    outData = [
                        m['pmid'], m['title'], m["journal"], m["year"],
                        m["month"], m["day"], m['section'], m['subsection'],
                        relType, prob
                    ] + entityData + [sentence.text]
                    if applyFinalFilter(outData):
                        outLine = "\t".join(map(str, outData))
                        outF.write(outLine + "\n")

        timers['output'] += time.time() - startTime

        print("%s : output" % now())

    sys.stdout.flush()

    print("%s : done" % now())

    for section, sectiontime in timers.items():
        print("%s\t%f" % (section, sectiontime))
예제 #28
0
    for wordlist in args.wordlists.split(','):
        assert os.path.isfile(wordlist), 'Unable to access file: %s' % wordlist
        entityType = os.path.splitext(os.path.basename(wordlist))[0]
        wordlistDict[entityType] = wordlist
        print("  %s - %s" % (entityType, wordlist))

    assert len(
        wordlistDict
    ) == 2, "This annotation tool currently only handles two entity relations of different types"

    wordlistLookup = kindred.EntityRecognizer.loadWordlists(wordlistDict,
                                                            idColumn=0,
                                                            termsColumn=0)

    print("Annotating entities in corpus with wordlists")
    entityRecognizer = kindred.EntityRecognizer(wordlistLookup)
    entityRecognizer.annotate(sentenceCorpus)

    print("Finding all candidate relations")
    acceptedEntityTypes = wordlistDict
    candidateBuilder = kindred.CandidateBuilder(
        entityCount=len(wordlistDict),
        acceptedEntityTypes=[tuple(sorted(wordlistDict.keys()))])
    candidateRelations = candidateBuilder.build(sentenceCorpus)

    print(
        "Time to through some of the candidate relations and annotate some...")
    annotatedCorpus, unannotatedCorpus = kindred.manuallyAnnotate(
        sentenceCorpus, candidateRelations)

    print(
예제 #29
0
def parseAndFindEntities(biocFile, filterTermsFile, wordlistPickle,
                         variantStopwordsFile, outSentencesFilename):
    print("%s : start" % now())

    with open(wordlistPickle, 'rb') as f:
        termLookup = pickle.load(f)

    with open(filterTermsFile, 'r') as f:
        filterTerms = [line.strip().lower() for line in f]

    with open(variantStopwordsFile) as f:
        variantStopwords = [line.strip() for line in f]

    timers = Counter()

    outSentences = []

    currentID = None
    duplicateCheck = set()

    print("%s : processing..." % now())
    parser = kindred.Parser(model='en_core_sci_sm')
    ner = kindred.EntityRecognizer(lookup=termLookup,
                                   detectFusionGenes=True,
                                   detectMicroRNA=True,
                                   acronymDetectionForAmbiguity=True,
                                   mergeTerms=True,
                                   detectVariants=True,
                                   variantStopwords=variantStopwords)
    for corpusno, corpus in enumerate(kindred.iterLoad('biocxml', biocFile)):
        startTime = time.time()
        corpus = filterCorpus(corpus, filterTerms)
        timers['filter'] += time.time() - startTime

        startTime = time.time()
        parser.parse(corpus)
        timers['parser'] += time.time() - startTime
        print("%s : parsed" % now())

        startTime = time.time()
        ner.annotate(corpus)
        timers['ner'] += time.time() - startTime
        print("%s : ner" % now())

        startTime = time.time()

        for doc in corpus.documents:

            # Reset the duplicate check set for each new PMID
            if doc.metadata['id'] != currentID:
                currentID = doc.metadata['id']
                duplicateCheck = set()

            for sentence in doc.sentences:
                sentenceTextLower = sentence.text.lower()
                containsFilterTerm = any(ft in sentenceTextLower
                                         for ft in filterTerms)
                if not containsFilterTerm:
                    continue

                entityTypesInSentence = set([
                    entity.entityType
                    for entity, tokenIndices in sentence.entityAnnotations
                ])
                foundCancer = 'cancer' in entityTypesInSentence
                foundGene = 'gene' in entityTypesInSentence
                foundVariant = 'variant' in entityTypesInSentence

                if foundCancer and foundGene and foundVariant:
                    sentenceText = sentence.text.strip(string.whitespace + ',')

                    if not sentenceText in duplicateCheck:
                        tmpData = dict(doc.metadata)
                        tmpData['sentence'] = sentenceText
                        outSentences.append(tmpData)
                        duplicateCheck.add(sentenceText)

        timers['entitiesAdded'] += time.time() - startTime

        print("%s : entities added" % now())
        sys.stdout.flush()

    with open(outSentencesFilename, 'w') as f:
        json.dump(outSentences, f, indent=2)

    print("%s : done" % now())

    for section, sectiontime in timers.items():
        print("%s\t%f" % (section, sectiontime))
    print("%s\t%f" % ("parseAndFindEntities total", sum(timers.values())))
예제 #30
0
def parseAndFindEntities(biocFile,wordlistPickle,outSentencesFilename):
	print("%s : start" % now())

	with open(wordlistPickle,'rb') as f:
		termLookup = pickle.load(f)

	#with open(filterTermsFile,'r') as f:
	#	filterTerms = [ line.strip().lower() for line in f ]
	strictFilterTerms = ['tumor antigen','tumour antigen','tumor-antigen','tumour-antigen']
	weakFilterTerms = ['antigen']

	timers = Counter()

	outSentences = []

	currentID = None
	duplicateCheck = set()

	print("%s : processing..." % now())
	parser = kindred.Parser()
	ner = kindred.EntityRecognizer(lookup=termLookup,detectFusionGenes=True,detectMicroRNA=False,acronymDetectionForAmbiguity=True,mergeTerms=True)
	for corpusno,corpus in enumerate(kindred.iterLoadDataFromBioc(biocFile)):
		startTime = time.time()
		corpus = filterCorpus(corpus,weakFilterTerms)
		timers['filter'] += time.time() - startTime

		startTime = time.time()
		parser.parse(corpus)
		timers['parser'] += time.time() - startTime
		print("%s : parsed" % now())

		startTime = time.time()
		ner.annotate(corpus)
		timers['ner'] += time.time() - startTime
		print("%s : ner" % now())

		startTime = time.time()

		for doc in corpus.documents:

			# Reset the duplicate check set for each new PMID
			if doc.metadata['id'] != currentID:
				currentID = doc.metadata['id']
				duplicateCheck = set()

			for sentence in doc.sentences:
				sentenceTextLower = sentence.text.lower()



				#print(sentence.text)
				#print(sentence.entitiesWithLocations)
				entityTypesInSentence = set([ entity.entityType for entity,tokenIndices in sentence.entityAnnotations ])
				gotKeyword = 'keyword' in entityTypesInSentence
				gotGene = 'gene' in entityTypesInSentence
				gotProtein = 'protein' in entityTypesInSentence
				gotCancer = 'cancer' in entityTypesInSentence

				containsStrictTerm = any( ft in sentenceTextLower for ft in strictFilterTerms )
				containsWeakTerm = any( ft in sentenceTextLower for ft in weakFilterTerms )

				topicMatch = containsStrictTerm or (containsWeakTerm and gotCancer)

				if topicMatch and gotKeyword and (gotGene or gotProtein):
					sentenceText = sentence.text.strip(string.whitespace + ',')

					if not sentenceText in duplicateCheck:
						tmpData = dict(doc.metadata)
						tmpData['sentence'] = sentenceText
						outSentences.append(tmpData)
						duplicateCheck.add(sentenceText)

		timers['entitiesAdded'] += time.time() - startTime

		print("%s : entities added" % now())
		sys.stdout.flush()

	with open(outSentencesFilename,'w') as f:
		json.dump(outSentences,f,indent=2)

	print("%s : done" % now())
	
	for section,sectiontime in timers.items():
		print("%s\t%f" % (section,sectiontime))