Exemplo n.º 1
0
def findFusions(biocFile, genesFile, wordlistPickle, outFile):
    print("%s : start" % now())

    with open(wordlistPickle, 'rb') as f:
        termLookup = pickle.load(f)

    hugo2Name = {}
    with open(genesFile) as f:
        for line in f:
            hugo_gene_id, gene_name, synoyms, entrez_gene_id = line.strip(
                '\n').split('\t')
            hugo2Name[hugo_gene_id] = gene_name

    print("%s : processing..." % now())
    parser = kindred.Parser(model='en_core_sci_sm')
    ner = kindred.EntityRecognizer(lookup=termLookup,
                                   detectFusionGenes=True,
                                   detectMicroRNA=True,
                                   acronymDetectionForAmbiguity=True,
                                   mergeTerms=True,
                                   detectVariants=True)
    with open(outFile, 'w') as outF:
        for corpusno, corpus in enumerate(kindred.iterLoad(
                'biocxml', biocFile)):
            parser.parse(corpus)
            ner.annotate(corpus)

            for doc in corpus.documents:
                pmid = ''
                if 'pmid' in doc.metadata:
                    pmid = doc.metadata['pmid']
                #fusions = [ e for  e in doc.entities if e.entityType == 'Gene' ]
                for e in doc.entities:
                    if e.entityType == 'gene' and e.externalID.startswith(
                            'combo|'):
                        gene_ids = e.externalID.split('|')[1:]

                        if len(gene_ids) != 2:
                            continue

                        if any('&' in gene_id for gene_id in gene_ids):
                            continue

                        for gene_id in gene_ids:
                            assert gene_id in hugo2Name, 'Unable to find HUGO gene name for ID: %s (text=%s)' % (
                                gene_id, e.text)

                        gene_names = [
                            hugo2Name[gene_id] for gene_id in gene_ids
                        ]

                        assert len(gene_names) == 2

                        outData = [pmid, e.text] + gene_ids + gene_names
                        outF.write("\t".join(outData) + "\n")
Exemplo n.º 2
0
def test_iterLoadBiocFile():
    text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />'
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)
    docsToCreate = 100

    with TempDir() as tempDir:

        singleDoc = corpus.documents[0]
        corpus.documents = [singleDoc for _ in range(docsToCreate)]

        tempFile = os.path.join(tempDir, 'corpus.bioc.xml')
        kindred.save(corpus, 'biocxml', tempFile)

        totalDocCount = 0
        for corpus in kindred.iterLoad('biocxml', tempFile,
                                       corpusSizeCutoff=3):
            assert isinstance(corpus, kindred.Corpus)

            assert len(corpus.documents) <= 25
            totalDocCount += len(corpus.documents)

            for doc in corpus.documents:
                assert isinstance(doc, kindred.Document)
                entities = doc.entities
                relations = doc.relations

                sourceEntityIDsToEntity = {
                    entity.sourceEntityID: entity
                    for entity in entities
                }

                assertEntity(entities[0],
                             expectedType='disease',
                             expectedText='colorectal cancer',
                             expectedPos=[(4, 21)],
                             expectedSourceEntityID="T1")
                assertEntity(entities[1],
                             expectedType='gene',
                             expectedText='APC',
                             expectedPos=[(49, 52)],
                             expectedSourceEntityID="T2")
                assert relations == [
                    kindred.Relation('causes', [
                        sourceEntityIDsToEntity["T1"],
                        sourceEntityIDsToEntity["T2"]
                    ], ['obj', 'subj'])
                ], "(%s) not as expected" % relations

        assert totalDocCount == docsToCreate
Exemplo n.º 3
0
        type=str,
        help=
        'Terms used to filter sentences to enrich for pharmacogenomics relations'
    )
    parser.add_argument('--outBioc',
                        required=True,
                        type=str,
                        help='Output BioC file with identified sentences')
    args = parser.parse_args()

    sentenceCorpus = kindred.Corpus()

    with open(args.filterTermsFile, 'r') as f:
        filterTerms = [line.strip().lower() for line in f]

    for corpus in kindred.iterLoad('biocxml', args.inBioc):
        corpus = filterCorpus(corpus, filterTerms)

        annotateStarAlleles(corpus)

        corpusEntityTypes = [
            set(e.entityType for e in doc.entities) for doc in corpus.documents
        ]

        corpus.documents = [
            doc for doc, types in zip(corpus.documents, corpusEntityTypes)
            if "Mutation" in types and "Chemical" in types
        ]

        parser = kindred.Parser(model='en_core_sci_sm')
        parser.parse(corpus)
Exemplo n.º 4
0
def parseAndFindEntities(biocFile, filterTermsFile, wordlistPickle,
                         variantStopwordsFile, outSentencesFilename):
    print("%s : start" % now())

    with open(wordlistPickle, 'rb') as f:
        termLookup = pickle.load(f)

    with open(filterTermsFile, 'r') as f:
        filterTerms = [line.strip().lower() for line in f]

    with open(variantStopwordsFile) as f:
        variantStopwords = [line.strip() for line in f]

    timers = Counter()

    outSentences = []

    currentID = None
    duplicateCheck = set()

    print("%s : processing..." % now())
    parser = kindred.Parser(model='en_core_sci_sm')
    ner = kindred.EntityRecognizer(lookup=termLookup,
                                   detectFusionGenes=True,
                                   detectMicroRNA=True,
                                   acronymDetectionForAmbiguity=True,
                                   mergeTerms=True,
                                   detectVariants=True,
                                   variantStopwords=variantStopwords)
    for corpusno, corpus in enumerate(kindred.iterLoad('biocxml', biocFile)):
        startTime = time.time()
        corpus = filterCorpus(corpus, filterTerms)
        timers['filter'] += time.time() - startTime

        startTime = time.time()
        parser.parse(corpus)
        timers['parser'] += time.time() - startTime
        print("%s : parsed" % now())

        startTime = time.time()
        ner.annotate(corpus)
        timers['ner'] += time.time() - startTime
        print("%s : ner" % now())

        startTime = time.time()

        for doc in corpus.documents:

            # Reset the duplicate check set for each new PMID
            if doc.metadata['id'] != currentID:
                currentID = doc.metadata['id']
                duplicateCheck = set()

            for sentence in doc.sentences:
                sentenceTextLower = sentence.text.lower()
                containsFilterTerm = any(ft in sentenceTextLower
                                         for ft in filterTerms)
                if not containsFilterTerm:
                    continue

                entityTypesInSentence = set([
                    entity.entityType
                    for entity, tokenIndices in sentence.entityAnnotations
                ])
                foundCancer = 'cancer' in entityTypesInSentence
                foundGene = 'gene' in entityTypesInSentence
                foundVariant = 'variant' in entityTypesInSentence

                if foundCancer and foundGene and foundVariant:
                    sentenceText = sentence.text.strip(string.whitespace + ',')

                    if not sentenceText in duplicateCheck:
                        tmpData = dict(doc.metadata)
                        tmpData['sentence'] = sentenceText
                        outSentences.append(tmpData)
                        duplicateCheck.add(sentenceText)

        timers['entitiesAdded'] += time.time() - startTime

        print("%s : entities added" % now())
        sys.stdout.flush()

    with open(outSentencesFilename, 'w') as f:
        json.dump(outSentences, f, indent=2)

    print("%s : done" % now())

    for section, sectiontime in timers.items():
        print("%s\t%f" % (section, sectiontime))
    print("%s\t%f" % ("parseAndFindEntities total", sum(timers.values())))
Exemplo n.º 5
0
                        help='Input directory of BioC xml files')
    parser.add_argument('--genes', required=True, type=str, help='Gene file')
    parser.add_argument('--outFile',
                        required=True,
                        type=str,
                        help='Output file')
    args = parser.parse_args()

    geneList = set()
    with open(args.genes) as f:
        for line in f:
            hugo_id, single, synonyms, entrez_id = line.strip('\n').split('\t')
            geneList.add(single.lower())

    with open(args.outFile, 'w') as outF:
        for corpus in kindred.iterLoad('biocxml', args.inFile):

            corpus.documents = [
                doc for doc in corpus.documents
                if 'rs' in doc.text and '*' in doc.text
            ]

            for doc in corpus.documents:
                for sentence in doc.text.split('.'):
                    genes = []
                    matches = re.finditer('(?P<gene>\w+)\s*\*\s*(?P<star>\w+)',
                                          sentence)
                    for match in matches:
                        startPos, endPos = match.span('gene')

                        if match.group('gene').lower() in geneList: