def findFusions(biocFile, genesFile, wordlistPickle, outFile): print("%s : start" % now()) with open(wordlistPickle, 'rb') as f: termLookup = pickle.load(f) hugo2Name = {} with open(genesFile) as f: for line in f: hugo_gene_id, gene_name, synoyms, entrez_gene_id = line.strip( '\n').split('\t') hugo2Name[hugo_gene_id] = gene_name print("%s : processing..." % now()) parser = kindred.Parser(model='en_core_sci_sm') ner = kindred.EntityRecognizer(lookup=termLookup, detectFusionGenes=True, detectMicroRNA=True, acronymDetectionForAmbiguity=True, mergeTerms=True, detectVariants=True) with open(outFile, 'w') as outF: for corpusno, corpus in enumerate(kindred.iterLoad( 'biocxml', biocFile)): parser.parse(corpus) ner.annotate(corpus) for doc in corpus.documents: pmid = '' if 'pmid' in doc.metadata: pmid = doc.metadata['pmid'] #fusions = [ e for e in doc.entities if e.entityType == 'Gene' ] for e in doc.entities: if e.entityType == 'gene' and e.externalID.startswith( 'combo|'): gene_ids = e.externalID.split('|')[1:] if len(gene_ids) != 2: continue if any('&' in gene_id for gene_id in gene_ids): continue for gene_id in gene_ids: assert gene_id in hugo2Name, 'Unable to find HUGO gene name for ID: %s (text=%s)' % ( gene_id, e.text) gene_names = [ hugo2Name[gene_id] for gene_id in gene_ids ] assert len(gene_names) == 2 outData = [pmid, e.text] + gene_ids + gene_names outF.write("\t".join(outData) + "\n")
def test_iterLoadBiocFile(): text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />' corpus = kindred.Corpus(text, loadFromSimpleTag=True) docsToCreate = 100 with TempDir() as tempDir: singleDoc = corpus.documents[0] corpus.documents = [singleDoc for _ in range(docsToCreate)] tempFile = os.path.join(tempDir, 'corpus.bioc.xml') kindred.save(corpus, 'biocxml', tempFile) totalDocCount = 0 for corpus in kindred.iterLoad('biocxml', tempFile, corpusSizeCutoff=3): assert isinstance(corpus, kindred.Corpus) assert len(corpus.documents) <= 25 totalDocCount += len(corpus.documents) for doc in corpus.documents: assert isinstance(doc, kindred.Document) entities = doc.entities relations = doc.relations sourceEntityIDsToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation('causes', [ sourceEntityIDsToEntity["T1"], sourceEntityIDsToEntity["T2"] ], ['obj', 'subj']) ], "(%s) not as expected" % relations assert totalDocCount == docsToCreate
type=str, help= 'Terms used to filter sentences to enrich for pharmacogenomics relations' ) parser.add_argument('--outBioc', required=True, type=str, help='Output BioC file with identified sentences') args = parser.parse_args() sentenceCorpus = kindred.Corpus() with open(args.filterTermsFile, 'r') as f: filterTerms = [line.strip().lower() for line in f] for corpus in kindred.iterLoad('biocxml', args.inBioc): corpus = filterCorpus(corpus, filterTerms) annotateStarAlleles(corpus) corpusEntityTypes = [ set(e.entityType for e in doc.entities) for doc in corpus.documents ] corpus.documents = [ doc for doc, types in zip(corpus.documents, corpusEntityTypes) if "Mutation" in types and "Chemical" in types ] parser = kindred.Parser(model='en_core_sci_sm') parser.parse(corpus)
def parseAndFindEntities(biocFile, filterTermsFile, wordlistPickle, variantStopwordsFile, outSentencesFilename): print("%s : start" % now()) with open(wordlistPickle, 'rb') as f: termLookup = pickle.load(f) with open(filterTermsFile, 'r') as f: filterTerms = [line.strip().lower() for line in f] with open(variantStopwordsFile) as f: variantStopwords = [line.strip() for line in f] timers = Counter() outSentences = [] currentID = None duplicateCheck = set() print("%s : processing..." % now()) parser = kindred.Parser(model='en_core_sci_sm') ner = kindred.EntityRecognizer(lookup=termLookup, detectFusionGenes=True, detectMicroRNA=True, acronymDetectionForAmbiguity=True, mergeTerms=True, detectVariants=True, variantStopwords=variantStopwords) for corpusno, corpus in enumerate(kindred.iterLoad('biocxml', biocFile)): startTime = time.time() corpus = filterCorpus(corpus, filterTerms) timers['filter'] += time.time() - startTime startTime = time.time() parser.parse(corpus) timers['parser'] += time.time() - startTime print("%s : parsed" % now()) startTime = time.time() ner.annotate(corpus) timers['ner'] += time.time() - startTime print("%s : ner" % now()) startTime = time.time() for doc in corpus.documents: # Reset the duplicate check set for each new PMID if doc.metadata['id'] != currentID: currentID = doc.metadata['id'] duplicateCheck = set() for sentence in doc.sentences: sentenceTextLower = sentence.text.lower() containsFilterTerm = any(ft in sentenceTextLower for ft in filterTerms) if not containsFilterTerm: continue entityTypesInSentence = set([ entity.entityType for entity, tokenIndices in sentence.entityAnnotations ]) foundCancer = 'cancer' in entityTypesInSentence foundGene = 'gene' in entityTypesInSentence foundVariant = 'variant' in entityTypesInSentence if foundCancer and foundGene and foundVariant: sentenceText = sentence.text.strip(string.whitespace + ',') if not sentenceText in duplicateCheck: tmpData = dict(doc.metadata) tmpData['sentence'] = sentenceText outSentences.append(tmpData) duplicateCheck.add(sentenceText) timers['entitiesAdded'] += time.time() - startTime print("%s : entities added" % now()) sys.stdout.flush() with open(outSentencesFilename, 'w') as f: json.dump(outSentences, f, indent=2) print("%s : done" % now()) for section, sectiontime in timers.items(): print("%s\t%f" % (section, sectiontime)) print("%s\t%f" % ("parseAndFindEntities total", sum(timers.values())))
help='Input directory of BioC xml files') parser.add_argument('--genes', required=True, type=str, help='Gene file') parser.add_argument('--outFile', required=True, type=str, help='Output file') args = parser.parse_args() geneList = set() with open(args.genes) as f: for line in f: hugo_id, single, synonyms, entrez_id = line.strip('\n').split('\t') geneList.add(single.lower()) with open(args.outFile, 'w') as outF: for corpus in kindred.iterLoad('biocxml', args.inFile): corpus.documents = [ doc for doc in corpus.documents if 'rs' in doc.text and '*' in doc.text ] for doc in corpus.documents: for sentence in doc.text.split('.'): genes = [] matches = re.finditer('(?P<gene>\w+)\s*\*\s*(?P<star>\w+)', sentence) for match in matches: startPos, endPos = match.span('gene') if match.group('gene').lower() in geneList: