def findVarDisGeneDrug(pmid, text): """ >>> startup({}) >>> list(findVarDisGeneDrug(0, "The R71G BRCA1 is a breast cancer founder mutation not treatable with Herceptin")) """ textLow = text.lower() # very basic filter, remove documents without some basic keywords if " variant " not in textLow and " mutation" not in textLow and " substitution" not in textLow and \ " mutant " not in textLow: return for section, sentStart, sentEnd, sentText in pubNlp.sectionSentences(text): genes = list(geneFinder.findGeneNames(sentText)) if len(genes) == 0: continue #print "genes", genes, sentText conds = list(pubNlp.findDiseases(sentText)) drugs = list(pubNlp.findDrugs(sentText)) # remove diseases and drugs that are also genes drugs = pubNlp.rangeRemoveOverlaps(drugs, genes) conds = pubNlp.rangeRemoveOverlaps(conds, genes) # check if we still have a disease and drug left if len(conds) == 0 or len(drugs) == 0: continue print "drugs", drugs print "diseases", conds geneSnips = pubNlp.rangeTexts(sentText, genes) condSnips = pubNlp.rangeTexts(sentText, conds) drugSnips = pubNlp.rangeTexts(sentText, drugs) genePosSet = pubNlp.rangeToPosSet(genes) variants = varFinder.findVariantDescriptions(sentText, exclPos=genePosSet) # the last field of the genes rows is the entrez ID entrezIds = [r[-1] for r in genes] # we need a protein variant, not DNA if "prot" not in variants: continue for variant, mentions in variants["prot"]: print "grounding variant", variant, mentions groundedMuts, ungroundVar, beds = \ varFinder.groundVariant(pmid, sentText, variant, mentions, [], entrezIds) for mutInfo in groundedMuts: coords = [(m.start, m.end) for m in mentions] varSnips = pubNlp.rangeTexts(sentText, coords) row = [ section, "|".join(geneSnips), "|".join(condSnips), "|".join(drugSnips), "|".join(varSnips), sentText ] yield row
def findVarDisGeneDrug(pmid, text): """ >>> startup({}) >>> list(findVarDisGeneDrug(0, "The R71G BRCA1 is a breast cancer founder mutation not treatable with Herceptin")) """ textLow = text.lower() # very basic filter, remove documents without some basic keywords if " variant " not in textLow and " mutation" not in textLow and " substitution" not in textLow and \ " mutant " not in textLow: return for section, sentStart, sentEnd, sentText in pubNlp.sectionSentences(text): genes = list(geneFinder.findGeneNames(sentText)) if len(genes)==0: continue #print "genes", genes, sentText conds = list(pubNlp.findDiseases(sentText)) drugs = list(pubNlp.findDrugs(sentText)) # remove diseases and drugs that are also genes drugs = pubNlp.rangeRemoveOverlaps(drugs, genes) conds = pubNlp.rangeRemoveOverlaps(conds, genes) # check if we still have a disease and drug left if len(conds)==0 or len(drugs)==0: continue print "drugs", drugs print "diseases", conds geneSnips = pubNlp.rangeTexts(sentText, genes) condSnips = pubNlp.rangeTexts(sentText, conds) drugSnips = pubNlp.rangeTexts(sentText, drugs) genePosSet = pubNlp.rangeToPosSet(genes) variants = varFinder.findVariantDescriptions(sentText, exclPos=genePosSet) # the last field of the genes rows is the entrez ID entrezIds = [r[-1] for r in genes] # we need a protein variant, not DNA if "prot" not in variants: continue for variant, mentions in variants["prot"]: print "grounding variant", variant, mentions groundedMuts, ungroundVar, beds = \ varFinder.groundVariant(pmid, sentText, variant, mentions, [], entrezIds) for mutInfo in groundedMuts: coords = [(m.start, m.end) for m in mentions] varSnips = pubNlp.rangeTexts(sentText, coords) row = [section, "|".join(geneSnips), "|".join(condSnips), "|".join(drugSnips), "|".join(varSnips), sentText] yield row
def findDisGeneVariant(text): """ >>> geneFinder.initData(exclMarkerTypes=["dnaSeq", "band"]) >>> varFinder.loadDb(loadSequences=False) >>> list(findDisGeneVariant("Diabetes is caused by a PITX2 mutation, V234T and influenced by Herceptin.")) [(0, 74, 'probablyAbstract', '64-73:Herceptin=Trastuzumab', '0-8:Diabetes=Diabetes Mellitus', '24-29:PITX2=symbol', 'V233T', 'Diabetes is caused by a PITX2 mutation, V234T and influenced by Herceptin.')] >>> #list(findDisGeneVariant("We undertook a quantitative review of the literature to estimate the effectiveness of desferrioxamine and deferiprone in decreasing hepatic iron concentrations (HIC) in thalassemia major.")) >>> list(findDisGeneVariant("his mutation, we cotransfected C3H10T cells with expression vectors encoding SMO-WT or SMO-D473H ")) """ docGenes = list(geneFinder.findGeneNames(text)) docEntrezIds = set([r[-1] for r in docGenes]) for section, start, end, sentence in pubNlp.sectionSentences(text): conds = list(pubNlp.findDiseases(sentence)) drugs = list(pubNlp.findDrugs(sentence)) genes = list(geneFinder.findGeneNames(sentence)) #print conds, drugs, genes, section, sentence # remove drugs and conds that are also genes drugs = rangeRemoveOverlaps(drugs, genes) conds = rangeRemoveOverlaps(conds, genes) #geneSnips = rangeTexts(sentence, genes, useSym=True) #condSnips = rangeTexts(sentence, conds) #drugSnips = rangeTexts(sentence, drugs) # mutDescs = [] mutDict = varFinder.findVariantDescriptions(sentence) if "prot" in mutDict: for varDesc, mentions in mutDict["prot"]: if varDesc.mutType!="sub": continue logging.debug("grounding variant: %s %s"% (varDesc, mentions)) groundedMuts, ungroundVar, beds = \ varFinder.groundVariant(None, sentence, varDesc, mentions, [], docEntrezIds) for mutInfo in groundedMuts: shortDesc = varDesc.origSeq+str(varDesc.start+1)+varDesc.mutSeq # 0-based!! mutDescs.append(shortDesc+"=%s:%s"%(mutInfo.geneSymbol,mutInfo.hgvsProt)) #mutMatches = list(mutRe.finditer(sentence)) #mutDescs = [(m.group(1),m.group(2), m.group(3)) for m in mutMatches] #mutDescSet = set(mutDescs) #blackListMuts = mutDescSet.intersection(blackListStr) #if len(mutMatches)==0: #logging.debug("No mutation found, skipping") #continue #if len(blackListMuts)!=0: #logging.debug("At least one blacklisted mutation found, skipping") #continue #if len(drugs)==0: #logging.debug("No drugs found, skipping") #continue #if len(genes)==0: #logging.debug("No genes found, skipping") #continue mutDesc = "|".join(mutDescs) drugDesc = rangeDescs(sentence, drugs) condDesc = rangeDescs(sentence, conds) geneDesc = rangeDescs(sentence, genes, useSym=True) ret = (start, end, section, drugDesc, condDesc, geneDesc, mutDesc, sentence) yield ret
def findDisGeneVariant(text): """ >>> geneFinder.initData(exclMarkerTypes=["dnaSeq", "band"]) >>> varFinder.loadDb(loadSequences=False) >>> list(findDisGeneVariant("Diabetes is caused by a PITX2 mutation, V234T and influenced by Herceptin.")) [(0, 74, 'probablyAbstract', '64-73:Herceptin=Trastuzumab', '0-8:Diabetes=Diabetes Mellitus', '24-29:PITX2=symbol', 'V233T', 'Diabetes is caused by a PITX2 mutation, V234T and influenced by Herceptin.')] >>> #list(findDisGeneVariant("We undertook a quantitative review of the literature to estimate the effectiveness of desferrioxamine and deferiprone in decreasing hepatic iron concentrations (HIC) in thalassemia major.")) >>> list(findDisGeneVariant("his mutation, we cotransfected C3H10T cells with expression vectors encoding SMO-WT or SMO-D473H ")) """ docGenes = list(geneFinder.findGeneNames(text)) docEntrezIds = set([r[-1] for r in docGenes]) for section, start, end, sentence in pubNlp.sectionSentences(text): conds = list(pubNlp.findDiseases(sentence)) drugs = list(pubNlp.findDrugs(sentence)) genes = list(geneFinder.findGeneNames(sentence)) #print conds, drugs, genes, section, sentence # remove drugs and conds that are also genes drugs = rangeRemoveOverlaps(drugs, genes) conds = rangeRemoveOverlaps(conds, genes) #geneSnips = rangeTexts(sentence, genes, useSym=True) #condSnips = rangeTexts(sentence, conds) #drugSnips = rangeTexts(sentence, drugs) # mutDescs = [] mutDict = varFinder.findVariantDescriptions(sentence) if "prot" in mutDict: for varDesc, mentions in mutDict["prot"]: if varDesc.mutType != "sub": continue logging.debug("grounding variant: %s %s" % (varDesc, mentions)) groundedMuts, ungroundVar, beds = \ varFinder.groundVariant(None, sentence, varDesc, mentions, [], docEntrezIds) for mutInfo in groundedMuts: shortDesc = varDesc.origSeq + str( varDesc.start + 1) + varDesc.mutSeq # 0-based!! mutDescs.append(shortDesc + "=%s:%s" % (mutInfo.geneSymbol, mutInfo.hgvsProt)) #mutMatches = list(mutRe.finditer(sentence)) #mutDescs = [(m.group(1),m.group(2), m.group(3)) for m in mutMatches] #mutDescSet = set(mutDescs) #blackListMuts = mutDescSet.intersection(blackListStr) #if len(mutMatches)==0: #logging.debug("No mutation found, skipping") #continue #if len(blackListMuts)!=0: #logging.debug("At least one blacklisted mutation found, skipping") #continue #if len(drugs)==0: #logging.debug("No drugs found, skipping") #continue #if len(genes)==0: #logging.debug("No genes found, skipping") #continue mutDesc = "|".join(mutDescs) drugDesc = rangeDescs(sentence, drugs) condDesc = rangeDescs(sentence, conds) geneDesc = rangeDescs(sentence, genes, useSym=True) ret = (start, end, section, drugDesc, condDesc, geneDesc, mutDesc, sentence) yield ret