def map(article, file, text, result): " go over words of text and check if they are in dict " text = file.content annots = list(fastFind.fastFindFlankWords(text, lex, wordDist=1)) newAnnots = [] for annot in annots: start, end, id, leftWords, rightWords = annot # get and clean word word = text[start:end] if not wordRe.match(word): # remove if garbage between words like ( or # etc continue word = " ".join(word.split()) # remove multi whitespace # remove word if it contains more than one dash letters = list(word) if letters.count("-")>1: continue word = word.replace(" -", "-") word = word.replace("- ", "-") if word in blackList: continue result.setdefault( word, [0,0] ) # need to return it as a double-list result[ word ][0] += 1 # check and inc count if flanking word is gene or protein allFlanks = list(leftWords) allFlanks.extend(rightWords) allFlanks = [w.lower() for w in allFlanks] if "protein" in allFlanks or "gene" in allFlanks or "locus" in allFlanks: result[ word ][1] += 1
def map(article, file, text, result): " go over words of text and check if they are in dict " text = file.content annots = list(fastFind.fastFindFlankWords(text, lex, wordDist=1)) newAnnots = [] for annot in annots: start, end, id, leftWords, rightWords = annot # get and clean word word = text[start:end] if not wordRe.match( word): # remove if garbage between words like ( or # etc continue word = " ".join(word.split()) # remove multi whitespace # remove word if it contains more than one dash letters = list(word) if letters.count("-") > 1: continue word = word.replace(" -", "-") word = word.replace("- ", "-") if word in blackList: continue result.setdefault(word, [0, 0]) # need to return it as a double-list result[word][0] += 1 # check and inc count if flanking word is gene or protein allFlanks = list(leftWords) allFlanks.extend(rightWords) allFlanks = [w.lower() for w in allFlanks] if "protein" in allFlanks or "gene" in allFlanks or "locus" in allFlanks: result[word][1] += 1
def map(article, file, text, result): " go over words of text and check if they are in dict " text = file.content annots = list(fastFind.fastFindFlankWords(text, lex, wordDist=2)) newAnnots = [] for annot in annots: start, end, id, leftWords, rightWords = annot # get and clean word word = text[start:end] if not wordRe.match( word): # remove if garbage between words like ( or # etc continue word = " ".join(word.split()) # remove multi whitespace # remove word if it contains more than one dash letters = list(word) if letters.count("-") > 1: continue word = word.replace(" -", "-") word = word.replace("- ", "-") if word in blackList: continue result.setdefault(word, { "l": {}, "r": {} }) # need to return it as a double-list addToResults(word, leftWords, "l", result) addToResults(word, rightWords, "r", result)
def findGeneNames(text): """ look for gene names and symbols. Some symbols need flanking trigger words. If these are not present, they are returned as "symbolMaybe" Will always return the gene name matches before the symbol matches. >>> initData(addOptional=True) >>> list(findGeneNames("thyroid hormone receptor, beta")) [(0, 30, 'geneName', '7068')] >>> list(findGeneNames("FATE1")) [(0, 5, 'symbolMaybe', '89885')] >>> list(findGeneNames("FATE1 is overexpressed")) [(0, 5, 'symbol', '89885')] >>> list(findGeneNames("fate1 is overexpressed")) [] >>> list(findGeneNames("PITX2 overexpression")) [(0, 5, 'symbol', '5308')] # ignore genes that are immediately flanked by "pathway" >>> list(findGeneNames("PITX2 pathway")) [] # XX need to correct this >>> list(findGeneNames(" BLAST ")) [(1, 6, 'symbolMaybe', '962')] """ assert (geneSymLex != None) textLower = text.lower() for start, end, geneId in fastFind.fastFind(textLower, geneNameLex, toLower=False): yield (start, end, 'geneName', geneId) flankFindIter = fastFind.fastFindFlankWords(text, geneSymLex, wordDist=2, wordRe=fastFind.SYMRE, toLower=False) for start, end, geneId, leftWords, rightWords in flankFindIter: # if the symbol is marked as potentially ambiguous, check the flanking words if geneId.startswith("?"): leftWords = [w.lower() for w in leftWords] rightWords = [w.lower() for w in rightWords] geneId = geneId.strip("?") if len(symLeftReqWords.intersection(leftWords))!=0 or \ len(symRightReqWords.intersection(rightWords))!=0: yield (start, end, 'symbol', geneId) else: yield (start, end, 'symbolMaybe', geneId) # otherwise pass them though else: # ignore genes that are immediately flanked by "pathway" flankWords = getFlankWords(start, end, textLower) if "pathway" in flankWords: logging.debug("ignored %s, flank words are %s" % (text[start:end], flankWords)) continue yield (start, end, 'symbol', geneId)
def findGeneNames(text): """ look for gene names and symbols. Some symbols need flanking trigger words. If these are not present, they are returned as "symbolMaybe" Will always return the gene name matches before the symbol matches. >>> initData(addOptional=True) >>> list(findGeneNames("thyroid hormone receptor, beta")) [(0, 30, 'geneName', '7068')] >>> list(findGeneNames("FATE1")) [(0, 5, 'symbolMaybe', '89885')] >>> list(findGeneNames("FATE1 is overexpressed")) [(0, 5, 'symbol', '89885')] >>> list(findGeneNames("fate1 is overexpressed")) [] >>> list(findGeneNames("PITX2 overexpression")) [(0, 5, 'symbol', '5308')] # ignore genes that are immediately flanked by "pathway" >>> list(findGeneNames("PITX2 pathway")) [] # XX need to correct this >>> list(findGeneNames(" BLAST ")) [(1, 6, 'symbolMaybe', '962')] """ assert(geneSymLex!=None) textLower = text.lower() for start, end, geneId in fastFind.fastFind(textLower, geneNameLex): yield (start, end, 'geneName', geneId) flankFindIter = fastFind.fastFindFlankWords(text, geneSymLex, wordDist=2, wordRe=fastFind.SYMRE) for start, end, geneId, leftWords, rightWords in flankFindIter: # if the symbol is marked as potentially ambiguous, check the flanking words if geneId.startswith("?"): leftWords = [w.lower() for w in leftWords] rightWords = [w.lower() for w in rightWords] geneId = geneId.strip("?") if len(symLeftReqWords.intersection(leftWords))!=0 or \ len(symRightReqWords.intersection(rightWords))!=0: yield (start, end, 'symbol', geneId) else: yield (start, end, 'symbolMaybe', geneId) # otherwise pass them though else: # ignore genes that are immediately flanked by "pathway" flankWords = getFlankWords(start, end, textLower) if "pathway" in flankWords: logging.debug("ignored %s, flank words are %s" % (text[start:end], flankWords)) continue yield (start, end, 'symbol', geneId)
def map(article, file, text, result): " go over words of text and check if they are in dict " text = file.content annots = list(fastFind.fastFindFlankWords(text, lex, wordDist=2)) newAnnots = [] for annot in annots: start, end, id, leftWords, rightWords = annot # get and clean word word = text[start:end] if not wordRe.match(word): # remove if garbage between words like ( or # etc continue word = " ".join(word.split()) # remove multi whitespace # remove word if it contains more than one dash letters = list(word) if letters.count("-")>1: continue word = word.replace(" -", "-") word = word.replace("- ", "-") if word in blackList: continue result.setdefault( word, {"l" : {}, "r" : {} } ) # need to return it as a double-list addToResults(word, leftWords, "l", result) addToResults(word, rightWords, "r", result)