def map(article, file, text, result):
    " go over words of text and check if they are in dict "
    text = file.content
    annots = list(fastFind.fastFindFlankWords(text, lex, wordDist=1))
    newAnnots = []
    for annot in annots:
        start, end, id, leftWords, rightWords = annot
        # get and clean word
        word = text[start:end]
        if not wordRe.match(word): # remove if garbage between words like ( or # etc
            continue
        word = " ".join(word.split()) # remove multi whitespace
        # remove word if it contains more than one dash
        letters = list(word)
        if letters.count("-")>1:
            continue
        word = word.replace(" -", "-")
        word = word.replace("- ", "-")
        if word in blackList:
            continue

        result.setdefault( word, [0,0] ) # need to return it as a double-list
        result[ word ][0] += 1

        # check and inc count if flanking word is gene or protein
        allFlanks = list(leftWords)
        allFlanks.extend(rightWords)
        allFlanks = [w.lower() for w in allFlanks]
        if "protein" in allFlanks or "gene" in allFlanks or "locus" in allFlanks:
            result[ word ][1] += 1
示例#2
0
def map(article, file, text, result):
    " go over words of text and check if they are in dict "
    text = file.content
    annots = list(fastFind.fastFindFlankWords(text, lex, wordDist=1))
    newAnnots = []
    for annot in annots:
        start, end, id, leftWords, rightWords = annot
        # get and clean word
        word = text[start:end]
        if not wordRe.match(
                word):  # remove if garbage between words like ( or # etc
            continue
        word = " ".join(word.split())  # remove multi whitespace
        # remove word if it contains more than one dash
        letters = list(word)
        if letters.count("-") > 1:
            continue
        word = word.replace(" -", "-")
        word = word.replace("- ", "-")
        if word in blackList:
            continue

        result.setdefault(word, [0, 0])  # need to return it as a double-list
        result[word][0] += 1

        # check and inc count if flanking word is gene or protein
        allFlanks = list(leftWords)
        allFlanks.extend(rightWords)
        allFlanks = [w.lower() for w in allFlanks]
        if "protein" in allFlanks or "gene" in allFlanks or "locus" in allFlanks:
            result[word][1] += 1
示例#3
0
def map(article, file, text, result):
    " go over words of text and check if they are in dict "
    text = file.content
    annots = list(fastFind.fastFindFlankWords(text, lex, wordDist=2))
    newAnnots = []
    for annot in annots:
        start, end, id, leftWords, rightWords = annot
        # get and clean word
        word = text[start:end]
        if not wordRe.match(
                word):  # remove if garbage between words like ( or # etc
            continue
        word = " ".join(word.split())  # remove multi whitespace
        # remove word if it contains more than one dash
        letters = list(word)
        if letters.count("-") > 1:
            continue
        word = word.replace(" -", "-")
        word = word.replace("- ", "-")
        if word in blackList:
            continue

        result.setdefault(word, {
            "l": {},
            "r": {}
        })  # need to return it as a double-list
        addToResults(word, leftWords, "l", result)
        addToResults(word, rightWords, "r", result)
示例#4
0
def findGeneNames(text):
    """
    look for gene names and symbols. Some symbols need flanking trigger words. If these 
    are not present, they are returned as "symbolMaybe"

    Will always return the gene name matches before the symbol matches.

    >>> initData(addOptional=True)
    >>> list(findGeneNames("thyroid hormone receptor, beta"))
    [(0, 30, 'geneName', '7068')]
    >>> list(findGeneNames("FATE1"))
    [(0, 5, 'symbolMaybe', '89885')]
    >>> list(findGeneNames("FATE1 is overexpressed"))
    [(0, 5, 'symbol', '89885')]
    >>> list(findGeneNames("fate1 is overexpressed"))
    []
    >>> list(findGeneNames("PITX2 overexpression"))
    [(0, 5, 'symbol', '5308')]

    # ignore genes that are immediately flanked by "pathway"
    >>> list(findGeneNames("PITX2 pathway"))
    []

    # XX need to correct this
    >>> list(findGeneNames(" BLAST "))
    [(1, 6, 'symbolMaybe', '962')]
    """
    assert (geneSymLex != None)
    textLower = text.lower()
    for start, end, geneId in fastFind.fastFind(textLower,
                                                geneNameLex,
                                                toLower=False):
        yield (start, end, 'geneName', geneId)

    flankFindIter = fastFind.fastFindFlankWords(text,
                                                geneSymLex,
                                                wordDist=2,
                                                wordRe=fastFind.SYMRE,
                                                toLower=False)
    for start, end, geneId, leftWords, rightWords in flankFindIter:
        # if the symbol is marked as potentially ambiguous, check the flanking words
        if geneId.startswith("?"):
            leftWords = [w.lower() for w in leftWords]
            rightWords = [w.lower() for w in rightWords]
            geneId = geneId.strip("?")
            if len(symLeftReqWords.intersection(leftWords))!=0 or \
                len(symRightReqWords.intersection(rightWords))!=0:
                yield (start, end, 'symbol', geneId)
            else:
                yield (start, end, 'symbolMaybe', geneId)
        # otherwise pass them though
        else:
            # ignore genes that are immediately flanked by "pathway"
            flankWords = getFlankWords(start, end, textLower)
            if "pathway" in flankWords:
                logging.debug("ignored %s, flank words are %s" %
                              (text[start:end], flankWords))
                continue
            yield (start, end, 'symbol', geneId)
示例#5
0
def findGeneNames(text):
    """
    look for gene names and symbols. Some symbols need flanking trigger words. If these 
    are not present, they are returned as "symbolMaybe"

    Will always return the gene name matches before the symbol matches.

    >>> initData(addOptional=True)
    >>> list(findGeneNames("thyroid hormone receptor, beta"))
    [(0, 30, 'geneName', '7068')]
    >>> list(findGeneNames("FATE1"))
    [(0, 5, 'symbolMaybe', '89885')]
    >>> list(findGeneNames("FATE1 is overexpressed"))
    [(0, 5, 'symbol', '89885')]
    >>> list(findGeneNames("fate1 is overexpressed"))
    []
    >>> list(findGeneNames("PITX2 overexpression"))
    [(0, 5, 'symbol', '5308')]

    # ignore genes that are immediately flanked by "pathway"
    >>> list(findGeneNames("PITX2 pathway"))
    []

    # XX need to correct this
    >>> list(findGeneNames(" BLAST "))
    [(1, 6, 'symbolMaybe', '962')]
    """
    assert(geneSymLex!=None)
    textLower = text.lower()
    for start, end, geneId in fastFind.fastFind(textLower, geneNameLex):
        yield (start, end, 'geneName', geneId)

    flankFindIter = fastFind.fastFindFlankWords(text, geneSymLex, wordDist=2, wordRe=fastFind.SYMRE)
    for start, end, geneId, leftWords, rightWords in flankFindIter:
        # if the symbol is marked as potentially ambiguous, check the flanking words
        if geneId.startswith("?"):
            leftWords = [w.lower() for w in leftWords]
            rightWords = [w.lower() for w in rightWords]
            geneId = geneId.strip("?")
            if len(symLeftReqWords.intersection(leftWords))!=0 or \
                len(symRightReqWords.intersection(rightWords))!=0:
                yield (start, end, 'symbol', geneId)
            else:
                yield (start, end, 'symbolMaybe', geneId)
        # otherwise pass them though
        else:
            # ignore genes that are immediately flanked by "pathway"
            flankWords = getFlankWords(start, end, textLower)
            if "pathway" in flankWords:
                logging.debug("ignored %s, flank words are %s" % (text[start:end], flankWords))
                continue
            yield (start, end, 'symbol', geneId)
示例#6
0
def map(article, file, text, result):
    " go over words of text and check if they are in dict "
    text = file.content
    annots = list(fastFind.fastFindFlankWords(text, lex, wordDist=2))
    newAnnots = []
    for annot in annots:
        start, end, id, leftWords, rightWords = annot
        # get and clean word
        word = text[start:end]
        if not wordRe.match(word): # remove if garbage between words like ( or # etc
            continue
        word = " ".join(word.split()) # remove multi whitespace
        # remove word if it contains more than one dash
        letters = list(word)
        if letters.count("-")>1:
            continue
        word = word.replace(" -", "-")
        word = word.replace("- ", "-")
        if word in blackList:
            continue

        result.setdefault( word, {"l" : {}, "r" : {} } ) # need to return it as a double-list
        addToResults(word, leftWords, "l", result)
        addToResults(word, rightWords, "r", result)