예제 #1
0
def extract(sentence):
    mentions = []
    mention_ids = set()
    # If there are no English words in the sentence, we skip it.
    no_english_words = True
    for word in sentence.words:
        word.stem = stemmer.stem(word.word)  # Here so all words have stem
        if len(word.word) > 2 and \
                (word.word in english_dict or
                 word.word.casefold() in english_dict):
            no_english_words = False
    if no_english_words:
        return mentions
    history = set()
    # Iterate over each phrase of length at most max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        if start in history or end - 1 in history:
            continue
        phrase = " ".join([word.word for word in sentence.words[start:end]])
        # If the phrase is a gene long name containing a phenotype name, create
        # a candidate that we supervise as negative
        if len(phrase) > 1 and phrase in genes_with_hpoterm:
            mention = Mention("HPOTERM_SUP_GENEL", phrase,
                              sentence.words[start:end])
            mention.is_correct = False
            add_features(mention, sentence)
            mentions.append(mention)
            for word in sentence.words[start:end]:
                history.add(word.in_sent_idx)
            continue
    # Iterate over each phrase of length at most max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        should_continue = False
        for i in range(start, end):
            if i in history:
                should_continue = True
                break
        if should_continue:
            continue
        phrase = " ".join([word.word for word in sentence.words[start:end]])
        # The list of stems in the phrase (not from stopwords or symbols, and
        # not already used for a mention)
        phrase_stems = []
        for word in sentence.words[start:end]:
            if not re.match("^(_|\W)+$", word.word) and \
                    (len(word.word) == 1 or
                     word.lemma.casefold() not in stopwords_dict):
                phrase_stems.append(word.stem)
        phrase_stems_set = frozenset(phrase_stems)
        if phrase_stems_set in hpoterms_dict:
            # Find the word objects of that match
            mention_words = []
            mention_lemmas = []
            mention_stems = []
            for word in sentence.words[start:end]:
                if word.stem in phrase_stems_set and \
                        word.lemma.casefold() not in mention_lemmas and \
                        word.stem not in mention_stems:
                    mention_lemmas.append(word.lemma.casefold())
                    mention_words.append(word)
                    mention_stems.append(word.stem)
                    if len(mention_words) == len(phrase_stems_set):
                        break
            entity = list(hpoterms_dict[phrase_stems_set])[0]
            mention = Mention("HPOTERM",
                              hponames_to_ids[entity] + "|" + entity,
                              mention_words)
            # The following is a way to avoid duplicates.
            # It's ugly and not perfect
            if mention.id() in mention_ids:
                continue
            mention_ids.add(mention.id())
            # Features
            add_features(mention, sentence)
            mentions.append(mention)
            for word in mention_words:
                history.add(word.in_sent_idx)
    # Generate some negative candidates at random, if this sentences didn't
    # contain any other candidate. We want the candidates to be nouns.
    if len(mentions) == 0 and random.random() <= NEG_PROB:
        index = random.randint(0, len(sentence.words) - 1)
        # We may not get a noun at random, so we try again if we don't.
        tries = 10
        while not sentence.words[index].pos.startswith("NN") and tries > 0:
            index = random.randint(0, len(sentence.words) - 1)
            tries -= 1
        if sentence.words[index].pos.startswith("NN"):
            mention = Mention("HPOTERM_SUP_rand",
                              sentence.words[index].lemma.casefold(),
                              sentence.words[index:index + 1])
            mention.is_correct = False
            add_features(mention, sentence)
            mentions.append(mention)
    return mentions
def extract(sentence):
    mentions = []
    mention_ids = set()
    # If there are no English words in the sentence, we skip it.
    no_english_words = True
    for word in sentence.words:
        word.stem = stemmer.stem(word.word)  # Here so all words have stem
        if len(word.word) > 2 and \
                (word.word in english_dict or
                 word.word.casefold() in english_dict):
            no_english_words = False
    if no_english_words:
        return mentions
    history = set()
    # Iterate over each phrase of length at most max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        if start in history or end - 1 in history:
            continue
        phrase = " ".join([word.word for word in sentence.words[start:end]])
        # If the phrase is a gene long name containing a phenotype name, create
        # a candidate that we supervise as negative
        if len(phrase) > 1 and phrase in genes_with_hpoterm:
            mention = Mention("HPOTERM_SUP_GENEL",
                              phrase,
                              sentence.words[start:end])
            mention.is_correct = False
            add_features(mention, sentence)
            mentions.append(mention)
            for word in sentence.words[start:end]:
                history.add(word.in_sent_idx)
            continue
    # Iterate over each phrase of length at most max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        should_continue = False
        for i in range(start, end):
            if i in history:
                should_continue = True
                break
        if should_continue:
            continue
        phrase = " ".join([word.word for word in sentence.words[start:end]])
        # The list of stems in the phrase (not from stopwords or symbols, and
        # not already used for a mention)
        phrase_stems = []
        for word in sentence.words[start:end]:
            if not re.match("^(_|\W)+$", word.word) and \
                    (len(word.word) == 1 or
                     word.lemma.casefold() not in stopwords_dict):
                phrase_stems.append(word.stem)
        phrase_stems_set = frozenset(phrase_stems)
        if phrase_stems_set in hpoterms_dict:
            # Find the word objects of that match
            mention_words = []
            mention_lemmas = []
            mention_stems = []
            for word in sentence.words[start:end]:
                if word.stem in phrase_stems_set and \
                        word.lemma.casefold() not in mention_lemmas and \
                        word.stem not in mention_stems:
                    mention_lemmas.append(word.lemma.casefold())
                    mention_words.append(word)
                    mention_stems.append(word.stem)
                    if len(mention_words) == len(phrase_stems_set):
                        break
            entity = list(hpoterms_dict[phrase_stems_set])[0]
            mention = Mention(
                "HPOTERM", hponames_to_ids[entity] + "|" + entity,
                mention_words)
            # The following is a way to avoid duplicates.
            # It's ugly and not perfect
            if mention.id() in mention_ids:
                continue
            mention_ids.add(mention.id())
            # Features
            add_features(mention, sentence)
            mentions.append(mention)
            for word in mention_words:
                history.add(word.in_sent_idx)
    # Generate some negative candidates at random, if this sentences didn't
    # contain any other candidate. We want the candidates to be nouns.
    if len(mentions) == 0 and random.random() <= NEG_PROB:
        index = random.randint(0, len(sentence.words) - 1)
        # We may not get a noun at random, so we try again if we don't.
        tries = 10
        while not sentence.words[index].pos.startswith("NN") and tries > 0:
            index = random.randint(0, len(sentence.words) - 1)
            tries -= 1
        if sentence.words[index].pos.startswith("NN"):
            mention = Mention(
                "HPOTERM_SUP_rand", sentence.words[index].lemma.casefold(),
                sentence.words[index:index+1])
            mention.is_correct = False
            add_features(mention, sentence)
            mentions.append(mention)
    return mentions