def extract(sentence): mentions = [] # Skip the sentence if there are no English words in the sentence no_english_words = True for word in sentence.words: if len(word.word) > 2 and \ (word.word in english_dict or word.word.casefold() in english_dict): no_english_words = False break if no_english_words: return [] # Stop iteration sentence_is_upper = False if " ".join([x.word for x in sentence.words]).isupper(): sentence_is_upper = True # The following set keeps a list of indexes we already looked at and which # contained a mention history = set() words = sentence.words # Scan all subsequences of the sentence of length up to max_mention_length for start, end in get_all_phrases_in_sentence(sentence, max_mention_length): if start in history or end in history: continue phrase = " ".join([word.word for word in words[start:end]]) if sentence_is_upper: # This may not be a great idea... phrase = phrase.casefold() mention = None # If the phrase is a hpoterm name containing a gene, then it is a # mention candidate to supervise as negative if phrase in hpoterms_with_gene: mention = Mention("GENE_SUP_HPO", phrase, words[start:end]) add_features(mention, sentence) mention.is_correct = False mentions.append(mention) for i in range(start, end): history.add(i) # If the phrase is in the gene dictionary, then is a mention candidate if len(phrase) > 1 and phrase in merged_genes_dict: # The entity is a list of all the main symbols that could have the # phrase as symbol. They're separated by "|". mention = Mention("GENE", "|".join(merged_genes_dict[phrase]), words[start:end]) # Add features to the candidate add_features(mention, sentence) # Add mention to the list mentions.append(mention) # Add indexes to history so that they are not used for another # mention for i in range(start, end): history.add(i) return mentions
no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, lambda x: TSVstring2list(x, int), TSVstring2list, no_op, lambda x: TSVstring2list(x, int), TSVstring2bool, no_op, no_op, lambda x: TSVstring2list(x, int), TSVstring2bool, no_op ]) # Create the sentence object where the two mentions appear sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"], line_dict["dep_parents"], line_dict["bounding_boxes"]) # Create the mentions gene_1_mention = Mention( "GENE", line_dict["gene_1_entity"], [sentence.words[j] for j in line_dict["gene_1_wordidxs"]]) gene_1_mention.is_correct = line_dict["gene_1_is_correct"] gene_1_mention.type = line_dict["gene_1_type"] gene_2_mention = Mention( "GENE", line_dict["gene_2_entity"], [sentence.words[j] for j in line_dict["gene_2_wordidxs"]]) gene_2_mention.is_correct = line_dict["gene_2_is_correct"] gene_2_mention.type = line_dict["gene_2_type"] # If the word indexes do not overlap, create the relation candidate # TODO there may be other cases. Check with Emily. if not set(line_dict["gene_1_wordidxs"]) & \ set(line_dict["gene_2_wordidxs"]): relation = Relation("GENEGENE", gene_1_mention, gene_2_mention) # Add features add_features(relation, gene_1_mention, gene_2_mention,
def supervise(mentions, sentence): phrase = " ".join([x.word for x in sentence.words]) new_mentions = [] for mention in mentions: new_mentions.append(mention) if mention.is_correct is not None: continue # The candidate is a long name. if " ".join([word.word for word in mention.words]) in \ inverted_long_names: mention.is_correct = True mention.type = "GENE_SUP_long" continue # The candidate is a MIM entry if mention.words[0].word == "MIM": mention_word_idx = mention.words[0].in_sent_idx if mention_word_idx < len(sentence.words) - 1: next_word = sentence.words[mention_word_idx + 1].word if next_word.casefold() in ["no", "no.", "#", ":"] and \ mention_word_idx + 2 < len(sentence.words): next_word = sentence.words[mention_word_idx + 2].word try: int(next_word) mention.is_correct = False mention.type = "GENE_SUP_MIM" continue except ValueError: pass # The phrase starts with words that are indicative of the candidate not # being a mention of a gene # We add a feature for this, as it is a context property if phrase.startswith("Performed the experiments :") or \ phrase.startswith("Wrote the paper :") or \ phrase.startswith("W'rote the paper :") or \ phrase.startswith("Wlrote the paper") or \ phrase.startswith("Contributed reagents") or \ phrase.startswith("Analyzed the data :") or \ phrase.casefold().startswith("address"): # An unsupervised copy with the special feature unsuper_enriched = Mention( "GENE_dontsup", mention.entity, mention.words) unsuper_enriched.features = mention.features.copy() unsuper_enriched.add_feature("IN_CONTRIB_PHRASE") new_mentions.append(unsuper_enriched) # This candidate contain only the 'special' feature. super_spec = Mention( "GENE_SUP_contr_2", mention.entity, mention.words) super_spec.is_correct = False super_spec.add_feature("IN_CONTRIB_PHRASE") new_mentions.append(super_spec) # Set is_correct and type. mention.is_correct = False mention.type = "GENE_SUP_contr_1" continue # The candidate is an entry in Gene Ontology if len(mention.words) == 1 and mention.words[0].word == "GO": try: if sentence.words[mention.words[0].in_sent_idx + 1][0] == ":": mention.is_correct = False mention.type = "GENE_SUP_go" except: pass continue # Index of the word on the left idx = mention.wordidxs[0] - 1 if idx >= 0: # The candidate is preceded by a "%" (it's probably a quantity) if sentence.words[idx].word == "%": mention.is_correct = False mention.type = "GENE_SUP_%" continue # The candidate comes after a "document element" (e.g., table, or # figure) if sentence.words[idx].word.casefold() in DOC_ELEMENTS: mention.is_correct = False mention.type = "GENE_SUP_doc" continue # The candidate comes after an "individual" word (e.g., # "individual") if sentence.words[idx].word.casefold() in INDIVIDUALS and \ not mention.words[0].word.isalpha() and \ not len(mention.words[0].word) > 4: mention.is_correct = False mention.type = "GENE_SUP_indiv" continue # The candidate comes after a "type" word, and it is made only of # the letters "I" and "V" if sentence.words[idx].lemma.casefold() in TYPES and \ set(mention.words[0].word).issubset(set(["I", "V"])): mention.is_correct = False mention.type = "GENE_SUP_type" continue # Index of the word on the right idx = mention.wordidxs[-1] + 1 if idx < len(sentence.words): # The candidate is followed by a "=" (it's probably a quantity) if sentence.words[idx].word == "=": mention.is_correct = False mention.type = "GENE_SUP_=" continue # The candidate is followed by a ":" and the word after it is a # number (it's probably a quantity) if sentence.words[idx].word == ":": try: float(sentence.words[idx + 1].word) mention.is_correct = False mention.type = "GENE_SUP_:" except: # both ValueError and IndexError pass continue # The candidate comes before "et" if sentence.words[idx].word == "et": mention.is_correct = False mention.type = "GENE_SUP_et" continue # The candidate is a DNA triplet # We check this by looking at whether the word before or after is also # a DNA triplet. if len(mention.words) == 1 and len(mention.words[0].word) == 3 and \ set(mention.words[0].word) <= set("ACGT"): done = False idx = mention.wordidxs[0] - 1 if idx > 0: if set(sentence.words[idx].word) <= set("ACGT"): mention.is_correct = False mention.type = "GENE_SUP_dna" continue idx = mention.wordidxs[-1] + 1 if not done and idx < len(sentence.words): if set(sentence.words[idx].word) <= set("ACGT"): mention.is_correct = False mention.type = "GENE_SUP_dna" continue # If it's "II", it's most probably wrong. if mention.words[0].word == "II": mention.is_correct = False mention.type = "GENE_SUP_ii" continue # Snowball positive features # Commented out to avoid overfitting # if mention.features & snowball_pos_feats: # supervised = Mention("GENE_SUP", mention.entity, # mention.words) # supervised.features = mention.features - snowball_pos_feats # supervised.is_correct = True # new_mentions.append(supervised) # supervised2 = Mention("GENE_SUP", mention.entity, # mention.words) # supervised2.features = mention.features & snowball_pos_feats # supervised2.is_correct = True # new_mentions.append(supervised2) # continue # Some negative features # if "EXT_KEYWORD_MIN_[chromosome]@nn" in mention.features: # supervised = Mention("GENE_SUP", mention.entity, mention.words) # supervised.features = mention.features.copy() # supervised.is_correct = False # new_mentions.append(supervised) # continue # if "IS_YEAR_RIGHT" in mention.features: # supervised = Mention("GENE_SUP", mention.entity, mention.words) # supervised.features = mention.features.copy() # supervised.is_correct = False # new_mentions.append(supervised) # continue # The candidate comes after an organization, or a location, or a # person. We skip commas as they may trick us. comes_after = None loc_idx = mention.wordidxs[0] - 1 while loc_idx >= 0 and sentence.words[loc_idx].lemma == ",": loc_idx -= 1 if loc_idx >= 0 and \ sentence.words[loc_idx].ner in \ ["ORGANIZATION", "LOCATION", "PERSON"] and \ sentence.words[loc_idx].word not in merged_genes_dict: comes_after = sentence.words[loc_idx].ner # The candidate comes before an organization, or a location, or a # person. We skip commas, as they may trick us. comes_before = None loc_idx = mention.wordidxs[-1] + 1 while loc_idx < len(sentence.words) and \ sentence.words[loc_idx].lemma == ",": loc_idx += 1 if loc_idx < len(sentence.words) and sentence.words[loc_idx].ner in \ ["ORGANIZATION", "LOCATION", "PERSON"] and \ sentence.words[loc_idx].word not in merged_genes_dict: comes_before = sentence.words[loc_idx].ner # Not correct if it's most probably a person name. if comes_before and comes_after: mention.is_correct = False mention.type = "GENE_SUP_name" continue # Comes after person and before "," or ":", so it's probably a person # name if comes_after == "PERSON" and \ mention.words[-1].in_sent_idx + 1 < len(sentence.words) and \ sentence.words[mention.words[-1].in_sent_idx + 1].word \ in [",", ":"]: mention.is_correct = False mention.type = "GENE_SUP_name2" continue if comes_after == "PERSON" and mention.words[0].ner == "PERSON": mention.is_correct = False mention.type = "GENE_SUP_name3" continue # Is a location and comes before a location so it's probably wrong if comes_before == "LOCATION" and mention.words[0].ner == "LOCATION": mention.is_correct = False mention.type = "GENE_SUP_loc" continue return new_mentions
line_dict["hpoterm_types"] = new_hpoterm_types # Create the sentence object where the two mentions appear sentence = Sentence( line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"], line_dict["dep_parents"], line_dict["bounding_boxes"]) # Skip weird sentences if sentence.is_weird(): continue # Iterate over each pair of (gene,phenotype) mention for g_idx in range(len(line_dict["gene_is_corrects"])): g_wordidxs = TSVstring2list( line_dict["gene_wordidxss"][g_idx], int) gene_mention = Mention( "GENE", line_dict["gene_entities"][g_idx], [sentence.words[j] for j in g_wordidxs]) if line_dict["gene_is_corrects"][g_idx] == "n": gene_mention.is_correct = None elif line_dict["gene_is_corrects"][g_idx] == "f": gene_mention.is_correct = False elif line_dict["gene_is_corrects"][g_idx] == "t": gene_mention.is_correct = True else: assert False gene_mention.type = line_dict["gene_types"][g_idx] assert not gene_mention.type.endswith("_UNSUP") for h_idx in range(len(line_dict["hpoterm_is_corrects"])): h_wordidxs = TSVstring2list( line_dict["hpoterm_wordidxss"][h_idx], int) hpoterm_mention = Mention(
"gene_2_is_correct", "gene_2_type"], [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, lambda x: TSVstring2list(x, int), TSVstring2list, no_op, lambda x: TSVstring2list(x, int), TSVstring2bool, no_op, no_op, lambda x: TSVstring2list(x, int), TSVstring2bool, no_op]) # Create the sentence object where the two mentions appear sentence = Sentence( line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"], line_dict["dep_parents"], line_dict["bounding_boxes"]) # Create the mentions gene_1_mention = Mention( "GENE", line_dict["gene_1_entity"], [sentence.words[j] for j in line_dict["gene_1_wordidxs"]]) gene_1_mention.is_correct = line_dict["gene_1_is_correct"] gene_1_mention.type = line_dict["gene_1_type"] gene_2_mention = Mention( "GENE", line_dict["gene_2_entity"], [sentence.words[j] for j in line_dict["gene_2_wordidxs"]]) gene_2_mention.is_correct = line_dict["gene_2_is_correct"] gene_2_mention.type = line_dict["gene_2_type"] # If the word indexes do not overlap, create the relation candidate # TODO there may be other cases. Check with Emily. if not set(line_dict["gene_1_wordidxs"]) & \ set(line_dict["gene_2_wordidxs"]): relation = Relation( "GENEGENE", gene_1_mention, gene_2_mention) # Add features
def extract(sentence): mentions = [] mention_ids = set() # If there are no English words in the sentence, we skip it. no_english_words = True for word in sentence.words: word.stem = stemmer.stem(word.word) # Here so all words have stem if len(word.word) > 2 and \ (word.word in english_dict or word.word.casefold() in english_dict): no_english_words = False if no_english_words: return mentions history = set() # Iterate over each phrase of length at most max_mention_length for start, end in get_all_phrases_in_sentence(sentence, max_mention_length): if start in history or end - 1 in history: continue phrase = " ".join([word.word for word in sentence.words[start:end]]) # If the phrase is a gene long name containing a phenotype name, create # a candidate that we supervise as negative if len(phrase) > 1 and phrase in genes_with_hpoterm: mention = Mention("HPOTERM_SUP_GENEL", phrase, sentence.words[start:end]) mention.is_correct = False add_features(mention, sentence) mentions.append(mention) for word in sentence.words[start:end]: history.add(word.in_sent_idx) continue # Iterate over each phrase of length at most max_mention_length for start, end in get_all_phrases_in_sentence(sentence, max_mention_length): should_continue = False for i in range(start, end): if i in history: should_continue = True break if should_continue: continue phrase = " ".join([word.word for word in sentence.words[start:end]]) # The list of stems in the phrase (not from stopwords or symbols, and # not already used for a mention) phrase_stems = [] for word in sentence.words[start:end]: if not re.match("^(_|\W)+$", word.word) and \ (len(word.word) == 1 or word.lemma.casefold() not in stopwords_dict): phrase_stems.append(word.stem) phrase_stems_set = frozenset(phrase_stems) if phrase_stems_set in hpoterms_dict: # Find the word objects of that match mention_words = [] mention_lemmas = [] mention_stems = [] for word in sentence.words[start:end]: if word.stem in phrase_stems_set and \ word.lemma.casefold() not in mention_lemmas and \ word.stem not in mention_stems: mention_lemmas.append(word.lemma.casefold()) mention_words.append(word) mention_stems.append(word.stem) if len(mention_words) == len(phrase_stems_set): break entity = list(hpoterms_dict[phrase_stems_set])[0] mention = Mention("HPOTERM", hponames_to_ids[entity] + "|" + entity, mention_words) # The following is a way to avoid duplicates. # It's ugly and not perfect if mention.id() in mention_ids: continue mention_ids.add(mention.id()) # Features add_features(mention, sentence) mentions.append(mention) for word in mention_words: history.add(word.in_sent_idx) # Generate some negative candidates at random, if this sentences didn't # contain any other candidate. We want the candidates to be nouns. if len(mentions) == 0 and random.random() <= NEG_PROB: index = random.randint(0, len(sentence.words) - 1) # We may not get a noun at random, so we try again if we don't. tries = 10 while not sentence.words[index].pos.startswith("NN") and tries > 0: index = random.randint(0, len(sentence.words) - 1) tries -= 1 if sentence.words[index].pos.startswith("NN"): mention = Mention("HPOTERM_SUP_rand", sentence.words[index].lemma.casefold(), sentence.words[index:index + 1]) mention.is_correct = False add_features(mention, sentence) mentions.append(mention) return mentions
# Create the sentence object where the two mentions appear sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"], line_dict["dep_parents"], line_dict["bounding_boxes"]) # Skip weird sentences if sentence.is_weird(): continue # Iterate over each pair of (gene,phenotype) mention for g_idx in range(len(line_dict["gene_is_corrects"])): g_wordidxs = TSVstring2list(line_dict["gene_wordidxss"][g_idx], int) gene_mention = Mention("GENE", line_dict["gene_entities"][g_idx], [sentence.words[j] for j in g_wordidxs]) if line_dict["gene_is_corrects"][g_idx] == "n": gene_mention.is_correct = None elif line_dict["gene_is_corrects"][g_idx] == "f": gene_mention.is_correct = False elif line_dict["gene_is_corrects"][g_idx] == "t": gene_mention.is_correct = True else: assert False gene_mention.type = line_dict["gene_types"][g_idx] assert not gene_mention.type.endswith("_UNSUP") for h_idx in range(len(line_dict["hpoterm_is_corrects"])): h_wordidxs = TSVstring2list( line_dict["hpoterm_wordidxss"][h_idx], int) hpoterm_mention = Mention(
def extract(sentence): mentions = [] mention_ids = set() # If there are no English words in the sentence, we skip it. no_english_words = True for word in sentence.words: word.stem = stemmer.stem(word.word) # Here so all words have stem if len(word.word) > 2 and \ (word.word in english_dict or word.word.casefold() in english_dict): no_english_words = False if no_english_words: return mentions history = set() # Iterate over each phrase of length at most max_mention_length for start, end in get_all_phrases_in_sentence(sentence, max_mention_length): if start in history or end - 1 in history: continue phrase = " ".join([word.word for word in sentence.words[start:end]]) # If the phrase is a gene long name containing a phenotype name, create # a candidate that we supervise as negative if len(phrase) > 1 and phrase in genes_with_hpoterm: mention = Mention("HPOTERM_SUP_GENEL", phrase, sentence.words[start:end]) mention.is_correct = False add_features(mention, sentence) mentions.append(mention) for word in sentence.words[start:end]: history.add(word.in_sent_idx) continue # Iterate over each phrase of length at most max_mention_length for start, end in get_all_phrases_in_sentence(sentence, max_mention_length): should_continue = False for i in range(start, end): if i in history: should_continue = True break if should_continue: continue phrase = " ".join([word.word for word in sentence.words[start:end]]) # The list of stems in the phrase (not from stopwords or symbols, and # not already used for a mention) phrase_stems = [] for word in sentence.words[start:end]: if not re.match("^(_|\W)+$", word.word) and \ (len(word.word) == 1 or word.lemma.casefold() not in stopwords_dict): phrase_stems.append(word.stem) phrase_stems_set = frozenset(phrase_stems) if phrase_stems_set in hpoterms_dict: # Find the word objects of that match mention_words = [] mention_lemmas = [] mention_stems = [] for word in sentence.words[start:end]: if word.stem in phrase_stems_set and \ word.lemma.casefold() not in mention_lemmas and \ word.stem not in mention_stems: mention_lemmas.append(word.lemma.casefold()) mention_words.append(word) mention_stems.append(word.stem) if len(mention_words) == len(phrase_stems_set): break entity = list(hpoterms_dict[phrase_stems_set])[0] mention = Mention( "HPOTERM", hponames_to_ids[entity] + "|" + entity, mention_words) # The following is a way to avoid duplicates. # It's ugly and not perfect if mention.id() in mention_ids: continue mention_ids.add(mention.id()) # Features add_features(mention, sentence) mentions.append(mention) for word in mention_words: history.add(word.in_sent_idx) # Generate some negative candidates at random, if this sentences didn't # contain any other candidate. We want the candidates to be nouns. if len(mentions) == 0 and random.random() <= NEG_PROB: index = random.randint(0, len(sentence.words) - 1) # We may not get a noun at random, so we try again if we don't. tries = 10 while not sentence.words[index].pos.startswith("NN") and tries > 0: index = random.randint(0, len(sentence.words) - 1) tries -= 1 if sentence.words[index].pos.startswith("NN"): mention = Mention( "HPOTERM_SUP_rand", sentence.words[index].lemma.casefold(), sentence.words[index:index+1]) mention.is_correct = False add_features(mention, sentence) mentions.append(mention) return mentions
def supervise(mentions, sentence): phrase = " ".join([x.word for x in sentence.words]) new_mentions = [] for mention in mentions: new_mentions.append(mention) if mention.is_correct is not None: continue # The candidate is a long name. if " ".join([word.word for word in mention.words]) in \ inverted_long_names: mention.is_correct = True mention.type = "GENE_SUP_long" continue # The candidate is a MIM entry if mention.words[0].word == "MIM": mention_word_idx = mention.words[0].in_sent_idx if mention_word_idx < len(sentence.words) - 1: next_word = sentence.words[mention_word_idx + 1].word if next_word.casefold() in ["no", "no.", "#", ":"] and \ mention_word_idx + 2 < len(sentence.words): next_word = sentence.words[mention_word_idx + 2].word try: int(next_word) mention.is_correct = False mention.type = "GENE_SUP_MIM" continue except ValueError: pass # The phrase starts with words that are indicative of the candidate not # being a mention of a gene # We add a feature for this, as it is a context property if phrase.startswith("Performed the experiments :") or \ phrase.startswith("Wrote the paper :") or \ phrase.startswith("W'rote the paper :") or \ phrase.startswith("Wlrote the paper") or \ phrase.startswith("Contributed reagents") or \ phrase.startswith("Analyzed the data :") or \ phrase.casefold().startswith("address"): # An unsupervised copy with the special feature unsuper_enriched = Mention("GENE_dontsup", mention.entity, mention.words) unsuper_enriched.features = mention.features.copy() unsuper_enriched.add_feature("IN_CONTRIB_PHRASE") new_mentions.append(unsuper_enriched) # This candidate contain only the 'special' feature. super_spec = Mention("GENE_SUP_contr_2", mention.entity, mention.words) super_spec.is_correct = False super_spec.add_feature("IN_CONTRIB_PHRASE") new_mentions.append(super_spec) # Set is_correct and type. mention.is_correct = False mention.type = "GENE_SUP_contr_1" continue # The candidate is an entry in Gene Ontology if len(mention.words) == 1 and mention.words[0].word == "GO": try: if sentence.words[mention.words[0].in_sent_idx + 1][0] == ":": mention.is_correct = False mention.type = "GENE_SUP_go" except: pass continue # Index of the word on the left idx = mention.wordidxs[0] - 1 if idx >= 0: # The candidate is preceded by a "%" (it's probably a quantity) if sentence.words[idx].word == "%": mention.is_correct = False mention.type = "GENE_SUP_%" continue # The candidate comes after a "document element" (e.g., table, or # figure) if sentence.words[idx].word.casefold() in DOC_ELEMENTS: mention.is_correct = False mention.type = "GENE_SUP_doc" continue # The candidate comes after an "individual" word (e.g., # "individual") if sentence.words[idx].word.casefold() in INDIVIDUALS and \ not mention.words[0].word.isalpha() and \ not len(mention.words[0].word) > 4: mention.is_correct = False mention.type = "GENE_SUP_indiv" continue # The candidate comes after a "type" word, and it is made only of # the letters "I" and "V" if sentence.words[idx].lemma.casefold() in TYPES and \ set(mention.words[0].word).issubset(set(["I", "V"])): mention.is_correct = False mention.type = "GENE_SUP_type" continue # Index of the word on the right idx = mention.wordidxs[-1] + 1 if idx < len(sentence.words): # The candidate is followed by a "=" (it's probably a quantity) if sentence.words[idx].word == "=": mention.is_correct = False mention.type = "GENE_SUP_=" continue # The candidate is followed by a ":" and the word after it is a # number (it's probably a quantity) if sentence.words[idx].word == ":": try: float(sentence.words[idx + 1].word) mention.is_correct = False mention.type = "GENE_SUP_:" except: # both ValueError and IndexError pass continue # The candidate comes before "et" if sentence.words[idx].word == "et": mention.is_correct = False mention.type = "GENE_SUP_et" continue # The candidate is a DNA triplet # We check this by looking at whether the word before or after is also # a DNA triplet. if len(mention.words) == 1 and len(mention.words[0].word) == 3 and \ set(mention.words[0].word) <= set("ACGT"): done = False idx = mention.wordidxs[0] - 1 if idx > 0: if set(sentence.words[idx].word) <= set("ACGT"): mention.is_correct = False mention.type = "GENE_SUP_dna" continue idx = mention.wordidxs[-1] + 1 if not done and idx < len(sentence.words): if set(sentence.words[idx].word) <= set("ACGT"): mention.is_correct = False mention.type = "GENE_SUP_dna" continue # If it's "II", it's most probably wrong. if mention.words[0].word == "II": mention.is_correct = False mention.type = "GENE_SUP_ii" continue # Snowball positive features # Commented out to avoid overfitting # if mention.features & snowball_pos_feats: # supervised = Mention("GENE_SUP", mention.entity, # mention.words) # supervised.features = mention.features - snowball_pos_feats # supervised.is_correct = True # new_mentions.append(supervised) # supervised2 = Mention("GENE_SUP", mention.entity, # mention.words) # supervised2.features = mention.features & snowball_pos_feats # supervised2.is_correct = True # new_mentions.append(supervised2) # continue # Some negative features # if "EXT_KEYWORD_MIN_[chromosome]@nn" in mention.features: # supervised = Mention("GENE_SUP", mention.entity, mention.words) # supervised.features = mention.features.copy() # supervised.is_correct = False # new_mentions.append(supervised) # continue # if "IS_YEAR_RIGHT" in mention.features: # supervised = Mention("GENE_SUP", mention.entity, mention.words) # supervised.features = mention.features.copy() # supervised.is_correct = False # new_mentions.append(supervised) # continue # The candidate comes after an organization, or a location, or a # person. We skip commas as they may trick us. comes_after = None loc_idx = mention.wordidxs[0] - 1 while loc_idx >= 0 and sentence.words[loc_idx].lemma == ",": loc_idx -= 1 if loc_idx >= 0 and \ sentence.words[loc_idx].ner in \ ["ORGANIZATION", "LOCATION", "PERSON"] and \ sentence.words[loc_idx].word not in merged_genes_dict: comes_after = sentence.words[loc_idx].ner # The candidate comes before an organization, or a location, or a # person. We skip commas, as they may trick us. comes_before = None loc_idx = mention.wordidxs[-1] + 1 while loc_idx < len(sentence.words) and \ sentence.words[loc_idx].lemma == ",": loc_idx += 1 if loc_idx < len(sentence.words) and sentence.words[loc_idx].ner in \ ["ORGANIZATION", "LOCATION", "PERSON"] and \ sentence.words[loc_idx].word not in merged_genes_dict: comes_before = sentence.words[loc_idx].ner # Not correct if it's most probably a person name. if comes_before and comes_after: mention.is_correct = False mention.type = "GENE_SUP_name" continue # Comes after person and before "," or ":", so it's probably a person # name if comes_after == "PERSON" and \ mention.words[-1].in_sent_idx + 1 < len(sentence.words) and \ sentence.words[mention.words[-1].in_sent_idx + 1].word \ in [",", ":"]: mention.is_correct = False mention.type = "GENE_SUP_name2" continue if comes_after == "PERSON" and mention.words[0].ner == "PERSON": mention.is_correct = False mention.type = "GENE_SUP_name3" continue # Is a location and comes before a location so it's probably wrong if comes_before == "LOCATION" and mention.words[0].ner == "LOCATION": mention.is_correct = False mention.type = "GENE_SUP_loc" continue return new_mentions
# Skip weird sentences if sentence.is_weird(): continue gene_mentions = [] hpoterm_mentions = [] positive_relations = [] gene_wordidxs = set() hpoterm_wordidxs = set() # Iterate over each pair of (gene,phenotype) mentions for g_idx in range(len(line_dict["gene_is_corrects"])): g_wordidxs = TSVstring2list( line_dict["gene_wordidxss"][g_idx], int) for idx in g_wordidxs: gene_wordidxs.add(idx) gene_mention = Mention( "GENE", line_dict["gene_entities"][g_idx], [sentence.words[j] for j in g_wordidxs]) if line_dict["gene_is_corrects"][g_idx] == "n": gene_mention.is_correct = None elif line_dict["gene_is_corrects"][g_idx] == "f": gene_mention.is_correct = False elif line_dict["gene_is_corrects"][g_idx] == "t": gene_mention.is_correct = True else: assert False gene_mention.type = line_dict["gene_types"][g_idx] assert not gene_mention.type.endswith("_UNSUP") gene_mentions.append(gene_mention) for h_idx in range(len(line_dict["hpoterm_is_corrects"])): h_wordidxs = TSVstring2list( line_dict["hpoterm_wordidxss"][h_idx], int)