hpoterm_mention.is_correct = True else: assert False hpoterm_mention.type = line_dict["hpoterm_types"][h_idx] assert not hpoterm_mention.type.endswith("_UNSUP") # Skip if the word indexes overlab if set(g_wordidxs) & set(h_wordidxs): continue # Skip if the mentions are too far away gene_start = gene_mention.wordidxs[0] hpoterm_start = hpoterm_mention.wordidxs[0] gene_end = gene_mention.wordidxs[-1] hpoterm_end = hpoterm_mention.wordidxs[-1] limits = sorted( (gene_start, hpoterm_start, gene_end, hpoterm_end)) start = limits[0] betw_start = limits[1] betw_end = limits[2] if betw_end - betw_start > 50: continue relation = Relation( "GENEHPOTERM", gene_mention, hpoterm_mention) # Add features add_features(relation, gene_mention, hpoterm_mention, sentence) # Supervise supervise(relation, gene_mention, hpoterm_mention, sentence) # Print! print(relation.tsv_dump())
gene_1_mention = Mention( "GENE", line_dict["gene_1_entity"], [sentence.words[j] for j in line_dict["gene_1_wordidxs"]]) gene_1_mention.is_correct = line_dict["gene_1_is_correct"] gene_1_mention.type = line_dict["gene_1_type"] gene_2_mention = Mention( "GENE", line_dict["gene_2_entity"], [sentence.words[j] for j in line_dict["gene_2_wordidxs"]]) gene_2_mention.is_correct = line_dict["gene_2_is_correct"] gene_2_mention.type = line_dict["gene_2_type"] # If the word indexes do not overlap, create the relation candidate # TODO there may be other cases. Check with Emily. if not set(line_dict["gene_1_wordidxs"]) & \ set(line_dict["gene_2_wordidxs"]): relation = Relation("GENEGENE", gene_1_mention, gene_2_mention) # Add features add_features(relation, gene_1_mention, gene_2_mention, sentence) # Supervise # One of the two mentions (or both) is labelled as False # We do not create a copy in this case because there will # already be an unsupervised copy built on the unsupervised # copies of the mentions. if gene_1_mention.is_correct is False or \ gene_2_mention.is_correct is False: relation.is_correct = False relation.type = "GENEGENE_SUP_F" # TODO Check in Emily's code how to supervise as True # Print! print(relation.tsv_dump())
(gene_start, hpoterm_start, gene_end, hpoterm_end)) start = limits[0] betw_start = limits[1] betw_end = limits[2] if betw_end - betw_start > 50: continue relation = Relation( "GENEPHENO", gene_mention, hpoterm_mention) # Supervise supervise(relation, gene_mention, hpoterm_mention, sentence) if relation.is_correct: positive_relations.append( (gene_mention, hpoterm_mention)) # Print! print(relation.tsv_dump()) # Create some artificial negative examples: # for each (gene, phenotype) pair that is labelled as positive # example, select one word w in the same sentence that (1) is not a # gene mention candidate and (2) is not a phenotype mention # candidate, add (gene, w) and (w, phenotype) as negative example avail_wordidxs = ( set(line_dict["wordidxs"]) - set(hpoterm_wordidxs)) - \ set(gene_wordidxs) avail_wordidxs = list(avail_wordidxs) if len(avail_wordidxs) > 0: fake_rels = [] for (gene_mention, hpoterm_mention) in positive_relations: other_word = sentence.words[random.choice(avail_wordidxs)] fake_gene_mention = Mention( "FAKE_GENE", other_word.lemma, [other_word, ])