hpoterm_mention.is_correct = True
                    else:
                        assert False
                    hpoterm_mention.type = line_dict["hpoterm_types"][h_idx]
                    assert not hpoterm_mention.type.endswith("_UNSUP")
                    # Skip if the word indexes overlab
                    if set(g_wordidxs) & set(h_wordidxs):
                        continue
                    # Skip if the mentions are too far away
                    gene_start = gene_mention.wordidxs[0]
                    hpoterm_start = hpoterm_mention.wordidxs[0]
                    gene_end = gene_mention.wordidxs[-1]
                    hpoterm_end = hpoterm_mention.wordidxs[-1]
                    limits = sorted(
                        (gene_start, hpoterm_start, gene_end, hpoterm_end))
                    start = limits[0]
                    betw_start = limits[1]
                    betw_end = limits[2]
                    if betw_end - betw_start > 50:
                        continue
                    relation = Relation(
                        "GENEHPOTERM", gene_mention, hpoterm_mention)
                    # Add features
                    add_features(relation, gene_mention, hpoterm_mention,
                                 sentence)
                    # Supervise
                    supervise(relation, gene_mention, hpoterm_mention,
                              sentence)
                    # Print!
                    print(relation.tsv_dump())
Exemplo n.º 2
0
            gene_1_mention = Mention(
                "GENE", line_dict["gene_1_entity"],
                [sentence.words[j] for j in line_dict["gene_1_wordidxs"]])
            gene_1_mention.is_correct = line_dict["gene_1_is_correct"]
            gene_1_mention.type = line_dict["gene_1_type"]
            gene_2_mention = Mention(
                "GENE", line_dict["gene_2_entity"],
                [sentence.words[j] for j in line_dict["gene_2_wordidxs"]])
            gene_2_mention.is_correct = line_dict["gene_2_is_correct"]
            gene_2_mention.type = line_dict["gene_2_type"]
            # If the word indexes do not overlap, create the relation candidate
            # TODO there may be other cases. Check with Emily.
            if not set(line_dict["gene_1_wordidxs"]) & \
                    set(line_dict["gene_2_wordidxs"]):
                relation = Relation("GENEGENE", gene_1_mention, gene_2_mention)
                # Add features
                add_features(relation, gene_1_mention, gene_2_mention,
                             sentence)
                # Supervise
                # One of the two mentions (or both) is labelled as False
                # We do not create a copy in this case because there will
                # already be an unsupervised copy built on the unsupervised
                # copies of the mentions.
                if gene_1_mention.is_correct is False or \
                        gene_2_mention.is_correct is False:
                    relation.is_correct = False
                    relation.type = "GENEGENE_SUP_F"
                # TODO Check in Emily's code how to supervise as True
                # Print!
                print(relation.tsv_dump())
Exemplo n.º 3
0
             (gene_start, hpoterm_start, gene_end, hpoterm_end))
         start = limits[0]
         betw_start = limits[1]
         betw_end = limits[2]
         if betw_end - betw_start > 50:
             continue
         relation = Relation(
             "GENEPHENO", gene_mention, hpoterm_mention)
         # Supervise
         supervise(relation, gene_mention, hpoterm_mention,
                   sentence)
         if relation.is_correct:
             positive_relations.append(
                 (gene_mention, hpoterm_mention))
         # Print!
         print(relation.tsv_dump())
 # Create some artificial negative examples:
 # for each (gene, phenotype) pair that is labelled as positive
 # example, select one word w in the same sentence that (1) is not a
 # gene mention candidate and (2) is not a phenotype mention
 # candidate, add (gene, w) and (w, phenotype) as negative example
 avail_wordidxs = (
     set(line_dict["wordidxs"]) - set(hpoterm_wordidxs)) - \
     set(gene_wordidxs)
 avail_wordidxs = list(avail_wordidxs)
 if len(avail_wordidxs) > 0:
     fake_rels = []
     for (gene_mention, hpoterm_mention) in positive_relations:
         other_word = sentence.words[random.choice(avail_wordidxs)]
         fake_gene_mention = Mention(
             "FAKE_GENE", other_word.lemma, [other_word, ])