示例#1
0
def get_input_sentences(input_files=sys.argv[1:]):
    with fileinput.input(files=input_files) as f:
        for line in f:
            sent_dict = json.loads(line)
            yield Sentence(sent_dict["doc_id"], sent_dict["sent_id"],
                           sent_dict["wordidxs"], sent_dict["words"],
                           sent_dict["poses"], sent_dict["ners"],
                           sent_dict["lemmas"], sent_dict["dep_paths"],
                           sent_dict["dep_parents"],
                           sent_dict["bounding_boxes"])
示例#2
0
     "doc_id", "sent_id", "wordidxs", "words", "poses", "ners",
     "lemmas", "dep_paths", "dep_parents", "bounding_boxes",
     "gene_1_entity", "gene_1_wordidxs", "gene_1_is_correct",
     "gene_1_type", "gene_2_entity", "gene_2_wordidxs",
     "gene_2_is_correct", "gene_2_type"
 ], [
     no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
     TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list,
     lambda x: TSVstring2list(x, int), TSVstring2list, no_op,
     lambda x: TSVstring2list(x, int), TSVstring2bool, no_op, no_op,
     lambda x: TSVstring2list(x, int), TSVstring2bool, no_op
 ])
 # Create the sentence object where the two mentions appear
 sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"],
                     line_dict["wordidxs"], line_dict["words"],
                     line_dict["poses"], line_dict["ners"],
                     line_dict["lemmas"], line_dict["dep_paths"],
                     line_dict["dep_parents"],
                     line_dict["bounding_boxes"])
 # Create the mentions
 gene_1_mention = Mention(
     "GENE", line_dict["gene_1_entity"],
     [sentence.words[j] for j in line_dict["gene_1_wordidxs"]])
 gene_1_mention.is_correct = line_dict["gene_1_is_correct"]
 gene_1_mention.type = line_dict["gene_1_type"]
 gene_2_mention = Mention(
     "GENE", line_dict["gene_2_entity"],
     [sentence.words[j] for j in line_dict["gene_2_wordidxs"]])
 gene_2_mention.is_correct = line_dict["gene_2_is_correct"]
 gene_2_mention.type = line_dict["gene_2_type"]
 # If the word indexes do not overlap, create the relation candidate
 # TODO there may be other cases. Check with Emily.
示例#3
0

if __name__ == "__main__":
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "poses",
                       "ners", "lemmas", "dep_paths", "dep_parents",
                       "mention_id", "mention_wordidxs"],
                [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                    TSVstring2list, TSVstring2list, TSVstring2list,
                    TSVstring2list, lambda x: TSVstring2list(x, int),
                    no_op, lambda x: TSVstring2list(x, int)])
            # Create the sentence object
            null_list = [None, ] * len(line_dict["wordidxs"])
            sentence = Sentence(
                line_dict["doc_id"], line_dict["sent_id"],
                line_dict["wordidxs"], line_dict["words"], line_dict["poses"],
                line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"],
                line_dict["dep_parents"], null_list)
            if sentence.is_weird():
                continue
            mention_words = []
            for mention_wordidx in line_dict["mention_wordidxs"]:
                mention_words.append(sentence.words[mention_wordidx])
            add_features(line_dict["mention_id"], mention_words, sentence)
            # add_features_generic( line_dict["mention_id"], mention_words,
            # sentence)
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(line, [
                "doc_id", "sent_id", "wordidxs", "words", "poses", "ners",
                "lemmas", "dep_paths", "dep_parents", "mention_id",
                "mention_wordidxs"
            ], [
                no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list,
                lambda x: TSVstring2list(x, int), no_op,
                lambda x: TSVstring2list(x, int)
            ])
            # Create the sentence object
            null_list = [
                None,
            ] * len(line_dict["wordidxs"])
            sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"],
                                line_dict["wordidxs"], line_dict["words"],
                                line_dict["poses"], line_dict["ners"],
                                line_dict["lemmas"], line_dict["dep_paths"],
                                line_dict["dep_parents"], null_list)
            if sentence.is_weird():
                continue
            mention_words = []
            for mention_wordidx in line_dict["mention_wordidxs"]:
                mention_words.append(sentence.words[mention_wordidx])
            add_features(line_dict["mention_id"], mention_words, sentence)
            # add_features_generic( line_dict["mention_id"], mention_words,
            # sentence)
示例#5
0
 merged_genes_dict = load_dict("merged_genes")
 # Process the input
 with fileinput.input() as input_files:
     for line in input_files:
         # Parse the TSV line
         line_dict = get_dict_from_TSVline(
             line, ["doc_id", "sent_id", "wordidxs", "words", "gene"], [
                 no_op, int, lambda x: TSVstring2list(x, int),
                 TSVstring2list, no_op
             ])
         # Create the Sentence object
         null_list = [
             None,
         ] * len(line_dict["wordidxs"])
         sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"],
                             line_dict["wordidxs"], line_dict["words"],
                             null_list, null_list, null_list, null_list,
                             null_list, null_list)
         # This is the 'labelled' gene that we know is in the sentence
         gene = line_dict["gene"]
         # Get the main symbol (or list of symbols) for the labelled gene
         if gene in merged_genes_dict:
             gene = merged_genes_dict[gene]
         else:
             gene = [
                 gene,
             ]
         # Skip sentences that are "( GENE )", as they give no info about
         # anything.
         if (sentence.words[0].word == "-LRB-" and
                 sentence.words[-1].word == "-RRB-") or \
            (sentence.words[0].word == "-LSB-" and
示例#6
0
 # Acronyms defined in the document
 acronyms = dict()
 for idx in range(len(line_dict["sent_ids"])):
     wordidxs = TSVstring2list(line_dict["wordidxss"][idx], int)
     words = TSVstring2list(line_dict["wordss"][idx])
     poses = TSVstring2list(line_dict["posess"][idx])
     ners = TSVstring2list(line_dict["nerss"][idx])
     lemmas = TSVstring2list(line_dict["lemmass"][idx])
     dep_paths = TSVstring2list(line_dict["dep_pathss"][idx])
     dep_parents = TSVstring2list(line_dict["dep_parentss"][idx],
                                  int)
     bounding_boxes = TSVstring2list(
         line_dict["bounding_boxess"][idx])
     # Create the Sentence object
     sentence = Sentence(line_dict["doc_id"],
                         line_dict["sent_ids"][idx], wordidxs,
                         words, poses, ners, lemmas, dep_paths,
                         dep_parents, bounding_boxes)
     # Extract the acronyms from the sentence
     sen_acronyms = extract(sentence)
     for acronym in sen_acronyms:
         if acronym["acronym"] not in acronyms:
             acronyms[acronym["acronym"]] = set()
         acronyms[acronym["acronym"]].add(acronym["definition"])
 # Classify the acronyms
 for acronym in acronyms:
     contains_kw = False
     is_correct = None
     for definition in acronyms[acronym]:
         # If the definition is in the gene dictionary, supervise as
         # correct
         if definition in merged_genes_dict: