def get_input_sentences(input_files=sys.argv[1:]): with fileinput.input(files=input_files) as f: for line in f: sent_dict = json.loads(line) yield Sentence(sent_dict["doc_id"], sent_dict["sent_id"], sent_dict["wordidxs"], sent_dict["words"], sent_dict["poses"], sent_dict["ners"], sent_dict["lemmas"], sent_dict["dep_paths"], sent_dict["dep_parents"], sent_dict["bounding_boxes"])
"doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas", "dep_paths", "dep_parents", "bounding_boxes", "gene_1_entity", "gene_1_wordidxs", "gene_1_is_correct", "gene_1_type", "gene_2_entity", "gene_2_wordidxs", "gene_2_is_correct", "gene_2_type" ], [ no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, lambda x: TSVstring2list(x, int), TSVstring2list, no_op, lambda x: TSVstring2list(x, int), TSVstring2bool, no_op, no_op, lambda x: TSVstring2list(x, int), TSVstring2bool, no_op ]) # Create the sentence object where the two mentions appear sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"], line_dict["dep_parents"], line_dict["bounding_boxes"]) # Create the mentions gene_1_mention = Mention( "GENE", line_dict["gene_1_entity"], [sentence.words[j] for j in line_dict["gene_1_wordidxs"]]) gene_1_mention.is_correct = line_dict["gene_1_is_correct"] gene_1_mention.type = line_dict["gene_1_type"] gene_2_mention = Mention( "GENE", line_dict["gene_2_entity"], [sentence.words[j] for j in line_dict["gene_2_wordidxs"]]) gene_2_mention.is_correct = line_dict["gene_2_is_correct"] gene_2_mention.type = line_dict["gene_2_type"] # If the word indexes do not overlap, create the relation candidate # TODO there may be other cases. Check with Emily.
if __name__ == "__main__": # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas", "dep_paths", "dep_parents", "mention_id", "mention_wordidxs"], [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, lambda x: TSVstring2list(x, int), no_op, lambda x: TSVstring2list(x, int)]) # Create the sentence object null_list = [None, ] * len(line_dict["wordidxs"]) sentence = Sentence( line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"], line_dict["dep_parents"], null_list) if sentence.is_weird(): continue mention_words = [] for mention_wordidx in line_dict["mention_wordidxs"]: mention_words.append(sentence.words[mention_wordidx]) add_features(line_dict["mention_id"], mention_words, sentence) # add_features_generic( line_dict["mention_id"], mention_words, # sentence)
with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline(line, [ "doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas", "dep_paths", "dep_parents", "mention_id", "mention_wordidxs" ], [ no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, lambda x: TSVstring2list(x, int), no_op, lambda x: TSVstring2list(x, int) ]) # Create the sentence object null_list = [ None, ] * len(line_dict["wordidxs"]) sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"], line_dict["dep_parents"], null_list) if sentence.is_weird(): continue mention_words = [] for mention_wordidx in line_dict["mention_wordidxs"]: mention_words.append(sentence.words[mention_wordidx]) add_features(line_dict["mention_id"], mention_words, sentence) # add_features_generic( line_dict["mention_id"], mention_words, # sentence)
merged_genes_dict = load_dict("merged_genes") # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_id", "wordidxs", "words", "gene"], [ no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, no_op ]) # Create the Sentence object null_list = [ None, ] * len(line_dict["wordidxs"]) sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], null_list, null_list, null_list, null_list, null_list, null_list) # This is the 'labelled' gene that we know is in the sentence gene = line_dict["gene"] # Get the main symbol (or list of symbols) for the labelled gene if gene in merged_genes_dict: gene = merged_genes_dict[gene] else: gene = [ gene, ] # Skip sentences that are "( GENE )", as they give no info about # anything. if (sentence.words[0].word == "-LRB-" and sentence.words[-1].word == "-RRB-") or \ (sentence.words[0].word == "-LSB-" and
# Acronyms defined in the document acronyms = dict() for idx in range(len(line_dict["sent_ids"])): wordidxs = TSVstring2list(line_dict["wordidxss"][idx], int) words = TSVstring2list(line_dict["wordss"][idx]) poses = TSVstring2list(line_dict["posess"][idx]) ners = TSVstring2list(line_dict["nerss"][idx]) lemmas = TSVstring2list(line_dict["lemmass"][idx]) dep_paths = TSVstring2list(line_dict["dep_pathss"][idx]) dep_parents = TSVstring2list(line_dict["dep_parentss"][idx], int) bounding_boxes = TSVstring2list( line_dict["bounding_boxess"][idx]) # Create the Sentence object sentence = Sentence(line_dict["doc_id"], line_dict["sent_ids"][idx], wordidxs, words, poses, ners, lemmas, dep_paths, dep_parents, bounding_boxes) # Extract the acronyms from the sentence sen_acronyms = extract(sentence) for acronym in sen_acronyms: if acronym["acronym"] not in acronyms: acronyms[acronym["acronym"]] = set() acronyms[acronym["acronym"]].add(acronym["definition"]) # Classify the acronyms for acronym in acronyms: contains_kw = False is_correct = None for definition in acronyms[acronym]: # If the definition is in the gene dictionary, supervise as # correct if definition in merged_genes_dict: