relation.add_feature("GENE_2_NGRAM_RIGHT_1_[" + sentence.words[gene_2_end + 1].lemma + "]") if __name__ == "__main__": # Process input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline(line, [ "doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas", "dep_paths", "dep_parents", "bounding_boxes", "gene_1_entity", "gene_1_wordidxs", "gene_1_is_correct", "gene_1_type", "gene_2_entity", "gene_2_wordidxs", "gene_2_is_correct", "gene_2_type" ], [ no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, lambda x: TSVstring2list(x, int), TSVstring2list, no_op, lambda x: TSVstring2list(x, int), TSVstring2bool, no_op, no_op, lambda x: TSVstring2list(x, int), TSVstring2bool, no_op ]) # Create the sentence object where the two mentions appear sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"], line_dict["dep_parents"], line_dict["bounding_boxes"]) # Create the mentions gene_1_mention = Mention(
# There are many PERSONs/ORGANIZATIONs/LOCATIONs in the sentence # for ner in ["PERSON", "ORGANIZATION", "LOCATION"]: # if [x.ner for x in sentence.words].count(ner) > 4: # print_feature( # sentence.doc_id, mention_id, "MANY_{}_IN_SENTENCE".format(ner)) if __name__ == "__main__": # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas", "dep_paths", "dep_parents", "mention_id", "mention_wordidxs"], [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, lambda x: TSVstring2list(x, int), no_op, lambda x: TSVstring2list(x, int)]) # Create the sentence object null_list = [None, ] * len(line_dict["wordidxs"]) sentence = Sentence( line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"], line_dict["dep_parents"], null_list) if sentence.is_weird(): continue mention_words = [] for mention_wordidx in line_dict["mention_wordidxs"]: mention_words.append(sentence.words[mention_wordidx])
import fileinput from dstruct.Sentence import Sentence from extract_gene_mentions import extract from helper.easierlife import get_dict_from_TSVline, TSVstring2list, no_op from helper.dictionaries import load_dict if __name__ == "__main__": # Load the merged genes dictionary merged_genes_dict = load_dict("merged_genes") # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_id", "wordidxs", "words", "gene"], [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, no_op]) # Create the Sentence object null_list = [None, ] * len(line_dict["wordidxs"]) sentence = Sentence( line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], null_list, null_list, null_list, null_list, null_list, null_list) # This is the 'labelled' gene that we know is in the sentence gene = line_dict["gene"] # Get the main symbol (or list of symbols) for the labelled gene if gene in merged_genes_dict: gene = merged_genes_dict[gene] else: gene = [gene, ] # Skip sentences that are "( GENE )", as they give no info about
# for ner in ["PERSON", "ORGANIZATION", "LOCATION"]: # if [x.ner for x in sentence.words].count(ner) > 4: # print_feature( # sentence.doc_id, mention_id, "MANY_{}_IN_SENTENCE".format(ner)) if __name__ == "__main__": # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline(line, [ "doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas", "dep_paths", "dep_parents", "mention_id", "mention_wordidxs" ], [ no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, lambda x: TSVstring2list(x, int), no_op, lambda x: TSVstring2list(x, int) ]) # Create the sentence object null_list = [ None, ] * len(line_dict["wordidxs"]) sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"], line_dict["dep_parents"], null_list) if sentence.is_weird(): continue
from dstruct.Sentence import Sentence from extract_gene_mentions import extract, add_features from helper.easierlife import get_dict_from_TSVline, TSVstring2list, no_op from helper.dictionaries import load_dict if __name__ == "__main__": # Load the merged genes dictionary merged_genes_dict = load_dict("merged_genes") # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas", "dep_paths", "dep_parents", "bounding_boxes", "gene"], [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, lambda x: TSVstring2list(x, int), TSVstring2list, no_op]) # Create the Sentence object sentence = Sentence( line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"], line_dict["dep_parents"], line_dict["bounding_boxes"]) # This is the 'labelled' gene that we know is in the sentence gene = line_dict["gene"] # Get the main symbol (or list of symbols) for the labelled gene if gene in merged_genes_dict: gene = merged_genes_dict[gene] else:
relation.add_feature("GENE_2_NGRAM_RIGHT_1_[" + sentence.words[gene_2_end+1].lemma + "]") if __name__ == "__main__": # Process input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas", "dep_paths", "dep_parents", "bounding_boxes", "gene_1_entity", "gene_1_wordidxs", "gene_1_is_correct", "gene_1_type", "gene_2_entity", "gene_2_wordidxs", "gene_2_is_correct", "gene_2_type"], [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, lambda x: TSVstring2list(x, int), TSVstring2list, no_op, lambda x: TSVstring2list(x, int), TSVstring2bool, no_op, no_op, lambda x: TSVstring2list(x, int), TSVstring2bool, no_op]) # Create the sentence object where the two mentions appear sentence = Sentence( line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"], line_dict["dep_parents"], line_dict["bounding_boxes"]) # Create the mentions gene_1_mention = Mention( "GENE", line_dict["gene_1_entity"],
# Load the gene<->hpoterm dictionary genehpoterms_dict = load_dict("genehpoterms") if __name__ == "__main__": # Process input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas", "dep_paths", "dep_parents", "bounding_boxes", "gene_entities", "gene_wordidxss", "gene_is_corrects", "gene_types", "hpoterm_entities", "hpoterm_wordidxss", "hpoterm_is_corrects", "hpoterm_types"], [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, lambda x: TSVstring2list(x, int), TSVstring2list, # these are for the sentence TSVstring2list, lambda x: TSVstring2list(x, sep="!~!"), TSVstring2list, TSVstring2list, # these are for the genes TSVstring2list, lambda x: TSVstring2list(x, sep="!~!"), TSVstring2list, TSVstring2list, # these are for the HPO ]) # Remove the genes that are unsupervised copies or duplicates supervised_idxs = set() unsupervised_idxs = set() for i in range(len(line_dict["gene_is_corrects"])): if line_dict["gene_is_corrects"][i] == "n": unsupervised_idxs.add(i) else: if line_dict["gene_types"][i] != "GENE_SUP_contr_2":
# Load the genes dictionary merged_genes_dict = load_dict("merged_genes") inverted_long_names = load_dict("inverted_long_names") if __name__ == "__main__": # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_ids", "wordidxss", "wordss", "posess", "nerss", "lemmass", "dep_pathss", "dep_parentss", "bounding_boxess"], [no_op, lambda x: TSVstring2list(x, int), lambda x: TSVstring2list(x,sep='!~!'), lambda x: TSVstring2list(x,sep='!~!'), lambda x: TSVstring2list(x,sep='!~!'), lambda x: TSVstring2list(x,sep='!~!'), lambda x: TSVstring2list(x,sep='!~!'), lambda x: TSVstring2list(x,sep='!~!'), lambda x: TSVstring2list(x,sep='!~!'), lambda x: TSVstring2list(x,sep='!~!')]) # Acronyms defined in the document acronyms = dict() for idx in range(len(line_dict["sent_ids"])): wordidxs = TSVstring2list(line_dict["wordidxss"][idx], int) words = TSVstring2list(line_dict["wordss"][idx]) poses = TSVstring2list(line_dict["posess"][idx]) ners = TSVstring2list(line_dict["nerss"][idx]) lemmas = TSVstring2list(line_dict["lemmass"][idx]) dep_paths = TSVstring2list(line_dict["dep_pathss"][idx])
# Parse the TSV line line_dict = get_dict_from_TSVline( line, [ "doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas", "dep_paths", "dep_parents", "bounding_boxes", "gene_entities", "gene_wordidxss", "gene_is_corrects", "gene_types", "hpoterm_entities", "hpoterm_wordidxss", "hpoterm_is_corrects", "hpoterm_types" ], [ no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list, lambda x: TSVstring2list(x, int), TSVstring2list, # these are for the sentence TSVstring2list, lambda x: TSVstring2list(x, sep="!~!"), TSVstring2list, TSVstring2list, # these are for the genes TSVstring2list, lambda x: TSVstring2list(x, sep="!~!"), TSVstring2list, TSVstring2list, # these are for the HPO ]) # Remove the genes that are unsupervised copies or duplicates
from dstruct.Sentence import Sentence from extract_gene_mentions import extract from helper.easierlife import get_dict_from_TSVline, TSVstring2list, no_op from helper.dictionaries import load_dict if __name__ == "__main__": # Load the merged genes dictionary merged_genes_dict = load_dict("merged_genes") # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_id", "wordidxs", "words", "gene"], [ no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, no_op ]) # Create the Sentence object null_list = [ None, ] * len(line_dict["wordidxs"]) sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], null_list, null_list, null_list, null_list, null_list, null_list) # This is the 'labelled' gene that we know is in the sentence gene = line_dict["gene"] # Get the main symbol (or list of symbols) for the labelled gene if gene in merged_genes_dict: gene = merged_genes_dict[gene]
mentions.append(mention) # Add indexes to history so that they are not used for another # mention for i in range(start, end): history.add(i) return mentions if __name__ == "__main__": # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas"], [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list]) # Create the sentence object null_list = [None, ] * len(line_dict["wordidxs"]) sentence = Sentence( line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], line_dict["poses"], line_dict["ners"], line_dict["lemmas"], null_list, null_list, null_list) # Skip weird sentences if sentence.is_weird(): continue # Get list of mentions candidates in this sentence mentions = extract(sentence) # Supervise them
# Load the genes dictionary merged_genes_dict = load_dict("merged_genes") inverted_long_names = load_dict("inverted_long_names") if __name__ == "__main__": # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline(line, [ "doc_id", "sent_ids", "wordidxss", "wordss", "posess", "nerss", "lemmass", "dep_pathss", "dep_parentss", "bounding_boxess" ], [ no_op, lambda x: TSVstring2list(x, int), lambda x: TSVstring2list(x, sep='!~!'), lambda x: TSVstring2list(x, sep='!~!'), lambda x: TSVstring2list(x, sep='!~!'), lambda x: TSVstring2list(x, sep='!~!'), lambda x: TSVstring2list(x, sep='!~!'), lambda x: TSVstring2list(x, sep='!~!'), lambda x: TSVstring2list(x, sep='!~!'), lambda x: TSVstring2list(x, sep='!~!') ]) # Acronyms defined in the document acronyms = dict() for idx in range(len(line_dict["sent_ids"])): wordidxs = TSVstring2list(line_dict["wordidxss"][idx], int) words = TSVstring2list(line_dict["wordss"][idx]) poses = TSVstring2list(line_dict["posess"][idx]) ners = TSVstring2list(line_dict["nerss"][idx]) lemmas = TSVstring2list(line_dict["lemmass"][idx]) dep_paths = TSVstring2list(line_dict["dep_pathss"][idx])