def main(): # Load the dictionaries we need stopwords_dict = load_dict("stopwords") hpoterm_phenotype_abnormalities = load_dict( "hpoterm_phenotype_abnormalities") # Load the stemmer from NLTK stemmer = SnowballStemmer("english") if len(sys.argv) != 2: sys.stderr.write("USAGE: {} DICT\n".format(sys.argv[0])) sys.exit(1) with open(sys.argv[1], 'rt') as dict_file: for line in dict_file: # Skip empty lines if line.strip() == "": continue hpo_id, name, definition = line.strip().split("\t") # Skip if this is not a phenotypic abnormality if hpo_id not in hpoterm_phenotype_abnormalities: continue tokens = name.split() if len(tokens) == 1: name_stems = [ tokens[0].casefold(), ] else: # Compute the stems of the name name_stems = set() for word in tokens: # Remove parenthesis and commas and colons if word[0] == "(": word = word[1:] if word[-1] == ")": word = word[:-1] if word[-1] == ",": word = word[:-1] if word[-1] == ":": word = word[:-1] # Only process non stop-words AND single letters if (word.casefold() not in stopwords_dict and word not in ORDINALS) or len(word) == 1: # split words that contain a "/" if word.find("/") != -1: for part in word.split("/"): name_stems.add(stemmer.stem(part)) else: name_stems.add(stemmer.stem(word)) print("\t".join([hpo_id, name, "|".join(name_stems)]))
def main(): # Load the dictionaries we need stopwords_dict = load_dict("stopwords") hpoterm_phenotype_abnormalities = load_dict("hpoterm_phenotype_abnormalities") # Load the stemmer from NLTK stemmer = SnowballStemmer("english") if len(sys.argv) != 2: sys.stderr.write("USAGE: {} DICT\n".format(sys.argv[0])) sys.exit(1) with open(sys.argv[1], "rt") as dict_file: for line in dict_file: # Skip empty lines if line.strip() == "": continue hpo_id, name, definition = line.strip().split("\t") # Skip if this is not a phenotypic abnormality if hpo_id not in hpoterm_phenotype_abnormalities: continue tokens = name.split() if len(tokens) == 1: name_stems = [tokens[0].casefold()] else: # Compute the stems of the name name_stems = set() for word in tokens: # Remove parenthesis and commas and colons if word[0] == "(": word = word[1:] if word[-1] == ")": word = word[:-1] if word[-1] == ",": word = word[:-1] if word[-1] == ":": word = word[:-1] # Only process non stop-words AND single letters if (word.casefold() not in stopwords_dict and word not in ORDINALS) or len(word) == 1: # split words that contain a "/" if word.find("/") != -1: for part in word.split("/"): name_stems.add(stemmer.stem(part)) else: name_stems.add(stemmer.stem(word)) print("\t".join([hpo_id, name, "|".join(name_stems)]))
#! /usr/bin/env python3 from helper.dictionaries import load_dict if __name__ == "__main__": merged_genes_dict = load_dict("merged_genes") inverted_long_names = load_dict("inverted_long_names") hpoterms_orig = load_dict("hpoterms_orig") for long_name in inverted_long_names: for hpoterm_name in hpoterms_orig: if long_name in hpoterm_name.split() and \ long_name.casefold() != hpoterm_name.casefold(): print("\t".join((long_name, hpoterm_name)))
GENE_KWS = frozenset([ "gene", "oncogene", "protooncogene", "proto-oncogene", "pseudogene", "transgene"]) COEXPRESSION_KWS = frozenset([ "expression", "overexpression", "over-expression", "co-expression", "coexpression"]) KEYWORDS = VAR_KWS | KNOCK_KWS | AMINO_ACID_KWS | ANTIGENE_KWS | DNA_KWS | \ DOWNREGULATION_KWS | DOWNREGULATION_KWS | TUMOR_KWS | GENE_KWS | \ COEXPRESSION_KWS # Load the dictionaries that we need merged_genes_dict = load_dict("merged_genes") long_names_dict = load_dict("long_names") inverted_long_names = load_dict("inverted_long_names") hpoterms_with_gene = load_dict("hpoterms_with_gene") stopwords_dict = load_dict("stopwords") # Add features to a gene mention candidate def add_features(mention_id, mention_words, sentence): # The verb closest to the candidate, with the path to it. minl = 100 minp = None minw = None for word in mention_words: for word2 in sentence.words: if word2.lemma.isalpha() and re.search('^VB[A-Z]*$', word2.pos) \
"gene", "oncogene", "protooncogene", "proto-oncogene", "pseudogene", "transgene" ]) COEXPRESSION_KWS = frozenset([ "expression", "overexpression", "over-expression", "co-expression", "coexpression" ]) KEYWORDS = VAR_KWS | KNOCK_KWS | AMINO_ACID_KWS | ANTIGENE_KWS | DNA_KWS | \ DOWNREGULATION_KWS | DOWNREGULATION_KWS | TUMOR_KWS | GENE_KWS | \ COEXPRESSION_KWS # Load the dictionaries that we need merged_genes_dict = load_dict("merged_genes") long_names_dict = load_dict("long_names") inverted_long_names = load_dict("inverted_long_names") hpoterms_with_gene = load_dict("hpoterms_with_gene") stopwords_dict = load_dict("stopwords") # Add features to a gene mention candidate def add_features(mention_id, mention_words, sentence): # The verb closest to the candidate, with the path to it. minl = 100 minp = None minw = None for word in mention_words: for word2 in sentence.words: if word2.lemma.isalpha() and re.search('^VB[A-Z]*$', word2.pos) \
# "EXT_KEYWORD_MIN_[activation]nn@", # "EXT_KEYWORD_MIN_[oligomerization]nn@", # "EXT_KEYWORD_MIN_[methylation]prep_of@", # "EXT_KEYWORD_MIN_[antibody]nn@", # "EXT_KEYWORD_MIN_[polymorphism]prep_of@", # "EXT_KEYWORD_MIN_[gene]appos@", # "EXT_KEYWORD_MIN_[enzyme]@nn", # "EXT_KEYWORD_MIN_[phosphorylation]prep_of@", # "EXT_KEYWORD_MIN_[receptor]@nn", # "EXT_KEYWORD_MIN_[histone]@nn", # "EXT_KEYWORD_MIN_[receptor]nn", # "IS_LONG_ALPHANUMERIC_MAIN_SYMBOL", "IS_HYPHENATED_SYMBOL", "IS_LONG_NAME" # ]) # Load the dictionaries that we need merged_genes_dict = load_dict("merged_genes") english_dict = load_dict("english") stopwords_dict = load_dict("stopwords") pos_mentions_dict = load_dict("pos_gene_mentions") neg_mentions_dict = load_dict("neg_gene_mentions") med_acrons_dict = load_dict("med_acrons") long_names_dict = load_dict("long_names") inverted_long_names = load_dict("inverted_long_names") hpoterms_with_gene = load_dict("hpoterms_with_gene") # Max mention length. We won't look at subsentences longer than this. max_mention_length = 0 for key in merged_genes_dict: length = len(key.split()) if length > max_mention_length: max_mention_length = length
if frozenset([gene_mention.words[0].word, hpo_entity_id]) in \ genehpoterms_dict: in_mapping = True else: for gene in gene_mention.entity.split("|"): if frozenset([gene, hpo_entity_id]) in \ genehpoterms_dict: in_mapping = True break if in_mapping: relation.is_correct = True relation.type = "GENEHPOTERM_SUP_MAP" # Load the gene<->hpoterm dictionary genehpoterms_dict = load_dict("genehpoterms") if __name__ == "__main__": # Process input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas", "dep_paths", "dep_parents", "bounding_boxes", "gene_entities", "gene_wordidxss", "gene_is_corrects", "gene_types", "hpoterm_entities", "hpoterm_wordidxss", "hpoterm_is_corrects", "hpoterm_types"], [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list,
is_definition = False break definition = " ".join([w.word for w in window_words]) # Only consider this acronym if the definition is valid if is_definition: acronym = dict() acronym["acronym"] = word.word acronym["definition"] = definition acronyms.append(acronym) break start_idx += 1 return acronyms # Load the genes dictionary merged_genes_dict = load_dict("merged_genes") inverted_long_names = load_dict("inverted_long_names") if __name__ == "__main__": # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_ids", "wordidxss", "wordss", "posess", "nerss", "lemmass", "dep_pathss", "dep_parentss", "bounding_boxess"], [no_op, lambda x: TSVstring2list(x, int), lambda x: TSVstring2list(x,sep='!~!'), lambda x: TSVstring2list(x,sep='!~!'),
"cronic", "deletion", "detection", "diagnose", "diagnosis", "disease", "drug", "family", "gene", "genome", "genomic", "genotype", "give", "grade", "group", "history", "infection", "inflammatory", "injury", "mutation", "pathway", "phenotype", "polymorphism", "prevalence", "protein", "risk", "severe", "stage", "symptom", "syndrome", "therapy", "therapeutic", "treat", "treatment", "variant" "viruses", "virus" ]) PATIENT_KWS = frozenset( ["boy", "girl", "man", "woman", "men", "women", "patient", "patients"]) KEYWORDS = VAR_KWS | PATIENT_KWS # Load the dictionaries that we need english_dict = load_dict("english") stopwords_dict = load_dict("stopwords") inverted_hpoterms = load_dict("hpoterms_inverted") hponames_to_ids = load_dict("hponames_to_ids") genes_with_hpoterm = load_dict("genes_with_hpoterm") # hpodag = load_dict("hpoparents") stems = set() for hpo_name in inverted_hpoterms: stem_set = inverted_hpoterms[hpo_name] stems |= stem_set stems = frozenset(stems) # The keys of the following dictionary are sets of stems, and the values are # sets of hpoterms whose name, without stopwords, gives origin to the # corresponding set of stems (as key)
if frozenset([gene_mention.words[0].word, hpo_entity_id]) in \ genehpoterms_dict: in_mapping = True else: for gene in gene_mention.entity.split("|"): if frozenset([gene, hpo_entity_id]) in \ genehpoterms_dict: in_mapping = True break if in_mapping: relation.is_correct = True relation.type = "GENEHPOTERM_SUP_MAP" # Load the gene<->hpoterm dictionary genehpoterms_dict = load_dict("genehpoterms") if __name__ == "__main__": # Process input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, [ "doc_id", "sent_id", "wordidxs", "words", "poses", "ners", "lemmas", "dep_paths", "dep_parents", "bounding_boxes", "gene_entities", "gene_wordidxss", "gene_is_corrects", "gene_types", "hpoterm_entities", "hpoterm_wordidxss", "hpoterm_is_corrects", "hpoterm_types" ],
#! /usr/bin/env python3 # # Extract, add features to, and supervise mentions extracted from geneRifs. # import fileinput from dstruct.Sentence import Sentence from extract_gene_mentions import extract from helper.easierlife import get_dict_from_TSVline, TSVstring2list, no_op from helper.dictionaries import load_dict if __name__ == "__main__": # Load the merged genes dictionary merged_genes_dict = load_dict("merged_genes") # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_id", "wordidxs", "words", "gene"], [ no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, no_op ]) # Create the Sentence object null_list = [ None, ] * len(line_dict["wordidxs"]) sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], null_list, null_list, null_list, null_list,
from dstruct.Mention import Mention from dstruct.Sentence import Sentence from helper.dictionaries import load_dict from helper.easierlife import get_all_phrases_in_sentence, \ get_dict_from_TSVline, TSVstring2list, no_op DOC_ELEMENTS = frozenset( ["figure", "table", "figures", "tables", "fig", "fig.", "figs", "figs.", "file", "movie"]) INDIVIDUALS = frozenset(["individual", "individuals", "patient"]) TYPES = frozenset(["group", "type", "class", "method"]) # Load the dictionaries that we need merged_genes_dict = load_dict("merged_genes") inverted_long_names = load_dict("inverted_long_names") hpoterms_with_gene = load_dict("hpoterms_with_gene") english_dict = load_dict("english") # Max mention length. We won't look at subsentences longer than this. max_mention_length = 0 for key in merged_genes_dict: length = len(key.split()) if length > max_mention_length: max_mention_length = length # doubling to take into account commas and who knows what max_mention_length *= 2 # Supervise the candidates.
is_definition = False break definition = " ".join([w.word for w in window_words]) # Only consider this acronym if the definition is valid if is_definition: acronym = dict() acronym["acronym"] = word.word acronym["definition"] = definition acronyms.append(acronym) break start_idx += 1 return acronyms # Load the genes dictionary merged_genes_dict = load_dict("merged_genes") inverted_long_names = load_dict("inverted_long_names") if __name__ == "__main__": # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline(line, [ "doc_id", "sent_ids", "wordidxss", "wordss", "posess", "nerss", "lemmass", "dep_pathss", "dep_parentss", "bounding_boxess" ], [ no_op, lambda x: TSVstring2list(x, int), lambda x: TSVstring2list(x, sep='!~!'), lambda x: TSVstring2list(x, sep='!~!'), lambda x: TSVstring2list(x, sep='!~!'),
"abnormality", "affect", "apoptosis", "association", "cancer", "carcinoma", "case", "cell", "chemotherapy", "clinic", "clinical", "chromosome", "cronic", "deletion", "detection", "diagnose", "diagnosis", "disease", "drug", "family", "gene", "genome", "genomic", "genotype", "give", "grade", "group", "history", "infection", "inflammatory", "injury", "mutation", "pathway", "phenotype", "polymorphism", "prevalence", "protein", "risk", "severe", "stage", "symptom", "syndrome", "therapy", "therapeutic", "treat", "treatment", "variant" "viruses", "virus"]) PATIENT_KWS = frozenset( ["boy", "girl", "man", "woman", "men", "women", "patient", "patients"]) KEYWORDS = VAR_KWS | PATIENT_KWS # Load the dictionaries that we need english_dict = load_dict("english") stopwords_dict = load_dict("stopwords") inverted_hpoterms = load_dict("hpoterms_inverted") hponames_to_ids = load_dict("hponames_to_ids") genes_with_hpoterm = load_dict("genes_with_hpoterm") # hpodag = load_dict("hpoparents") stems = set() for hpo_name in inverted_hpoterms: stem_set = inverted_hpoterms[hpo_name] stems |= stem_set stems = frozenset(stems) # The keys of the following dictionary are sets of stems, and the values are # sets of hpoterms whose name, without stopwords, gives origin to the
#! /usr/bin/env python3 # # Extract, add features to, and supervise mentions extracted from geneRifs. # import fileinput from dstruct.Sentence import Sentence from extract_gene_mentions import extract from helper.easierlife import get_dict_from_TSVline, TSVstring2list, no_op from helper.dictionaries import load_dict if __name__ == "__main__": # Load the merged genes dictionary merged_genes_dict = load_dict("merged_genes") # Process the input with fileinput.input() as input_files: for line in input_files: # Parse the TSV line line_dict = get_dict_from_TSVline( line, ["doc_id", "sent_id", "wordidxs", "words", "gene"], [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list, no_op]) # Create the Sentence object null_list = [None, ] * len(line_dict["wordidxs"]) sentence = Sentence( line_dict["doc_id"], line_dict["sent_id"], line_dict["wordidxs"], line_dict["words"], null_list, null_list, null_list, null_list, null_list, null_list) # This is the 'labelled' gene that we know is in the sentence gene = line_dict["gene"]
#! /usr/bin/env python3 # # Canonicalize a dump using the HPO dag # # Use the output of filter_out_uncertain_genes.py import sys from helper.dictionaries import load_dict if len(sys.argv) != 2: sys.stderr.write("USAGE: {} dump.tsv\n".format(sys.argv[0])) sys.exit(1) hpoancestors = load_dict("hpoancestors") with open(sys.argv[1], 'rt') as dump: for line in dump: tokens = line.strip().split("\t") relation_id = tokens[0] gene_entity = tokens[1] hpo_entity = tokens[3] if "|" not in hpo_entity: continue hpo_id = hpo_entity.split("|")[0] if hpo_id not in hpoancestors: continue print("{}\t{}\t{}".format(relation_id, gene_entity, hpo_entity)) for ancestor in hpoancestors[hpo_id]: print("{}\t{}\t{}".format(relation_id, gene_entity, ancestor))
threeh_suffixes = threeh_singles full_singles = frozenset(("outcalls", "outcall", "overnite")) full_suffixes = full_singles two_grams_prefixes = frozenset(( "1/2", "1/2", "half", "1.5", "hlf", "full", "an hour", "a hr", "whole", "multiple", "additional", "first", "multi", "add", "complete")) two_grams_durations = frozenset(( "quick visits", "quick visit", "quick fix", "short stay", "short visit", "quick stay", "short fix")) singles = quick_singles | half_singles | hour_singles | fourtyfive_singles | \ twoh_singles | threeh_singles suffixes = quick_suffixes | half_suffixes | hour_suffixes | \ fourtyfive_suffixes | twoh_suffixes | threeh_suffixes | min_suffixes STOP_WORDS = load_dict("stopwords") ALL_ENGLISH_WORDS = load_dict("english") - \ (singles | suffixes | two_grams_prefixes | {"quick", }) sregex_1w_num_min = re.compile("^(15|20|30|45|60|90|120)/?-?min") sregex_1w_num_hour = re.compile("^(1|2|3|one|two|three)/?-?h") regex_min = re.compile("min$|mins|minut") regex_half = re.compile("hlf|half") regex_hh = re.compile("hh") regex_slash = re.compile("/") regex_hour = re.compile("hrs?$|hours?") regex_hyphen = re.compile("-") regex_common_minute_number = re.compile( "15$|15\D+|30$|30\D+|45$|45\D+|60$|60\D+|90$|90\D+") regex_common_hour_spelled = re.compile("one|two|three")