def load_hpo(adapter, disease_lines, hpo_disease_lines=None, hpo_lines=None, hpo_gene_lines=None): """Load the hpo terms and hpo diseases into database Args: adapter(MongoAdapter) disease_lines(iterable(str)): These are the omim genemap2 information hpo_lines(iterable(str)) disease_lines(iterable(str)) hpo_gene_lines(iterable(str)) """ # Create a map from gene aliases to gene objects alias_genes = adapter.genes_by_alias() # Fetch the hpo terms if no file if not hpo_lines: hpo_lines = fetch_hpo_terms() # Fetch the hpo gene information if no file if not hpo_gene_lines: hpo_gene_lines = fetch_hpo_to_genes() # Fetch the hpo phenotype information if no file if not hpo_disease_lines: hpo_disease_lines = fetch_hpo_phenotype_to_terms() load_hpo_terms(adapter, hpo_lines, hpo_gene_lines, alias_genes) load_disease_terms(adapter, disease_lines, alias_genes, hpo_disease_lines)
def load_hpo(adapter, disease_lines, hpo_disease_lines=None, hpo_lines=None, hpo_gene_lines=None): """Load the hpo terms and hpo diseases into database Args: adapter(MongoAdapter) hpo_lines(iterable(str)) disease_lines(iterable(str)) """ alias_genes = adapter.genes_by_alias() # Fetch the hpo terms if no file if not hpo_lines: hpo_lines = fetch_hpo_terms() # Fetch the hpo gene information if no file if not hpo_gene_lines: hpo_gene_lines = fetch_hpo_to_genes() # Fetch the hpo phenotype information if no file if not hpo_disease_lines: hpo_disease_lines = fetch_hpo_phenotype_to_terms() load_hpo_terms(adapter, hpo_lines, hpo_gene_lines, alias_genes) load_disease_terms(adapter, disease_lines, alias_genes, hpo_disease_lines)
def get_reduced_hpo_terms(hpo_terms): """Return a reduced version of the hpo terms Args: hpo_terms(set(str)): Set of choosen terms that should be included Yields: hpo_line: A line with hpo information """ hpo_lines = fetch_hpo_terms() begining = True term_lines = [] # We want to keep the header lines keep = True nr_terms = 0 nr_kept = 0 for line in hpo_lines: # When we encounter a new term we yield all lines of the previous term if line.startswith('[Term]'): nr_terms += 1 if keep: nr_kept += 1 for hpo_line in term_lines: yield hpo_line keep = False term_lines = [] elif line.startswith('id'): hpo_id = line[4:] if hpo_id in hpo_terms: keep = True term_lines.append(line) if keep: for hpo_line in term_lines: yield hpo_line LOG.info("Nr of terms in file %s", nr_terms) LOG.info("Nr of terms kept: %s", nr_kept)
def hpo(context): """ Update the hpo terms in the database. Fetch the latest release and update terms. """ LOG.info("Running scout update hpo") adapter = context.obj['adapter'] LOG.info("Dropping HPO terms") adapter.hpo_term_collection.drop() LOG.debug("HPO terms dropped") # Fetch the latest version of the hpo terms hpo_lines = fetch_hpo_terms() # Fetch the connection to genes from hpo source hpo_gene_lines = fetch_hpo_to_genes() load_hpo_terms(adapter, hpo_lines, hpo_gene_lines)
def load_hpo_terms(adapter, hpo_lines=None, hpo_gene_lines=None, alias_genes=None): """Load the hpo terms into the database Parse the hpo lines, build the objects and add them to the database Args: adapter(MongoAdapter) hpo_lines(iterable(str)) hpo_gene_lines(iterable(str)) """ # Store the hpo terms hpo_terms = {} # Fetch the hpo terms if no file if not hpo_lines: hpo_lines = fetch_hpo_terms() # Fetch the hpo gene information if no file if not hpo_gene_lines: hpo_gene_lines = fetch_hpo_to_genes() # Parse the terms # This will yield dictionaries with information about the terms LOG.info("Parsing hpo terms") for term in parse_hpo_obo(hpo_lines): hpo_terms[term['hpo_id']] = term # Get a map with hgnc symbols to hgnc ids from scout if not alias_genes: alias_genes = adapter.genes_by_alias() LOG.info("Adding gene information to hpo terms ...") for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines): hgnc_symbol = hpo_to_symbol['hgnc_symbol'] hpo_id = hpo_to_symbol['hpo_id'] # Fetch gene info to get correct hgnc id gene_info = alias_genes.get(hgnc_symbol) if not gene_info: continue hgnc_id = gene_info['true'] if hpo_id not in hpo_terms: continue hpo_term = hpo_terms[hpo_id] if not 'genes' in hpo_term: hpo_term['genes'] = set() hpo_term['genes'].add(hgnc_id) start_time = datetime.now() LOG.info("Loading the hpo terms...") nr_terms = len(hpo_terms) hpo_bulk = [] with progressbar(hpo_terms.values(), label="Loading hpo terms", length=nr_terms) as bar: for hpo_info in bar: hpo_bulk.append(build_hpo_term(hpo_info)) if len(hpo_bulk) > 10000: adapter.load_hpo_bulk(hpo_bulk) hpo_bulk = [] if hpo_bulk: adapter.load_hpo_bulk(hpo_bulk) LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms)) LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))
all_ancestors = get_all_ancestors(hpo_tree, term, set()) term['all_ancestors'] = all_ancestors return hpo_tree if __name__ == "__main__": import sys from pprint import pprint as pp from scout.utils.handle import get_file_handle from scout.utils.requests import fetch_hpo_terms if not len(sys.argv) > 1: file_handle = fetch_hpo_terms() else: file_handle = get_file_handle(sys.argv[1]) hpo_tree = build_hpo_tree(file_handle) my_term = hpo_tree['HP:0200024'] pp(my_term) # print(get_all_ancestors(hpo_tree, my_term)) for term in hpo_tree: pp(hpo_tree[term]) # phenotypes = parse_hpo_phenotypes(file_handle) # for hpo_id in phenotypes: # hpo_term = phenotypes[hpo_id]