def test_build_hpo_term_missing_key(adapter, test_hpo_info, key): ## GIVEN a dictionary with hpo information ## WHEN deleteing key test_hpo_info.pop(key) ## THEN calling build_hpo_term() will raise KeyError with pytest.raises(KeyError): build_hpo_term(test_hpo_info)
def test_build_hpo_term_with_genes(adapter): ## GIVEN a hpo term and a adapter with genes hpo_info = { 'hpo_id':"HP:0000878", 'description': "11 pairs of ribs", 'hgnc_symbols': ['B3GALT6', 'RBBP8'] } alias_genes = {} alias_genes['B3GALT6'] = { 'true': 17978, 'ids': [17978], } alias_genes['RBBP8'] = { 'true': 9891, 'ids': [9891], } ## WHEN building the hpo term hpo_obj = build_hpo_term(hpo_info, alias_genes) ## THEN assert that the term has the correct information assert hpo_obj['_id'] == hpo_obj['hpo_id'] == hpo_info['hpo_id'] ## The adapter has no genes loaded so we expect this to be 0 assert len(hpo_obj['genes']) == 2 assert set(hpo_obj['genes']) == set([17978, 9891])
def test_build_hpo_term(adapter, test_hpo_info): ## GIVEN a hpo term ## WHEN building the hpo term hpo_obj = build_hpo_term(test_hpo_info) ## THEN assert that the term has the correct information assert hpo_obj["_id"] == hpo_obj["hpo_id"] == test_hpo_info["hpo_id"] assert hpo_obj["description"] == test_hpo_info["description"] assert len(hpo_obj["genes"]) == 2
def test_build_hpo_term(adapter): ## GIVEN a hpo term hpo_info = { "hpo_id": "HP:0000878", "description": "11 pairs of ribs", "genes": [1, 2], } ## WHEN building the hpo term hpo_obj = build_hpo_term(hpo_info) ## THEN assert that the term has the correct information assert hpo_obj["_id"] == hpo_obj["hpo_id"] == hpo_info["hpo_id"] assert hpo_obj["description"] == hpo_info["description"] assert len(hpo_obj["genes"]) == 2
def test_build_hpo_term_non_existing_genes(adapter): ## GIVEN a hpo term hpo_info = { 'hpo_id': "HP:0000878", 'description': "11 pairs of ribs", 'genes': [1, 2] } ## WHEN building the hpo term hpo_obj = build_hpo_term(hpo_info) ## THEN assert that the term has the correct information assert hpo_obj['_id'] == hpo_obj['hpo_id'] == hpo_info['hpo_id'] assert hpo_obj['description'] == hpo_info['description'] assert len(hpo_obj['genes']) == 2
def test_build_hpo_term(adapter): ## GIVEN a hpo term hpo_info = { 'hpo_id':"HP:0000878", 'description': "11 pairs of ribs", 'genes': [1, 2] } ## WHEN building the hpo term hpo_obj = build_hpo_term(hpo_info) ## THEN assert that the term has the correct information assert hpo_obj['_id'] == hpo_obj['hpo_id'] == hpo_info['hpo_id'] assert hpo_obj['description'] == hpo_info['description'] assert len(hpo_obj['genes']) == 2
def test_build_hpo_term_non_existing_genes(adapter): ## GIVEN a hpo term hpo_info = { 'hpo_id':"HP:0000878", 'description': "11 pairs of ribs", 'hgnc_symbols': ['B3GALT6', 'RBBP8'] } ## WHEN building the hpo term hpo_obj = build_hpo_term(hpo_info, {}) ## THEN assert that the term has the correct information assert hpo_obj['_id'] == hpo_obj['hpo_id'] == hpo_info['hpo_id'] assert hpo_obj['description'] == hpo_info['description'] ## The adapter has no genes loaded so we expect this to be 0 assert len(hpo_obj['genes']) == 0
def load_hpo_terms(adapter, hpo_lines, genes): """Load the hpo terms into the database Parse the hpo lines, build the objects and add them to the database Args: adapter(MongoAdapter) hpo_lines(iterable(str)) """ hpo_terms = parse_hpo_phenotypes(hpo_lines) start_time = datetime.now() logger.info("Loading the hpo terms...") for nr_terms, hpo_id in enumerate(hpo_terms): hpo_info = hpo_terms[hpo_id] hpo_obj = build_hpo_term(hpo_info, genes) adapter.load_hpo_term(hpo_obj) logger.info("Loading done. Nr of terms loaded {0}".format(nr_terms)) logger.info("Time to load terms: {0}".format(datetime.now() - start_time))
def load_hpo_terms(adapter, hpo_lines=None, hpo_gene_lines=None, alias_genes=None): """Load the hpo terms into the database Parse the hpo lines, build the objects and add them to the database Args: adapter(MongoAdapter) hpo_lines(iterable(str)) hpo_gene_lines(iterable(str)) """ # Store the hpo terms hpo_terms = {} # Fetch the hpo terms if no file if not hpo_lines: hpo_lines = fetch_hpo_terms() # Fetch the hpo gene information if no file if not hpo_gene_lines: hpo_gene_lines = fetch_hpo_to_genes() # Parse the terms # This will yield dictionaries with information about the terms LOG.info("Parsing hpo terms") for term in parse_hpo_obo(hpo_lines): hpo_terms[term['hpo_id']] = term # Get a map with hgnc symbols to hgnc ids from scout if not alias_genes: alias_genes = adapter.genes_by_alias() LOG.info("Adding gene information to hpo terms ...") for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines): hgnc_symbol = hpo_to_symbol['hgnc_symbol'] hpo_id = hpo_to_symbol['hpo_id'] # Fetch gene info to get correct hgnc id gene_info = alias_genes.get(hgnc_symbol) if not gene_info: continue hgnc_id = gene_info['true'] if hpo_id not in hpo_terms: continue hpo_term = hpo_terms[hpo_id] if not 'genes' in hpo_term: hpo_term['genes'] = set() hpo_term['genes'].add(hgnc_id) start_time = datetime.now() LOG.info("Loading the hpo terms...") nr_terms = len(hpo_terms) hpo_bulk = [] with progressbar(hpo_terms.values(), label="Loading hpo terms", length=nr_terms) as bar: for hpo_info in bar: hpo_bulk.append(build_hpo_term(hpo_info)) if len(hpo_bulk) > 10000: adapter.load_hpo_bulk(hpo_bulk) hpo_bulk = [] if hpo_bulk: adapter.load_hpo_bulk(hpo_bulk) LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms)) LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))
def load_hpo_terms(adapter, hpo_lines=None, hpo_gene_lines=None, alias_genes=None): """Load the hpo terms into the database Parse the hpo lines, build the objects and add them to the database Args: adapter(MongoAdapter) hpo_lines(iterable(str)): lines from file http://purl.obolibrary.org/obo/hp.obo hpo_gene_lines(iterable(str)): lines from file https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt alias_genes """ # Fetch the hpo terms if no file if not hpo_lines: hpo_lines = fetch_hpo_terms() # Parse the terms LOG.info("Parsing hpo terms") hpo_terms = build_hpo_tree(hpo_lines) # Fetch the hpo gene information if no file if not hpo_gene_lines: hpo_gene_lines = fetch_hpo_to_genes_to_disease() # Get a map with hgnc symbols to hgnc ids from scout if not alias_genes: alias_genes = adapter.genes_by_alias() LOG.info("Adding gene information to hpo terms ...") for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines): hgnc_symbol = hpo_to_symbol["hgnc_symbol"] hpo_id = hpo_to_symbol["hpo_id"] # Fetch gene info to get correct hgnc id gene_info = alias_genes.get(hgnc_symbol) if not gene_info: continue hgnc_id = gene_info["true"] if hpo_id not in hpo_terms: continue hpo_term = hpo_terms[hpo_id] if not "genes" in hpo_term: hpo_term["genes"] = set() hpo_term["genes"].add(hgnc_id) start_time = datetime.now() LOG.info("Loading the hpo terms...") nr_terms = len(hpo_terms) hpo_bulk = [] with progressbar(hpo_terms.values(), label="Loading hpo terms", length=nr_terms) as bar: for hpo_info in bar: hpo_bulk.append(build_hpo_term(hpo_info)) if len(hpo_bulk) > 10000: adapter.load_hpo_bulk(hpo_bulk) hpo_bulk = [] if hpo_bulk: adapter.load_hpo_bulk(hpo_bulk) LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms)) LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))
def load_hpo_terms(adapter, hpo_lines=None, hpo_gene_lines=None, alias_genes=None): """Load the hpo terms into the database Parse the hpo lines, build the objects and add them to the database Args: adapter(MongoAdapter) hpo_lines(iterable(str)) hpo_gene_lines(iterable(str)) """ # Store the hpo terms hpo_terms = {} # Fetch the hpo terms if no file if not hpo_lines: hpo_lines = fetch_hpo_terms() # Fetch the hpo gene information if no file if not hpo_gene_lines: hpo_gene_lines = fetch_hpo_to_genes() # Parse the terms # This will yield dictionaries with information about the terms LOG.info("Parsing hpo terms") for term in parse_hpo_obo(hpo_lines): hpo_terms[term['hpo_id']] = term # Get a map with hgnc symbols to hgnc ids from scout if not alias_genes: alias_genes = adapter.genes_by_alias() LOG.info("Adding gene information to hpo terms ...") for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines): hgnc_symbol = hpo_to_symbol['hgnc_symbol'] hpo_id = hpo_to_symbol['hpo_id'] # Fetch gene info to get correct hgnc id gene_info = alias_genes.get(hgnc_symbol) if not gene_info: continue hgnc_id = gene_info['true'] if hpo_id not in hpo_terms: continue hpo_term = hpo_terms[hpo_id] if not 'genes' in hpo_term: hpo_term['genes'] = set() hpo_term['genes'].add(hgnc_id) start_time = datetime.now() LOG.info("Loading the hpo terms...") nr_terms = len(hpo_terms) hpo_bulk = [] with progressbar(hpo_terms.values(), label="Loading hpo terms", length=nr_terms) as bar: for hpo_info in bar: hpo_bulk.append(build_hpo_term(hpo_info)) if len(hpo_bulk) > 10000: adapter.load_hpo_bulk(hpo_bulk) hpo_bulk = [] if hpo_bulk: adapter.load_hpo_bulk(hpo_bulk) LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms)) LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))