def test_parse_hpo_obo(): hpo_info = [ "[Term]", "id: HP:0000003", "name: Multicystic kidney dysplasia", "alt_id: HP:0004715", "def: Multicystic dysplasia of the kidney is characterized by multiple cysts of varying size in the kidney and the absence of a normal pelvicaliceal system. The condition is associated with ureteral or ureteropelvic atresia, and the affected kidney is nonfunctional. [HPO:curators]", "comment: Multicystic kidney dysplasia is the result of abnormal fetal renal development in which the affected kidney is replaced by multiple cysts and has little or no residual function. The vast majority of multicystic kidneys are unilateral. Multicystic kidney can be diagnosed on prenatal ultrasound.", 'synonym: "Multicystic dysplastic kidney" EXACT []', 'synonym: "Multicystic kidneys" EXACT []', 'synonym: "Multicystic renal dysplasia" EXACT []', "xref: MSH:D021782", "xref: SNOMEDCT_US:204962002", "xref: SNOMEDCT_US:82525005", "xref: UMLS:C3714581", "is_a: HP:0000107 ! Renal cyst", ] hpo_terms = parse_hpo_obo(hpo_info) for hpo_term in hpo_terms: assert hpo_term['hpo_id'] == "HP:0000003" assert hpo_term['description'] == "Multicystic kidney dysplasia"
def load_hpo_terms(adapter, hpo_lines=None, hpo_gene_lines=None, alias_genes=None): """Load the hpo terms into the database Parse the hpo lines, build the objects and add them to the database Args: adapter(MongoAdapter) hpo_lines(iterable(str)) hpo_gene_lines(iterable(str)) """ # Store the hpo terms hpo_terms = {} # Fetch the hpo terms if no file if not hpo_lines: hpo_lines = fetch_hpo_terms() # Fetch the hpo gene information if no file if not hpo_gene_lines: hpo_gene_lines = fetch_hpo_to_genes() # Parse the terms # This will yield dictionaries with information about the terms LOG.info("Parsing hpo terms") for term in parse_hpo_obo(hpo_lines): hpo_terms[term['hpo_id']] = term # Get a map with hgnc symbols to hgnc ids from scout if not alias_genes: alias_genes = adapter.genes_by_alias() LOG.info("Adding gene information to hpo terms ...") for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines): hgnc_symbol = hpo_to_symbol['hgnc_symbol'] hpo_id = hpo_to_symbol['hpo_id'] # Fetch gene info to get correct hgnc id gene_info = alias_genes.get(hgnc_symbol) if not gene_info: continue hgnc_id = gene_info['true'] if hpo_id not in hpo_terms: continue hpo_term = hpo_terms[hpo_id] if not 'genes' in hpo_term: hpo_term['genes'] = set() hpo_term['genes'].add(hgnc_id) start_time = datetime.now() LOG.info("Loading the hpo terms...") nr_terms = len(hpo_terms) hpo_bulk = [] with progressbar(hpo_terms.values(), label="Loading hpo terms", length=nr_terms) as bar: for hpo_info in bar: hpo_bulk.append(build_hpo_term(hpo_info)) if len(hpo_bulk) > 10000: adapter.load_hpo_bulk(hpo_bulk) hpo_bulk = [] if hpo_bulk: adapter.load_hpo_bulk(hpo_bulk) LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms)) LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))
def test_parse_hpo_terms(hpo_terms_handle): hpo_terms = parse_hpo_obo(hpo_terms_handle) for hpo_term in hpo_terms: assert hpo_term['hpo_id']