예제 #1
0
def test_parse_hpo_obo():
    hpo_info = [
        "[Term]",
        "id: HP:0000003",
        "name: Multicystic kidney dysplasia",
        "alt_id: HP:0004715",
        "def: Multicystic dysplasia of the kidney is characterized by multiple cysts of varying size in the kidney and the absence of a normal pelvicaliceal system. The condition is associated with ureteral or ureteropelvic atresia, and the affected kidney is nonfunctional. [HPO:curators]",
        "comment: Multicystic kidney dysplasia is the result of abnormal fetal renal development in which the affected kidney is replaced by multiple cysts and has little or no residual function. The vast majority of multicystic kidneys are unilateral. Multicystic kidney can be diagnosed on prenatal ultrasound.",
        'synonym: "Multicystic dysplastic kidney" EXACT []',
        'synonym: "Multicystic kidneys" EXACT []',
        'synonym: "Multicystic renal dysplasia" EXACT []',
        "xref: MSH:D021782",
        "xref: SNOMEDCT_US:204962002",
        "xref: SNOMEDCT_US:82525005",
        "xref: UMLS:C3714581",
        "is_a: HP:0000107 ! Renal cyst",
    ]
    hpo_terms = parse_hpo_obo(hpo_info)

    for hpo_term in hpo_terms:
        assert hpo_term['hpo_id'] == "HP:0000003"
        assert hpo_term['description'] == "Multicystic kidney dysplasia"
def test_parse_hpo_obo():
    hpo_info = [
        "[Term]",
        "id: HP:0000003",
        "name: Multicystic kidney dysplasia",
        "alt_id: HP:0004715",
        "def: Multicystic dysplasia of the kidney is characterized by multiple cysts of varying size in the kidney and the absence of a normal pelvicaliceal system. The condition is associated with ureteral or ureteropelvic atresia, and the affected kidney is nonfunctional. [HPO:curators]",
        "comment: Multicystic kidney dysplasia is the result of abnormal fetal renal development in which the affected kidney is replaced by multiple cysts and has little or no residual function. The vast majority of multicystic kidneys are unilateral. Multicystic kidney can be diagnosed on prenatal ultrasound.",
        'synonym: "Multicystic dysplastic kidney" EXACT []',
        'synonym: "Multicystic kidneys" EXACT []',
        'synonym: "Multicystic renal dysplasia" EXACT []',
        "xref: MSH:D021782",
        "xref: SNOMEDCT_US:204962002",
        "xref: SNOMEDCT_US:82525005",
        "xref: UMLS:C3714581",
        "is_a: HP:0000107 ! Renal cyst",
        
    ]
    hpo_terms = parse_hpo_obo(hpo_info)
    
    for hpo_term in hpo_terms:
        assert hpo_term['hpo_id'] == "HP:0000003"
        assert hpo_term['description'] == "Multicystic kidney dysplasia"
예제 #3
0
파일: hpo.py 프로젝트: hassanfa/scout
def load_hpo_terms(adapter,
                   hpo_lines=None,
                   hpo_gene_lines=None,
                   alias_genes=None):
    """Load the hpo terms into the database
    
    Parse the hpo lines, build the objects and add them to the database
    
    Args:
        adapter(MongoAdapter)
        hpo_lines(iterable(str))
        hpo_gene_lines(iterable(str))
    """

    # Store the hpo terms
    hpo_terms = {}

    # Fetch the hpo terms if no file
    if not hpo_lines:
        hpo_lines = fetch_hpo_terms()

    # Fetch the hpo gene information if no file
    if not hpo_gene_lines:
        hpo_gene_lines = fetch_hpo_to_genes()

    # Parse the terms
    # This will yield dictionaries with information about the terms
    LOG.info("Parsing hpo terms")
    for term in parse_hpo_obo(hpo_lines):
        hpo_terms[term['hpo_id']] = term

    # Get a map with hgnc symbols to hgnc ids from scout
    if not alias_genes:
        alias_genes = adapter.genes_by_alias()

    LOG.info("Adding gene information to hpo terms ...")
    for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines):
        hgnc_symbol = hpo_to_symbol['hgnc_symbol']
        hpo_id = hpo_to_symbol['hpo_id']

        # Fetch gene info to get correct hgnc id
        gene_info = alias_genes.get(hgnc_symbol)
        if not gene_info:
            continue

        hgnc_id = gene_info['true']

        if hpo_id not in hpo_terms:
            continue

        hpo_term = hpo_terms[hpo_id]

        if not 'genes' in hpo_term:
            hpo_term['genes'] = set()

        hpo_term['genes'].add(hgnc_id)

    start_time = datetime.now()

    LOG.info("Loading the hpo terms...")
    nr_terms = len(hpo_terms)
    hpo_bulk = []
    with progressbar(hpo_terms.values(),
                     label="Loading hpo terms",
                     length=nr_terms) as bar:

        for hpo_info in bar:
            hpo_bulk.append(build_hpo_term(hpo_info))

        if len(hpo_bulk) > 10000:
            adapter.load_hpo_bulk(hpo_bulk)
            hpo_bulk = []

    if hpo_bulk:
        adapter.load_hpo_bulk(hpo_bulk)

    LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms))
    LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))
def test_parse_hpo_terms(hpo_terms_handle):
    hpo_terms = parse_hpo_obo(hpo_terms_handle)
    
    for hpo_term in hpo_terms:
        assert hpo_term['hpo_id']
예제 #5
0
def test_parse_hpo_terms(hpo_terms_handle):
    hpo_terms = parse_hpo_obo(hpo_terms_handle)

    for hpo_term in hpo_terms:
        assert hpo_term['hpo_id']
예제 #6
0
def load_hpo_terms(adapter, hpo_lines=None, hpo_gene_lines=None, alias_genes=None):
    """Load the hpo terms into the database
    
    Parse the hpo lines, build the objects and add them to the database
    
    Args:
        adapter(MongoAdapter)
        hpo_lines(iterable(str))
        hpo_gene_lines(iterable(str))
    """
    
    # Store the hpo terms
    hpo_terms = {}
    
    # Fetch the hpo terms if no file
    if not hpo_lines:
        hpo_lines = fetch_hpo_terms()
    
    # Fetch the hpo gene information if no file
    if not hpo_gene_lines:
        hpo_gene_lines = fetch_hpo_to_genes()

    # Parse the terms
    # This will yield dictionaries with information about the terms
    LOG.info("Parsing hpo terms")
    for term in parse_hpo_obo(hpo_lines):
        hpo_terms[term['hpo_id']] = term
    
    # Get a map with hgnc symbols to hgnc ids from scout
    if not alias_genes:
        alias_genes = adapter.genes_by_alias()

    LOG.info("Adding gene information to hpo terms ...")
    for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines):
        hgnc_symbol = hpo_to_symbol['hgnc_symbol']
        hpo_id = hpo_to_symbol['hpo_id']
        
        # Fetch gene info to get correct hgnc id
        gene_info = alias_genes.get(hgnc_symbol)
        if not gene_info:
            continue

        hgnc_id = gene_info['true']

        if hpo_id not in hpo_terms:
            continue

        hpo_term = hpo_terms[hpo_id]

        if not 'genes' in hpo_term:
            hpo_term['genes'] = set()

        hpo_term['genes'].add(hgnc_id)

    start_time = datetime.now()

    LOG.info("Loading the hpo terms...")
    nr_terms = len(hpo_terms)
    hpo_bulk = []
    with progressbar(hpo_terms.values(), label="Loading hpo terms", length=nr_terms) as bar:
        
        for hpo_info in bar:
            hpo_bulk.append(build_hpo_term(hpo_info))
        
        if len(hpo_bulk) > 10000:
            adapter.load_hpo_bulk(hpo_bulk)
            hpo_bulk = []
    
    if hpo_bulk:
        adapter.load_hpo_bulk(hpo_bulk)
    
    LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms))
    LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))