示例#1
0
def test_build_hpo_term_missing_key(adapter, test_hpo_info, key):
    ## GIVEN a dictionary with hpo information

    ## WHEN deleteing key
    test_hpo_info.pop(key)
    ## THEN calling build_hpo_term() will raise KeyError
    with pytest.raises(KeyError):
        build_hpo_term(test_hpo_info)
示例#2
0
def test_build_hpo_term_with_genes(adapter):
    ## GIVEN a hpo term and a adapter with genes
    hpo_info = {
        'hpo_id':"HP:0000878",
        'description': "11 pairs of ribs",
        'hgnc_symbols': ['B3GALT6', 'RBBP8']
    }
    alias_genes = {}
    alias_genes['B3GALT6'] = {
            'true': 17978,
            'ids': [17978],
        }
    
    alias_genes['RBBP8'] = {
            'true': 9891,
            'ids': [9891],
        }
    
    ## WHEN building the hpo term
    hpo_obj = build_hpo_term(hpo_info, alias_genes)
    ## THEN assert that the term has the correct information
    assert hpo_obj['_id'] == hpo_obj['hpo_id'] == hpo_info['hpo_id']    
    ## The adapter has no genes loaded so we expect this to be 0
    assert len(hpo_obj['genes']) == 2
    assert set(hpo_obj['genes']) == set([17978, 9891])
示例#3
0
def test_build_hpo_term(adapter, test_hpo_info):
    ## GIVEN a hpo term
    ## WHEN building the hpo term
    hpo_obj = build_hpo_term(test_hpo_info)
    ## THEN assert that the term has the correct information
    assert hpo_obj["_id"] == hpo_obj["hpo_id"] == test_hpo_info["hpo_id"]
    assert hpo_obj["description"] == test_hpo_info["description"]
    assert len(hpo_obj["genes"]) == 2
示例#4
0
def test_build_hpo_term(adapter):
    ## GIVEN a hpo term
    hpo_info = {
        "hpo_id": "HP:0000878",
        "description": "11 pairs of ribs",
        "genes": [1, 2],
    }
    ## WHEN building the hpo term
    hpo_obj = build_hpo_term(hpo_info)
    ## THEN assert that the term has the correct information
    assert hpo_obj["_id"] == hpo_obj["hpo_id"] == hpo_info["hpo_id"]
    assert hpo_obj["description"] == hpo_info["description"]
    assert len(hpo_obj["genes"]) == 2
示例#5
0
def test_build_hpo_term_non_existing_genes(adapter):
    ## GIVEN a hpo term
    hpo_info = {
        'hpo_id': "HP:0000878",
        'description': "11 pairs of ribs",
        'genes': [1, 2]
    }
    ## WHEN building the hpo term
    hpo_obj = build_hpo_term(hpo_info)
    ## THEN assert that the term has the correct information
    assert hpo_obj['_id'] == hpo_obj['hpo_id'] == hpo_info['hpo_id']
    assert hpo_obj['description'] == hpo_info['description']
    assert len(hpo_obj['genes']) == 2
示例#6
0
def test_build_hpo_term(adapter):
    ## GIVEN a hpo term
    hpo_info = {
        'hpo_id':"HP:0000878",
        'description': "11 pairs of ribs",
        'genes': [1, 2]
    }
    ## WHEN building the hpo term
    hpo_obj = build_hpo_term(hpo_info)
    ## THEN assert that the term has the correct information
    assert hpo_obj['_id'] == hpo_obj['hpo_id'] == hpo_info['hpo_id']    
    assert hpo_obj['description'] == hpo_info['description']
    assert len(hpo_obj['genes']) == 2
示例#7
0
def test_build_hpo_term_non_existing_genes(adapter):
    ## GIVEN a hpo term
    hpo_info = {
        'hpo_id':"HP:0000878",
        'description': "11 pairs of ribs",
        'hgnc_symbols': ['B3GALT6', 'RBBP8']
    }
    ## WHEN building the hpo term
    hpo_obj = build_hpo_term(hpo_info, {})
    ## THEN assert that the term has the correct information
    assert hpo_obj['_id'] == hpo_obj['hpo_id'] == hpo_info['hpo_id']    
    assert hpo_obj['description'] == hpo_info['description']
    ## The adapter has no genes loaded so we expect this to be 0
    assert len(hpo_obj['genes']) == 0
示例#8
0
def load_hpo_terms(adapter, hpo_lines, genes):
    """Load the hpo terms into the database
    
    Parse the hpo lines, build the objects and add them to the database
    
    Args:
        adapter(MongoAdapter)
        hpo_lines(iterable(str))
    """
    hpo_terms = parse_hpo_phenotypes(hpo_lines)

    start_time = datetime.now()

    logger.info("Loading the hpo terms...")
    for nr_terms, hpo_id in enumerate(hpo_terms):
        hpo_info = hpo_terms[hpo_id]
        hpo_obj = build_hpo_term(hpo_info, genes)

        adapter.load_hpo_term(hpo_obj)

    logger.info("Loading done. Nr of terms loaded {0}".format(nr_terms))
    logger.info("Time to load terms: {0}".format(datetime.now() - start_time))
示例#9
0
文件: hpo.py 项目: hassanfa/scout
def load_hpo_terms(adapter,
                   hpo_lines=None,
                   hpo_gene_lines=None,
                   alias_genes=None):
    """Load the hpo terms into the database
    
    Parse the hpo lines, build the objects and add them to the database
    
    Args:
        adapter(MongoAdapter)
        hpo_lines(iterable(str))
        hpo_gene_lines(iterable(str))
    """

    # Store the hpo terms
    hpo_terms = {}

    # Fetch the hpo terms if no file
    if not hpo_lines:
        hpo_lines = fetch_hpo_terms()

    # Fetch the hpo gene information if no file
    if not hpo_gene_lines:
        hpo_gene_lines = fetch_hpo_to_genes()

    # Parse the terms
    # This will yield dictionaries with information about the terms
    LOG.info("Parsing hpo terms")
    for term in parse_hpo_obo(hpo_lines):
        hpo_terms[term['hpo_id']] = term

    # Get a map with hgnc symbols to hgnc ids from scout
    if not alias_genes:
        alias_genes = adapter.genes_by_alias()

    LOG.info("Adding gene information to hpo terms ...")
    for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines):
        hgnc_symbol = hpo_to_symbol['hgnc_symbol']
        hpo_id = hpo_to_symbol['hpo_id']

        # Fetch gene info to get correct hgnc id
        gene_info = alias_genes.get(hgnc_symbol)
        if not gene_info:
            continue

        hgnc_id = gene_info['true']

        if hpo_id not in hpo_terms:
            continue

        hpo_term = hpo_terms[hpo_id]

        if not 'genes' in hpo_term:
            hpo_term['genes'] = set()

        hpo_term['genes'].add(hgnc_id)

    start_time = datetime.now()

    LOG.info("Loading the hpo terms...")
    nr_terms = len(hpo_terms)
    hpo_bulk = []
    with progressbar(hpo_terms.values(),
                     label="Loading hpo terms",
                     length=nr_terms) as bar:

        for hpo_info in bar:
            hpo_bulk.append(build_hpo_term(hpo_info))

        if len(hpo_bulk) > 10000:
            adapter.load_hpo_bulk(hpo_bulk)
            hpo_bulk = []

    if hpo_bulk:
        adapter.load_hpo_bulk(hpo_bulk)

    LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms))
    LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))
示例#10
0
def load_hpo_terms(adapter,
                   hpo_lines=None,
                   hpo_gene_lines=None,
                   alias_genes=None):
    """Load the hpo terms into the database

    Parse the hpo lines, build the objects and add them to the database

    Args:
        adapter(MongoAdapter)
        hpo_lines(iterable(str)): lines from file http://purl.obolibrary.org/obo/hp.obo
        hpo_gene_lines(iterable(str)): lines from file
            https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt
        alias_genes
    """
    # Fetch the hpo terms if no file
    if not hpo_lines:
        hpo_lines = fetch_hpo_terms()

    # Parse the terms
    LOG.info("Parsing hpo terms")
    hpo_terms = build_hpo_tree(hpo_lines)

    # Fetch the hpo gene information if no file
    if not hpo_gene_lines:
        hpo_gene_lines = fetch_hpo_to_genes_to_disease()

    # Get a map with hgnc symbols to hgnc ids from scout
    if not alias_genes:
        alias_genes = adapter.genes_by_alias()

    LOG.info("Adding gene information to hpo terms ...")
    for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines):
        hgnc_symbol = hpo_to_symbol["hgnc_symbol"]
        hpo_id = hpo_to_symbol["hpo_id"]

        # Fetch gene info to get correct hgnc id
        gene_info = alias_genes.get(hgnc_symbol)
        if not gene_info:
            continue

        hgnc_id = gene_info["true"]

        if hpo_id not in hpo_terms:
            continue

        hpo_term = hpo_terms[hpo_id]

        if not "genes" in hpo_term:
            hpo_term["genes"] = set()

        hpo_term["genes"].add(hgnc_id)

    start_time = datetime.now()

    LOG.info("Loading the hpo terms...")
    nr_terms = len(hpo_terms)
    hpo_bulk = []
    with progressbar(hpo_terms.values(),
                     label="Loading hpo terms",
                     length=nr_terms) as bar:

        for hpo_info in bar:
            hpo_bulk.append(build_hpo_term(hpo_info))

        if len(hpo_bulk) > 10000:
            adapter.load_hpo_bulk(hpo_bulk)
            hpo_bulk = []

    if hpo_bulk:
        adapter.load_hpo_bulk(hpo_bulk)

    LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms))
    LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))
示例#11
0
def load_hpo_terms(adapter, hpo_lines=None, hpo_gene_lines=None, alias_genes=None):
    """Load the hpo terms into the database
    
    Parse the hpo lines, build the objects and add them to the database
    
    Args:
        adapter(MongoAdapter)
        hpo_lines(iterable(str))
        hpo_gene_lines(iterable(str))
    """
    
    # Store the hpo terms
    hpo_terms = {}
    
    # Fetch the hpo terms if no file
    if not hpo_lines:
        hpo_lines = fetch_hpo_terms()
    
    # Fetch the hpo gene information if no file
    if not hpo_gene_lines:
        hpo_gene_lines = fetch_hpo_to_genes()

    # Parse the terms
    # This will yield dictionaries with information about the terms
    LOG.info("Parsing hpo terms")
    for term in parse_hpo_obo(hpo_lines):
        hpo_terms[term['hpo_id']] = term
    
    # Get a map with hgnc symbols to hgnc ids from scout
    if not alias_genes:
        alias_genes = adapter.genes_by_alias()

    LOG.info("Adding gene information to hpo terms ...")
    for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines):
        hgnc_symbol = hpo_to_symbol['hgnc_symbol']
        hpo_id = hpo_to_symbol['hpo_id']
        
        # Fetch gene info to get correct hgnc id
        gene_info = alias_genes.get(hgnc_symbol)
        if not gene_info:
            continue

        hgnc_id = gene_info['true']

        if hpo_id not in hpo_terms:
            continue

        hpo_term = hpo_terms[hpo_id]

        if not 'genes' in hpo_term:
            hpo_term['genes'] = set()

        hpo_term['genes'].add(hgnc_id)

    start_time = datetime.now()

    LOG.info("Loading the hpo terms...")
    nr_terms = len(hpo_terms)
    hpo_bulk = []
    with progressbar(hpo_terms.values(), label="Loading hpo terms", length=nr_terms) as bar:
        
        for hpo_info in bar:
            hpo_bulk.append(build_hpo_term(hpo_info))
        
        if len(hpo_bulk) > 10000:
            adapter.load_hpo_bulk(hpo_bulk)
            hpo_bulk = []
    
    if hpo_bulk:
        adapter.load_hpo_bulk(hpo_bulk)
    
    LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms))
    LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))