예제 #1
0
파일: hpo.py 프로젝트: hassanfa/scout
def load_hpo(adapter,
             disease_lines,
             hpo_disease_lines=None,
             hpo_lines=None,
             hpo_gene_lines=None):
    """Load the hpo terms and hpo diseases into database
    
    Args:
        adapter(MongoAdapter)
        disease_lines(iterable(str)): These are the omim genemap2 information
        hpo_lines(iterable(str))
        disease_lines(iterable(str))
        hpo_gene_lines(iterable(str))
    """
    # Create a map from gene aliases to gene objects
    alias_genes = adapter.genes_by_alias()

    # Fetch the hpo terms if no file
    if not hpo_lines:
        hpo_lines = fetch_hpo_terms()

    # Fetch the hpo gene information if no file
    if not hpo_gene_lines:
        hpo_gene_lines = fetch_hpo_to_genes()

    # Fetch the hpo phenotype information if no file
    if not hpo_disease_lines:
        hpo_disease_lines = fetch_hpo_phenotype_to_terms()

    load_hpo_terms(adapter, hpo_lines, hpo_gene_lines, alias_genes)

    load_disease_terms(adapter, disease_lines, alias_genes, hpo_disease_lines)
예제 #2
0
def load_hpo(adapter, disease_lines, hpo_disease_lines=None, hpo_lines=None, hpo_gene_lines=None):
    """Load the hpo terms and hpo diseases into database
    
    Args:
        adapter(MongoAdapter)
        disease_lines(iterable(str)): These are the omim genemap2 information
        hpo_lines(iterable(str))
        disease_lines(iterable(str))
        hpo_gene_lines(iterable(str))
    """
    # Create a map from gene aliases to gene objects
    alias_genes = adapter.genes_by_alias()
    
    # Fetch the hpo terms if no file
    if not hpo_lines:
        hpo_lines = fetch_hpo_terms()

    # Fetch the hpo gene information if no file
    if not hpo_gene_lines:
        hpo_gene_lines = fetch_hpo_to_genes()
    
    # Fetch the hpo phenotype information if no file
    if not hpo_disease_lines:
        hpo_disease_lines = fetch_hpo_phenotype_to_terms()
    
    load_hpo_terms(adapter, hpo_lines, hpo_gene_lines, alias_genes)
    
    load_disease_terms(adapter, disease_lines, alias_genes, hpo_disease_lines)
예제 #3
0
파일: hpo.py 프로젝트: CHRUdeLille/scout
def load_hpo(adapter,
             disease_lines,
             hpo_disease_lines=None,
             hpo_lines=None,
             hpo_gene_lines=None):
    """Load the hpo terms and hpo diseases into database
    
    Args:
        adapter(MongoAdapter)
        hpo_lines(iterable(str))
        disease_lines(iterable(str))
    """
    alias_genes = adapter.genes_by_alias()

    # Fetch the hpo terms if no file
    if not hpo_lines:
        hpo_lines = fetch_hpo_terms()

    # Fetch the hpo gene information if no file
    if not hpo_gene_lines:
        hpo_gene_lines = fetch_hpo_to_genes()

    # Fetch the hpo phenotype information if no file
    if not hpo_disease_lines:
        hpo_disease_lines = fetch_hpo_phenotype_to_terms()

    load_hpo_terms(adapter, hpo_lines, hpo_gene_lines, alias_genes)

    load_disease_terms(adapter, disease_lines, alias_genes, hpo_disease_lines)
예제 #4
0
def get_reduced_hpo_terms(hpo_terms):
    """Return a reduced version of the hpo terms
    
    Args:
        hpo_terms(set(str)): Set of choosen terms that should be included

    Yields:
        hpo_line: A line with hpo information
    """
    hpo_lines = fetch_hpo_terms()
    
    begining = True
    
    term_lines = []
    # We want to keep the header lines
    keep = True
    
    nr_terms = 0
    nr_kept = 0

    for line in hpo_lines:
        
        # When we encounter a new term we yield all lines of the previous term
        if line.startswith('[Term]'):
            nr_terms += 1
            if keep:
                nr_kept += 1
                for hpo_line in term_lines:
                    yield hpo_line

            keep = False
            term_lines = []
        
        elif line.startswith('id'):
            hpo_id = line[4:]
            if hpo_id in hpo_terms:
                keep = True
        
        term_lines.append(line)
        

    if keep:
        for hpo_line in term_lines:
            yield hpo_line
    
    LOG.info("Nr of terms in file %s", nr_terms)
    LOG.info("Nr of terms kept: %s", nr_kept)
예제 #5
0
def get_reduced_hpo_terms(hpo_terms):
    """Return a reduced version of the hpo terms
    
    Args:
        hpo_terms(set(str)): Set of choosen terms that should be included

    Yields:
        hpo_line: A line with hpo information
    """
    hpo_lines = fetch_hpo_terms()

    begining = True

    term_lines = []
    # We want to keep the header lines
    keep = True

    nr_terms = 0
    nr_kept = 0

    for line in hpo_lines:

        # When we encounter a new term we yield all lines of the previous term
        if line.startswith('[Term]'):
            nr_terms += 1
            if keep:
                nr_kept += 1
                for hpo_line in term_lines:
                    yield hpo_line

            keep = False
            term_lines = []

        elif line.startswith('id'):
            hpo_id = line[4:]
            if hpo_id in hpo_terms:
                keep = True

        term_lines.append(line)

    if keep:
        for hpo_line in term_lines:
            yield hpo_line

    LOG.info("Nr of terms in file %s", nr_terms)
    LOG.info("Nr of terms kept: %s", nr_kept)
예제 #6
0
def hpo(context):
    """
    Update the hpo terms in the database. Fetch the latest release and update terms.
    """
    LOG.info("Running scout update hpo")
    adapter = context.obj['adapter']
    
    LOG.info("Dropping HPO terms")
    adapter.hpo_term_collection.drop()
    LOG.debug("HPO terms dropped")
    
    # Fetch the latest version of the hpo terms
    hpo_lines = fetch_hpo_terms()
    # Fetch the connection to genes from hpo source
    hpo_gene_lines = fetch_hpo_to_genes()
    
    load_hpo_terms(adapter, hpo_lines, hpo_gene_lines)
예제 #7
0
파일: hpo.py 프로젝트: hassanfa/scout
def load_hpo_terms(adapter,
                   hpo_lines=None,
                   hpo_gene_lines=None,
                   alias_genes=None):
    """Load the hpo terms into the database
    
    Parse the hpo lines, build the objects and add them to the database
    
    Args:
        adapter(MongoAdapter)
        hpo_lines(iterable(str))
        hpo_gene_lines(iterable(str))
    """

    # Store the hpo terms
    hpo_terms = {}

    # Fetch the hpo terms if no file
    if not hpo_lines:
        hpo_lines = fetch_hpo_terms()

    # Fetch the hpo gene information if no file
    if not hpo_gene_lines:
        hpo_gene_lines = fetch_hpo_to_genes()

    # Parse the terms
    # This will yield dictionaries with information about the terms
    LOG.info("Parsing hpo terms")
    for term in parse_hpo_obo(hpo_lines):
        hpo_terms[term['hpo_id']] = term

    # Get a map with hgnc symbols to hgnc ids from scout
    if not alias_genes:
        alias_genes = adapter.genes_by_alias()

    LOG.info("Adding gene information to hpo terms ...")
    for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines):
        hgnc_symbol = hpo_to_symbol['hgnc_symbol']
        hpo_id = hpo_to_symbol['hpo_id']

        # Fetch gene info to get correct hgnc id
        gene_info = alias_genes.get(hgnc_symbol)
        if not gene_info:
            continue

        hgnc_id = gene_info['true']

        if hpo_id not in hpo_terms:
            continue

        hpo_term = hpo_terms[hpo_id]

        if not 'genes' in hpo_term:
            hpo_term['genes'] = set()

        hpo_term['genes'].add(hgnc_id)

    start_time = datetime.now()

    LOG.info("Loading the hpo terms...")
    nr_terms = len(hpo_terms)
    hpo_bulk = []
    with progressbar(hpo_terms.values(),
                     label="Loading hpo terms",
                     length=nr_terms) as bar:

        for hpo_info in bar:
            hpo_bulk.append(build_hpo_term(hpo_info))

        if len(hpo_bulk) > 10000:
            adapter.load_hpo_bulk(hpo_bulk)
            hpo_bulk = []

    if hpo_bulk:
        adapter.load_hpo_bulk(hpo_bulk)

    LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms))
    LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))
예제 #8
0
def load_hpo_terms(adapter, hpo_lines=None, hpo_gene_lines=None, alias_genes=None):
    """Load the hpo terms into the database
    
    Parse the hpo lines, build the objects and add them to the database
    
    Args:
        adapter(MongoAdapter)
        hpo_lines(iterable(str))
        hpo_gene_lines(iterable(str))
    """
    
    # Store the hpo terms
    hpo_terms = {}
    
    # Fetch the hpo terms if no file
    if not hpo_lines:
        hpo_lines = fetch_hpo_terms()
    
    # Fetch the hpo gene information if no file
    if not hpo_gene_lines:
        hpo_gene_lines = fetch_hpo_to_genes()

    # Parse the terms
    # This will yield dictionaries with information about the terms
    LOG.info("Parsing hpo terms")
    for term in parse_hpo_obo(hpo_lines):
        hpo_terms[term['hpo_id']] = term
    
    # Get a map with hgnc symbols to hgnc ids from scout
    if not alias_genes:
        alias_genes = adapter.genes_by_alias()

    LOG.info("Adding gene information to hpo terms ...")
    for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines):
        hgnc_symbol = hpo_to_symbol['hgnc_symbol']
        hpo_id = hpo_to_symbol['hpo_id']
        
        # Fetch gene info to get correct hgnc id
        gene_info = alias_genes.get(hgnc_symbol)
        if not gene_info:
            continue

        hgnc_id = gene_info['true']

        if hpo_id not in hpo_terms:
            continue

        hpo_term = hpo_terms[hpo_id]

        if not 'genes' in hpo_term:
            hpo_term['genes'] = set()

        hpo_term['genes'].add(hgnc_id)

    start_time = datetime.now()

    LOG.info("Loading the hpo terms...")
    nr_terms = len(hpo_terms)
    hpo_bulk = []
    with progressbar(hpo_terms.values(), label="Loading hpo terms", length=nr_terms) as bar:
        
        for hpo_info in bar:
            hpo_bulk.append(build_hpo_term(hpo_info))
        
        if len(hpo_bulk) > 10000:
            adapter.load_hpo_bulk(hpo_bulk)
            hpo_bulk = []
    
    if hpo_bulk:
        adapter.load_hpo_bulk(hpo_bulk)
    
    LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms))
    LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))
예제 #9
0
        all_ancestors = get_all_ancestors(hpo_tree, term, set())

        term['all_ancestors'] = all_ancestors

    return hpo_tree


if __name__ == "__main__":
    import sys
    from pprint import pprint as pp
    from scout.utils.handle import get_file_handle
    from scout.utils.requests import fetch_hpo_terms

    if not len(sys.argv) > 1:
        file_handle = fetch_hpo_terms()
    else:
        file_handle = get_file_handle(sys.argv[1])

    hpo_tree = build_hpo_tree(file_handle)

    my_term = hpo_tree['HP:0200024']
    pp(my_term)

    # print(get_all_ancestors(hpo_tree, my_term))
    for term in hpo_tree:
        pp(hpo_tree[term])

    # phenotypes = parse_hpo_phenotypes(file_handle)
    # for hpo_id in phenotypes:
    #     hpo_term = phenotypes[hpo_id]