예제 #1
0
def generate_exac_genes(genes):
    """Generate a reduced file with omim mim2gene information
    
    Args:
        genes(dict): A dictionary with hgnc_symbol as key and hgnc_id as value
        outpath(str)

    Yields:
        print_line(str): Lines from the reduced file
    """
    exac_lines = fetch_exac_constraint()

    yield (exac_lines[0])

    for gene_info in parse_exac_genes(exac_lines):
        hgnc_symbol = gene_info.get('hgnc_symbol')
        if not hgnc_symbol:
            continue
        if hgnc_symbol in genes:
            yield gene_info['raw']
예제 #2
0
def generate_exac_genes(genes):
    """Generate a reduced file with omim mim2gene information
    
    Args:
        genes(dict): A dictionary with hgnc_symbol as key and hgnc_id as value
        outpath(str)

    Yields:
        print_line(str): Lines from the reduced file
    """
    exac_lines = fetch_exac_constraint()

    yield(exac_lines[0])
    
    for gene_info in parse_exac_genes(exac_lines):
        hgnc_symbol = gene_info.get('hgnc_symbol')
        if not hgnc_symbol:
            continue
        if hgnc_symbol in genes:
            yield gene_info['raw']
예제 #3
0
def add_exac_info(genes, alias_genes, exac_lines):
    """Add information from the exac genes

    Currently we only add the pLi score on gene level

    The exac resource only use HGNC symbol to identify genes so we need
    our alias mapping.

    Args:
        genes(dict): Dictionary with all genes
        alias_genes(dict): Genes mapped to all aliases
        ensembl_lines(iteable): Iteable with raw ensembl info

    """
    LOG.info("Add exac pli scores")
    for exac_gene in parse_exac_genes(exac_lines):
        hgnc_symbol = exac_gene["hgnc_symbol"].upper()
        pli_score = exac_gene["pli_score"]

        for hgnc_id in get_correct_ids(hgnc_symbol, alias_genes):
            genes[hgnc_id]["pli_score"] = pli_score
예제 #4
0
def add_exac_info(genes, alias_genes, exac_lines):
    """Add information from the exac genes
    
    Currently we only add the pLi score on gene level
    
    The exac resource only use HGNC symbol to identify genes so we need
    our alias mapping.
    
    Args:
        genes(dict): Dictionary with all genes
        alias_genes(dict): Genes mapped to all aliases
        ensembl_lines(iteable): Iteable with raw ensembl info
        
    """
    LOG.info("Add exac pli scores")
    for exac_gene in parse_exac_genes(exac_lines):
        hgnc_symbol = exac_gene['hgnc_symbol'].upper()
        pli_score = exac_gene['pli_score']
        
        for hgnc_id in get_correct_ids(hgnc_symbol, alias_genes):
            genes[hgnc_id]['pli_score'] = pli_score
예제 #5
0
def link_genes(ensembl_lines, hgnc_lines, exac_lines, mim2gene_lines,
               genemap_lines, hpo_lines):
    """Gather information from different sources and return a gene dict

    Extract information collected from a number of sources and combine them
    into a gene dict with HGNC symbols as keys.

    hgnc_id works as the primary symbol and it is from this source we gather
    as much information as possible (hgnc_complete_set.txt)

    Coordinates are gathered from ensemble and the entries are linked from hgnc
    to ensembl via ENSGID.

    From exac the gene intolerance scores are collected, genes are linked to hgnc
    via hgnc symbol. This is a unstable symbol since they often change.


        Args:
            ensembl_lines(iterable(str))
            hgnc_lines(iterable(str))
            exac_lines(iterable(str))

        Yields:
            gene(dict): A dictionary with gene information
    """
    genes = {}
    log.info("Linking genes and transcripts")
    # HGNC genes are the main source, these define the gene dataset to use
    # Try to use as much information as possible from hgnc
    for hgnc_gene in parse_hgnc_genes(hgnc_lines):
        hgnc_id = hgnc_gene['hgnc_id']
        hgnc_gene['transcripts'] = []
        genes[hgnc_id] = hgnc_gene

    symbol_to_id = genes_by_alias(genes)
    # Parse and add the ensembl gene info
    all_genes = {'ensembl': {}, 'symbol': {}}
    for transcript in parse_ensembl_transcripts(ensembl_lines):
        ensg_symbol = transcript['hgnc_symbol']
        ensgid = transcript['ensembl_gene_id']
        for id_type, gene_id in [('symbol', ensg_symbol), ('ensembl', ensgid)]:
            if gene_id in all_genes[id_type]:
                all_genes[id_type][gene_id].append(transcript)
            else:
                all_genes[id_type][gene_id] = [transcript]

    log.info("Add ensembl info")
    # Add gene coordinates and transcript info for hgnc genes:
    for gene_info in genes.values():
        ensgid = gene_info['ensembl_gene_id']
        ensg_symbol = gene_info['hgnc_symbol']

        for id_type, gene_id in [('ensembl', ensgid), ('symbol', ensg_symbol)]:
            if gene_id:
                if gene_id in all_genes[id_type]:
                    add_ensembl_info(gene_info, all_genes[id_type][gene_id])
                    ensgid = 'ADDED'
                    break

    log.info("Add exac pli scores")
    for exac_gene in parse_exac_genes(exac_lines):
        hgnc_symbol = exac_gene['hgnc_symbol'].upper()
        pli_score = exac_gene['pli_score']

        if hgnc_symbol in symbol_to_id:
            hgnc_id_info = symbol_to_id[hgnc_symbol]

            # If we have the true id we know ot os correct
            if hgnc_id_info['true_id']:
                hgnc_id = hgnc_id_info['true_id']
                genes[hgnc_id]['pli_score'] = pli_score

            # Otherwise we loop over the ids and add pli score if it
            # is not already added
            else:
                for hgnc_id in hgnc_id_info['ids']:
                    gene_info = genes[hgnc_id]
                    if not gene_info.get('pli_score'):
                        gene_info['pli_score'] = pli_score

    log.info("Add omim info")
    omim_genes = get_mim_genes(genemap_lines, mim2gene_lines)
    for hgnc_symbol in omim_genes:
        omim_info = omim_genes[hgnc_symbol]
        inheritance = omim_info.get('inheritance', set())
        if hgnc_symbol in symbol_to_id:
            hgnc_id_info = symbol_to_id[hgnc_symbol]

            # If we have the true id we know it is correct
            if hgnc_id_info['true_id']:
                hgnc_id = hgnc_id_info['true_id']
                gene_info = genes[hgnc_id]

                # Update the omim id to the one found in omim
                gene_info['omim_id'] = omim_info['mim_number']

                gene_info['inheritance_models'] = list(inheritance)
                gene_info['phenotypes'] = omim_info.get('phenotypes', [])
            else:
                for hgnc_id in hgnc_id_info['ids']:
                    gene_info = genes[hgnc_id]
                    if not gene_info.get('omim_id'):
                        gene_info['omim_id'] = omim_info['mim_number']
                    if not gene_info.get('inheritance_models'):
                        gene_info['inheritance_models'] = list(inheritance)
                    if not gene_info.get('phenotypes'):
                        gene_info['phenotypes'] = omim_info.get('phenotypes', [])

    log.info("Add incomplete penetrance info")
    for hgnc_symbol in get_incomplete_penetrance_genes(hpo_lines):
        if hgnc_symbol in symbol_to_id:
            hgnc_id_info = symbol_to_id[hgnc_symbol]

            # If we have the true id we know ot os correct
            if hgnc_id_info['true_id']:
                hgnc_id = hgnc_id_info['true_id']
                genes[hgnc_id]['incomplete_penetrance'] = True

            # Otherwise we loop over the ids and add incomplete penetrance if it
            # is not already added
            else:
                for hgnc_id in hgnc_id_info['ids']:
                    gene_info = genes[hgnc_id]
                    if not 'incomplete_penetrance' in gene_info:
                        gene_info['incomplete_penetrance'] = True

    return genes
예제 #6
0
def exac_genes(request, exac_handle):
    """Get the parsed exac genes"""
    print('')
    return parse_exac_genes(exac_handle)
예제 #7
0
def test_parse_exac_genes(exac_handle):

    genes = parse_exac_genes(exac_handle)

    for gene in genes:
        assert gene["hgnc_symbol"]
def test_parse_exac_genes(exac_handle):
    
    genes = parse_exac_genes(exac_handle)
    
    for gene in genes:
        assert gene['hgnc_symbol']