Пример #1
0
    def get_canonical_ensembl_names(taxon_id):
        beginners = {
            7227: 'FLYBASE:',
            6239: 'WormBase:',
            3702: 'Araport:',
            511145: 'EcoGene',
            559292: 'SGD'
        }
        if taxon_id in beginners.keys():
            beg = beginners[taxon_id]
        else:
            beg = 'Ensembl:'

        gene_info = meta.gene_info(taxon_id, usecols=['gene_ncbi', 'dbXrefs'])
        gene_info = utils.split_text_to_multiple_rows(gene_info, 'dbXrefs',
                                                      '\|')
        f = gene_info['dbXrefs'].str.startswith(beg)
        gene_info = gene_info.loc[f, :].copy()
        gene_info['dbXrefs'] = gene_info.loc[:, 'dbXrefs'].str.replace(
            r'^' + beg,
            '',
        )
        gene_info = gene_info.drop_duplicates()
        canonical_ensembl_names = set(gene_info['dbXrefs'].values)
        return canonical_ensembl_names
def _get_gene_ncbi_2_ensembl():

    gi = meta.gene_info(taxon_id=9606)

    f = gi['dbXrefs'].str.contains('Ensembl:ENSG[0-9]*')
    gi.loc[f, 'gene_ensembl'] = gi.loc[f, 'dbXrefs'].str.extract(
        'Ensembl:(ENSG[0-9]*)', expand=False)
    gi = gi[['gene_ncbi', 'gene_ensembl']].drop_duplicates()
    gi = gi.drop_duplicates('gene_ensembl', keep=False)
    gi = gi[gi['gene_ncbi'].isin(get_ref_genes())]

    return gi
def reference_genes(taxon_id, ref_code):
    """
    Obtains a list of reference genes

    Input:
        taxon_id    int
        ref_code    str;  if it contains
                        l   -> at least one medline paper
                        o   -> official nomenclature require
                        p   -> protein-coding only

    Output:
        ref_genes   sorted list of gene identifiers
    """

    df = meta.gene_info(taxon_id)

    if df.shape[0] == 0:
        raise EnvironmentError("""
            Did not find gene info for taxon {}""".format(int(taxon_id)))

    if 'l' in ref_code:
        genes_in_medline = medline.gene2pubmed(taxon_id, ['gene_ncbi'])
        f = df.loc[:, 'gene_ncbi'].isin(genes_in_medline['gene_ncbi'])
        df = df.loc[f, :]

        if df.shape[0] == 0:
            raise EnvironmentError("""
                After filtering for genes with at least one paper,
                no gene is left.""")

    if 'o' in ref_code:  # official nomeclature
        f = df.loc[:, 'Nomenclature_status'] == 'O'
        df = df.loc[f, :]

        if df.shape[0] == 0:
            raise EnvironmentError("""
                After filtering for genes with official nomeclature,
                no gene is left.""")

    if 'p' in ref_code:  # protein-coding
        f = df.loc[:, 'type_of_gene'] == 'protein-coding'
        df = df.loc[f, :]

        if df.shape[0] == 0:
            raise EnvironmentError("""
                After filtering for protein-coding, no gene is
                left.""")

    if 'r' in ref_code:
        genes_in_medline = medline.gene2pubmed(taxon_id,
                                               ['pubmed_id', 'gene_ncbi'],
                                               paper_kind='research')
        f = df.loc[:, 'gene_ncbi'].isin(genes_in_medline['gene_ncbi'])
        df = df.loc[f, :]

        if df.shape[0] == 0:
            raise EnvironmentError("""
                After filtering for genes with at least one research paper,
                no gene is left.""")

    ref_genes = sorted(df.loc[:, 'gene_ncbi'].values)

    return ref_genes
def any_gwas():

    ebi_gwas = gwas_studies.ebi_gwas()

    f = ebi_gwas['MAPPED_GENE'].str.contains('[;,-]') == True
    gwas = ebi_gwas.loc[
        ~f,
        ['MAPPED_GENE', 'DISEASE/TRAIT', 'PVALUE_MLOG', 'pubmed_id']].rename(
        columns={
            'MAPPED_GENE': 'symbol_ambiguous',
            'DISEASE/TRAIT': 'trait',
            'PVALUE_MLOG': 'log_pvalue'
        }
    )

    gwas = pd.merge(
        gwas,
        meta.gene_info(taxon_id=9606, usecols=[
                       'symbol_ncbi', 'gene_ncbi']),
        left_on='symbol_ambiguous',
        right_on='symbol_ncbi',
        how='inner'
    ).drop('symbol_ambiguous', axis=1).drop('symbol_ncbi', axis=1)

    gwas = gwas[gwas['gene_ncbi'].isin(get_ref_genes())]

    ge = sorted(gwas['gene_ncbi'].unique())

    gwas = gwas.sort_values('log_pvalue', ascending=False)
    gwas = gwas.drop_duplicates(
        ['trait', 'pubmed_id', 'gene_ncbi'],
        keep='first')

    studies_per_phenotype = gwas[
        ['pubmed_id', 'trait']].drop_duplicates()[
        'trait'].value_counts()

    required_studies = 1
    important_gwas = gwas.loc[
        (
            gwas['trait'].isin(
                studies_per_phenotype[
                    studies_per_phenotype >= required_studies].index)), :

    ][['pubmed_id', 'trait', 'gene_ncbi']].drop_duplicates()

    he = pd.merge(
        important_gwas.groupby(
            ['trait', 'gene_ncbi']).size(
        ).reset_index().rename(columns={0: 'records'}),
        studies_per_phenotype.to_frame(
            'studies').reset_index().rename(columns={'index': 'trait'}))

    he.loc[:, 'fraction_of_any_gwas_studies'] = he['records'] / he['studies']

    dd = he.pivot(
        index='gene_ncbi',
        columns='trait',
        values='fraction_of_any_gwas_studies'
    )

    dd.columns = ['gwas_any_{}'.format(x) for x in dd.columns]

    dd = dd.reindex(ge)
    dd = dd.fillna(0)

    cl = dd > 0.0
    cl.loc[:, 'any_gwas'] = cl.any(axis=1)

    return cl, dd, ge