Пример #1
0
def make_base(project_base, sub_name, ref_genes, u_features, df_target):
    """
    Make prediction bases; Note that it will remove
    rows with NaN as a feature, or as prediction target:

    Input:
        project_base    str, name of series of analysis
        sub_name        str, will be used as folder
                        (should be specific to dataset)
        ref_genes       list-like; reference genes that shall be filtered for
        u_features      either a dataframe with features or a dictionary
                        of many dataframes (note: always: index is gene_ncbi)
        df_target       dataframe with target to predict as sole column


    """

    p = inout.get_internal_path(os.path.join(
        project_base,
        sub_name,
        'input'
    ))

    if df_target.shape[1] != 1:
        raise EnvironmentError(
            'df_target must have exactly one column')
    else:
        df_target.columns = ['prediction_target']

    if isinstance(u_features, dict):
        df = pd.concat(u_features, join='outer', axis=1)
    else:
        df = u_features

    if isinstance(df.columns, pd.core.index.MultiIndex):
        df.columns = df.columns.droplevel(level=0)

    df = df.loc[ref_genes, :].dropna()

    master = pd.merge(
        df,
        df_target,
        left_index=True,
        right_index=True,
        how='inner'
    )

    master = master.dropna()

    if any(master.index.duplicated()):
        raise EnvironmentError(
            'At least one index is duplicated')

    features = master.drop('prediction_target', axis=1)
    target = master.loc[:, ['prediction_target']]
    _export(features, p, 'features')
    _export(target, p, 'target')
def load_layout(rotation_degrees=0):

    p = rinout.get_internal_path(
        os.path.join(
            '171014f_visualize_metrics_on_characteristics_tsne_and_features/',
            'genes_coordinates.csv'))

    tsne_frame = pd.read_csv(p).set_index('gene_ncbi')

    if rotation_degrees != 0:
        agg = []
        for x in zip(tsne_frame.loc[:, 'x'], tsne_frame.loc[:, 'y']):
            d = ret.rotate((0, 0), x, rotation_degrees)
            agg.append(d)

        tsne_frame.loc[:, 'x'] = [x[0] for x in agg]
        tsne_frame.loc[:, 'y'] = [x[1] for x in agg]

    return tsne_frame
def gtx():

    p = rinout.get_internal_path(
        (
            '170830f_differential_gene_expression_from_gtx/'
            'taxon9606/gtx/df_genes.csv.gz'
        )
    )

    df = pd.read_csv(p)

    gi = _get_gene_ncbi_2_ensembl()

    df = pd.merge(df, gi).drop('gene_ensembl', axis=1)
    df = df[df['gene_ncbi'].isin(get_ref_genes())]

    df.loc[:, 'differential'] = df.loc[:, 'p-value'] < 0.0001
    he = df[['gene_ncbi', 'differential']].groupby('gene_ncbi').agg(
        np.mean).rename(columns={'differential': 'gtx_fraction'})

    ge = natsorted(he.index)

    h = np.mean(df['differential'])
    f = he['gtx_fraction'] == 0
    he.loc[~f, 'gtx_fold'] = np.log2(he.loc[~f, 'gtx_fraction'] / h)
    he.loc[f, 'gtx_fold'] = -np.inf

    dd = he.copy()

    f = dd['gtx_fraction'].rank(pct=True) >= 0.99

    dd.loc[:, 'extreme_gtx'] = f

    cl = dd.copy()

    f = cl['gtx_fold'] > 2
    cl.loc[f, 'gtx_fold'] = 2

    f = cl['gtx_fold'] < -2
    cl.loc[f, 'gtx_fold'] = -2

    return cl, dd, ge
def pi_transition():

    p = rinout.get_internal_path(
        (
            '180311_cache_pi_transition_for_genes/'
            '180311_cache_pi_transition_for_genes.csv')

    )

    pool = pd.read_csv(p, low_memory=False)

    pubmed_year_pi = pool[
        ['pubmed_id', 'pubdate_year', 'will_be_pi', 'genes']].copy()

    tolerated_genes_per_publication = 10
    pubmed_year_pi = pubmed_year_pi[
        pubmed_year_pi['genes'] <= tolerated_genes_per_publication]

    human_gene2pubmed = medline.gene2pubmed(
        taxon_id=9606,
        paper_kind='research',
        ref_genes=get_ref_genes())[['gene_ncbi', 'pubmed_id']]

    ma = pd.merge(human_gene2pubmed, pubmed_year_pi)

    # m = ma[['gene_ncbi', 'pubdate_year', 'will_be_pi']].groupby(
    #     ['gene_ncbi', 'pubdate_year']).agg(np.mean).reset_index()

    # av = m[[
    #     'pubdate_year', 'will_be_pi']].groupby(
    #         'pubdate_year').agg(np.mean).reset_index().rename(columns={
    #             'will_be_pi': 'per_year_occurence_will_be_pi'
    #         })

    # n = pd.merge(m, av)

    # f = n['pubdate_year'].isin(range(2010, 2011))

    # nn = n.loc[f, :].copy()

    # nn['above'] = nn['will_be_pi'] > (nn['per_year_occurence_will_be_pi']*2)

    # r = nn[['gene_ncbi', 'above']].groupby('gene_ncbi').agg(np.mean)

    # r = r.rename(columns={'above': 'recent_above_average'})

    # dd = r.copy()
    # ge = natsorted(r.index)
    # cl = r > 0.9

    # y = 2010

    m = ma[ma['pubdate_year'].isin(range(2010, 2016))]

    c = m['gene_ncbi'].value_counts()

    m = m[m['gene_ncbi'].isin(c[c >= 10].index)]

    dd = m[['gene_ncbi', 'will_be_pi']].groupby('gene_ncbi').agg(np.mean)

    a = np.log2(dd / dd['will_be_pi'].mean())

    # a[a > 2] = 2
    # a[a < -2] = -2

    dd = a.copy()

    cl = dd.copy() > 1    

    ge = cl.index

    return cl, dd, ge
def load_group_annotation(annotation_code='180318'):

    # Add manually selected groups of genes to data to plot

    if annotation_code == 'pre_180318':
        gg = rinout.get_internal_path(
            (
                '171014f_visualize_metrics_on_characteristics_tsne'
                '_and_features/gene_list*'))

        agg = []
        for g in glob.glob(gg):
            df = pd.read_table(g, names=['gene_ncbi'])
            _, fn = os.path.split(g)
            df.loc[:, 'list'] = fn
            agg.append(df)
        df_manual_label = pd.concat(agg, axis=0)
        mislabelled_genes = [
            192670,
            154810,
            140710,
            170690,
            157570,
            374860,
            133690,
            353140,
            387700,
            319100,
            404550,
            283870,
            129450,
            121260,
            131870,
            123720,
            132720,
            203430,
            284110,
            219790,
            130540,
            158830,
            347730,
            124540,
            163590,
            283600,
            118460
        ]
        f = df_manual_label['gene_ncbi'].isin(mislabelled_genes)
        df_manual_label = df_manual_label.loc[~f, :]
        df_manual_label = df_manual_label.drop_duplicates(
            subset='gene_ncbi', keep=False).copy()
        df_manual_label.loc[:, 'list_code'] = df_manual_label.loc[
            :, 'list'].copy(
        ).str.extract('gene_list([0-9]*).*', expand=False).astype(float)

    elif annotation_code == '180318':
        p = rinout.get_internal_path(
            '180317_complete_cluster_maker/mark_180318.csv')
        df_manual_label = pd.read_csv(p).rename(columns={
            'group': 'list_code'
        })

    else:
        raise ValueError('Does not support provdied annotation_code.')

    df_manual_label = df_manual_label[df_manual_label['gene_ncbi'].isin(
        get_ref_genes())]

    return df_manual_label