Пример #1
0
def main():
    gn = Granatum()

    gene_scores = gn.get_import("gene_scores")
    species = gn.get_arg("species")
    gset_group_id = gn.get_arg("gset_group_id")
    n_repeats = gn.get_arg("n_repeats")
    alterChoice = gn.get_arg("alterChoice")

    if alterChoice=="pos":
        gene_scores = dict(filter(lambda elem: elem[1] >= 0.0, gene_scores.items()))
    elif alterChoice=="neg":
        gene_scores = dict(filter(lambda elem: elem[1] < 0.0, gene_scores.items()))
        gene_scores = { k: abs(v) for k, v in gene_scores.items() }

    gene_ids = gene_scores.keys()
    gene_scores = gene_scores.values()

    gene_id_type = guess_gene_id_type(list(gene_ids)[:5])
    if gene_id_type != 'symbol':
        gene_ids = convert_gene_ids(gene_ids, gene_id_type, 'symbol', species)

    if species == "human":
        pass
    elif species == "mouse":
        gene_ids = zgsea.to_human_homolog(gene_ids, "mouse")
    else:
        raise ValueError()

    result_df = zgsea.gsea(gene_ids, gene_scores, gset_group_id, n_repeats=n_repeats)
    if result_df is None:
        gn.add_markdown('No gene set is enriched with your given genes.')
    else:
        result_df = result_df[["gset_name", "gset_size", "nes", "p_val", "fdr"]]
        gn.add_pandas_df(result_df)
        gn.export(result_df.to_csv(index=False), 'gsea_results.csv', kind='raw', meta=None, raw=True)
        newdict = dict(zip(result_df.index.tolist(), result_df['nes'].tolist()))
        print(newdict, flush=True)
        gn.export(newdict, 'nes', 'geneMeta')

    gn.commit()
Пример #2
0
def main():
    gn = Granatum()
    assay_df = gn.pandas_from_assay(gn.get_import('assay'))
    grdict = gn.get_import('groupVec')
    phe_dict = pd.Series(gn.get_import('groupVec'))
    groups = set(parse(gn.get_arg('groups')))

    inv_map = {}
    for k, v in grdict.items():
        if v in groups:
            inv_map[v] = inv_map.get(v, []) + [k]
    cells = []
    for k, v in inv_map.items():
        cells.extend(v)
    assay_df = assay_df.loc[:, cells]
    assay_df = assay_df.sparse.to_dense().fillna(0)
    #assay_mat = r['as.matrix'](pandas2ri.py2ri(assay_df))
    # assay_mat = r['as.matrix'](conversion.py2rpy(assay_df))
    phe_vec = phe_dict[assay_df.columns]

    r.source('./drive_DESeq2.R')
    ret_r = r['run_DESeq'](assay_df, phe_vec)
    ret_r_as_df = r['as.data.frame'](ret_r)

    # ret_py_df = pandas2ri.ri2py(ret_r_as_df)
    # TODO: maybe rename the columns to be more self-explanatory?
    result_df = ret_r_as_df
    result_df = result_df.sort_values('padj')
    result_df.index.name = 'gene'
    gn.add_pandas_df(result_df.reset_index(),
                     description='The result table as returned by DESeq2.')
    gn.export(result_df.to_csv(), 'DESeq2_results.csv', raw=True)
    significant_genes = result_df.loc[
        result_df['padj'] < 0.05]['log2FoldChange'].to_dict()
    gn.export(significant_genes, 'Significant genes', kind='geneMeta')
    gn.commit()
def main():
    gn = Granatum()

    sample_coords = gn.get_import("viz_data")
    df = gn.pandas_from_assay(gn.get_import("assay"))
    gene_ids = gn.get_arg("gene_ids")
    overlay_genes = gn.get_arg("overlay_genes")
    max_colors = gn.get_arg("max_colors")
    min_level = gn.get_arg("min_level")
    max_level = gn.get_arg("max_level")
    convert_to_zscore = gn.get_arg("convert_to_zscore")
    min_marker_area = gn.get_arg("min_marker_area")
    max_marker_area = gn.get_arg("max_marker_area")
    min_alpha = gn.get_arg("min_alpha")
    max_alpha = gn.get_arg("max_alpha")
    grey_level = gn.get_arg("grey_level")

    coords = sample_coords.get("coords")
    dim_names = sample_coords.get("dimNames")

    cmaps = []
    if overlay_genes:
        if max_colors == "":
            numcolors = len(gene_ids.split(','))
            cycol = cycle('bgrcmk')
            for i in range(numcolors):
                cmaps = cmaps + [
                    LinearSegmentedColormap("fire",
                                            produce_cdict(next(cycol),
                                                          grey=grey_level,
                                                          min_alpha=min_alpha,
                                                          max_alpha=max_alpha),
                                            N=256)
                ]
        else:
            for col in max_colors.split(','):
                col = col.strip()
                cmaps = cmaps + [
                    LinearSegmentedColormap("fire",
                                            produce_cdict(col,
                                                          grey=grey_level,
                                                          min_alpha=min_alpha,
                                                          max_alpha=max_alpha),
                                            N=256)
                ]

    else:
        if max_colors == "":
            cmaps = cmaps + [LinearSegmentedColormap("fire", cdict, N=256)]
        else:
            for col in max_colors.split(','):
                col = col.strip()
                cmaps = cmaps + [
                    LinearSegmentedColormap("fire",
                                            produce_cdict(col,
                                                          grey=grey_level,
                                                          min_alpha=min_alpha,
                                                          max_alpha=max_alpha),
                                            N=256)
                ]

    colorbar_height = 10
    plot_height = 650
    num_cbars = 1
    if overlay_genes:
        num_cbars = len(gene_ids.split(','))
    cbar_height_ratio = plot_height / (num_cbars * colorbar_height)
    fig, ax = plt.subplots(
        1 + num_cbars,
        1,
        gridspec_kw={'height_ratios': [cbar_height_ratio] + [1] * num_cbars})

    gene_index = -1
    for gene_id in gene_ids.split(','):
        gene_id = gene_id.strip()
        gene_index = gene_index + 1
        if gene_id in df.index:
            if not overlay_genes:
                plt.clf()
                fig, ax = plt.subplots(
                    1 + num_cbars,
                    1,
                    gridspec_kw={
                        'height_ratios': [cbar_height_ratio] + [1] * num_cbars
                    })

            transposed_df = df.T

            mean = transposed_df[gene_id].mean()
            stdev = transposed_df[gene_id].std(ddof=0)

            if convert_to_zscore:
                scatter_df = pd.DataFrame(
                    {
                        "x": [a[0] for a in coords.values()],
                        "y": [a[1] for a in coords.values()],
                        "value": (df.loc[gene_id, :] - mean) / stdev
                    },
                    index=coords.keys())
            else:
                scatter_df = pd.DataFrame(
                    {
                        "x": [a[0] for a in coords.values()],
                        "y": [a[1] for a in coords.values()],
                        "value": df.loc[gene_id, :]
                    },
                    index=coords.keys())

            values_df = np.clip(scatter_df["value"],
                                min_level,
                                max_level,
                                out=None)
            min_value = np.nanmin(values_df)
            max_value = np.nanmax(values_df)
            scaled_marker_size = (max_marker_area - min_marker_area) * (
                values_df - min_value) / (max_value -
                                          min_value) + min_marker_area
            scaled_marker_size = scaled_marker_size * scaled_marker_size
            # s = 5000 / scatter_df.shape[0]
            scatter = ax[0].scatter(
                x=scatter_df["x"],
                y=scatter_df["y"],
                s=scaled_marker_size,
                c=values_df,
                cmap=cmaps[gene_index % len(cmaps)])  #Amp_3.mpl_colormap)
            cbar = fig.colorbar(scatter,
                                cax=ax[1 + (gene_index % num_cbars)],
                                orientation='horizontal',
                                aspect=40)
            cbar.set_label(gene_id, rotation=0)

            ax[0].set_xlabel(dim_names[0])
            ax[0].set_ylabel(dim_names[1])

            if not overlay_genes:
                gn.add_current_figure_to_results(
                    "Scatter-plot of {} expression".format(gene_id), dpi=75)

        else:

            # if the gene ID entered is not present in the assay
            # Communicate it to the user and output a table of available gene ID's

            description = 'The selected gene is not present in the assay. See the step that generated the assay'
            genes_in_assay = pd.DataFrame(
                df.index.tolist(),
                columns=['Gene unavailable in assay: choose from below'])
            gn.add_pandas_df(genes_in_assay, description)
    if overlay_genes:
        gn.add_current_figure_to_results(
            "Scatter-plot of {} expression".format(gene_ids),
            height=650 + 100 * len(gene_ids.split(',')),
            dpi=75)

    gn.commit()
Пример #4
0
def main():
    gn = Granatum()

    gene_scores_dict = gn.get_import("gene_scores")
    species = gn.get_arg("species")
    gset_group_id = gn.get_arg("gset_group_id")
    threshold = gn.get_arg("threshold")
    use_abs = gn.get_arg("use_abs")
    background = gn.get_arg("background")

    gene_ids = list(gene_scores_dict.keys())
    gene_scores = list(gene_scores_dict.values())

    gene_id_type = guess_gene_id_type(list(gene_ids)[:5])

    if gene_id_type != 'symbol':
        gene_ids = convert_gene_ids(gene_ids, gene_id_type, 'symbol', species)

    if species == "human":
        pass
    elif species == "mouse":
        gene_ids = zgsea.to_human_homolog(gene_ids, "mouse")
        # problem is that gene_ids is NAN after this
    else:
        raise ValueError()

    if use_abs:
        input_list = np.array(gene_ids)[
            np.abs(np.array(gene_scores)) >= threshold]
    else:
        input_list = np.array(gene_ids)[np.array(gene_scores) >= threshold]

    print(input_list)

    gn.add_result(
        f"""\
Number of genes after thresholding: {len(input_list)} (out of original {len(gene_ids)}).

Please see the attachment `list_of_genes.csv` for the list of genes considered in this enrichment analysis.""",
        'markdown',
    )

    gn.export(pd.Series(input_list).to_csv(index=False),
              'list_of_genes.csv',
              kind='raw',
              meta=None,
              raw=True)

    if background == 'all':
        background_list = get_all_genes('human')
    elif background == 'from_gene_sets':
        background_list = None
    elif background == 'from_input':
        background_list = gene_ids
    else:
        raise ValueError()

    result_df = zgsea.simple_fisher(input_list,
                                    gset_group_id,
                                    background_list=background_list)
    result_df = result_df.sort_values('fdr')
    result_df = result_df[[
        'gene_set_name',
        'size',
        'p_val',
        'fdr',
        'odds_ratio',
        'n_overlaps',
        'overlapping_genes',
    ]]
    result_df.columns = [
        'Gene set',
        'Gene set size',
        'p-value',
        'FDR',
        'Odds ratio',
        'Number of overlapping genes',
        'Overlapping genes',
    ]

    gn.add_pandas_df(result_df)
    gn.export(result_df.to_csv(index=False),
              'enrichment_results.csv',
              kind='raw',
              meta=None,
              raw=True)

    gn.commit()