Пример #1
0
def main():
    tic = time.perf_counter()

    gn = Granatum()

    df = gn.pandas_from_assay(gn.get_import('assay'))
    mingenes = gn.get_arg('min_genes_per_cell')
    maxgenes = gn.get_arg('max_genes_per_cell')
    mt_percent = gn.get_arg('mt_genes_percent')/100.0

    uniquegenecount = df.astype(bool).sum(axis=0)
    totalgenecount = df.sum(axis=0)
    mtrows = df[df.index.str.startswith('MT')]
    mtgenecount = mtrows.sum(axis=0)
    mtpercent = mtgenecount.div(totalgenecount)
    colsmatching = uniquegenecount.T[(uniquegenecount.T >= mingenes) & (uniquegenecount.T <= maxgenes) & (mtpercent.T <= mt_percent)].index.values
    adata = df.loc[:, colsmatching]

    num_orig_cells = uniquegenecount.T.index.size
    num_filtered_cells = len(colsmatching)

    num_lt_min = uniquegenecount.T[(uniquegenecount.T < mingenes)].index.size
    num_gt_max = uniquegenecount.T[(uniquegenecount.T > maxgenes)].index.size
    num_gt_mt = uniquegenecount.T[(mtpercent.T > mt_percent)].index.size

    gn.add_result("Number of cells is now {} out of {} original cells with {} below min genes, {} above max genes, and {} above mt percentage threshold.".format(num_filtered_cells, num_orig_cells, num_lt_min, num_gt_max, num_gt_mt), "markdown")

    plt.figure()

    plt.subplot(2, 1, 1)
    plt.title('Unique gene count distribution')
    sns.distplot(uniquegenecount, bins=int(200), color = 'darkblue', kde_kws={'linewidth': 2})
    plt.ylabel('Frequency')
    plt.xlabel('Gene count')

    plt.subplot(2, 1, 2)
    plt.title('MT Percent Distribution')
    sns.distplot(mtpercent*100.0, bins=int(200), color = 'darkblue', kde_kws={'linewidth': 2})
    plt.ylabel('Frequency')
    plt.xlabel('MT Percent')

    plt.tight_layout()

    caption = (
        'The distribution of expression levels for each cell with various metrics.'
    )
    gn.add_current_figure_to_results(caption, zoom=1, dpi=75)

    gn.export(gn.assay_from_pandas(adata), "Filtered Cells Assay", dynamic=False)

    toc = time.perf_counter()
    time_passed = round(toc - tic, 2)

    timing = "* Finished cell filtering step in {} seconds*".format(time_passed)
    gn.add_result(timing, "markdown")

    gn.commit()
Пример #2
0
def main():
    tic = time.perf_counter()

    gn = Granatum()
    assay = gn.pandas_from_assay(gn.get_import('assay'))
    groups = gn.get_import('groups')
    reflabels = gn.get_import('reflabels')
    remove_cells = gn.get_arg('remove_cells')

    inv_map = {}
    for k, v in groups.items():
        inv_map[v] = inv_map.get(v, []) + [k]

    inv_map_ref = {}
    for k, v in reflabels.items():
        inv_map_ref[v] = inv_map_ref.get(v, []) + [k]

    group_relabel = {}
    mislabelled_cells = []
    for k, v in inv_map.items():
        vset = set(v)
        label_scores = {}
        for kref, vref in inv_map_ref.items():
            label_scores[kref] = len(set(vref).intersection(vset))
        group_relabel[k] = max(label_scores, key=label_scores.get)
        mislabelled_cells = mislabelled_cells + list(
            vset.difference(set(inv_map_ref[group_relabel[k]])))

    if remove_cells:
        gn.add_result(
            "Dropping {} mislabelled cells".format(len(mislabelled_cells)),
            "markdown")
        assay = assay.drop(mislabelled_cells, axis=1)
        groups = {
            key: val
            for key, val in groups.items() if not key in mislabelled_cells
        }

    for cell in groups:
        groups[cell] = group_relabel[groups[cell]]

    toc = time.perf_counter()
    time_passed = round(toc - tic, 2)

    gn.export_statically(gn.assay_from_pandas(assay), "Corresponded assay")
    gn.export_statically(groups, "Corresponded labels")

    timing = "* Finished sample coloring step in {} seconds*".format(
        time_passed)
    gn.add_result(timing, "markdown")

    gn.commit()
Пример #3
0
def main():
    gn = Granatum()

    assay = gn.get_import('assay')

    args_for_init = {
        'selected_embedding': gn.get_arg('selectedEmbedding'),
        'selected_clustering': gn.get_arg('selectedClustering'),
        'n_components': gn.get_arg('nComponents'),
        'n_clusters': gn.get_arg('nClusters'),
        'find_best_number_of_cluster': gn.get_arg('findBestNumberOfCluster'),
    }

    args_for_fit = {
        'matrix': np.transpose(np.array(assay.get('matrix'))),
        'sample_ids': assay.get('sampleIds'),
    }

    granatum_clustering = GranatumDeepClustering(**args_for_init)
    fit_results = granatum_clustering.fit(**args_for_fit)

    fit_exp = fit_results.get('clusters')
    gn.export_statically(fit_exp, 'Cluster assignment')
    newdictstr = ['"'+str(k)+'"'+", "+str(v) for k, v in fit_exp.items()]
    gn.export("\n".join(newdictstr), 'Cluster assignment.csv', kind='raw', meta=None, raw=True)

    md_str = f"""\
## Results

  * Cluster array: `{fit_results.get('clusters_array')}`
  * Cluster array: `{fit_results.get('clusters_array')}`
  * nClusters: {fit_results.get('n_clusters')}
  * Number of components: {fit_results.get('n_components')}
  * Outliers: {fit_results.get('outliers')}"""
    # gn.add_result(md_str, 'markdown')

    gn.add_result(
        {
            'orient': 'split',
            'columns': ['Sample ID', 'Cluster Assignment'],
            'data': [{'Sample ID':x, 'Cluster Assignment':y} for x, y in zip(assay.get('sampleIds'), fit_results.get('clusters_array'))],
        },
        'table',
    )

    gn.commit()
Пример #4
0
def main():
    gn = Granatum()

    df = gn.pandas_from_assay(gn.get_import('assay'))
    n_steps = gn.get_arg('n_steps')
    min_theta = gn.get_arg('min_theta')
    max_theta = gn.get_arg('max_theta')

    jammit = JAMMIT.from_dfs([df])

    jammit.scan(
        thetas=np.linspace(min_theta, max_theta, n_steps),
        calculate_fdr=True,
        n_perms=10,
        verbose=1,
        convergence_threshold=0.000000001,
    )

    jammit_result = jammit.format(columns=['theta', 'alpha', 'n_sigs', 'fdr'])
    jammit_result['theta'] = jammit_result['theta'].round(3)
    jammit_result['alpha'] = jammit_result['alpha'].round(3)

    plt.plot(jammit_result['alpha'], jammit_result['fdr'])
    plt.xlabel('alpha')
    plt.ylabel('FDR')
    gn.add_current_figure_to_results('FDR plotted against alpha', height=400)

    gn.add_result(
        {
            'pageSize':
            n_steps,
            'orient':
            'split',
            'columns': [{
                'name': h,
                'type': 'number',
                'round': 3
            } for h in jammit_result.columns],
            'data':
            jammit_result.values.tolist(),
        },
        data_type='table',
    )

    gn.commit()
def main():
    gn = Granatum()

    adata = gn.ann_data_from_assay(gn.get_import("assay"))
    num_cells_to_sample = gn.get_arg("num_cells_to_sample")
    random_seed = gn.get_arg("random_seed")

    np.random.seed(random_seed)

    num_cells_before = adata.shape[0]
    num_genes_before = adata.shape[1]

    if num_cells_to_sample > 0 and num_cells_to_sample < 1:
        num_cells_to_sample = round(num_cells_before * num_cells_to_sample)
    else:

        num_cells_to_sample = round(num_cells_to_sample)

    if num_cells_to_sample > num_cells_before:
        num_cells_to_sample = num_cells_before

    if num_cells_to_sample < 1:
        num_cells_to_sample = 1

    sampled_cells_idxs = np.sort(np.random.choice(num_cells_before, num_cells_to_sample, replace=False))

    adata = adata[sampled_cells_idxs, :]

    gn.add_result(
        "\n".join(
            [
                "The assay before down-sampling has **{}** cells and {} genes.".format(
                    num_cells_before, num_genes_before
                ),
                "",
                "The assay after down-sampling has **{}** cells and {} genes.".format(adata.shape[0], adata.shape[1]),
            ]
        ),
        type="markdown",
    )

    gn.export(gn.assay_from_ann_data(adata), "Down-sampled Assay", dynamic=False)

    gn.commit()
Пример #6
0
def main():
    tic = time.perf_counter()

    gn = Granatum()

    df = gn.pandas_from_assay(gn.get_import('assay'))
    n_neighbors = gn.get_arg('n_neighbors')
    min_dist = gn.get_arg('min_dist')
    metric = gn.get_arg('metric')
    random_seed = gn.get_arg('random_seed')

    embedding = umap.UMAP(n_neighbors=n_neighbors,
                          min_dist=min_dist,
                          metric=metric,
                          random_state=random_seed).fit_transform(df.values.T)

    plt.figure()
    plt.scatter(embedding[:, 0], embedding[:, 1], min(5000 / df.shape[0],
                                                      36.0))
    plt.xlabel('UMAP dim. 1')
    plt.ylabel('UMAP dim. 2')
    plt.tight_layout()

    gn.add_current_figure_to_results('UMAP plot: each dot represents a cell',
                                     dpi=75)

    pca_export = {
        'dimNames': ['UMAP dim. 1', 'UMAP dim. 2'],
        'coords': {
            sample_id: embedding[i, :].tolist()
            for i, sample_id in enumerate(df.columns)
        },
    }
    gn.export_statically(pca_export, 'UMAP coordinates')

    toc = time.perf_counter()
    time_passed = round(toc - tic, 2)

    timing = "* Finished UMAP step in {} seconds*".format(time_passed)
    gn.add_result(timing, "markdown")

    gn.commit()
Пример #7
0
def main():
    gn = Granatum()

    sample_meta_true = gn.get_import("sample_meta_true")
    sample_meta_predicted = gn.get_import("sample_meta_predicted")

    # Using pandas series to align the two metas in case they have different sample IDs
    rand_score = adjusted_rand_score(pd.Series(sample_meta_true),
                                     pd.Series(sample_meta_predicted))
    mutual_info_score = adjusted_mutual_info_score(
        pd.Series(sample_meta_true), pd.Series(sample_meta_predicted))

    results_markdown = "\n".join([
        "Adjusted Rand score: **{}**".format(rand_score),
        "",
        "Adjusted mutual information score: **{}**".format(mutual_info_score),
    ])

    gn.add_result(results_markdown, "markdown")
    gn.commit()
Пример #8
0
def main():
    gn = Granatum()

    adata = gn.ann_data_from_assay(gn.get_import("assay"))
    min_cells_expressed = gn.get_arg("min_cells_expressed")
    min_mean = gn.get_arg("min_mean")
    max_mean = gn.get_arg("max_mean")
    min_disp = gn.get_arg("min_disp")
    max_disp = gn.get_arg("max_disp")

    num_genes_before = adata.shape[1]

    sc.pp.filter_genes(adata, min_cells=min_cells_expressed)

    filter_result = sc.pp.filter_genes_dispersion(
        adata.X, flavor='seurat', min_mean=math.log(min_mean), max_mean=math.log(max_mean), min_disp=min_disp, max_disp=max_disp,
    )
    adata = adata[:, filter_result.gene_subset]

    sc.pl.filter_genes_dispersion(filter_result)
    gn.add_current_figure_to_results(
        "Each dot represent a gene. The gray dots are the removed genes. The x-axis is log-transformed.",
        zoom=3,
        dpi=50,
        height=400,
    )

    gn.add_result(
        "\n".join(
            [
                "Number of genes before filtering: **{}**".format(num_genes_before),
                "",
                "Number of genes after filtering: **{}**".format(adata.shape[1]),
            ]
        ),
        type="markdown",
    )

    gn.export(gn.assay_from_ann_data(adata), "Filtered Assay", dynamic=False)

    gn.commit()
Пример #9
0
def main():
    gn = Granatum()

    df = gn.pandas_from_assay(gn.get_import("assay"))

    frob_norm = np.linalg.norm(df.values)

    df = df / frob_norm

    gn.add_result(
        f"""\
The original assay had Frobenius norm of {frob_norm}, after normalization its
Frobenius norm is now {np.linalg.norm(df.values)}""",
        'markdown',
    )

    gn.export(gn.assay_from_pandas(df),
              "Frobenius normalized assay",
              dynamic=False)

    gn.commit()
def main():
  gn = Granatum()

  adata = gn.ann_data_from_assay(gn.get_import('assay'))
  outliers = gn.get_arg('outliers')

  num_cells_before = adata.shape[0]

  kept_cell_ids = adata.obs_names.drop(outliers, errors='ignore').values

  adata = adata[kept_cell_ids, :]

  gn.export_statically(gn.assay_from_ann_data(adata), 'Outlier removed assay')
  gn.add_result(
    'You removed {} outliers from {} cells, the result assay has {} cells (and {} genes).'.format(
      len(outliers), num_cells_before, adata.shape[0], adata.shape[1]
    ),
    type='markdown'
  )

  gn.commit()
def main():
    gn = Granatum()

    df = gn.pandas_from_assay(gn.get_import("assay"))

    epsilon = gn.get_arg('epsilon')
    min_cells_expressed = gn.get_arg('min_cells_expressed')

    filter_df = pd.DataFrame({'gene': df.index})
    filter_df['sum_expr'] = [sum(df.values[i, :]) for i in range(df.shape[0])]
    filter_df['avg_expr'] = filter_df['sum_expr'] / df.shape[1]
    filter_df['num_expressed_genes'] = [
        sum([x > epsilon for x in df.values[i, :]]) for i in range(df.shape[0])
    ]
    filter_df[
        'removed'] = filter_df['num_expressed_genes'] < min_cells_expressed

    new_df = df.loc[np.logical_not(filter_df['removed'].values), :]

    gn.add_result(
        "\n".join([
            "Number of genes before filtering: **{}**".format(df.shape[0]),
            "",
            "Number of genes after filtering: **{}**".format(new_df.shape[0]),
        ]),
        type="markdown",
    )

    if filter_df.shape[0] > 0:
        filter_df_deleted = filter_df.loc[filter_df['removed'].values, :].drop(
            'removed', axis=1)
        gn.add_result(
            {
                'title': f"Removed genes ({filter_df_deleted.shape[0]})",
                'orient': 'split',
                'columns': filter_df_deleted.columns.values.tolist(),
                'data': filter_df_deleted.values.tolist(),
            },
            data_type='table',
        )
    else:
        gn.add_result(
            f"No genes were removed. All {df.shape[0]} genes were kept. "
            f"See attachment **gene_selection.csv** for detail.",
            'markdown',
        )

    gn.export(filter_df.to_csv(index=False),
              'gene_selection.csv',
              kind='raw',
              meta=None,
              raw=True)
    gn.export(gn.assay_from_pandas(new_df), "Filtered Assay", dynamic=False)

    gn.commit()
Пример #12
0
def main():
    gn = Granatum()

    assay = gn.get_import('assay')
    x = np.array(assay.get('matrix')).astype(np.float)
    log_base = gn.get_arg('log_base')
    n_top = gn.get_arg('n_top')
    n_bottom = gn.get_arg('n_bottom')
    which_mid = gn.get_arg('which_mid')

    gene_df = pd.DataFrame(
        {
            'row_num': range(x.shape[0]),
            'gene_id': assay.get('geneIds'),
            'exp_mean': np.mean(x, axis=1),
            'exp_std': np.std(x, axis=1),
        }
    )
    gene_df = gene_df.sort_values('exp_mean', ascending=False)
    top_gene_row = gene_df.head(n_top).sort_values('exp_std', ascending=False).iloc[0]
    bottom_gene_row = gene_df.tail(n_bottom).sort_values('exp_std').iloc[0]

    hk_gene = np.clip(x[top_gene_row['row_num'], :], a_min=0.00001, a_max=None)

    neg_gene = x[bottom_gene_row['row_num'], :]

    if which_mid == 'mean':
        alphabk = np.mean(neg_gene[:])
    elif which_mid == 'median':
        alphabk = np.median(neg_gene[:])
    else:
        raise ValueError()

    loghkdatabk = np.log(hk_gene - alphabk) / np.log(log_base)
    
    # Drop NAN values
    loghkdatabk = loghkdatabk[~np.isnan(loghkdatabk)]

    c = (np.std(neg_gene[:], ddof=1) / np.std(loghkdatabk, ddof=1))**2

    xbk = x - alphabk
    transformed_matrix = np.log((xbk + np.sqrt(xbk**2 + c)) / 2) / np.log(log_base)

    gn.add_result(
        '\n'.join(
            [
                f"Selected benchmarking genes:",
                f"  * housekeeping gene: **{top_gene_row['gene_id']}** "
                f"(mean: {top_gene_row['exp_mean']}, std: {top_gene_row['exp_std']}) ",
                f"  * negative control gene: **{bottom_gene_row['gene_id']}**"
                f"(mean: {bottom_gene_row['exp_mean']}, std: {bottom_gene_row['exp_std']})",
                f"",
                f"Final formula is `y = log{log_base}((z + sqrt(z^2 + c))/2)`, where `z = x - {alphabk}` and `c = {c}`."
            ]
        ), 'markdown'
    )

    non_zero_values_before = x.flatten()
    non_zero_values_before = non_zero_values_before[(non_zero_values_before > np.percentile(non_zero_values_before, 5))]

    non_zero_values_after = transformed_matrix.flatten()
    non_zero_values_after = non_zero_values_after[(non_zero_values_after > np.percentile(non_zero_values_after, 5))]

    plt.figure()

    plt.subplot(2, 1, 1)
    plt.title('Before glog transformation')
    plt.hist(non_zero_values_before, bins=100)
    plt.ylabel('Frequency')
    plt.xlabel('Expression level')

    plt.subplot(2, 1, 2)
    plt.title('After glog transformation')
    plt.hist(non_zero_values_after, bins=100)
    plt.ylabel('Frequency')
    plt.xlabel('Expression level')

    plt.tight_layout()

    caption = (
        'The distribution of expression level before and after glog transformation. Only the values greater '
        'than the 5 percentile (usually zero in single-cell data) and lower than 95 percentile are considered.'
    )
    gn.add_current_figure_to_results(caption, zoom=2, dpi=50)

    assay['matrix'] = transformed_matrix.tolist()
    gn.export_statically(assay, 'GLog transformed assay')

    gn.commit()
Пример #13
0
def main():
    gn = Granatum()

    gene_scores_dict = gn.get_import("gene_scores")
    species = gn.get_arg("species")
    gset_group_id = gn.get_arg("gset_group_id")
    threshold = gn.get_arg("threshold")
    use_abs = gn.get_arg("use_abs")
    background = gn.get_arg("background")

    gene_ids = list(gene_scores_dict.keys())
    gene_scores = list(gene_scores_dict.values())

    gene_id_type = guess_gene_id_type(list(gene_ids)[:5])

    if gene_id_type != 'symbol':
        gene_ids = convert_gene_ids(gene_ids, gene_id_type, 'symbol', species)

    if species == "human":
        pass
    elif species == "mouse":
        gene_ids = zgsea.to_human_homolog(gene_ids, "mouse")
        # problem is that gene_ids is NAN after this
    else:
        raise ValueError()

    if use_abs:
        input_list = np.array(gene_ids)[
            np.abs(np.array(gene_scores)) >= threshold]
    else:
        input_list = np.array(gene_ids)[np.array(gene_scores) >= threshold]

    print(input_list)

    gn.add_result(
        f"""\
Number of genes after thresholding: {len(input_list)} (out of original {len(gene_ids)}).

Please see the attachment `list_of_genes.csv` for the list of genes considered in this enrichment analysis.""",
        'markdown',
    )

    gn.export(pd.Series(input_list).to_csv(index=False),
              'list_of_genes.csv',
              kind='raw',
              meta=None,
              raw=True)

    if background == 'all':
        background_list = get_all_genes('human')
    elif background == 'from_gene_sets':
        background_list = None
    elif background == 'from_input':
        background_list = gene_ids
    else:
        raise ValueError()

    result_df = zgsea.simple_fisher(input_list,
                                    gset_group_id,
                                    background_list=background_list)
    result_df = result_df.sort_values('fdr')
    result_df = result_df[[
        'gene_set_name',
        'size',
        'p_val',
        'fdr',
        'odds_ratio',
        'n_overlaps',
        'overlapping_genes',
    ]]
    result_df.columns = [
        'Gene set',
        'Gene set size',
        'p-value',
        'FDR',
        'Odds ratio',
        'Number of overlapping genes',
        'Overlapping genes',
    ]

    gn.add_pandas_df(result_df)
    gn.export(result_df.to_csv(index=False),
              'enrichment_results.csv',
              kind='raw',
              meta=None,
              raw=True)

    gn.commit()
Пример #14
0
def main():
    tic = time.perf_counter()

    gn = Granatum()

    assay = gn.pandas_from_assay(gn.get_import('assay'))
    # Groups is {"cell":"cluster}
    groups = gn.get_import('groups')

    certainty = gn.get_arg('certainty')
    alpha = 1 - certainty / 100.0

    min_zscore = st.norm.ppf(gn.get_arg("certainty") / 100.0)

    min_dist = 0.1

    # Likely we want to filter genes before we get started, namely if we cannot create a good statistic
    norms_df = assay.apply(np.linalg.norm, axis=1)
    assay = assay.loc[norms_df.T >= min_dist, :]

    inv_map = {}
    inv_map_rest = {}
    for k, v in groups.items():
        inv_map[v] = inv_map.get(v, []) + [k]
        clist = inv_map_rest.get(v, list(assay.columns))
        clist.remove(k)
        inv_map_rest[v] = clist
    # Inv map is {"cluster": ["cell"]}
    print("Completed setup", flush=True)

    cols = list(inv_map.keys())

    colnames = []
    for coli in cols:
        for colj in cols:
            if coli != colj:
                colnames.append("{} vs {}".format(coli, colj))
    for coli in cols:
        colnames.append("{} vs rest".format(coli))

    # Instead of scoring into a dataframe, let's analyze each statistically
    # Dict (gene) of dict (cluster) of dict (statistics)
    # { "gene_name" : { "cluster_name" : { statistics data } }}
    # Export would be percentage more/less expressed in "on" state
    # For example gene "XIST" expresses at least 20% more in cluster 1 vs cluster 4 with 95% certainty
    total_genes = len(assay.index)
    print("Executing parallel for {} genes".format(total_genes), flush=True)

    results = Parallel(
        n_jobs=math.floor(multiprocessing.cpu_count() * 2 * 9 / 10))(
            delayed(compref)(gene, assay.loc[gene, :], colnames, inv_map,
                             inv_map_rest, alpha, min_dist, min_zscore)
            for gene in tqdm(list(assay.index)))
    result = pd.concat(results, axis=0)

    gn.export_statically(gn.assay_from_pandas(result.T),
                         'Differential expression sets')
    gn.export(result.to_csv(),
              'differential_gene_sets.csv',
              kind='raw',
              meta=None,
              raw=True)

    toc = time.perf_counter()
    time_passed = round(toc - tic, 2)

    timing = "* Finished differential expression sets step in {} seconds*".format(
        time_passed)
    gn.add_result(timing, "markdown")

    gn.commit()
Пример #15
0
def main():
    gn = Granatum()

    df = gn.pandas_from_assay(gn.get_import('assay'))
    alpha = gn.get_arg('alpha')

    jammit = JAMMIT.from_dfs([df])

    res = jammit.run_for_one_alpha(
        alpha,
        verbose=1,
        convergence_threshold=0.000000001,
    )

    u = res['u']
    v = res['v']

    gn.export(dict(zip(df.index, u)), 'Genes loadings', kind='geneMeta')
    gn.export(dict(zip(df.columns, v)), 'Sample scores', kind='sampleMeta')

    gene_df = pd.DataFrame({
        'id_': df.index,
        'abs_loading': abs(u),
        'loading': u
    })
    gene_df = gene_df[['id_', 'abs_loading', 'loading']]
    gene_df = gene_df.loc[gene_df['loading'].abs() > EPSILON]
    gene_df = gene_df.sort_values('abs_loading', ascending=False)

    gn.add_result(
        {
            'title': f"Signal genes ({len(gene_df)})",
            'orient': 'split',
            'columns': gene_df.columns.values.tolist(),
            'data': gene_df.values.tolist(),
        },
        data_type='table',
    )
    gn.export(gene_df.to_csv(index=False),
              'signal_genes.csv',
              kind='raw',
              meta=None,
              raw=True)

    sample_df = pd.DataFrame({
        'id_': df.columns,
        'abs_score': abs(v),
        'score': v
    })
    sample_df = sample_df[['id_', 'abs_score', 'score']]
    sample_df = sample_df.loc[sample_df['score'].abs() > EPSILON]
    sample_df = sample_df.sort_values('abs_score', ascending=False)

    gn.add_result(
        {
            'title': f"Signal samples ({len(sample_df)})",
            'orient': 'split',
            'columns': sample_df.columns.values.tolist(),
            'data': sample_df.values.tolist(),
        },
        data_type='table',
    )
    gn.export(sample_df.to_csv(index=False),
              'signal_samples.csv',
              kind='raw',
              meta=None,
              raw=True)

    subset_df = df.loc[gene_df['id_'], sample_df['id_']]
    gn.export(gn.assay_from_pandas(subset_df),
              'Assay with only signal genes and samples',
              kind='assay')

    sns.clustermap(subset_df, cmap='RdBu')
    gn.add_current_figure_to_results(
        description='Cluster map of the signal genes and signal samples',
        zoom=2,
        width=750,
        height=850,
        dpi=50,
    )
    plt.close()

    plt.figure()
    plt.scatter(range(len(u)), u, s=2, c='red')
    plt.xlabel('index')
    plt.ylabel('value in u')
    gn.add_current_figure_to_results(
        description=
        'The *u* vector (loadings for genes) plotted as a scatter plot.',
        zoom=2,
        width=750,
        height=450,
        dpi=50,
    )
    plt.close()

    plt.figure()
    plt.plot(range(len(v)), v)
    plt.scatter(range(len(v)), v, s=6, c='red')
    plt.xlabel('index')
    plt.ylabel('value in v')
    gn.add_current_figure_to_results(
        description=
        'The *v* vector (scores for samples) plotted as a line plot.',
        zoom=2,
        width=750,
        height=450,
        dpi=50,
    )
    plt.close()

    # gn.export_current_figure(
    #     'cluster_map.pdf',
    #     zoom=2,
    #     width=750,
    #     height=850,
    #     dpi=50,
    # )

    gn.commit()
Пример #16
0
def main():
    tic = time.perf_counter()

    gn = Granatum()
    sample_coords = gn.get_import("viz_data")
    value = gn.get_import("value")
    coloring_type = gn.get_arg("coloring_type")
    bounding_stdev = gn.get_arg("bounding_stdev")
    label_location = gn.get_arg("label_location")
    label_transform = gn.get_arg("label_transform")
    labelXaxis = gn.get_arg("labelXaxis")
    labelYaxis = gn.get_arg("labelYaxis")
    sigfigs = gn.get_arg("sigfigs")
    numticks = gn.get_arg("numticks")
    font = gn.get_arg('font')

    coords = sample_coords.get("coords")
    dim_names = sample_coords.get("dimNames")
    seed = gn.get_arg('random_seed')
    random.seed(seed)
    np.random.seed(seed)

    df = pd.DataFrame(
        {
            "x": [a[0] for a in coords.values()],
            "y": [a[1] for a in coords.values()],
            "value": pd.Series(value)
        },
        index=coords.keys())

    target_dpi = 300
    target_width = 7.5  # inches
    target_height = 6.5  # inches
    font_size_in_in = font / 72.0  # inches
    font_size_in_px = font_size_in_in * target_dpi

    try:

        if coloring_type == "categorical":
            uniq = df["value"].unique()
            uniq.sort(kind="stable")
            num = uniq.shape[0]
            COLORS2 = plt.get_cmap('gist_rainbow')
            carr = [0] * df.shape[0]
            listcats = list(df["value"])
            miny = min(list(df["y"]))
            maxy = max(list(df["y"]))
            scaley = (maxy - miny) / (target_height * target_dpi)
            print("Scaley = {}".format(scaley))
            colorhash = {}
            colorstep = np.ceil(256.0 / num)
            coffset = randrange(colorstep)
            grouptocolor = np.random.choice(np.arange(num), num, replace=False)

            for i, cat in enumerate(uniq):
                dff = df[df["value"] == cat]
                xs = list(dff["x"])
                ys = list(dff["y"])
                #avgx = sum(dff["x"]) / len(dff["x"])
                #avgy = sum(dff["y"]) / len(dff["y"])
                #plt.scatter(x=dff["x"], y=dff["y"], s=5000 / df.shape[0], c=COLORS[i].hex_l, label=cat)
                #plt.scatter(x=dff["x"], y=dff["y"], s=5000 / df.shape[0], c=[abs(hash(cat)) % 256]*len(dff["x"]), cmap=COLORS2, label=cat)
                #plt.scatter(x=dff["x"], y=dff["y"], s=5000 / df.shape[0], c=abs(hash(cat)) % 256, cmap=COLORS2, label=cat)
                #abs(hash(cat))
                colorindex = (coffset + grouptocolor[i] * colorstep) % 256
                colorhash[cat] = colorindex
                craw = COLORS2((colorindex + 0.0) / 256.0)
                clr = [craw[0], craw[1], craw[2], 0.2]
                whitetransparent = [1.0, 1.0, 1.0, 0.5]
                coloropaque = [craw[0], craw[1], craw[2], 1.0]
                if len(xs) > 3:
                    pts = list(zip(xs, ys))
                    cent = np.mean(pts, axis=0)
                    lengs = list(
                        map(
                            lambda p: math.sqrt(
                                (p[0] - cent[0]) * (p[0] - cent[0]) +
                                (p[1] - cent[1]) * (p[1] - cent[1])), pts))
                    avgleng = st.mean(lengs)
                    stdleng = st.stdev(lengs) * bounding_stdev
                    rpts = []
                    if (stdleng > 0.0):
                        for j, ln in enumerate(lengs):
                            if (ln - avgleng < stdleng):
                                rpts.append(pts[j])
                        pts = rpts
                    cent = np.mean(pts, axis=0)
                    hull = ConvexHull(pts)
                    ptslist = []
                    for pt in hull.simplices:
                        ptslist.append(pts[pt[0]])
                        ptslist.append(pts[pt[1]])
                    ptslist.sort(key=lambda p: np.arctan2(
                        p[1] - cent[1], p[0] - cent[0]))
                    ptslist = ptslist[0::2]
                    ptslist.insert(len(ptslist), ptslist[0])
                    lowestpt = ptslist[0]
                    if label_location == 'bottom':
                        for pt in ptslist:
                            if (pt[1] < lowestpt[1]):
                                lowestpt = pt
                    else:
                        lowestpt = ptslist[randrange(len(ptslist))]
                    if (bounding_stdev >= 0.0):
                        poly = Polygon(1.1 * (np.array(ptslist) - cent) + cent,
                                       facecolor=clr)
                        poly.set_capstyle('round')
                        plt.gca().add_patch(poly)
                        poly.set_color(clr)
                    label_text = cat
                    if label_transform == "numbers":
                        label_text = re.sub("[^0-9]", "", cat)
                    txt = plt.text(lowestpt[0],
                                   lowestpt[1] -
                                   scaley * font_size_in_px * 1.2,
                                   label_text,
                                   fontsize=font,
                                   fontname="Arial",
                                   ha="center",
                                   va="center",
                                   color="black",
                                   bbox=dict(boxstyle="round",
                                             fc=whitetransparent,
                                             ec=coloropaque))
                    # plt.gca().add_artist(txt)
                for j, x in enumerate(listcats):
                    if x == cat:
                        carr[j] = colorhash[cat]
                        #carr[j] = colorhash[cat] / 256.0
                        #int(abs(hash(cat)) % 256)

            plt.scatter(x=df["x"],
                        y=df["y"],
                        s=5000 / df.shape[0],
                        c=carr,
                        cmap=COLORS2)
            lgd = plt.legend(markerscale=6,
                             loc='upper center',
                             bbox_to_anchor=(0.5, -0.05),
                             ncol=5)
    #60 / (5000 / df.shape[0])
        elif coloring_type == "continuous":
            plt.scatter(x=df["x"],
                        y=df["y"],
                        s=5000 / df.shape[0],
                        c=df["value"],
                        cmap="Reds")
            plt.colorbar()

        xmin, xmax = plt.gca().get_xlim()
        ymin, ymax = plt.gca().get_ylim()
        # stepsizex=(xmax-xmin)/numticks
        # stepsizey=(ymax-ymin)/numticks
        xtickArray = resetArray(xmin, xmax, numticks, sigfigs)
        ytickArray = resetArray(ymin, ymax, numticks, sigfigs)
        # plt.xticks(np.arange(xmin, xmax+stepsizex, step=stepsizex), fontsize=font, fontname="Arial")
        # plt.yticks(np.arange(ymin, ymax+stepsizey, step=stepsizey), fontsize=font, fontname="Arial")
        plt.xlim(xtickArray[0], xtickArray[-1])
        plt.ylim(ytickArray[0], ytickArray[-1])
        plt.xticks(xtickArray, fontsize=font, fontname="Arial")
        plt.yticks(ytickArray, fontsize=font, fontname="Arial")
        if labelXaxis == "":
            plt.xlabel(dim_names[0], fontsize=font, fontname="Arial")
        else:
            plt.xlabel(labelXaxis, fontsize=font, fontname="Arial")

        if labelYaxis == "":
            plt.ylabel(dim_names[1], fontsize=font, fontname="Arial")
        else:
            plt.ylabel(labelYaxis, fontsize=font, fontname="Arial")

        # plt.tight_layout()

        gn.add_current_figure_to_results(
            "Scatter-plot",
            dpi=target_dpi,
            width=target_width * target_dpi,
            height=target_height * target_dpi,
            savefig_kwargs={'bbox_inches': 'tight'})

        toc = time.perf_counter()
        time_passed = round(toc - tic, 2)

        timing = "* Finished sample coloring step in {} seconds*".format(
            time_passed)
        gn.add_result(timing, "markdown")

        gn.commit()

    except Exception as e:

        plt.figure()
        plt.text(
            0.05, 0.7,
            'Values used as colors and type of sample metadata are incompatible with each other'
        )

        if coloring_type == 'categorical':
            new_coloring_type = 'continuous'
        else:
            new_coloring_type = 'categorical'

        plt.text(
            0.05, 0.5, 'Retry the step with ' + new_coloring_type +
            ' instead of ' + coloring_type)
        plt.axis('off')
        gn.add_current_figure_to_results('Scatter-plot')

        gn.commit()
Пример #17
0
def main():

    tic = time.perf_counter()

    gn = Granatum()

    assay = gn.get_import('assay')
    sample_ids = assay.get('sampleIds')
    group_dict = gn.get_import('groupVec')
    group_vec = pd.Categorical([group_dict.get(x) for x in sample_ids])
    num_groups = len(group_vec.categories)
    figheight = 400 * (math.floor((num_groups - 1) / 7) + 1)

    adata = sc.AnnData(np.array(assay.get('matrix')).transpose())
    adata.var_names = assay.get('geneIds')
    adata.obs_names = assay.get('sampleIds')
    adata.obs['groupVec'] = group_vec

    sc.pp.neighbors(adata, n_neighbors=20, use_rep='X', method='gauss')

    try:

        sc.tl.rank_genes_groups(adata, 'groupVec', n_genes=100000)
        sc.pl.rank_genes_groups(adata, n_genes=20)
        gn.add_current_figure_to_results('One-vs-rest marker genes',
                                         dpi=75,
                                         height=figheight)

        gn._pickle(adata, 'adata')

        rg_res = adata.uns['rank_genes_groups']

        for group in rg_res['names'].dtype.names:
            genes_names = [str(x[group]) for x in rg_res['names']]
            scores = [float(x[group]) for x in rg_res['scores']]
            newdict = dict(zip(genes_names, scores))
            gn.export(newdict,
                      'Marker score ({} vs. rest)'.format(group),
                      kind='geneMeta')
            newdictstr = [
                '"' + str(k) + '"' + ", " + str(v) for k, v in newdict.items()
            ]
            gn.export("\n".join(newdictstr),
                      'Marker score {} vs rest.csv'.format(group),
                      kind='raw',
                      meta=None,
                      raw=True)

        # cluster_assignment = dict(zip(adata.obs_names, adata.obs['louvain'].values.tolist()))
        # gn.export_statically(cluster_assignment, 'cluster_assignment')

        toc = time.perf_counter()
        time_passed = round(toc - tic, 2)

        timing = "* Finished marker gene identification step in {} seconds*".format(
            time_passed)
        gn.add_result(timing, "markdown")

        gn.commit()

    except Exception as e:

        plt.figure()
        plt.text(0.01, 0.5,
                 'Incompatible group vector due to insufficent cells')
        plt.text(0.01, 0.3,
                 'Please retry the step with a different group vector')
        plt.axis('off')
        gn.add_current_figure_to_results('One-vs-rest marker genes')
        gn.add_result('Error = {}'.format(e), "markdown")

        gn.commit()
Пример #18
0
def main():

    tic = time.perf_counter()

    gn = Granatum()

    assay_file = gn.get_uploaded_file_path("assayFile")
    sample_meta_file = gn.get_uploaded_file_path("sampleMetaFile")
    file_format = gn.get_arg("fileFormat")
    file_format_meta = gn.get_arg("fileFormatMeta")
    species = gn.get_arg("species")

    # Share the email address among other gboxes using a pickle dump #
    email_address = gn.get_arg("email_address")
    shared = {"email_address": email_address}
    with open(gn.swd + "/shared.pkl", "wb") as fp:
        pickle.dump(shared, fp)

    if file_format == "und":
        file_format = Path(assay_file).suffix[1:]

    if file_format == "csv":
        tb = pd.read_csv(assay_file,
                         sep=",",
                         index_col=0,
                         engine='c',
                         memory_map=True)
    elif file_format == "tsv":
        tb = pd.read_csv(assay_file,
                         sep="\t",
                         index_col=0,
                         engine='c',
                         memory_map=True)
    elif file_format.startswith("xls"):
        tb = pd.read_excel(assay_file, index_col=0)
    elif file_format == "zip":
        os.system("zip -d {} __MACOSX/\\*".format(assay_file))
        os.system("unzip -p {} > {}.csv".format(assay_file, assay_file))
        tb = pd.read_csv("{}.csv".format(assay_file),
                         sep=",",
                         index_col=0,
                         engine='c',
                         memory_map=True)
    elif file_format == "gz":
        os.system("gunzip -c {} > {}.csv".format(assay_file, assay_file))
        tb = pd.read_csv("{}.csv".format(assay_file),
                         sep=",",
                         index_col=0,
                         engine='c',
                         memory_map=True)
    else:
        gn.error("Unknown file format: {}".format(file_format))

    sample_ids = tb.columns.values.tolist()
    gene_ids = tb.index.values.tolist()

    gene_id_type = guess_gene_id_type(gene_ids[:5])

    whether_convert_id = gn.get_arg("whether_convert_id")

    if whether_convert_id:
        to_id_type = gn.get_arg("to_id_type")
        add_info = gn.get_arg("add_info")

        # if there are duplicated ids, pick the first row
        # TODO: Need to have a more sophisticated handling of duplicated ids

        gene_ids, new_meta = convert_gene_ids(gene_ids,
                                              gene_id_type,
                                              to_id_type,
                                              species,
                                              return_new_meta=True)

        # TODO: remove NaN rows
        # TODO: combine duplicated rows

        if add_info:
            for col_name, col in new_meta.iteritems():
                gn.export(col.to_dict(), col_name, "geneMeta")

    assay_export_name = "[A]{}".format(basename(assay_file))

    exported_assay = {
        "matrix": tb.values.tolist(),
        "sampleIds": sample_ids,
        "geneIds": gene_ids,
    }

    gn.export(exported_assay, assay_export_name, "assay")

    entry_preview = '\n'.join(
        [', '.join(x) for x in tb.values[:10, :10].astype(str).tolist()])

    gn.add_result(
        f"""\
The assay has **{tb.shape[0]}** genes (with inferred ID type: {biomart_col_dict[gene_id_type]}) and **{tb.shape[1]}** samples.

The first few rows and columns:

```
{entry_preview}
```
""",
        "markdown",
    )

    meta_rows = []
    if sample_meta_file is not None:
        if file_format_meta == "und":
            file_format_meta = Path(sample_meta_file).suffix[1:]

        if file_format_meta == "csv":
            sample_meta_tb = pd.read_csv(sample_meta_file)
        elif file_format_meta == "tsv":
            sample_meta_tb = pd.read_csv(sample_meta_file, sep="\t")
        elif file_format_meta.startswith("xls"):
            sample_meta_tb = pd.read_excel(sample_meta_file)
        elif file_format_meta == "zip":
            os.system("unzip -p {} > {}.csv".format(sample_meta_file,
                                                    sample_meta_file))
            sample_meta_tb = pd.read_csv("{}.csv".format(sample_meta_file))
        elif file_format_meta == "gz":
            os.system("gunzip -c {} > {}.csv".format(sample_meta_file,
                                                     sample_meta_file))
            sample_meta_tb = pd.read_csv("{}.csv".format(sample_meta_file))
        else:
            gn.error("Unknown file format: {}".format(file_format))

        for meta_name in sample_meta_tb.columns:
            meta_output_name = "[M]{}".format(meta_name)

            sample_meta_dict = dict(
                zip(sample_ids, sample_meta_tb[meta_name].values.tolist()))

            gn.export(sample_meta_dict, meta_output_name, "sampleMeta")

            num_sample_values = 5
            sample_values = ", ".join(sample_meta_tb[meta_name].astype(
                str).values[0:num_sample_values].tolist())
            num_omitted_values = len(
                sample_meta_tb[meta_name]) - num_sample_values

            if num_omitted_values > 0:
                etc = ", ... and {} more entries".format(num_omitted_values)
            else:
                etc = ""

            meta_rows.append({
                'meta_name': meta_name,
                'sample_values': str(sample_values) + etc,
            })

    # meta_message = '\n'.join(
    #     "* Sample meta with name **{meta_name}** is accepted ({sample_values}).".format(**x) for x in meta_rows
    # )

    # gn.add_result(meta_message, "markdown")

    # gn.add_result({'columns': []}, 'table')

    # TODO: SAVE assay pickle

    toc = time.perf_counter()
    time_passed = round(toc - tic, 2)

    timing = "* Finished upload step in {} seconds*".format(time_passed)
    gn.add_result(timing, "markdown")

    gn.commit()
Пример #19
0
def main():
    tic = time.perf_counter()

    gn = Granatum()

    assay = gn.pandas_from_assay(gn.get_import('assay'))
    groups = gn.get_import('groups')

    min_zscore = gn.get_arg('min_zscore')
    max_zscore = gn.get_arg('max_zscore')
    min_expression_variation = gn.get_arg('min_expression_variation')

    inv_map = {}
    for k, v in groups.items():
        inv_map[v] = inv_map.get(v, []) + [k]

    low_mean_dfs = []
    high_mean_dfs = []
    mean_dfs = []
    std_dfs = []
    colnames = []
    for k, v in inv_map.items():
        group_values = assay.loc[:, v]
        lowbound_clust = {}
        highbound_clust = {}
        for index, row in group_values.iterrows():
            meanbounds = sms.DescrStatsW(row).tconfint_mean()
            lowbound_clust[index] = meanbounds[0]
            highbound_clust[index] = meanbounds[1]
        low_mean_dfs.append(pd.DataFrame.from_dict(lowbound_clust, orient="index", columns=[k]))
        high_mean_dfs.append(pd.DataFrame.from_dict(highbound_clust, orient="index", columns=[k]))
        mean_dfs.append(group_values.mean(axis=1))
        std_dfs.append(group_values.std(axis=1))
        colnames.append(k)
    mean_df = pd.concat(mean_dfs, axis=1)
    mean_df.columns = colnames
    low_mean_df = pd.concat(low_mean_dfs, axis=1)
    low_mean_df.columns = colnames
    high_mean_df = pd.concat(high_mean_dfs, axis=1)
    high_mean_df.columns = colnames
    std_df = pd.concat(std_dfs, axis=1)
    std_df.columns = colnames
    print(std_df)
    minvalues = std_df.min(axis=1).to_frame()
    minvalues.columns=["min"]
    print("Minvalues>>")
    print(minvalues, flush=True)
    genes_below_min = list((minvalues[minvalues["min"]<min_expression_variation]).index)
    print("{} out of {}".format(len(genes_below_min), len(minvalues.index)), flush=True)
    mean_df = mean_df.drop(genes_below_min, axis=0)
    low_mean_df = low_mean_df.drop(genes_below_min, axis=0)
    high_mean_df = high_mean_df.drop(genes_below_min, axis=0)
    std_df = std_df.drop(genes_below_min, axis=0)
    assay = assay.drop(genes_below_min, axis=0)
    print("Filtered assay to get {} columns by {} rows".format(len(assay.columns), len(assay.index)), flush=True)

    mean_rest_dfs = []
    std_rest_dfs = []
    colnames = []
    for k, v in inv_map.items():
        rest_v = list(set(list(assay.columns)).difference(set(v)))
        mean_rest_dfs.append(assay.loc[:, rest_v].mean(axis=1))
        std_rest_dfs.append(assay.loc[:, rest_v].std(axis=1))
        colnames.append(k)
    mean_rest_df = pd.concat(mean_rest_dfs, axis=1)
    mean_rest_df.columns = colnames
    std_rest_df = pd.concat(std_rest_dfs, axis=1)
    std_rest_df.columns = colnames

    zscore_dfs = []
    cols = colnames
    colnames = []
    for coli in cols:
        for colj in cols:
            if coli != colj:
                # Here we should check significance
                # Fetch most realistic mean comparison set, what is smallest difference between two ranges
                mean_diff_overlap_low_high = (low_mean_df[coli]-high_mean_df[colj])
                mean_diff_overlap_high_low = (high_mean_df[coli]-low_mean_df[colj])
                diff_df = mean_diff_overlap_low_high.combine(mean_diff_overlap_high_low, range_check)

                zscore_dfs.append((diff_df/(std_df[colj]+std_df[coli]/4)).fillna(0).clip(-max_zscore, max_zscore))
                colnames.append("{} vs {}".format(coli, colj)) 
    for coli in cols:
        zscore_dfs.append(((mean_df[coli]-mean_rest_df[colj])/(std_rest_df[colj]+std_rest_df[coli]/4)).fillna(0).clip(-max_zscore, max_zscore))
        colnames.append("{} vs rest".format(coli)) 

    zscore_df = pd.concat(zscore_dfs, axis=1)
    zscore_df.columns = colnames
    norms_df = zscore_df.apply(np.linalg.norm, axis=1)
    colsmatching = norms_df.T[(norms_df.T >= min_zscore)].index.values
    return_df = zscore_df.T[colsmatching]
    gn.export_statically(gn.assay_from_pandas(return_df), 'Differential expression sets')
    gn.export(return_df.T.to_csv(), 'differential_gene_sets.csv', kind='raw', meta=None, raw=True)

    toc = time.perf_counter()
    time_passed = round(toc - tic, 2)

    timing = "* Finished differential expression sets step in {} seconds*".format(time_passed)
    gn.add_result(timing, "markdown")

    gn.commit()
def main():
    tic = time.perf_counter()

    gn = Granatum()

    clustersvsgenes = gn.pandas_from_assay(gn.get_import('clustersvsgenes'))
    max_dist = gn.get_arg('max_dist')
    min_zscore = gn.get_arg('min_zscore')

    clustercomparisonstotest = list(clustersvsgenes.index)

    G = nx.MultiDiGraph()
    clusternames = list(clustersvsgenes.T.columns)
    individualclusters = [
        n[:n.index(" vs rest")] for n in clusternames if n.endswith("vs rest")
    ]
    print(individualclusters, flush=True)
    for cl in individualclusters:
        G.add_node(cl)

    # {pathway : {"cluster1":score1, "cluster2":score2}, pathway2 : {}}
    # resultsmap = {}
    relabels = {}
    keys = {}
    currentkeyindex = 0
    maxexpression = np.max(np.max(clustersvsgenes))
    print("Max expression = {}".format(maxexpression))
    print("Number to analyze = {}".format(
        len(clustersvsgenes.columns) * len(clustercomparisonstotest)),
          flush=True)
    gene_count = 0
    for gene_id in clustersvsgenes.columns:
        gene_count = gene_count + 1
        print("Genecount = {}/{}".format(gene_count,
                                         len(clustersvsgenes.columns)),
              flush=True)
        add_all_edges_for_current_gene = True
        for cluster in clustercomparisonstotest:
            score = clustersvsgenes.loc[cluster, gene_id]
            if score >= min_zscore:
                add_edges = True
                if not gene_id in keys:
                    # First check if within distance of another group
                    closestkey = None
                    closestkeyvalue = 1.0e12
                    for key in keys:
                        gene_values = clustersvsgenes.loc[:, gene_id]
                        ref_values = clustersvsgenes.loc[:, key]
                        sc = np.sqrt(
                            np.nansum(np.square(gene_values - ref_values)) /
                            len(gene_values))
                        if sc <= max_dist and sc < closestkeyvalue:
                            closestkeyvalue = sc
                            closestkey = key
                            break
                    if closestkey == None:
                        keys[gene_id] = currentkeyindex + 1
                    else:
                        keys[gene_id] = keys[closestkey]
                        add_edges = False
                        add_all_edges_for_current_gene = False
                        print("Found a near gene: {}".format(closestkey),
                              flush=True)
                else:
                    add_edges = add_all_edges_for_current_gene
                # print("Score = {}".format(score), flush=True)
                # olddict = resultsmap.get(gene_id, {})
                # olddict[cluster] = score
                # resultsmap[gene_id] = olddict
                if add_edges:
                    from_to = re.split(' vs ', cluster)
                    if from_to[1] != 'rest':
                        G.add_weighted_edges_from(
                            [(from_to[1], from_to[0],
                              score / maxexpression * 1.0)],
                            label=str(keys[gene_id]),
                            penwidth=str(score / maxexpression * 1.0))
                    else:
                        relabel_dict = relabels.get(from_to[0], "")
                        if relabel_dict == "":
                            relabel_dict = from_to[0] + ": " + str(
                                keys[gene_id])
                        else:
                            relabel_dict = relabel_dict + ", " + str(
                                keys[gene_id])
                        relabels[from_to[0]] = relabel_dict
                currentkeyindex = max(currentkeyindex, keys[gene_id])

    print("Relabels {}".format(relabels), flush=True)
    G = nx.relabel_nodes(G, relabels)
    pos = nx.spring_layout(G)
    edge_labels = nx.get_edge_attributes(G, 'label')
    write_dot(G, 'plot.dot')
    os.system('dot plot.dot -Kcirco -Tpng -Gsize="6,6" -Gdpi=600 > plot.png')
    with open('plot.png', "rb") as f:
        image_b64 = b64encode(f.read()).decode("utf-8")

    gn.results.append({
        "type": "png",
        "width": 650,
        "height": 480,
        "description": 'Network of clusters based on expression',
        "data": image_b64,
    })

    footnote = ""
    inv_map = {}
    for k, v in keys.items():
        inv_map[v] = inv_map.get(v, []) + [k]

    for k, v in sorted(inv_map.items(), key=lambda item: item[0]):
        newv = map(lambda gene: "[{}]({})".format(gene, geturl(gene)), v)
        vliststr = ", ".join(newv)
        newstr = "{}: {} {}".format(
            k, (clustersvsgenes.loc[clustersvsgenes[v[0]] > min_zscore,
                                    v[0]]).to_dict(), vliststr)
        if footnote == "":
            footnote = newstr
        else:
            footnote = footnote + "  \n" + newstr

    gn.add_result(footnote, "markdown")

    # gn.export(return_df.T.to_csv(), 'differential_gene_sets.csv', kind='raw', meta=None, raw=True)

    toc = time.perf_counter()
    time_passed = round(toc - tic, 2)

    timing = "* Finished differential expression sets step in {} seconds*".format(
        time_passed)
    gn.add_result(timing, "markdown")

    gn.commit()
def main():
    tic = time.perf_counter()

    gn = Granatum()

    clustersvsgenes = gn.pandas_from_assay(gn.get_import('clustersvsgenes'))
    gset_group_id = gn.get_arg('gset_group_id')
    min_zscore = gn.get_arg('min_zscore')

    clustercomparisonstotest = list(clustersvsgenes.index)

    # Load all gene sets
    gsets = load_gsets(gset_group_id)

    G = nx.MultiDiGraph()
    clusternames = list(clustersvsgenes.T.columns)
    individualclusters = [
        n[:n.index(" vs rest")] for n in clusternames if n.endswith("vs rest")
    ]
    print(individualclusters, flush=True)
    for cl in individualclusters:
        G.add_node(cl)

    # {pathway : {"cluster1":score1, "cluster2":score2}, pathway2 : {}}
    resultsmap = {}
    relabels = {}
    keys = {}
    urlsforkeys = {}
    currentkeyindex = 0
    for gset in gsets:
        urlsforkeys[gset["name"]] = gset["url"]
        for cluster in clustercomparisonstotest:
            try:
                resultdf = clustersvsgenes.loc[cluster, gset["gene_ids"]]
                resultdf = np.nan_to_num(resultdf)
                score = np.nanmean(resultdf)
                if score >= min_zscore:
                    keys[gset["name"]] = keys.get(gset["name"],
                                                  currentkeyindex + 1)
                    print("Score = {}".format(score), flush=True)
                    olddict = resultsmap.get(gset["name"], {})
                    olddict[cluster] = score
                    resultsmap[gset["name"]] = olddict
                    from_to = re.split(' vs ', cluster)
                    if from_to[1] != 'rest':
                        G.add_weighted_edges_from(
                            [(from_to[1], from_to[0], score * 2.0)],
                            label=str(keys[gset["name"]]),
                            penwidth=str(score * 2.0))
                    else:
                        relabel_dict = relabels.get(from_to[0], "")
                        if relabel_dict == "":
                            relabel_dict = from_to[0] + ": " + str(
                                keys[gset["name"]])
                        else:
                            relabel_dict = relabel_dict + ", " + str(
                                keys[gset["name"]])
                        relabels[from_to[0]] = relabel_dict
                    currentkeyindex = max(currentkeyindex, keys[gset["name"]])
            except Exception as inst:
                print("Key error with {}".format(gset["name"]), flush=True)
                print("Exception: {}".format(inst), flush=True)

    print("Relabels {}".format(relabels), flush=True)
    G = nx.relabel_nodes(G, relabels)
    pos = nx.spring_layout(G)
    edge_labels = nx.get_edge_attributes(G, 'label')
    write_dot(G, 'plot.dot')
    os.system("dot plot.dot -Tpng -Gdpi=600 > plot.png")
    with open('plot.png', "rb") as f:
        image_b64 = b64encode(f.read()).decode("utf-8")

    gn.results.append({
        "type": "png",
        "width": 650,
        "height": 480,
        "description": 'Network of clusters based on expression',
        "data": image_b64,
    })

    footnote = ""
    for k, v in sorted(keys.items(), key=lambda item: item[1]):
        newstr = "{}: [{}]({})".format(v, k, urlsforkeys[k])
        if footnote == "":
            footnote = newstr
        else:
            footnote = footnote + "  \n" + newstr

    gn.add_result(footnote, "markdown")

    # gn.export(return_df.T.to_csv(), 'differential_gene_sets.csv', kind='raw', meta=None, raw=True)

    toc = time.perf_counter()
    time_passed = round(toc - tic, 2)

    timing = "* Finished differential expression sets step in {} seconds*".format(
        time_passed)
    gn.add_result(timing, "markdown")

    gn.commit()
Пример #22
0
def main():
    tic = time.perf_counter()

    gn = Granatum()
    assay = gn.pandas_from_assay(gn.get_import('assay'))
    groups = gn.get_import('groups')

    inv_map = {}
    for k, v in groups.items():
        inv_map[v] = inv_map.get(v, []) + [k]

    drop_set = parse(gn.get_arg('drop_set'))
    merge_set_1 = parse(gn.get_arg('merge_set_1'))
    merge_set_2 = parse(gn.get_arg('merge_set_2'))
    merge_set_3 = parse(gn.get_arg('merge_set_3'))
    relabel_set_1 = gn.get_arg('relabel_set_1')
    relabel_set_2 = gn.get_arg('relabel_set_2')
    relabel_set_3 = gn.get_arg('relabel_set_3')

    if len(merge_set_1) > 0:
        if relabel_set_1 == "":
            relabel_set_1 = " + ".join(merge_set_1)

    if len(merge_set_2) > 0:
        if relabel_set_2 == "":
            relabel_set_2 = " + ".join(merge_set_2)

    if len(merge_set_3) > 0:
        if relabel_set_3 == "":
            relabel_set_3 = " + ".join(merge_set_3)

    try:
        for ds in drop_set:
            cells = inv_map[ds]
            gn.add_result(
                "Dropping {} cells that match {}".format(len(cells), ds),
                "markdown")
            assay = assay.drop(cells, axis=1)
            groups = {key: val for key, val in groups.items() if val != ds}
    except Exception as e:
        gn.add_result(
            "Error found in drop set, remember it should be comma separated: {}"
            .format(e), "markdown")

    try:
        if len(merge_set_1) > 0:
            merge_set_1_cells = []
            for ms1 in merge_set_1:
                merge_set_1_cells = merge_set_1_cells + inv_map[ms1]
            for cell in merge_set_1_cells:
                groups[cell] = relabel_set_1

        if len(merge_set_2) > 0:
            merge_set_2_cells = []
            for ms2 in merge_set_2:
                merge_set_2_cells = merge_set_2_cells + inv_map[ms2]
            for cell in merge_set_2_cells:
                groups[cell] = relabel_set_2

        if len(merge_set_3) > 0:
            merge_set_3_cells = []
            for ms3 in merge_set_3:
                merge_set_3_cells = merge_set_3_cells + inv_map[ms3]
            for cell in merge_set_3_cells:
                groups[cell] = relabel_set_3
    except Exception as e:
        gn.add_result(
            "Error found in merge sets, remember it should be comma separated: {}"
            .format(e), "markdown")

    toc = time.perf_counter()
    time_passed = round(toc - tic, 2)

    gn.export_statically(gn.assay_from_pandas(assay), "Label adjusted assay")
    gn.export_statically(groups, "Adjusted labels")

    timing = "* Finished sample coloring step in {} seconds*".format(
        time_passed)
    gn.add_result(timing, "markdown")

    gn.commit()