# Project to higher dimension. Z = np.absolute(np.random.randn(2, 100)) datasets = [np.dot(s, Z) for s in samples] # Add batch effect "noise." datasets = [ds + np.random.randn(1, 100) for ds in datasets] # Normalize datasets. datasets = [normalize(ds, axis=1) for ds in datasets] tsne = TSNE(n_iter=400, perplexity=100, verbose=2, random_state=69) tsne.fit(np.concatenate(datasets[1:])) plot_clusters(tsne.embedding_, np.concatenate(clusters[1:]), s=500) plt.title('Uncorrected data') plt.savefig('simulation_uncorrected.svg') # Assemble datasets. assembled = assemble(datasets[1:], verbose=1, sigma=1, knn=10, approx=True) tsne.fit(datasets[1]) plot_clusters(tsne.embedding_, clusters[1], s=500) plt.title('Dataset 1') plt.xlabel('t-SNE 1') plt.ylabel('t-SNE 2') plt.savefig('simulation_ds1.svg') tsne.fit(datasets[2]) plot_clusters(tsne.embedding_, clusters[2], s=500) plt.title('Dataset 2') plt.xlabel('t-SNE 1')
#datasets_dimred, genes = process_data(datasets, genes, hvg=hvg) datasets, genes = correct(datasets, genes_list) X = vstack(datasets).toarray() X[X < 0] = 0 cell_labels = ( open('data/cell_labels/pancreas_cluster.txt').read().rstrip().split()) er_idx = [i for i, cl in enumerate(cell_labels) if cl == 'beta_er'] beta_idx = [i for i, cl in enumerate(cell_labels) if cl == 'beta'] gadd_idx = list(genes).index('GADD45A') herp_idx = list(genes).index('HERPUD1') plt.figure() plt.boxplot([X[er_idx, gadd_idx], X[beta_idx, gadd_idx]], showmeans=True) plt.title('GADD45A (p = {})'.format( ttest_ind(X[er_idx, gadd_idx], X[beta_idx, gadd_idx], equal_var=False)[1])) plt.xticks([1, 2], ['beta_er', 'beta']) plt.ylabel('Scaled gene expression') plt.savefig('er_stress_GADD45A.svg') plt.figure() plt.boxplot([X[er_idx, herp_idx], X[beta_idx, herp_idx]], showmeans=True) plt.title('HERPUD1 (p = {})'.format( ttest_ind(X[er_idx, herp_idx], X[beta_idx, herp_idx], equal_var=False)[1])) plt.xticks([1, 2], ['beta_er', 'beta']) plt.ylabel('Scaled gene expression') plt.savefig('er_stress_HERPUD1.svg')
) # Scanorama. X = np.loadtxt('data/panorama_embedding.txt') idx = np.random.choice(X.shape[0], size=20000, replace=False) sil_pan = sil(X[idx, :], labels[idx]) print(np.median(sil_pan)) # scran MNN. X = np.loadtxt('data/mnn_embedding.txt') idx = np.random.choice(X.shape[0], size=20000, replace=False) sil_mnn = sil(X[idx, :], labels[idx]) print(np.median(sil_mnn)) # Seurat CCA. X = np.loadtxt('data/cca_embedding.txt') idx = np.random.choice(X.shape[0], size=20000, replace=False) sil_cca = sil(X[idx, :], labels[idx]) print(np.median(sil_cca)) print(ttest_ind(sil_pan, sil_mnn)) print(ttest_ind(sil_pan, sil_cca)) plt.figure() plt.boxplot([ sil_pan, sil_mnn, sil_cca ], showmeans=True) plt.title('Distributions of Silhouette Coefficients') plt.xticks([ 1, 2, 3 ], [ 'Scanorama', 'scran MNN', 'Seurat CCA' ]) plt.ylabel('Silhouette Coefficient') plt.savefig('silhouette.svg')
def plot_stats(stat, samp_fns=None, fname=None, dtype=float, only_fns=None, only_replace=None, max_N=None): if samp_fns is None: assert (fname is not None) samp_fns = parse_stats(fname) colors = [ #'#377eb8', '#ff7f00', '#f781bf', #'#4daf4a', '#ff0000', '#a65628', '#984ea3', #'#999999', '#e41a1c', '#dede00', #'#ffe119', '#e6194b', '#ffbea3', #'#911eb4', '#46f0f0', '#f032e6', #'#d2f53c', '#008080', '#e6beff', #'#aa6e28', '#800000', '#aaffc3', #'#808000', '#ffd8b1', '#000080', #'#808080', '#fabebe', '#a3f4ff' '#377eb8', '#ff7f00', '#4daf4a', '#984ea3', #'#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00', '#ffe119', '#e6194b', '#ffbea3', '#911eb4', '#46f0f0', '#f032e6', '#d2f53c', '#008080', '#e6beff', '#aa6e28', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000080', '#808080', '#fabebe', '#a3f4ff' ] plt.figure() c_idx = 0 for s_idx, (samp_fn, replace) in enumerate( sorted(samp_fns, key=lambda x: '{}_{}'.format(*x))): if samp_fn.startswith('_'): continue if only_fns is not None and samp_fn not in only_fns: continue if only_replace is not None and replace != only_replace: continue Ns = [] means = [] sems = [] for N in samp_fns[(samp_fn, replace)]: if max_N is not None and N > max_N: continue stat_vals = [ dtype(stat_dict[stat]) for stat_dict in samp_fns[(samp_fn, replace)][N] if stat in stat_dict ] if len(stat_vals) == 0: continue Ns.append(N) means.append(np.mean(stat_vals)) sems.append(ss.sem(stat_vals)) sort_idx = np.argsort(Ns) Ns = np.array(Ns)[sort_idx] means = np.array(means)[sort_idx] sems = np.array(sems)[sort_idx] label = '{}_{}'.format(samp_fn, replace) plt.plot(Ns, means, color=colors[c_idx], label=label) plt.scatter(Ns, means, color=colors[c_idx]) plt.fill_between(Ns, means - sems, means + sems, alpha=0.3, color=colors[c_idx]) c_idx = (c_idx + 1) % len(colors) namespace = samp_fns[('_namespace', None)] title = '{}_{}'.format(namespace, stat) if only_replace is not None: title += '_replace{}'.format(only_replace) plt.title(title) plt.xlabel('Sample size') plt.ylabel(stat) plt.legend() mkdir_p('target/stats_plots') plt.savefig('target/stats_plots/{}.svg'.format(title))
open('data/cell_labels/pancreas_cluster.txt') .read().rstrip().split() ) er_idx = [ i for i, cl in enumerate(cell_labels) if cl == 'beta_er' ] beta_idx = [ i for i, cl in enumerate(cell_labels) if cl == 'beta' ] gadd_idx = list(genes).index('GADD45A') herp_idx = list(genes).index('HERPUD1') plt.figure() plt.boxplot([ X[er_idx, gadd_idx], X[beta_idx, gadd_idx] ], showmeans=True) plt.title('GADD45A (p < {})'.format(ttest_ind( X[er_idx, gadd_idx], X[beta_idx, gadd_idx] )[1])) plt.xticks([1, 2], ['beta_er', 'beta']) plt.ylabel('Scaled gene expression') plt.savefig('er_stress_GADD45A.svg') plt.figure() plt.boxplot([ X[er_idx, herp_idx], X[beta_idx, herp_idx] ], showmeans=True) plt.title('HERPUD1 (p < {})'.format(ttest_ind( X[er_idx, herp_idx], X[beta_idx, herp_idx] )[1])) plt.xticks([1, 2], ['beta_er', 'beta']) plt.ylabel('Scaled gene expression') plt.savefig('er_stress_HERPUD1.svg')