Exemplo n.º 1
0
    # Project to higher dimension.
    Z = np.absolute(np.random.randn(2, 100))
    datasets = [np.dot(s, Z) for s in samples]

    # Add batch effect "noise."
    datasets = [ds + np.random.randn(1, 100) for ds in datasets]

    # Normalize datasets.
    datasets = [normalize(ds, axis=1) for ds in datasets]

    tsne = TSNE(n_iter=400, perplexity=100, verbose=2, random_state=69)

    tsne.fit(np.concatenate(datasets[1:]))
    plot_clusters(tsne.embedding_, np.concatenate(clusters[1:]), s=500)
    plt.title('Uncorrected data')
    plt.savefig('simulation_uncorrected.svg')

    # Assemble datasets.
    assembled = assemble(datasets[1:], verbose=1, sigma=1, knn=10, approx=True)
    tsne.fit(datasets[1])
    plot_clusters(tsne.embedding_, clusters[1], s=500)
    plt.title('Dataset 1')
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')
    plt.savefig('simulation_ds1.svg')

    tsne.fit(datasets[2])
    plot_clusters(tsne.embedding_, clusters[2], s=500)
    plt.title('Dataset 2')
    plt.xlabel('t-SNE 1')
Exemplo n.º 2
0
    #datasets_dimred, genes = process_data(datasets, genes, hvg=hvg)
    datasets, genes = correct(datasets, genes_list)
    X = vstack(datasets).toarray()
    X[X < 0] = 0

    cell_labels = (
        open('data/cell_labels/pancreas_cluster.txt').read().rstrip().split())
    er_idx = [i for i, cl in enumerate(cell_labels) if cl == 'beta_er']
    beta_idx = [i for i, cl in enumerate(cell_labels) if cl == 'beta']

    gadd_idx = list(genes).index('GADD45A')
    herp_idx = list(genes).index('HERPUD1')

    plt.figure()
    plt.boxplot([X[er_idx, gadd_idx], X[beta_idx, gadd_idx]], showmeans=True)
    plt.title('GADD45A (p = {})'.format(
        ttest_ind(X[er_idx, gadd_idx], X[beta_idx, gadd_idx],
                  equal_var=False)[1]))
    plt.xticks([1, 2], ['beta_er', 'beta'])
    plt.ylabel('Scaled gene expression')
    plt.savefig('er_stress_GADD45A.svg')

    plt.figure()
    plt.boxplot([X[er_idx, herp_idx], X[beta_idx, herp_idx]], showmeans=True)
    plt.title('HERPUD1 (p = {})'.format(
        ttest_ind(X[er_idx, herp_idx], X[beta_idx, herp_idx],
                  equal_var=False)[1]))
    plt.xticks([1, 2], ['beta_er', 'beta'])
    plt.ylabel('Scaled gene expression')
    plt.savefig('er_stress_HERPUD1.svg')
Exemplo n.º 3
0
    )
    
    # Scanorama.
    X = np.loadtxt('data/panorama_embedding.txt')
    idx = np.random.choice(X.shape[0], size=20000, replace=False)
    sil_pan = sil(X[idx, :], labels[idx])
    print(np.median(sil_pan))

    # scran MNN.
    X = np.loadtxt('data/mnn_embedding.txt')
    idx = np.random.choice(X.shape[0], size=20000, replace=False)
    sil_mnn = sil(X[idx, :], labels[idx])
    print(np.median(sil_mnn))

    # Seurat CCA.
    X = np.loadtxt('data/cca_embedding.txt')
    idx = np.random.choice(X.shape[0], size=20000, replace=False)
    sil_cca = sil(X[idx, :], labels[idx])
    print(np.median(sil_cca))


    print(ttest_ind(sil_pan, sil_mnn))
    print(ttest_ind(sil_pan, sil_cca))
    
    plt.figure()
    plt.boxplot([ sil_pan, sil_mnn, sil_cca ], showmeans=True)
    plt.title('Distributions of Silhouette Coefficients')
    plt.xticks([ 1, 2, 3 ], [ 'Scanorama', 'scran MNN', 'Seurat CCA' ])
    plt.ylabel('Silhouette Coefficient')
    plt.savefig('silhouette.svg')
Exemplo n.º 4
0
def plot_stats(stat,
               samp_fns=None,
               fname=None,
               dtype=float,
               only_fns=None,
               only_replace=None,
               max_N=None):
    if samp_fns is None:
        assert (fname is not None)
        samp_fns = parse_stats(fname)

    colors = [
        #'#377eb8', '#ff7f00', '#f781bf',
        #'#4daf4a', '#ff0000', '#a65628', '#984ea3',
        #'#999999', '#e41a1c', '#dede00',
        #'#ffe119', '#e6194b', '#ffbea3',
        #'#911eb4', '#46f0f0', '#f032e6',
        #'#d2f53c', '#008080', '#e6beff',
        #'#aa6e28', '#800000', '#aaffc3',
        #'#808000', '#ffd8b1', '#000080',
        #'#808080', '#fabebe', '#a3f4ff'
        '#377eb8',
        '#ff7f00',
        '#4daf4a',
        '#984ea3',
        #'#f781bf', '#a65628', '#984ea3',
        '#999999',
        '#e41a1c',
        '#dede00',
        '#ffe119',
        '#e6194b',
        '#ffbea3',
        '#911eb4',
        '#46f0f0',
        '#f032e6',
        '#d2f53c',
        '#008080',
        '#e6beff',
        '#aa6e28',
        '#800000',
        '#aaffc3',
        '#808000',
        '#ffd8b1',
        '#000080',
        '#808080',
        '#fabebe',
        '#a3f4ff'
    ]

    plt.figure()

    c_idx = 0

    for s_idx, (samp_fn, replace) in enumerate(
            sorted(samp_fns, key=lambda x: '{}_{}'.format(*x))):

        if samp_fn.startswith('_'):
            continue
        if only_fns is not None and samp_fn not in only_fns:
            continue
        if only_replace is not None and replace != only_replace:
            continue

        Ns = []
        means = []
        sems = []
        for N in samp_fns[(samp_fn, replace)]:
            if max_N is not None and N > max_N:
                continue
            stat_vals = [
                dtype(stat_dict[stat])
                for stat_dict in samp_fns[(samp_fn, replace)][N]
                if stat in stat_dict
            ]
            if len(stat_vals) == 0:
                continue
            Ns.append(N)
            means.append(np.mean(stat_vals))
            sems.append(ss.sem(stat_vals))

        sort_idx = np.argsort(Ns)
        Ns = np.array(Ns)[sort_idx]
        means = np.array(means)[sort_idx]
        sems = np.array(sems)[sort_idx]

        label = '{}_{}'.format(samp_fn, replace)

        plt.plot(Ns, means, color=colors[c_idx], label=label)
        plt.scatter(Ns, means, color=colors[c_idx])
        plt.fill_between(Ns,
                         means - sems,
                         means + sems,
                         alpha=0.3,
                         color=colors[c_idx])

        c_idx = (c_idx + 1) % len(colors)

    namespace = samp_fns[('_namespace', None)]
    title = '{}_{}'.format(namespace, stat)
    if only_replace is not None:
        title += '_replace{}'.format(only_replace)

    plt.title(title)
    plt.xlabel('Sample size')
    plt.ylabel(stat)
    plt.legend()
    mkdir_p('target/stats_plots')
    plt.savefig('target/stats_plots/{}.svg'.format(title))
Exemplo n.º 5
0
        open('data/cell_labels/pancreas_cluster.txt')
        .read().rstrip().split()
    )
    er_idx = [ i for i, cl in enumerate(cell_labels)
               if cl == 'beta_er' ]
    beta_idx = [ i for i, cl in enumerate(cell_labels)
                 if cl == 'beta' ]

    gadd_idx = list(genes).index('GADD45A')
    herp_idx = list(genes).index('HERPUD1')

    plt.figure()
    plt.boxplot([ X[er_idx, gadd_idx], X[beta_idx, gadd_idx] ],
                showmeans=True)
    plt.title('GADD45A (p < {})'.format(ttest_ind(
        X[er_idx, gadd_idx], X[beta_idx, gadd_idx]
    )[1]))
    plt.xticks([1, 2], ['beta_er', 'beta'])
    plt.ylabel('Scaled gene expression')
    plt.savefig('er_stress_GADD45A.svg')

    plt.figure()
    plt.boxplot([ X[er_idx, herp_idx], X[beta_idx, herp_idx] ],
                showmeans=True)
    plt.title('HERPUD1 (p < {})'.format(ttest_ind(
        X[er_idx, herp_idx], X[beta_idx, herp_idx]
    )[1]))
    plt.xticks([1, 2], ['beta_er', 'beta'])
    plt.ylabel('Scaled gene expression')
    plt.savefig('er_stress_HERPUD1.svg')