Пример #1
0
 def test_1_v_rest_TTest(self):
     t_test_scores, t_test_p_vals = one_vs_rest_t(self.data, self.labs)
     self.assertEqual(len(t_test_scores), len(t_test_p_vals))
     K = len(set(self.labs))
     genes, cells = self.data.shape
     self.assertEqual(len(t_test_scores), K)
     for k in set(self.labs):
         print(k)
         print(t_test_scores[k][:10])
         print(t_test_p_vals[k][:10])
         pvals = np.array([x[1] for x in t_test_p_vals[k]])
         cscores = np.array([x[1] for x in t_test_scores[k]])
         self.assertTrue(t_test_scores[k][0][1] >= 1)
         self.assertTrue(t_test_p_vals[k][0][1] <= 0.05)
         self.assertTrue((pvals >= 0).all())
         self.assertTrue((pvals <= 1).all())
         self.assertTrue((cscores >= 0).all())
     t_test_scores, _ = one_vs_rest_t(self.data,
                                      self.labs,
                                      calc_pvals=False)
     for k in set(self.labs):
         self.assertTrue(t_test_scores[k][0][1] >= 1)
Пример #2
0
 def test_1_v_rest_simulated_data(self):
     from uncurl import simulation
     data, clusters = simulation.generate_poisson_data(
         np.array([[0.5, 4.0, 3.0], [10.0, 1.0, 1.0], [0.5, 0.5, 0.5]]),
         100)
     data_csc = sparse.csc_matrix(data)
     ratios, pvals = one_vs_rest_t(data_csc, clusters, eps=1e-8, test='u')
     print(ratios)
     print(data[1, clusters == 0].mean() / data[1, clusters == 1].mean())
     print(pvals)
     self.assertTrue(ratios[0][0][0] == 1)
     self.assertTrue(ratios[1][0][0] == 0)
     self.assertTrue(pvals[0][0][0] == 1)
     self.assertTrue(pvals[0][0][1] < 0.05)
     self.assertTrue(pvals[0][2][1] > 0.01)
     self.assertTrue(pvals[1][0][0] == 0)
     self.assertTrue(pvals[1][0][1] < 0.05)
Пример #3
0
def get_genes_tm(all_matrices, genes, all_labels):
    """
    This function returns the top genes from Tabula Muris.
    """
    labels_set = set(all_labels)
    n_genes = len(genes)
    if len(genes) == all_matrices.shape[1]:
        all_matrices = all_matrices.T
    print(all_matrices.shape)
    print(all_labels.shape)

    from uncurl_analysis import gene_extraction
    scores_t, pvals_t = gene_extraction.one_vs_rest_t(all_matrices,
                                                      all_labels,
                                                      eps=0.1,
                                                      test='t')
    # take the top 50 genes for each cell type
    cell_type_genes = {}
    all_top_genes = set()
    n_gene = 50
    for label in labels_set:
        top_genes = [genes[x[0]] for x in scores_t[label][:n_gene]]
        cell_type_genes[label] = top_genes
        all_top_genes.update(top_genes)
    # TODO: get genes with highest variance
    from uncurl import preprocessing
    means, vars = preprocessing.sparse_mean_var(all_matrices)
    nonzeros = (all_matrices > 0)
    genes_sum = np.array(nonzeros.sum(1)).flatten()
    # get the 2000 genes with the highest variance/mean that also have at least 10% nonzero
    mv = vars / (means + 1e-8)
    mv_indices = mv.argsort()[::-1]
    top_genes = mv_indices[:1000]
    top_genes = top_genes[genes_sum[top_genes] > (n_genes / 10.)]
    top_genes = [genes[x] for x in top_genes]
    all_top_genes.update(top_genes)
    return all_top_genes
Пример #4
0
#reject, pvals_corrected, a, b = multitest.multipletests(pvals[1,0,:])
#pvals[0,0,:] = pvals_corrected

# calculate actual ratios between means of the two clusters
import numpy as np
actual_ratios = (data_csc[:, labels == 2].mean(1) +
                 0.1) / (data_csc[:, labels == 1].mean(1) + 0.1)
actual_ratios = np.array(actual_ratios).flatten()

# calculate p-values
c_scores, c_pvals = c_scores_from_t(scores, pvals)
print('c score time: ', time.time() - t0)

# 1 vs rest u-test
t0 = time.time()
c_scores, c_pvals = one_vs_rest_t(data_csc, labels, eps=0.1, test='u')
print('mann-whitney u-test time: ', time.time() - t0)

# step 3: map gene ids to gene names
new_pvals = {k: [] for k in c_pvals.keys()}
for k, p in c_pvals.items():
    for gene_id, pv in p:
        new_pvals[k].append((table_gene_names.iloc[gene_id], pv))

# save new_pvals
with open('scde_c_score_pvals.pkl', 'wb') as f:
    pickle.dump(new_pvals, f)

with open('scde_c_scores.pkl', 'wb') as f:
    pickle.dump(c_scores, f)
# save all_matrices, all_labels
scipy.io.mmwrite('mca_fine_all_matrices.mtx', all_matrices)
np.savetxt('mca_fine_all_labels.txt', all_labels, fmt='%s')

############################################################################################

genes = np.loadtxt('genes_mca.txt', dtype=str)
all_matrices = scipy.io.mmread('mca_fine_all_matrices.mtx')
all_labels = np.loadtxt('mca_fine_all_labels.txt', dtype=str, delimiter='##')

# 2. calculate diffexp for each cluster
from uncurl_analysis import gene_extraction
import time
t0 = time.time()
scores_t, pvals_t = gene_extraction.one_vs_rest_t(all_matrices,
                                                  all_labels,
                                                  eps=0.1,
                                                  test='t')
print('diffexp time for t test:', time.time() - t0)
t0 = time.time()
#scores_u, pvals_u = gene_extraction.one_vs_rest_t(all_matrices, all_labels, eps=0.1, test='u')
print('diffexp time for u test:', time.time() - t0)

with open('mca_fine_t_scores.pkl', 'wb') as f:
    pickle.dump(scores_t, f)
with open('mca_fine_t_pvals.pkl', 'wb') as f:
    pickle.dump(pvals_t, f)
#with open('mca_fine_u_pvals.pkl', 'wb') as f:
#    pickle.dump(pvals_u, f)

###########################################################################################
# 3. for each cluster, run cellmesh and cellmarker