示例#1
0
 def test_split_delete_cluster(self):
     """
     Tests splitting clusters and deleting cells.
     """
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  frac=0.2,
                                  clusters=8,
                                  data_filename='data.mtx',
                                  baseline_dim_red='tsvd',
                                  dim_red_option='MDS',
                                  cell_frac=1.0,
                                  max_iters=20,
                                  inner_max_iters=10)
     sca.run_full_analysis()
     # split two clusters....
     clusters = sca.labels
     cluster_counts = Counter(clusters)
     top_cluster, top_count = cluster_counts.most_common()[0]
     print(cluster_counts)
     print(top_cluster, top_count)
     sca.recluster('split', [top_cluster])
     sca.run_post_analysis()
     self.assertEqual(sca.params['clusters'], 9)
     self.assertEqual(sca.w_sampled.shape[0], 9)
     old_cell_count = len(sca.cell_sample)
     cells_to_remove = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     sca.recluster('delete', cells_to_remove)
     sca.run_post_analysis()
     self.assertEqual(sca.params['clusters'], 9)
     new_cell_count = len(sca.cell_sample)
     self.assertEqual(new_cell_count, old_cell_count - len(cells_to_remove))
示例#2
0
 def test_run_full_analysis(self):
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  clusters=8,
                                  frac=0.2,
                                  data_filename='data.mtx',
                                  baseline_dim_red='umap',
                                  dim_red_option='umap',
                                  normalize=True,
                                  use_fdr=True,
                                  min_reads=10,
                                  max_reads=4000,
                                  cell_frac=0.5,
                                  max_iters=20,
                                  inner_max_iters=20)
     print(sca.data.shape)
     print(sca.cell_subset.shape)
     print(sca.cell_subset)
     print(sca.data_subset.shape)
     self.assertEqual(sca.cell_subset.shape[0], 400)
     self.assertTrue(sca.data_subset.shape[1] > 200)
     sca.run_full_analysis()
     self.assertTrue(sca.has_dim_red)
     self.assertTrue(sca.has_pvals)
     self.assertTrue(sca.has_top_genes_1_vs_rest)
     self.assertTrue(sca.has_top_genes)
     self.assertTrue(sca.has_baseline_vis)
     top_genes = sca.top_genes
     self.assertEqual(len(top_genes), 8)
     self.assertEqual(len(top_genes[0]), sca.data.shape[0])
     top_genes_1_vs_rest = sca.top_genes_1_vs_rest
     self.assertEqual(len(top_genes_1_vs_rest), 8)
     self.assertEqual(len(top_genes_1_vs_rest[0]), sca.data.shape[0])
     self.assertEqual(sca.dim_red.shape[0], 2)
示例#3
0
 def test_merge_new_cluster(self):
     """
     tests merging clusters, and creating new clusters from a subset of
     cells.
     """
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  frac=0.2,
                                  clusters=8,
                                  data_filename='data.mtx',
                                  baseline_dim_red='tsvd',
                                  cell_frac=0.99,
                                  max_iters=20,
                                  inner_max_iters=20)
     sca.run_full_analysis()
     print(sca.w_sampled.argmax(0))
     # merge two clusters....
     pair = [0, 1]
     sca.recluster('merge', pair)
     print(sca.w_sampled.argmax(0))
     clusters = sca.w.argmax(0)
     cluster_counts = Counter(clusters)
     top_cluster, top_count = cluster_counts.most_common()[0]
     sca.run_post_analysis()
     self.assertEqual(sca.params['clusters'], 7)
     self.assertEqual(sca.w_sampled.shape[0], 7)
     # TODO: due to sampling, this won't actually be the cluster 7 cells...
     selected_cells = list(range(350, sca.w_sampled.shape[1]))
     sca.recluster('new', selected_cells)
     self.assertEqual(sca.params['clusters'], 8)
     self.assertEqual(sca.w_sampled.shape[0], 8)
示例#4
0
 def test_add_color_track(self):
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  frac=0.2,
                                  clusters=8,
                                  data_filename='data.mtx',
                                  baseline_dim_red='tsvd',
                                  dim_red_option='MDS',
                                  clustering_method='leiden',
                                  cell_frac=1.0,
                                  max_iters=20,
                                  inner_max_iters=10)
     sca.add_color_track('true_labels', self.labs, is_discrete=True)
     true_labels, is_discrete = sca.get_color_track('true_labels')
     self.assertTrue(nmi(true_labels, self.labs) > 0.99)
     top_genes, top_pvals = sca.calculate_diffexp('true_labels')
     self.assertEqual(len(top_genes), 8)
     self.assertEqual(len(top_pvals), 8)
     sca.add_color_track('true_labels_2', self.labs, is_discrete=False)
     true_labels_2, _ = sca.get_color_track('true_labels_2')
     self.assertTrue((true_labels_2.astype(int) == self.labs).all())
     pairwise_genes, pairwise_pvals = sca.calculate_diffexp('true_labels',
                                                            mode='pairwise')
     self.assertEqual(pairwise_genes.shape, pairwise_pvals.shape)
     pairwise_genes, pairwise_pvals = sca.calculate_diffexp('true_labels',
                                                            mode='pairwise')
     self.assertEqual(pairwise_genes.shape, pairwise_pvals.shape)
     self.assertEqual(pairwise_genes.shape[0], 8)
     top_genes, top_pvals = sca.calculate_diffexp('true_labels')
     self.assertEqual(len(top_genes[0]), len(sca.gene_names))
     self.assertEqual(len(top_genes), 8)
     self.assertEqual(len(top_pvals), 8)
示例#5
0
 def test_load_from_folder(self):
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  clusters=8,
                                  data_filename='data.mtx')
     self.assertEqual(sca.params['clusters'], 8)
     self.assertEqual(sca.data_dir, self.data_dir)
     self.assertEqual(sca.data_f, os.path.join(self.data_dir, 'data.mtx'))
     # test read couns
     self.assertTrue((sca.read_counts == self.data.sum(0)).all())
示例#6
0
 def test_json(self):
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  frac=0.2,
                                  clusters=8,
                                  data_filename='data.mtx',
                                  baseline_dim_red='tsvd',
                                  dim_red_option='umap',
                                  normalize=1,
                                  cell_frac=0.5,
                                  max_iters=20,
                                  inner_max_iters=20,
                                  use_fdr=1)
     sca.run_full_analysis()
     sca.save_json_reset()
     # delete the whole sca, re-load it from json
     del sca
     sca = sc_analysis.SCAnalysis(self.data_dir)
     sca = sca.load_params_from_folder()
     self.assertEqual(sca.params['clusters'], 8)
     self.assertEqual(sca.params['baseline_dim_red'], 'tsvd')
     self.assertEqual(sca.params['dim_red_option'], 'umap')
     self.assertEqual(sca.params['cell_frac'], 0.5)
     self.assertEqual(sca.params['genes_frac'], 0.2)
     self.assertTrue(sca.params['normalize'])
     self.assertTrue(sca.params['use_fdr'])
     self.assertEqual(sca.uncurl_kwargs['max_iters'], 20)
     self.assertTrue(sca.has_dim_red)
     self.assertTrue(sca.has_w)
     self.assertTrue(sca.has_m)
     self.assertEqual(sca.cell_subset.shape[0], 400)
     means = sca.cluster_means
     self.assertEqual(means.shape[1], 8)
     self.assertEqual(means.shape[0], self.data.shape[0])
     # TODO: do re-clustering
     sca.add_color_track('true_labels', self.labs, is_discrete=True)
     old_labels = sca.labels
     sca.relabel('louvain')
     self.assertFalse((old_labels == sca.labels).all())
     true_labels, is_discrete = sca.get_color_track('true_labels')
     self.assertTrue(nmi(sca.labels, true_labels) > 0.65)
     sca.relabel('leiden')
     self.assertTrue(nmi(sca.labels, true_labels) > 0.65)
示例#7
0
 def test_dim_red_sample(self):
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  frac=0.2,
                                  clusters=8,
                                  data_filename='data.mtx',
                                  cell_frac=0.2,
                                  max_iters=20,
                                  inner_max_iters=20)
     mds_means = sca.mds_means
     self.assertEqual(mds_means.shape[0], 2)
     self.assertEqual(mds_means.shape[1], 8)
     dr = sca.dim_red
     self.assertEqual(dr.shape[0], 2)
     self.assertEqual(dr.shape[1], int(0.2 * sca.data.shape[1]))
 def setUp(self):
     dat = loadmat('data/10x_pooled_400.mat')
     self.data = sparse.csc_matrix(dat['data'])
     self.labels = dat['labels'].flatten()
     self.data_dir = '/tmp/uncurl_analysis/test'
     try:
         shutil.rmtree(self.data_dir)
         os.makedirs(self.data_dir)
     except:
         os.makedirs(self.data_dir)
     scipy.io.mmwrite(os.path.join(self.data_dir, 'data.mtx'), self.data)
     self.sca = sc_analysis.SCAnalysis(self.data_dir,
                                       clusters=8,
                                       data_filename='data.mtx')
     self.sca.add_color_track('true_labels', self.labels, is_discrete=True)
示例#9
0
 def test_run_uncurl(self):
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  clusters=8,
                                  frac=0.2,
                                  data_filename='data.mtx',
                                  max_iters=20,
                                  inner_max_iters=50)
     sca.run_uncurl()
     self.assertTrue(sca.has_w)
     self.assertTrue(sca.has_m)
     self.assertTrue(sca.w.shape[0] == 8)
     self.assertTrue(sca.w.shape[1] == self.data.shape[1])
     self.assertTrue(os.path.exists(sca.w_f))
     self.assertTrue(os.path.exists(sca.m_f))
     print(nmi(sca.labels, self.labs))
     self.assertTrue(nmi(sca.labels, self.labs) > 0.65)
示例#10
0
 def test_dim_red_2(self):
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  clusters=8,
                                  frac=0.2,
                                  data_filename='data.mtx',
                                  dim_red_option='UMAP',
                                  baseline_dim_red='UMAP',
                                  cell_frac=0.2,
                                  max_iters=20,
                                  inner_max_iters=20)
     mds_means = sca.mds_means
     self.assertEqual(mds_means.shape[0], 2)
     self.assertEqual(mds_means.shape[1], 8)
     dr = sca.dim_red
     self.assertEqual(dr.shape[0], 2)
     self.assertEqual(dr.shape[1], int(0.2 * sca.data.shape[1]))
     dr_baseline = sca.baseline_vis
     self.assertEqual(dr_baseline.shape[0], 2)
     self.assertEqual(dr_baseline.shape[1], int(0.2 * sca.data.shape[1]))
示例#11
0
 def test_merge_cluster_history(self):
     """
     Test merging with history log
     """
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  frac=0.2,
                                  clusters=8,
                                  data_filename='data.mtx',
                                  baseline_dim_red='tsvd',
                                  dim_red_option='MDS',
                                  cell_frac=1.0,
                                  max_iters=20,
                                  inner_max_iters=10)
     sca.run_full_analysis()
     original_labels = sca.labels.copy()
     # split two clusters....
     clusters = sca.labels
     cluster_counts = Counter(clusters)
     top_cluster, top_count = cluster_counts.most_common()[0]
     print(cluster_counts)
     print(top_cluster, top_count)
     sca.recluster('merge', [0, 1], write_log_entry=True)
     sca.run_post_analysis()
     self.assertEqual(sca.params['clusters'], 7)
     self.assertEqual(sca.w_sampled.shape[0], 7)
     sca.recluster('split', [0], write_log_entry=True)
     sca.run_post_analysis()
     self.assertEqual(sca.params['clusters'], 8)
     # TODO: check history
     log = sca.log
     print(log)
     self.assertEqual(len(log), 2)
     entry = log[0]
     self.assertTrue(entry[3])
     entry2 = log[1]
     self.assertTrue(entry2[3])
     # try to re-load?
     sca.restore_prev(entry[1])
     self.assertEqual(sca.params['clusters'], 8)
     self.assertEqual(sca.w_sampled.shape[0], 8)
     print(original_labels)
     print(sca.labels)
     self.assertTrue((sca.labels == original_labels).all())
示例#12
0
 def test_gene_names(self):
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  clusters=8,
                                  data_filename='data.mtx',
                                  baseline_dim_red='tsvd',
                                  dim_red_option='MDS',
                                  cell_frac=1.0,
                                  max_iters=20,
                                  inner_max_iters=10)
     import random
     values = random.sample(range(len(sca.gene_names)), 100)
     for i in values:
         gene_name = sca.gene_names[i]
         if (sca.gene_names == gene_name).sum() > 1:
             print('duplicate gene name')
             continue
         gene_info = sca.data_sampled_gene(gene_name)
         self.assertTrue(
             np.abs(gene_info -
                    sca.data_sampled_all_genes[i, :]).sum() < 0.01)
         self.assertEqual(gene_info.shape[0],
                          sca.data_sampled_all_genes.shape[1])
示例#13
0
def generate_uncurl_analysis(data, output_dir, **uncurl_kwargs):
    """
    Performs an uncurl analysis of the data, writing the results in the given
    directory. Assumes that output_dir contains a file named params.json, with
    all the parameters.

    Outputs:
        output_dir/data.txt or output_dir/data.mtx
        output_dir/m.txt
        output_dir/w.txt
        output_dir/labels.txt (integer labels)
        output_dir/top_genes.txt (json of a dict mapping cluster ids to a list of (gene_id : c_score) sorted by c_score)
        output_dir/mds_means.txt (mds of the means - 2 x k)
        output_dir/mds_data.txt (mds projection of data - 2 x n)
        output_dir/gene_subset.txt (gene subset selected by uncurl)
        output_dir/gene_names.txt (list of all gene names in data subset)
        output_dir/entropy.txt (entropy of cell labels)

    Args:
        data (array or str): either a data array, or a string containing
            the path to a data array..
        output_dir (str): directory to write output to.
            contains params.json, data.mtx/.txt/.gz, and optionally gene_names.txt.
        **uncurl_kwargs: arguments to pass to uncurl.run_state_estimation..
    """
    # TODO: what about init?
    try:
        os.makedirs(output_dir)
    except:
        print('could not make output dir: {0}'.format(output_dir))
    with open(os.path.join(output_dir, 'submitted'), 'w') as f:
        f.write('')
    data_is_sparse = True
    if not isinstance(data, np.ndarray) and not isinstance(
            data, sparse.spmatrix):
        data_filename = data
        if data.endswith('.mtx') or data.endswith('.mtx.gz'):
            data_is_sparse = True
        else:
            data_is_sparse = False
    else:
        pass
    with open(os.path.join(output_dir, 'uncurl_kwargs.json'), 'w') as f:
        json.dump(uncurl_kwargs, f)
    sca = sc_analysis.SCAnalysis(output_dir,
                                 data_filename=data_filename,
                                 data_is_sparse=data_is_sparse)
    sca.load_params_json()
    if os.path.exists(os.path.join(
            output_dir, 'samples.txt')) and 'samples' not in sca.color_tracks:
        samples = np.loadtxt(os.path.join(output_dir, 'samples.txt'),
                             dtype=str)
        sca.add_color_track('samples', samples, True)
    try:
        sca.run_full_analysis()
    except Exception as e:
        import traceback
        text = traceback.format_exc()
        with open(os.path.join(output_dir, 'error.txt'), 'w') as f:
            f.write(text)
        return
    sca.save_json_reset()
    print('done with generate_analysis')
示例#14
0
try:
    shutil.rmtree(data_dir)
    os.makedirs(data_dir)
except:
    os.makedirs(data_dir)
data = sparse.csc_matrix(dat['data'])
# take subset of max variance genes
scipy.io.mmwrite(os.path.join(data_dir, 'data.mtx'), data)
shutil.copy('data/10x_pooled_400_gene_names.tsv',
            os.path.join(data_dir, 'gene_names.txt'))

sca = sc_analysis.SCAnalysis(data_dir,
                             frac=0.2,
                             clusters=8,
                             data_filename='data.mtx',
                             baseline_dim_red='tsvd',
                             dim_red_option='MDS',
                             cell_frac=1.0,
                             max_iters=20,
                             inner_max_iters=10)

sca.run_full_analysis()
original_labels = sca.labels.copy()
print(original_labels)
original_w = sca.w.copy()
print(original_w)
# split two clusters....
sca.recluster('merge', [0, 1], write_log_entry=True)
sca.run_post_analysis()
sca.recluster('split', [0], write_log_entry=True)
sca.run_post_analysis()
    def test_custom_label(self):
        c1 = custom_cell_selection.LabelCriterion(selection_type='true_labels',
                                                  comparison='=',
                                                  target='0',
                                                  and_or='or')
        c2 = custom_cell_selection.LabelCriterion(selection_type='true_labels',
                                                  comparison='=',
                                                  target='1',
                                                  and_or='or')
        c3 = custom_cell_selection.LabelCriterion(selection_type='true_labels',
                                                  comparison='=',
                                                  target='2',
                                                  and_or='or')
        label1 = custom_cell_selection.CustomLabel('label1',
                                                   criteria=[c1, c2, c3])
        results = label1.select_cells(self.sca)
        self.assertTrue(len(results) == 150)
        self.assertTrue(
            ((self.labels[results] == 0) | (self.labels[results] == 1) |
             (self.labels[results] == 2)).all())

        c4 = custom_cell_selection.LabelCriterion(selection_type='cluster',
                                                  comparison='=',
                                                  target='0',
                                                  and_or='and')
        c5 = custom_cell_selection.LabelCriterion(selection_type='true_labels',
                                                  comparison='=',
                                                  target='4',
                                                  and_or='or')
        c6 = custom_cell_selection.LabelCriterion(selection_type='true_labels',
                                                  comparison='=',
                                                  target='6',
                                                  and_or='or')
        c7 = custom_cell_selection.LabelCriterion(selection_type='true_labels',
                                                  comparison='=',
                                                  target='7',
                                                  and_or='or')
        label1 = custom_cell_selection.CustomLabel(
            'label1', criteria=[c1, c2, c3, c4, c5, c6, c7])
        results = label1.select_cells(self.sca)
        if len(results) > 0:
            self.assertTrue(((self.labels[results] == 0) | (self.labels[results] == 1) | (self.labels[results] == 2)\
                    | (self.labels[results]==4) | (self.labels[results]==6) | (self.labels[results]==7)).all())
            self.assertTrue((self.sca.labels[results] == 0).all())

        # test colormaps
        label1 = custom_cell_selection.CustomLabel('label1',
                                                   criteria=[c1, c2, c3])
        label2 = custom_cell_selection.CustomLabel('label2',
                                                   criteria=[c5, c6, c7])
        cmap1 = custom_cell_selection.CustomColorMap('cmap1', [label1, label2])
        labels = cmap1.label_cells(self.sca)
        self.assertTrue((labels == 'label1').sum() == 150)
        self.assertTrue((labels == 'label2').sum() == 150)
        self.assertTrue(((self.labels[labels == 'label1'] == 0) |
                         (self.labels[labels == 'label1'] == 1) |
                         (self.labels[labels == 'label1'] == 2)).all())
        self.assertTrue(((self.labels[labels == 'label2'] == 4) |
                         (self.labels[labels == 'label2'] == 6) |
                         (self.labels[labels == 'label2'] == 7)).all())
        # test json saving/loading
        custom_cell_selection.save_json(
            os.path.join(self.data_dir, 'cmap.json'), {'cmap1': cmap1})
        cmap1 = custom_cell_selection.load_json(
            os.path.join(self.data_dir, 'cmap.json'))['cmap1']
        print(custom_cell_selection.create_json(cmap1))
        labels = cmap1.label_cells(self.sca)
        self.assertTrue((labels == 'label1').sum() == 150)
        self.assertTrue((labels == 'label2').sum() == 150)
        self.assertTrue(((self.labels[labels == 'label1'] == 0) |
                         (self.labels[labels == 'label1'] == 1) |
                         (self.labels[labels == 'label1'] == 2)).all())
        self.assertTrue(((self.labels[labels == 'label2'] == 4) |
                         (self.labels[labels == 'label2'] == 6) |
                         (self.labels[labels == 'label2'] == 7)).all())

        # test adding colormaps to sca
        self.sca.create_custom_selection('cmap1', cmap1.labels)
        self.sca = sc_analysis.SCAnalysis(self.data_dir,
                                          clusters=8,
                                          data_filename='data.mtx')
        scores, pvals = self.sca.calculate_diffexp('cmap1', mode='pairwise')
        print(scores.shape)
        self.assertTrue(scores.shape == (3, 3, self.sca.data.shape[0]))
        print(pvals.shape)
        # update color map criteria
        c8 = custom_cell_selection.LabelCriterion(selection_type='true_labels',
                                                  comparison='=',
                                                  target='8',
                                                  and_or='or')
        c9 = custom_cell_selection.LabelCriterion(selection_type='true_labels',
                                                  comparison='=',
                                                  target='9',
                                                  and_or='or')
        self.sca.update_custom_color_track_label('cmap1', 'label2',
                                                 [c5, c8, c9])
        data, is_discrete = self.sca.get_color_track('cmap1')
        self.assertTrue((data == 'label1').sum() == 150)
        self.assertTrue((data == 'label2').sum() == 150)
        self.assertTrue(((self.labels[data == 'label2'] == 4) |
                         (self.labels[data == 'label2'] == 8) |
                         (self.labels[data == 'label2'] == 9)).all())