def test_run_se(self): w, h, cost = uncurl.run_state_estimation(self.data, 2, dist='log-norm') labs = h.argmax(0) self.assertTrue(uncurl.evaluation.purity(labs, self.labs) > 0.85) w1, h1, cost = uncurl.run_state_estimation(self.data, 2, dist='gaussian') labs = h1.argmax(0) self.assertTrue(uncurl.evaluation.purity(labs, self.labs) > 0.8)
def relabel(data, m_old, w_old, cell_ids, cell_labels, **uncurl_params): """ Re-runs UNCURL on the dataset, after re-initializing W Args: data (array): genes x cells m_old (array): genes x k w_old (array): k x cells cell_ids (array): 1d array of cell ids cell_labels (array): 1d array of new cell labels **uncurl_params: optional kwargs to pass to uncurl Returns: M_new, W_new """ k = m_old.shape[1] init_weights = w_old.argmax(0) for c in cell_ids: true_label = cell_labels[c] init_weights[c] = true_label m_new, w_new, ll_new = uncurl.run_state_estimation( data, clusters=k, #init_means=m_old, init_weights=init_weights, **uncurl_params) return m_new, w_new
def run_uncurl(data_subset, uncurl_kwargs, **kwargs): if uncurl_kwargs is None: uncurl_kwargs = {} m, w, ll = uncurl.run_state_estimation(data_subset, clusters=kwargs['clusters'], **uncurl_kwargs) return m, w
def test_leiden(self): m, w, ll = uncurl.run_state_estimation(self.data_subset, 8, max_iters=20, inner_max_iters=50) print('nmi basic: ' + str(nmi(self.labels, w.argmax(0)))) g = clustering_methods.create_graph(w.T, metric='cosine') leiden_clustering = clustering_methods.run_leiden(g) self.assertTrue(nmi(self.labels, leiden_clustering) >= 0.7) louvain_clustering = clustering_methods.run_louvain(g) self.assertTrue(nmi(self.labels, louvain_clustering) >= 0.7)
def merge_clusters(data, m_old, w_old, clusters_to_merge, rerun_uncurl=True, **uncurl_params): """ Merges a given list of clusters, returning the results of an uncurl re-initialization. Merging is done by averaging m_old over the clusters to be merged, and summing over w_old (and re-normalizing). Args: data (array): genes x cells m_old (array): genes x k w_old (array): k x cells clusters_to_merge (list): list of cluster ids to merge rerun_uncurl (boolean): if True, re-runs uncurl with a new initialization. If false, this just returns the merged cluster matrix. **uncurl_params: optional kwargs to pass to uncurl Returns: M_new, W_new """ # TODO: this doesn't work for merging more than 2 clusters k = m_old.shape[1] - len(clusters_to_merge) + 1 m_init_new_col = np.zeros(m_old.shape[0]) w_init_new_row = np.zeros(w_old.shape[1]) clusters_to_keep = np.array([True for i in range(m_old.shape[1])]) clusters_to_keep[list(clusters_to_merge)] = False m_init = m_old[:, clusters_to_keep] w_init = w_old[clusters_to_keep, :] for c in clusters_to_merge: m_init_new_col += m_old[:, c] w_init_new_row += w_old[c, :] m_init_new_col = m_init_new_col / len(clusters_to_merge) m_init_new_col = m_init_new_col.reshape((m_old.shape[0], 1)) w_init_new_row = w_init_new_row.reshape((1, w_old.shape[1])) m_init = np.hstack([ m_init[:, 0:clusters_to_merge[0]], m_init_new_col, m_init[:, clusters_to_merge[0]:] ]) w_init = np.vstack([ w_init[0:clusters_to_merge[0], :], w_init_new_row, w_init[clusters_to_merge[0]:, :] ]) w_init = w_init / w_init.sum(0) m_new, w_new, ll_new = uncurl.run_state_estimation(data, clusters=k, init_means=m_init, init_weights=w_init, **uncurl_params) return m_new, w_new
def test_real_data_pairwise(self): mat = scipy.io.loadmat('data/10x_pooled_400.mat') data = mat['data'] # do uncurl, followed by update_m selected_genes = uncurl.max_variance_genes(data) data_subset = data[selected_genes, :] m, w, ll = uncurl.run_state_estimation(data_subset, 8, max_iters=20, inner_max_iters=50) m = uncurl.update_m(data, m, w, selected_genes) # test pairwise all_pvs, all_ratios = poisson_diffexp.uncurl_test_pairwise(m, w, mode='counts') self.assertEqual(all_pvs.shape, (data.shape[0], 8, 8)) self.assertEqual(all_ratios.shape, (data.shape[0], 8, 8)) self.assertTrue((all_pvs < 0.001).sum() < data.shape[0]) self.assertTrue((all_pvs < 0.01).sum() > 100)
def split_cluster(data, m_old, w_old, cluster_to_split, **uncurl_params): """ Splits a given cluster, returning the results of an uncurl re-initialization. Args: data (array): genes x cells m_old (array): genes x k w_old (array): k x cells cluster_to_split (int): cluster id to split on **uncurl_params: optional kwargs to pass to uncurl Returns: M_new, W_new """ k = m_old.shape[1] k += 1 labels = w_old.argmax(0) cell_subset = (labels == cluster_to_split) # or (sorted_labels[1,:]==cluster_to_split) data_subset = data[:, cell_subset] mean_w = np.hstack([w_old[i, labels == i] for i in set(labels)]).mean() new_m, new_w = state_estimation.initialize_means_weights( data_subset, 2, max_assign_weight=mean_w) m_init = np.hstack([ m_old[:, :cluster_to_split], new_m[:, 0:1], m_old[:, cluster_to_split + 1:], new_m[:, 1:] ]) # extend new_w to encompass all cells # TODO: set new row as last cluster new_w_2 = np.zeros((2, w_old.shape[1])) new_w_2[0, :] = w_old[cluster_to_split] / 2 new_w_2[1, :] = w_old[cluster_to_split] / 2 new_w_2[:, cell_subset] = new_w w_init = np.vstack([ w_old[:cluster_to_split, :], new_w_2[0:1, :], w_old[cluster_to_split + 1:, :], new_w_2[1:, :] ]) w_init = w_init / w_init.sum(0) m_new, w_new, ll_new = uncurl.run_state_estimation(data, clusters=k, init_means=m_init, init_weights=w_init, **uncurl_params) return m_new, w_new
def test_run_se(self): """ test the run_state_estimation function """ sim_m, sim_w = simulation.generate_poisson_states(2, 200, 20) sim_data = simulation.generate_state_data(sim_m, sim_w) m, w, ll = run_state_estimation(sim_data, 2, dist='Poiss', max_iters=10, disp=False) means_good = False weights_good = False for p in itertools.permutations([0, 1]): means_good = means_good or (np.mean(np.abs(sim_m - m[:, p])) < 20.0) weights_good = weights_good or (np.mean(np.abs(sim_w - w[p, :])) < 0.3) self.assertTrue(means_good) self.assertTrue(weights_good)
def setUp(self): data = scipy.io.loadmat('data/10x_pooled_400.mat') data_csc = data['data'] self.labels = data['labels'].flatten() #gene_names = data['gene_names'] # 2. gene selection genes = uncurl.max_variance_genes(data_csc) self.data_subset = data_csc[genes, :] #gene_names_subset = gene_names[genes] # 3. run uncurl m, w, ll = uncurl.run_state_estimation(self.data_subset, 8, max_iters=20, inner_max_iters=50) print('nmi basic: ' + str(nmi(self.labels, w.argmax(0)))) self.m = m self.w = w
def delete_cluster(data, m_old, w_old, cluster_to_delete, **uncurl_params): """ Removes a cluster from the data. """ k = m_old.shape[1] - 1 m_init = np.hstack( [m_old[:, :cluster_to_delete], m_old[:, cluster_to_delete + 1:]]) w_init = np.vstack( [w_old[:cluster_to_delete, :], w_old[cluster_to_delete + 1:, :]]) w_init = w_init / w_init.sum(0) labels = w_old.argmax(0) cells_to_include = (labels != cluster_to_delete) data_subset = data[:, cells_to_include] w_init = w_init[:, cells_to_include] m_new, w_new, ll_new = uncurl.run_state_estimation(data_subset, clusters=k, init_means=m_init, init_weights=w_init, **uncurl_params) return m_new, w_new, cells_to_include
def test_10x_update_m(self): """ Test after updating M """ from uncurl.state_estimation import update_m genes = uncurl.max_variance_genes(self.data) data_subset = self.data[genes, :] # smaller # of iterations than default so it finishes faster... M, W, ll = uncurl.run_state_estimation(data_subset, clusters=0, max_iters=10, inner_max_iters=50) new_M = update_m(self.data, M, W, genes) self.assertEqual(new_M.shape, (self.data.shape[0], W.shape[0])) self.assertFalse(np.isnan(new_M).any()) # test RMSE test_data = np.dot(new_M, W) error = self.data.toarray() - test_data error = np.sqrt(np.mean(error**2)) print('M update RMSE:', error) self.assertTrue(error < 2.0)
def new_cluster(data, m_old, w_old, cells_to_add, rerun_uncurl=True, **uncurl_params): """ Creates a new cluster by assigning all the selected cells to the new cluster. Args: data: array of shape genes, cells m_old: array of shape genes, k w_old: array of shape k, cells cells_to_add: array or list of ints representing indices to add rerun_uncurl (bool): Returns a new m and w. """ labels = w_old.argmax(0) k = m_old.shape[1] + 1 genes = m_old.shape[0] cells = w_old.shape[1] m_init = np.zeros((genes, k)) w_init = np.zeros((k, cells)) m_init[:, :k - 1] = m_old w_init[:k - 1, :] = w_old m_init[:, k - 1] = data[:, cells_to_add].mean(1).A1 mean_w = np.hstack([w_old[i, labels == i] for i in set(labels)]).mean() w_init[k - 1, cells_to_add] = mean_w w_init[:k - 1, cells_to_add] = (1 - mean_w) / (k - 1) if rerun_uncurl: m_new, w_new, ll_new = uncurl.run_state_estimation(data, clusters=k, init_means=m_init, init_weights=w_init, **uncurl_params) return m_new, w_new else: return m_init, w_init
def test_real_data_1_vs_rest(self): mat = scipy.io.loadmat('data/10x_pooled_400.mat') data = mat['data'] # do uncurl, followed by update_m selected_genes = uncurl.max_variance_genes(data) data_subset = data[selected_genes, :] m, w, ll = uncurl.run_state_estimation(data_subset, 8, max_iters=20, inner_max_iters=50) m = uncurl.update_m(data, m, w, selected_genes) # TODO: how should the p-values be tested? all_pvs, all_ratios = poisson_diffexp.uncurl_test_1_vs_rest(m, w) all_pvs = np.array(all_pvs) all_ratios = np.array(all_ratios) self.assertTrue((all_pvs < 0.05).sum() > 100) self.assertTrue((all_ratios > 10).sum() > 100) self.assertEqual(all_pvs.shape, (data.shape[0], 8)) all_pvs, all_ratios = poisson_diffexp.uncurl_test_1_vs_rest(m, w, mode='counts') all_pvs = np.array(all_pvs) all_ratios = np.array(all_ratios) self.assertEqual(all_pvs.shape, (data.shape[0], 8)) self.assertTrue((all_pvs < 0.01).sum() > 100) self.assertTrue((all_pvs < 0.01).sum() < data.shape[0]) self.assertTrue((all_ratios > 10).sum() > 100)
def test_10x_auto_cluster(self): """ Test using automatic cluster size determination """ from sklearn.metrics.cluster import normalized_mutual_info_score as nmi # gene selection genes = uncurl.max_variance_genes(self.data) data_subset = self.data[genes, :] # smaller # of iterations than default so it finishes faster... M, W, ll = uncurl.run_state_estimation(data_subset, clusters=0, max_iters=10, inner_max_iters=80) labels = W.argmax(0) # NMI should be > 0.75 on 10x_pure_pooled # (accounting for lower than default iter count) self.assertTrue(nmi(self.labs, labels) > 0.6) # test RMSE test_data = np.dot(M, W) error = data_subset.toarray() - test_data error = np.sqrt(np.mean(error**2)) print('data subset RMSE:', error) self.assertTrue(error < 2.0)
def generate_uncurl_analysis(data, output_dir, data_type='dense', gene_names=None, gene_sub=True, **uncurl_kwargs): """ Performs an uncurl analysis of the data, writing the results in the given directory. Outputs: output_dir/m.txt output_dir/w.txt output_dir/labels.txt (integer labels) output_dir/top_genes.txt (json of a dict mapping cluster ids to a list of (gene_id : c_score) sorted by c_score) output_dir/mds_means.txt (mds of the means) output_dir/mds_data.txt (mds projection of data) output_dir/gene_subset.txt (gene subset selected by uncurl) output_dir/gene_names.txt (list of all gene names in data subset) Args: data (array or str): either a data array, or a string containing the path to a data array.. output_dir (str): directory to write output to. data_type (str): if data is a path, this indicates whether the data is a dense or sparse array. gene_names (list or array): list of all gene names gene_sub (bool): whether or not to use gene subset selection (max_variance_genes) **uncurl_kwargs: arguments to pass to uncurl.run_state_estimation. has to include clusters=k. """ try: os.makedirs(output_dir) except: print('could not make output dir: {0}'.format(output_dir)) if isinstance(data, str): if data_type == 'dense': data = np.loadtxt(data) elif data_type == 'sparse': data = scipy.io.mmread(data) data = sparse.csc_matrix(data) if isinstance(gene_names, str): gene_names = np.loadtxt(gene_names, dtype=str) # run uncurl if gene_sub: genes_subset = np.array(uncurl.max_variance_genes(data)) np.savetxt(os.path.join(output_dir, 'gene_subset.txt'), genes_subset, fmt='%d') data = data[genes_subset, :] if gene_names is not None: gene_names = gene_names[genes_subset] print(uncurl_kwargs) m, w, ll = uncurl.run_state_estimation(data, **uncurl_kwargs) np.savetxt(os.path.join(output_dir, 'm.txt'), m) np.savetxt(os.path.join(output_dir, 'w.txt'), w) labels = w.argmax(0) np.savetxt(os.path.join(output_dir, 'labels.txt'), labels, fmt='%d') # find overexpressed genes for clusters top_genes = uncurl_analysis.find_overexpressed_genes(data, w.argmax(0)) with open(os.path.join(output_dir, 'top_genes.txt'), 'w') as f: json.dump(top_genes, f) # run mds mds_output = uncurl.dim_reduce(m, w, 2) print(mds_output.shape) np.savetxt(os.path.join(output_dir, 'mds_means.txt'), mds_output.T) mds_data = uncurl.mds(m, w, 2) np.savetxt(os.path.join(output_dir, 'mds_data.txt'), mds_data) if gene_names is not None: np.savetxt(os.path.join(output_dir, 'gene_names.txt'), gene_names, fmt='%s')
return m_new, w_new if __name__ == '__main__': from scipy import sparse from scipy.io import loadmat from sklearn.metrics.cluster import normalized_mutual_info_score as nmi # TODO: load dataset dat = loadmat('data/10x_pooled_400.mat') data = sparse.csc_matrix(dat['data']) labs = dat['labels'].flatten() genes = uncurl.max_variance_genes(data) data_subset = data[genes, :] data_subset = uncurl.preprocessing.cell_normalize(data_subset) m, w, ll = uncurl.run_state_estimation(data_subset, 8, max_iters=0, inner_max_iters=1) m1, w1, ll1 = uncurl.run_state_estimation(data_subset, 8, max_iters=10, inner_max_iters=25) b = data_subset[:, 0].toarray() results = irls(m, b.flatten(), Normal(), IDLink(), restrict=False) print(results) print(lstsq(m, b.flatten())[0]) print('poisson error, id link') results = irls(m, b.flatten(), tol=1e-8) print(results) m_new, w_new = irls_uncurl(data_subset, 8, m, w) #print('poisson error, log link') #results = irls(m, b.flatten(), link=LogLink())
import time import numpy as np from uncurl_analysis import poisson_diffexp import scipy.io import uncurl mat = scipy.io.loadmat('data/10x_pooled_400.mat') data = mat['data'] # do uncurl, followed by update_m selected_genes = uncurl.max_variance_genes(data) data_subset = data[selected_genes, :] m, w, ll = uncurl.run_state_estimation(data_subset, 8, max_iters=20, inner_max_iters=50) m = uncurl.update_m(data, m, w, selected_genes) t0 = time.time() all_pvs, all_ratios = poisson_diffexp.uncurl_poisson_test_1_vs_rest( m, w, mode='counts') print('diffexp time: ', time.time() - t0) t0 = time.time() all_pvs_2, all_ratios_2 = poisson_diffexp.uncurl_poisson_test_pairwise( m, w, mode='counts') print('pairwise diffexp time: ', time.time() - t0) # test on simulated data # plotting mw import matplotlib.pyplot as plt
start_id = json.load(open("/ti/input/start_id.json")) else: start_id = None if os.path.exists('/ti/input/groups_n.json'): groups_n = json.load(open('/ti/input/groups_n.json')) groups_n = groups_n[0] else: groups_n = 0 checkpoints["method_afterpreproc"] = time.time() ## Trajectory inference ----------------------------------- count_data = expression.values count_data = count_data.T m, w, ll = uncurl.run_state_estimation(count_data, 2) checkpoints["method_aftermethod"] = time.time() # extract the component and use it as pseudotimes cell_ids = pd.DataFrame({ "cell_ids":expression.index }) pseudotime = pd.DataFrame({ "pseudotime":w[0, :], "cell_id":expression.index }) # use w as dim_red dimred = pd.DataFrame({
# 1. load data data = scipy.io.loadmat('data/10x_pooled_400.mat') data_csc = data['data'] labels = data['labels'].flatten() gene_names = data['gene_names'] # 2. gene selection genes = uncurl.max_variance_genes(data_csc) data_subset = data_csc[genes,:] gene_names_subset = gene_names[genes] # 3. run uncurl m, w, ll = uncurl.run_state_estimation(data_subset, 8) print('nmi basic: ' + str(nmi(labels, w.argmax(0)))) # 4. run clustering for metric in ['euclidean', 'cosine']: for n_neighbors in [10, 15, 20]: print('n_neighbors: ', n_neighbors, ' metric: ', metric) w_graph = clustering_methods.create_graph(w.T, n_neighbors=n_neighbors, metric=metric) clusters = clustering_methods.run_leiden(w_graph) print('nmi leiden: ' + str(nmi(labels, clusters))) clusters_louvain = clustering_methods.run_louvain(w_graph) print('nmi louvain: ' + str(nmi(labels, clusters_louvain))) # 5. try running clustering w/o uncurl clustering_result = clustering_methods.baseline_cluster(data_subset)
args = parse_args() print("run with these parametres: %s" % str(args)) data = pd.read_csv(args.input, index_col=0) if args.gene_subset == 'non_zero': genes_subset = np.sum(data.values, axis=1) != 0 # select nonzero genes elif args.gene_subset == 'max_variance': genes_subset = max_variance_genes(data.values, nbins=5, frac=0.2) # select genes with max variance else: raise NotImplementedError("optin `%s` for `gene_subset` not defined." % args.gene_subset) data_subset = data.iloc[genes_subset,:] M, W, ll = run_state_estimation(data_subset.values, clusters=args.clusters, dist=args.dist, disp=True, max_iters=args.max_iters, inner_max_iters=args.inner_max_iters, initialization=args.initialization, threads=args.threads) print("ll: %f" % ll) data.iloc[genes_subset, :] = np.matmul(M, W) # imputation make_sure_dir_exists(args.outputdir) np.savetxt("genes_subset.csv", genes_subset, delimiter=",") np.savetxt("M.csv", M, delimiter=",") np.savetxt("W.csv", W, delimiter=",") data.to_csv(os.path.join(args.outputdir, "uncurl_output.csv"))