def preproc_data(data): """ basic data preprocessing """ import uncurl from uncurl.preprocessing import log1p, cell_normalize from sklearn.decomposition import TruncatedSVD gene_subset = uncurl.max_variance_genes(data) data_subset = data[gene_subset, :] tsvd = TruncatedSVD(8) data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data_subset)).T) return data_tsvd
def one_vs_rest_t(data, labels, eps=1.0, calc_pvals=True, test='t', normalize=False, use_fdr=False): """ Computes 1-vs-rest t-test for all clusters and genes. If calc_pvals is False, the pvals will be all 0. test is either 't' or 'u'. 't' indicates that the 2-sample t-test will be used, and 'u' is the Mann-Whitney U test. Returns: scores (dict): map of labels to tuples (array of gene ids, array of ratios) sorted by descending ratio pvals (dict): map of labels to tuples (array of gene ids, array of pvals) sorted by ascending pval """ data_csc = sparse.csc_matrix(data) if normalize: from uncurl.preprocessing import cell_normalize data_csc = cell_normalize(data_csc) genes, cells = data.shape if cells != len(labels): raise Exception('length of data not equal to length of labels') labels_array = np.zeros(len(labels), dtype=int) # map from label names to indices labels_map = {} for i, l in enumerate(sorted(list(set(labels)))): labels_map[l] = i for i, l in enumerate(labels): labels_array[i] = labels_map[l] if test == 't': test_func = csc_unweighted_1_vs_rest_t_test elif test == 'u': test_func = csc_unweighted_1_vs_rest_rank_sum_test scores, pvals = test_func(data_csc.data, data_csc.indices, data_csc.indptr, labels_array, cells, genes, eps, calc_pvals) # map back to original label set? new_scores = {} new_pvals = {} for i, l in labels_map.items(): new_scores[i] = scores[l] new_pvals[i] = pvals[l] if use_fdr: from statsmodels.stats.multitest import fdrcorrection for k, v in new_pvals.items(): pv = v[1] v[1] = fdrcorrection(pv)[1] return new_scores, new_pvals
def load_data(paths, genes_path, genes_subset_path=None, normalize=True, log=True): """ Returns: data - cells x genes csr matrix genes - array of gene names all_labels - all cell labels """ all_matrices = [] all_labels = [] for data_path in paths: for f in os.listdir(data_path): if f.endswith('.mtx.gz'): file_path = os.path.join(data_path, f) label = f.split('.')[0] print(label) data = scipy.io.mmread(file_path) data = sparse.csc_matrix(data) all_matrices.append(data) all_labels.extend([label] * data.shape[1]) print('num cells: ', data.shape[1]) all_matrices = sparse.hstack(all_matrices) all_labels = np.array(all_labels) n_genes, n_cells = all_matrices.shape print('data matrix shape:', all_matrices.shape) print('number of cells:', all_labels.shape) from uncurl import preprocessing if normalize: all_matrices = preprocessing.cell_normalize(all_matrices, multiply_means=False) if log: all_matrices = preprocessing.log1p(all_matrices) all_matrices = sparse.csr_matrix(all_matrices.T) print('matrices normalized') genes = np.loadtxt(genes_path, dtype=str) if genes_subset_path is not None: all_matrices, genes = load_genes_subset(all_matrices, genes, genes_subset_path) return all_matrices, genes, all_labels
def pairwise_t(data, w_or_labels, eps=1.0, calc_pvals=True, normalize=False, use_fdr=False): """ Computes pairwise t-test between all pairs of clusters and all genes. If calc_pvals is False, the pvals will be all 0. Returns: ratios, pvals - two arrays of shape (k, k, genes), where k is the number of clusters. """ data_csc = sparse.csc_matrix(data) if normalize: from uncurl.preprocessing import cell_normalize data_csc = cell_normalize(data_csc) genes, cells = data.shape if len(w_or_labels.shape) == 2: scores, pvals = csc_weighted_t_test(data_csc.data, data_csc.indices, data_csc.indptr, w_or_labels, cells, genes, calc_pvals) else: labels_array = np.zeros(len(w_or_labels), dtype=int) if cells != len(labels_array): raise Exception('length of data not equal to length of labels') labels_map = {} for i, l in enumerate(sorted(list(set(w_or_labels)))): labels_map[l] = i for i, l in enumerate(w_or_labels): labels_array[i] = labels_map[l] scores, pvals = csc_unweighted_t_test(data_csc.data, data_csc.indices, data_csc.indptr, labels_array, cells, genes, eps, calc_pvals) # TODO: re-map keys? or would that mess up the c-score calculation? if use_fdr: from statsmodels.stats.multitest import fdrcorrection new_pvals = np.zeros(pvals.shape) for k1 in range(pvals.shape[0]): for k2 in range(pvals.shape[1]): new_pvals[k1, k2, :] = fdrcorrection(pvals[k1, k2, :])[1] pvals = new_pvals return scores, pvals
def preproc_data(data, gene_subset=False, **kwargs): """ basic data preprocessing before running gap score Assumes that data is a matrix of shape (genes, cells). Returns a matrix of shape (cells, 8), using the first 8 SVD components. Why 8? It's an arbitrary selection... """ import uncurl from uncurl.preprocessing import log1p, cell_normalize from sklearn.decomposition import TruncatedSVD data_subset = data if gene_subset: gene_subset = uncurl.max_variance_genes(data) data_subset = data[gene_subset, :] tsvd = TruncatedSVD(min(8, data_subset.shape[0] - 1)) data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data_subset)).T) return data_tsvd
def identify_mnns(data1, data2, k=20, n_components=20, metric='cosine'): """ Identify mutual nearest neighbors in the two datasets. Args: data1, data2: sparse (csc) matrices of shape genes x cells. data1 is the target matrix and data2 is the reference matrix. k (int): number of nearest neighbors n_components (int): number of components for tsvd metric (str): cosine or euclidean Returns: a list of lists, indicating MNNs every cell in data1 """ # 1. compute tsvd on sparse matrices data_combined = scipy.sparse.hstack([data1, data2]) cell_indices = np.array([1]*data1.shape[1] + [2]*data2.shape[1]) U, Sigma, VT = randomized_svd(log1p(cell_normalize(data_combined)).T, n_components) data_reduced = U*Sigma # 2. build index for euclidean nearest neighbors using nmslib # space can be cosinesiml or l2 metric_to_space = {'cosine': 'cosinesimil', 'euclidean': 'l2'} index1 = nmslib.init(method='hnsw', space=metric_to_space[metric]) index1.addDataPointBatch(data_reduced[:, cell_indices==1]) index2 = nmslib.init(method='hnsw', space=metric_to_space[metric]) index2.addDataPointBatch(data_reduced[:, cell_indices==2]) neighbors1 = [] neighbors2 = [] for i in range(data1.shape[1]): points = index2.knnQueryBatch(data_reduced[i,:], k=k) points = set([n[0][0] for n in points]) neighbors1.append(points) for i in range(data2.shape[1]): index = data1.shape[1] + i points = index1.knnQueryBatch(data_reduced[index,:], k=k) points = set([n[0][0] for n in points]) neighbors2.append(points) mnns = [[] for i in range(data1.shape[1])] for i in range(data1.shape[1]): for p in neighbors1[i]: if i in neighbors2[p]: mnns[i].append(p) return mnns
def bnpy_select_clusters(data, max_cells=50000): """ Args: data: matrix of shape genes x cells Returns: selected k based on converged Gaussian DPMM, and the assigned labels. """ # TODO: randomly sub-select max_cells selected_cell_ids = list(range(data.shape[1])) if max_cells < data.shape[1]: import random selected_cell_ids = random.sample(selected_cell_ids, max_cells) data = data[:, selected_cell_ids] tsvd = TruncatedSVD(8) data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data)).T) data_dense_bnpy = bnpy.data.XData(data_tsvd) trained_model, info_dict = bnpy.run( data_dense_bnpy, 'DPMixtureModel', 'Gauss', 'memoVB', #doSaveToDisk=False, doWriteStdOut=False, output_path='./temp', nLap=100, nTask=1, nBatch=1, sF=0.1, ECovMat='eye', K=10, initname='randexamples', moves='birth,merge,shuffle', m_startLap=5, b_startLap=2, b_Kfresh=4) selected_k = info_dict['K_history'][-1] results = trained_model.calc_local_params(data_dense_bnpy) cluster_labels = results['resp'].argmax(1) return selected_k, cluster_labels
if __name__ == '__main__': import uncurl from uncurl.state_estimation import objective from uncurl.preprocessing import cell_normalize, log1p import scipy.io from sklearn.cluster import KMeans from sklearn.metrics.cluster import normalized_mutual_info_score as nmi mat = scipy.io.loadmat('data/10x_pooled_400.mat') actual_labels = mat['labels'].squeeze() X = mat['data'].toarray().astype(np.float32) genes = uncurl.max_variance_genes(X, 5, 0.2) X_subset = X[genes, :] X_log_norm = log1p(cell_normalize(X_subset)).astype(np.float32) uncurl_net = UncurlNet(X_log_norm, 8, use_reparam=False, use_decoder=False, use_batch_norm=True, hidden_layers=2, hidden_units=200, loss='mse') m_init = torch.tensor(uncurl_net.M) uncurl_net.pre_train_encoder(None, lr=1e-3, n_epochs=20, log_interval=10) uncurl_net.train_model(None, lr=1e-3, n_epochs=50, log_interval=10) w = uncurl_net.w_net.get_w(X_log_norm).transpose(1, 0) m = uncurl_net.w_net.get_m() mw = torch.matmul(m, w)
) genes_seqwell = table_seqwell.index genes_10x = table_10x.index genes_set = set(genes_seqwell).intersection(genes_10x) genes_list = list(genes_set) data_seqwell = table_seqwell.loc[genes_list].values data_10x = table_10x.loc[genes_list].values batch_list = [0] * data_seqwell.shape[1] batch_list += [1] * data_10x.shape[1] data_total = np.hstack([data_seqwell, data_10x]) X_log_norm = log1p(cell_normalize(data_total)).astype(np.float32) """ # TODO: create networks net1 = UncurlNet(X_log_norm, 10, use_reparam=False, use_decoder=False, use_batch_norm=True, hidden_layers=2, hidden_units=400, loss='mse') net1.train_1(X_log_norm, log_interval=10) # TODO: test clustering? w = net1.w_net.get_w(X_log_norm).transpose(1, 0) print(w.argmax(0)) w = w.numpy().T tsne = TSNE(2)
def testCellNormalize(self): sparse_cell_norm = cell_normalize(self.data_sparse) dense_cell_norm = cell_normalize(self.data_dense) diff = dense_cell_norm - sparse_cell_norm.toarray() diff = np.sqrt(np.sum(diff**2)) self.assertTrue(diff < 1e-6)