def test_meld(): # MELD operator # Numerical accuracy np.random.seed(42) def norm(x): x = x.copy() x = x - np.min(x) x = x / np.max(x) return x D = np.random.normal(0, 2, (1000, 2)) RES = np.random.binomial(1, norm(D[:, 0]), 1000) G = gt.Graph(D, knn=20, decay=10, use_pygsp=True) meld_op = meld.MELD() B = meld_op.fit_transform(G, RES) if version.parse(np.__version__) < version.parse('1.17'): np.testing.assert_allclose(np.sum(B), 532.0001992193013) else: np.testing.assert_allclose(np.sum(B), 519.0001572740623) meld_op = meld.MELD() B = meld_op.fit_transform(gt.Graph( D, knn=20, decay=10, use_pygsp=False), RES) if version.parse(np.__version__) < version.parse('1.17'): np.testing.assert_allclose(np.sum(B), 532.0001992193013) else: np.testing.assert_allclose(np.sum(B), 519.0001572740623) # lap type TypeError lap_type = 'hello world' assert_raise_message( TypeError, "lap_type must be 'combinatorial'" " or 'normalized'. Got: '{}'".format(lap_type), meld.MELD(lap_type=lap_type).fit, G=G) # RES wrong shape RES = np.ones([2, G.N + 100]) assert_raise_message( ValueError, "Input data ({}) and input graph ({}) " "are not of the same size".format(RES.shape, G.N), meld_op.fit_transform, RES=RES, G=G) # lap reconversion warning assert_warns_message( RuntimeWarning, "Changing lap_type may require recomputing the Laplacian", meld_op.fit, G=gt.Graph(D, knn=20, decay=10, use_pygsp=True, lap_type='normalized'))
def test_meld_labels_non_numeric(): data = np.random.normal(size=(100, 2)) sample_labels = np.random.choice(["A", "B"], size=100) meld_op = meld.MELD() meld_op.fit_transform(data, sample_labels) sample_labels = np.random.choice(["A", "B", "C"], size=100) meld_op = meld.MELD() sample_densities = meld_op.fit_transform(data, sample_labels) assert np.all(sample_densities.columns == ["A", "B", "C"])
def run_meld(X_red_dim, sample_labels, conditions, k=15): ''' Run MELD - X_red_dim: c x d matrix of dimensionality reduction to use for graph construction - sample_labels: assignment of cells to samples - conditions: vector of condition names ''' ## Make graph graph = gt.Graph(X_red_dim, knn=int(k)) ## Make MELD object meld_op = meld.MELD() meld_op.graph = graph ## Compute density meld_fit = meld_op.transform(sample_labels=np.array(sample_labels)) ## Mean density per replicates mean_density = pd.DataFrame( np.zeros(shape=(meld_fit.shape[0], len(conditions))), index=meld_fit.index, columns=conditions, ) for c in conditions: c_mean = meld_fit.loc[:, [c in x for x in meld_fit.columns]].mean(1) mean_density[c] = c_mean ## From density to likelihood per condition likelihoods = meld.utils.normalize_densities(mean_density) likelihoods.columns = [col.split("_")[0] for col in likelihoods.columns] return (likelihoods)
def test_sample_labels_2d(): labels = np.ones((10, 2)) with assert_raises_message( ValueError, "sample_labels must be a single column. Got" "shape={}".format(labels.shape), ): meld.MELD()._create_sample_indicators(labels)
def setUpClass(self): # VertexFrequencyCluster # Custom window sizes self.window_sizes = np.array([2, 4, 8, 24]) data, self.labels = make_batches(n_pts_per_cluster=100) self.G = gt.Graph(data, sample_idx=self.labels, use_pygsp=True) meld_op = meld.MELD() self.EES = meld_op.fit_transform(G=self.G, RES=self.labels)
def test_utils(): data, labels = make_batches(n_pts_per_cluster=250) G = gt.Graph(data, sample_idx=labels, use_pygsp=True) EES = meld.MELD().fit_transform(G, labels) clusters = meld.VertexFrequencyCluster().fit_predict(G=G, RES=labels, EES=EES) meld.utils.sort_clusters_by_meld_score(clusters, EES)
def test_sample_labels_one_sample(): data = np.random.normal(size=(100, 2)) labels = np.ones(100) with assert_raises_message( ValueError, "Found only one unqiue sample label. Cannot estimate density " "of a single sample.", ): meld.MELD().fit_transform(data, labels)
def test_2d(self): RES = np.array([self.labels, self.labels]).T vfc_op = meld.VertexFrequencyCluster( window_sizes=self.window_sizes) meld_op = meld.MELD() EES = meld_op.fit_transform(G=self.G, RES=RES) clusters = vfc_op.fit_predict( self.G, RES=RES, EES=EES) assert len(clusters) == len(self.labels)
def test_mnn(): data, labels = make_batches(n_pts_per_cluster=250) meld_op = meld.MELD(verbose=0) sample_densities = meld_op.fit_transform(data, labels, sample_idx=labels) sample_likelihoods = meld.utils.normalize_densities(sample_densities) meld.VertexFrequencyCluster().fit_transform( G=meld_op.graph, sample_indicator=meld_op.sample_indicators["expt"], likelihood=sample_likelihoods["expt"], )
def test_RES_EES_shape(self): RES = np.array([self.labels, self.labels]).T vfc_op = meld.VertexFrequencyCluster( window_sizes=self.window_sizes) meld_op = meld.MELD() EES = meld_op.fit_transform(G=self.G, RES=RES) assert_raise_message(ValueError, '`RES` and `EES` must have the same shape.' 'Got RES: {} and EES: {}'.format(str(RES[:,1].shape), str(EES.shape)), vfc_op.fit_predict, G=self.G, RES=RES[:,1], EES=EES)
def test_meld_invalid_lap_type(): data = np.random.normal(0, 2, (1000, 2)) # lap type TypeError lap_type = "hello world" with assert_raises_message( ValueError, "lap_type value {} not recognized. " "Choose from ['combinatorial', 'normalized']".format(lap_type), ): meld.MELD(verbose=0, lap_type=lap_type).fit(data)
def setUpClass(self): # VertexFrequencyCluster # Custom window sizes self.window_sizes = np.array([2, 4, 8, 24]) self.data, self.sample_labels = make_batches(n_pts_per_cluster=100) meld_op = meld.MELD(verbose=0) self.densities = meld_op.fit_transform( self.data, sample_labels=self.sample_labels) self.sample_indicators = meld_op.sample_indicators self.likelihoods = meld.utils.normalize_densities(self.densities) self.G = meld_op.graph
def test_meld(filter): # MELD operator # Numerical accuracy np.random.seed(42) def norm(x): x = x.copy() x = x - np.min(x) x = x / np.max(x) return x data = np.random.normal(0, 2, (1000, 2)) sample_labels = np.random.binomial(1, norm(data[:, 0]), 1000) sample_labels = np.array( ["treat" if val else "ctrl" for val in sample_labels]) meld_op = meld.MELD( verbose=0, knn=20, decay=10, thresh=0, anisotropy=0, filter=filter, solver="exact", sample_normalize=False, ) densities = meld_op.fit_transform(data, sample_labels) expt_density = densities.iloc[:, 1] if version.parse("1.17") <= version.parse( np.__version__) < version.parse("1.18"): if meld_op.filter == "laplacian": np.testing.assert_allclose(np.sum(expt_density), 519) else: np.testing.assert_allclose(np.sum(expt_density), 519) else: if meld_op.filter == "laplacian": np.testing.assert_allclose(np.sum(expt_density), 532) else: np.testing.assert_allclose(np.sum(expt_density), 532) # check changing filter params resets filter meld_op.set_params(beta=meld_op.beta + 1) assert meld_op.sample_densities is None meld_op.fit_transform(data, sample_labels) assert meld_op.sample_densities is not None # check changing graph params resets filter meld_op.set_params(knn=meld_op.knn + 1) assert meld_op.graph is None assert meld_op.sample_densities is None
def test_meld_labels_wrong_shape(): data = np.random.normal(0, 2, (100, 2)) # sample_indicator wrong shape sample_labels = np.ones([101, 2], dtype=str) with assert_raises_message( ValueError, "Input data ({}) and input graph ({}) " "are not of the same size".format(sample_labels.shape, data.shape[0]), ): meld.MELD(verbose=0).fit_transform( X=data, sample_labels=sample_labels, )
def calculate_EES(self, data=None, **kwargs): np.random.seed(self.seed) if not self.graph: try: self.fit_graph(data) except NameError: raise NameError( "Must pass `data` unless graph has already been fit") self.meld_op = meld.MELD(**kwargs, verbose=False).fit(self.graph) self.EES = self.meld_op.transform(self.sample_labels) self.EES = self.EES["expt"].values # Only keep the expt condition self.estimates["EES"] = self.EES return self.EES
def calculate_MELD_likelihood(self, data=None, **kwargs): np.random.seed(self.seed) if not self.graph: if data is not None: self.fit_graph(data) else: raise NameError( "Must pass `data` unless graph has already been fit") self.meld_op = meld.MELD(**kwargs, verbose=False).fit(self.graph) self.sample_densities = self.meld_op.transform(self.sample_labels) self.sample_likelihoods = meld.utils.normalize_densities( self.sample_densities) self.expt_likelihood = self.sample_likelihoods[ "expt"].values # Only keep the expt condition return self.expt_likelihood
def test_meld_label_2d(): data = np.random.normal(0, 2, (100, 2)) # Create a dataframe with a index index = pd.Index(["cell_{}".format(i) for i in range(100)]) columns = pd.Index(["A"]) sample_labels = pd.DataFrame( np.concatenate([np.zeros((50, 1)), np.ones((50, 1))]), index=index, columns=columns, dtype=str, ) meld_op = meld.MELD(verbose=0) meld_op.fit_transform( X=data, sample_labels=sample_labels, )
def test_meld_label_dataframe(): data = np.random.normal(0, 2, (100, 2)) # Create a dataframe with a index index = pd.Index(["cell_{}".format(i) for i in range(100)]) sample_labels = pd.DataFrame( np.concatenate([np.zeros(50), np.ones(50)]), index=index, columns=["sample_labels"], dtype=str, ) meld_op = meld.MELD(verbose=0) sample_densities = meld_op.fit_transform( X=data, sample_labels=sample_labels, ) assert np.all(sample_densities.index == index) assert np.all( sample_densities.columns == pd.Index(np.unique(sample_labels)))
def test_utils(): data, labels = make_batches(n_pts_per_cluster=250) G = gt.Graph(data, sample_idx=labels, use_pygsp=True) meld_op = meld.MELD() sample_densities = meld_op.fit_transform(G, labels) sample_likelihoods = meld.utils.normalize_densities(sample_densities) meld.VertexFrequencyCluster().fit_predict( G=G, sample_indicator=meld_op.sample_indicators["expt"], likelihood=sample_likelihoods["expt"], ) meld.utils.get_meld_cmap() # Test normalize_densities # Three samples densities = np.ones([100, 3]) meld.utils.normalize_densities(sample_densities=densities) # Two samples densities = np.ones([100, 2]) meld.utils.normalize_densities(sample_densities=densities)
gamma=0, n_jobs=-1, random_state=rs) adata.obsm['X_phate']=phate_op.fit_transform(G.K) if True : # save adata obj with batch correction adata.write(os.path.join(pdfp,'mouse_MT_bbknn.h5ad')) print('\n... saved @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S')) print('... full PHATE in {:.2f}-min'.format((time.time() - start)/60)) if True : # MELD adata.obs['res_sca1']=[1 if i=='SCA1' else -1 for i in adata.obs['genotype']] adata.obs['ees_sca1']=meld.MELD().fit_transform(G=G,RES=adata.obs['res_sca1']) adata.obs['ees_sca1']=adata.obs['ees_sca1']-adata.obs['ees_sca1'].mean() # mean center if True : # save adata obj with batch correction adata.write(os.path.join(pdfp,'mouse_MT_bbknn.h5ad')) print('\n... saved @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S')) if True : # MAGIC magic_op=magic.MAGIC().fit(X=adata.X,graph=G) # running fit_transform produces wrong shape adata.layers['imputed_bbknn']=magic_op.transform(adata.X,genes='all_genes') # adata.layers['imputed_bbknn']=sparse.csr_matrix(magic_op.transform(adata.X,genes='all_genes')) # causes memory spike if True : # save adata obj with batch correction & imputation adata.write(os.path.join(pdfp,'mouse_MT_bbknn.h5ad'))
def test_mnn(): data, labels = make_batches(n_pts_per_cluster=250) G = gt.Graph(data, sample_idx=labels, use_pygsp=True) meld_op = meld.MELD() EES = meld_op.fit_transform(G, labels) meld.VertexFrequencyCluster().fit_transform(G=G, RES=labels, EES=EES)
if False: wt = utils.adata_phate(wt) mut = utils.adata_phate(mut) # MELD G = gt.Graph(data=wt.obsp['connectivities'] + sparse.diags([1] * wt.shape[0], format='csr'), precomputed='adjacency', use_pygsp=True) G.knn_max = None wt.obs['res_t'] = -1 wt.obs.loc[wt.obs['timepoint'] == '12wk', 'res_t'] = -0.5 wt.obs.loc[wt.obs['timepoint'] == '18wk', 'res_t'] = 0 wt.obs.loc[wt.obs['timepoint'] == '24wk', 'res_t'] = 0.5 wt.obs.loc[wt.obs['timepoint'] == '30wk', 'res_t'] = 1 wt.obs['ees_t'] = meld.MELD().fit_transform(G=G, RES=wt.obs['res_t']) wt.obs['ees_t'] = (wt.obs['ees_t'] - wt.obs['ees_t'].min()) / ( wt.obs['ees_t'].max() - wt.obs['ees_t'].min()) G = gt.Graph(data=mut.obsp['connectivities'] + sparse.diags([1] * mut.shape[0], format='csr'), precomputed='adjacency', use_pygsp=True) G.knn_max = None mut.obs['res_t'] = -1 mut.obs.loc[mut.obs['timepoint'] == '12wk', 'res_t'] = -0.5 mut.obs.loc[mut.obs['timepoint'] == '18wk', 'res_t'] = 0 mut.obs.loc[mut.obs['timepoint'] == '24wk', 'res_t'] = 0.5 mut.obs.loc[mut.obs['timepoint'] == '30wk', 'res_t'] = 1 mut.obs['ees_t'] = meld.MELD().fit_transform(G=G, RES=mut.obs['res_t']) mut.obs['ees_t'] = (mut.obs['ees_t'] - mut.obs['ees_t'].min()) / (
fname = 'scv2_200428.h5ad' adata = loader(fname, pdfp) # meld adata.obs['res_t'] = adata.obs['Condition'].astype(str) adata.obs['res_t'][adata.obs['Condition'] == 'Mock'] = 0 adata.obs['res_t'][adata.obs['Condition'] == '1dpi'] = 1 adata.obs['res_t'][adata.obs['Condition'] == '2dpi'] = 2 adata.obs['res_t'][adata.obs['Condition'] == '3dpi'] = 3 G = gt.Graph(data=adata.uns['neighbors']['connectivities'] + sparse.diags([1] * adata.shape[0], format='csr'), precomputed='adjacency', use_pygsp=True) G.knn_max = None adata.obs['ees_t'] = meld.MELD().fit_transform( G=G, RES=adata.obs['res_t'].to_numpy(dtype=float)) adata.obs['ees_t'] = adata.obs['ees_t'] - adata.obs['ees_t'].mean( ) # mean center del G # cluster genes random_genes = False if random_genes: genes = adata.var_names.to_list() genes = random.sample(random_genes, 10) else: genes = adata.var_names.to_list() # genes=[int(sys.argv[1]:int(sys.argv[2]))] print('Aggregating data')