def test_forfatal_from_dict(self): """ Test if dictionary-based constraint interface is working. """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) n_cells = 2000 n_genes = 2 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: sample_description = pd.DataFrame({ "cond": ["cond" + str(i // 1000) for i in range(n_cells)], "batch": ["batch" + str(i // 500) for i in range(n_cells)] }) test = de.test.wald(data=sim.input_data, sample_description=sample_description, formula_loc="~1+cond+batch", formula_scale="~1+cond+batch", constraints_loc={"batch": "cond"}, constraints_scale={"batch": "cond"}, coef_to_test=["cond[T.cond1]"]) _ = test.summary()
def test_rank_test_zero_variance(self): """ Test if rank test works if it is given genes with zero variance. """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) sim = Simulator(num_observations=1000, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() sim.input_data.x[:, 0] = 0 sim.input_data.x[:, 1] = 5 random_sample_description = pd.DataFrame( {"condition": np.random.randint(2, size=sim.nobs)}) test = de.test.rank_test(data=sim.input_data, sample_description=random_sample_description, grouping="condition", is_sig_zerovar=True) assert np.isnan(test.pval[0]) and test.pval[1] == 1, \ "rank test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \ (test.pval[0], test.pval[1]) return True
def _test_null_distribution_rank(self, n_cells: int, n_genes: int): """ Test if de.test.rank_test() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ from batchglm.api.models.tf1.glm_norm import Simulator sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame( {"condition": np.random.randint(2, size=sim.nobs)}) test = de.test.rank_test(data=sim.input_data, sample_description=random_sample_description, grouping="condition") _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of rank_test(): %f' % pval_h0) assert pval_h0 > 0.05, ("KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)) return True
def simulate(self, n_cells: int = 200, n_genes: int = 2): sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.input_data.num_observations) }) return sim.x, random_sample_description
def test_forfatal_from_string(self): """ Test if _from_string interface is working. n_cells is constant as the design matrix and constraints depend on it. """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) n_cells = 2000 n_genes = 2 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: dmat = np.zeros([n_cells, 6]) dmat[:, 0] = 1 dmat[:500, 1] = 1 # bio rep 1 dmat[500:1000, 2] = 1 # bio rep 2 dmat[1000:1500, 3] = 1 # bio rep 3 dmat[1500:2000, 4] = 1 # bio rep 4 dmat[1000:2000, 5] = 1 # condition effect coefficient_names = [ 'intercept', 'bio1', 'bio2', 'bio3', 'bio4', 'treatment1' ] dmat_est = pd.DataFrame(data=dmat, columns=coefficient_names) dmat_est_loc, _ = de.utils.design_matrix(dmat=dmat_est, return_type="dataframe") dmat_est_scale, _ = de.utils.design_matrix(dmat=dmat_est, return_type="dataframe") # Build constraints: constraints_loc = de.utils.constraint_matrix_from_string( dmat=dmat_est_loc.values, coef_names=dmat_est_loc.columns, constraints=["bio1+bio2=0", "bio3+bio4=0"]) constraints_scale = de.utils.constraint_matrix_from_string( dmat=dmat_est_scale.values, coef_names=dmat_est_scale.columns, constraints=["bio1+bio2=0", "bio3+bio4=0"]) test = de.test.wald(data=sim.input_data, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["treatment1"]) _ = test.summary()
def test_null_distribution_lrt(self, n_cells: int = 4000, n_genes: int = 200): """ Test if de.lrt() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate() sample_description = pd.DataFrame({ "covar1": np.random.randint(2, size=sim.nobs), "covar2": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition(data=sim.x, parts="cond", sample_description=sample_description) det = partition.lrt(full_formula_loc="~ 1 + covar1", full_formula_scale="~ 1", reduced_formula_loc="~ 1", reduced_formula_scale="~ 1", training_strategy="DEFAULT", dtype="float64") _ = det.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of lrt(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round( pval_h0, 5) return True
def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2): """ Test if de.test_wald_loc() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distriubution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) from batchglm.api.models.tf1.glm_nb import Simulator sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame( {"condition": np.random.randint(n_groups, size=sim.nobs)}) test = de.test.versus_rest( data=sim.x, grouping="condition", test="wald", noise_model="nb", sample_description=random_sample_description, batch_size=500, training_strategy="DEFAULT", dtype="float64") summary = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round( pval_h0, 5) return True
def test_null_distribution_wald_constrained(self, n_genes: int = 100): """ Test if de.wald() with constraints generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. n_cells is constant as the design matrix and constraints depend on it. :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) n_cells = 2000 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: sample_description = pd.DataFrame({ "cond": ["cond" + str(i // 1000) for i in range(n_cells)], "batch": ["batch" + str(i // 500) for i in range(n_cells)] }) test = de.test.wald(data=sim.input_data, sample_description=sample_description, formula_loc="~1+cond+batch", formula_scale="~1+cond+batch", constraints_loc={"batch": "cond"}, constraints_scale={"batch": "cond"}, coef_to_test=["cond[T.cond1]"]) _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!" return True
def _test_null_distribution_lrt(self, n_cells: int, n_genes: int, noise_model: str): """ Test if de.lrt() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator elif noise_model == "norm": from batchglm.api.models.tf1.glm_norm import Simulator else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame( {"condition": np.random.randint(2, size=sim.nobs)}) test = de.test.lrt(data=sim.input_data, sample_description=random_sample_description, full_formula_loc="~ 1 + condition", full_formula_scale="~ 1", reduced_formula_loc="~ 1", reduced_formula_scale="~ 1", noise_model=noise_model) _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of lrt(): %f' % pval_h0) assert pval_h0 > 0.05, ("KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)) return True
def simulate(self): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator elif self.noise_model == "norm": from batchglm.api.models import Simulator elif self.noise_model == "beta": from batchglm.api.models.tf1.glm_beta import Simulator else: raise ValueError("noise_model not recognized") num_observations = 500 sim = Simulator(num_observations=num_observations, num_features=4) sim.generate_sample_description(num_conditions=2, num_batches=2) sim.generate() self.sim = sim
def _test_residuals_fit( self, n_cells: int, n_genes: int, noise_model: str ): """ Test if de.wald() (multivariate mode) generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator elif noise_model == "norm": from batchglm.api.models.tf1.glm_norm import Simulator else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.nobs), "batch": np.random.randint(2, size=sim.nobs) }) res = de.fit.residuals( data=sim.input_data, sample_description=random_sample_description, formula_loc="~ 1 + condition + batch", noise_model=noise_model ) return True
def test_for_fatal(self): """ """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=50, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate() test = de.test.wald(data=sim.X, factor_loc_totest="condition", formula_loc="~ 1 + condition", sample_description=sim.sample_description, gene_names=[str(x) for x in range(sim.X.shape[1])], training_strategy="DEFAULT", dtype="float64") # Set up reference gene sets. rs = de.enrich.RefSets() rs.add(id="set1", source="manual", gene_ids=["1", "3"]) rs.add(id="set2", source="manual", gene_ids=["5", "6"]) for i in [True, False]: for j in [True, False]: enrich_test_i = de.enrich.test( ref=rs, det=test, threshold=0.05, incl_all_zero=i, clean_ref=j, ) _ = enrich_test_i.summary() _ = enrich_test_i.significant_set_ids() _ = enrich_test_i.significant_sets() _ = enrich_test_i.set_summary(id="set1") return True
def _test_null_distribution_wald_constrained_2layer( self, n_genes: int = 100): """ Test if de.wald() with constraints generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. n_cells is constant as the design matrix and constraints depend on it. :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) n_cells = 12000 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: dmat = np.zeros([n_cells, 14]) dmat[:, 0] = 1 dmat[6000:12000, 1] = 1 # condition effect dmat[:1000, 2] = 1 # bio rep 1 - treated 1 dmat[1000:3000, 3] = 1 # bio rep 2 - treated 2 dmat[3000:5000, 4] = 1 # bio rep 3 - treated 3 dmat[5000:6000, 5] = 1 # bio rep 4 - treated 4 dmat[6000:7000, 6] = 1 # bio rep 5 - untreated 1 dmat[7000:9000, 7] = 1 # bio rep 6 - untreated 2 dmat[9000:11000, 8] = 1 # bio rep 7 - untreated 3 dmat[11000:12000, 9] = 1 # bio rep 8 - untreated 4 dmat[1000:2000, 10] = 1 # tech rep 1 dmat[7000:8000, 10] = 1 # tech rep 1 dmat[2000:3000, 11] = 1 # tech rep 2 dmat[8000:9000, 11] = 1 # tech rep 2 dmat[3000:4000, 12] = 1 # tech rep 3 dmat[9000:10000, 12] = 1 # tech rep 3 dmat[4000:5000, 13] = 1 # tech rep 4 dmat[10000:11000, 13] = 1 # tech rep 4 coefficient_names = [ 'intercept', 'treatment1', 'bio1', 'bio2', 'bio3', 'bio4', 'bio5', 'bio6', 'bio7', 'bio8', 'tech1', 'tech2', 'tech3', 'tech4' ] dmat_est = pd.DataFrame(data=dmat, columns=coefficient_names) dmat_est_loc = de.utils.design_matrix(dmat=dmat_est, return_type="dataframe") dmat_est_scale = de.utils.design_matrix(dmat=dmat_est.iloc[:, [0]], return_type="dataframe") # Build constraints: constraints_loc = de.utils.constraint_matrix_from_string( dmat=dmat_est_loc.values, coef_names=dmat_est_loc.columns, constraints=[ "bio1+bio2=0", "bio3+bio4=0", "bio5+bio6=0", "bio7+bio8=0", "tech1+tech2=0", "tech3+tech4=0" ]) constraints_scale = None test = de.test.wald(data=sim.input_data, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["treatment1"]) _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!" return True
def _test_compute_hessians(self, sparse): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator, InputDataGLM elif self.noise_model == "norm": from batchglm.api.models import Simulator, InputDataGLM elif self.noise_model == "beta": from batchglm.api.models.tf1.glm_beta import Simulator, InputDataGLM else: raise ValueError("noise_model not recognized") num_observations = 500 num_conditions = 2 sim = Simulator(num_observations=num_observations, num_features=4) sim.generate_sample_description(num_conditions=num_conditions, num_batches=2) sim.generate() sample_description = data_utils.sample_description_from_xarray( sim.data, dim="observations") design_loc = data_utils.design_matrix( sample_description, formula="~ 1 + condition + batch") design_scale = data_utils.design_matrix(sample_description, formula="~ 1 + condition") if sparse: input_data = InputDataGLM(data=scipy.sparse.csr_matrix(sim.X), design_loc=design_loc, design_scale=design_scale) else: input_data = InputDataGLM(data=sim.X, design_loc=design_loc, design_scale=design_scale) # Compute hessian based on analytic solution. pkg_constants.HESSIAN_MODE = "analytic" t0_analytic = time.time() h_analytic = self.get_hessians(input_data) t1_analytic = time.time() t_analytic = t1_analytic - t0_analytic # Compute hessian based on tensorflow auto-differentiation. pkg_constants.HESSIAN_MODE = "tf1" t0_tf = time.time() h_tf = self.get_hessians(input_data) t1_tf = time.time() t_tf = t1_tf - t0_tf logging.getLogger("batchglm").info( "run time observation batch-wise analytic solution: %f" % t_analytic) logging.getLogger("batchglm").info("run time tensorflow solution: %f" % t_tf) logging.getLogger("batchglm").info("MAD: %f" % np.max(np.abs((h_tf - h_analytic)))) #i = 1 #print(h_tf[i, :, :]) #print(h_analytic[i, :, :]) #print(h_tf[i, :, :] - h_analytic[i, :, :]) # Make sure that hessians are not all zero which might make evaluation of equality difficult. assert np.sum(np.abs(h_analytic)) > 1e-10, \ "hessians too small to perform test: %f" % np.sum(np.abs(h_analytic)) mad = np.max(np.abs(h_tf - h_analytic)) assert mad < 1e-15, mad return True