def _test_interaction(self, ngenes: int, test: str, constrained: bool, spline_basis: str): n_timepoints = 5 sim = Simulator(num_observations=n_timepoints * 200, num_features=ngenes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params() sim.generate_data() random_sample_description = pd.DataFrame({ "continuous": np.asarray(np.random.randint(0, n_timepoints, size=sim.nobs), dtype=float) }) random_sample_description["condition"] = [ str(np.random.randint(0, 2)) for x in random_sample_description["continuous"] ] random_sample_description["batch"] = [ x + str(np.random.randint(0, 3)) for x in random_sample_description["condition"] ] random_sample_description["size_factors"] = np.random.uniform( 0.9, 1.1, sim.nobs) # TODO put into simulation. det = self._fit_continuous_interaction( sim=sim, sample_description=random_sample_description, test=test, constrained=constrained, spline_basis=spline_basis, ) return det
def test(self): """ Check that factors that are numeric receive the correct number of coefficients. :return: """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=2000, num_features=2) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params() sim.generate_data() sample_description = sim.sample_description sample_description["numeric1"] = np.random.random(size=sim.nobs) sample_description["numeric2"] = np.random.random(size=sim.nobs) test = de.test.wald( data=sim.input_data, sample_description=sample_description, formula_loc="~ 1 + condition + numeric1 + numeric2", formula_scale="~ 1", factor_loc_totest="condition", as_numeric=["numeric1", "numeric2"], training_strategy="DEFAULT") # Check that number of coefficients is correct. assert test.model_estim.a_var.shape[0] == 4 return True
def _prepare_data(self, n_cells: int, n_genes: int, noise_model: str): """ :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(5, 10, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) num_non_de = n_genes // 2 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params(rand_fn_loc=rand_fn_loc, rand_fn_scale=rand_fn_scale) sim.a_var[1, :num_non_de] = 0 sim.b_var[1, :num_non_de] = 0 self.isDE = np.arange(n_genes) >= num_non_de sim.generate_data() return sim
def _test_single_full_rank(self): """ Test if de.wald() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif self.noise_model == "norm": from batchglm.api.models import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % self.noise_model) sim = Simulator(num_observations=200, num_features=2) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame({ "condition": [str(x) for x in np.random.randint(2, size=sim.nobs)] }) try: random_sample_description["batch"] = random_sample_description["condition"] _ = de.test.wald( data=sim.input_data, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", noise_model=self.noise_model ) except ValueError as error: logging.getLogger("diffxpy").info(error) else: raise ValueError("rank error was erroneously not thrown on under-determined unconstrained system") try: random_sample_description["batch"] = [ x + str(np.random.randint(0, 2)) for x in random_sample_description["condition"].values ] _ = de.test.wald( data=sim.input_data, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", constraints_loc={"batch": "condition"}, noise_model=self.noise_model ) except ValueError as error: raise ValueError("rank error was erroneously thrown on defined constrained system")
class TestSimulationGlmAll: sim: _SimulatorGLM input_data: InputDataGLM noise_model: str def eval_simulation_mean(self): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": threshold_dev = 1e-2 threshold_std = 1e-1 elif self.noise_model == "norm": threshold_dev = 1e-2 threshold_std = 1e-1 elif self.noise_model == "beta": threshold_dev = 1e-2 threshold_std = 1e-1 else: raise ValueError("noise_model not recognized") means_sim = self.sim.a_var[0, :] means_obs = self.sim.link_loc(np.mean(self.sim.input_data.x, axis=0)) mean_dev = np.mean(means_sim - means_obs) std_dev = np.std(means_sim - means_obs) logging.getLogger("batchglm").info("mean_dev_a %f" % mean_dev) logging.getLogger("batchglm").info("std_dev_a %f" % std_dev) if np.abs(mean_dev) < threshold_dev and \ std_dev < threshold_std: return True else: return False def _test_all_moments(self): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator elif self.noise_model == "norm": from batchglm.api.models import Simulator elif self.noise_model == "beta": from batchglm.api.models.tf1.glm_beta import Simulator else: raise ValueError("noise_model not recognized") self.sim = Simulator(num_observations=100000, num_features=10) self.sim.generate_sample_description(num_batches=1, num_conditions=1) self.sim.generate_params() self.sim.generate_data() success = self.eval_simulation_mean() assert success, "mean of simulation was inaccurate" return True
def _test_wald_de( self, constrained: bool, spline_basis: str, ngenes: int ): if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(2, 5, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif self.noise_model == "norm": from batchglm.api.models import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % self.noise_model) n_timepoints = 7 sim = Simulator(num_observations=n_timepoints*200, num_features=ngenes) sim.generate_sample_description( num_batches=0, num_conditions=n_timepoints ) sim.generate_params( rand_fn_loc=rand_fn_loc, rand_fn_scale=rand_fn_scale ) num_non_de = round(ngenes / 2) sim.a_var[1:, :num_non_de] = 0 # Set all condition effects of non DE genes to zero. sim.b_var[1:, :] = 0 # Use constant dispersion across all conditions. self.isDE = np.arange(ngenes) >= num_non_de sim.generate_data() random_sample_description = sim.sample_description random_sample_description["continuous"] = [int(x) for x in random_sample_description["condition"]] random_sample_description["batch"] = [ str(int(x)) + str(np.random.randint(0, 3)) for x in random_sample_description["continuous"] ] test = de.test.continuous_1d( data=sim.input_data, sample_description=random_sample_description, gene_names=["gene" + str(i) for i in range(sim.input_data.num_features)], formula_loc="~ 1 + continuous + batch" if constrained else "~ 1 + continuous", formula_scale="~ 1", factor_loc_totest="continuous", continuous="continuous", constraints_loc={"batch": "continuous"} if constrained else None, df=5, spline_basis=spline_basis, test="wald", quick_scale=True, noise_model=self.noise_model ) self._eval(sim=sim, test=test)
def _test_null_distribution_wald(self, n_cells: int, n_genes: int, noise_model: str): """ Test if de.wald() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.nobs), "batch": np.random.randint(2, size=sim.nobs) }) random_sf = np.random.uniform(0.5, 1.5, sim.nobs) test = de.test.wald(data=sim.input_data, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", size_factors=random_sf, batch_size=500, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64") _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, ("KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)) return True
def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100): """ :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params() sim.generate_data() return sim
def _test_model_fit_partition( self, n_cells: int, n_genes: int, noise_model: str ): """ Test if de.wald() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models.tf1.glm_norm import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.nobs), "batch": np.random.randint(2, size=sim.nobs) }) partition = de.fit.partition( data=sim.input_data, sample_description=random_sample_description, parts="condition" ) estim = partition.model( formula_loc="~ 1 + batch", noise_model=noise_model ) return True
def _prepate_data(self, n_cells: int, n_genes: int, n_groups: int): if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(0.1, 1, shape) rand_fn_scale = lambda shape: np.random.uniform(0.5, 1, shape) elif self.noise_model == "norm" or self.noise_model is None: from batchglm.api.models import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % self.noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_loc=rand_fn_loc, rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame({ "condition": [str(x) for x in np.random.randint(n_groups, size=sim.nobs)] }) return sim, random_sample_description