def _test_interaction(self, ngenes: int, test: str, constrained: bool, spline_basis: str): n_timepoints = 5 sim = Simulator(num_observations=n_timepoints * 200, num_features=ngenes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params() sim.generate_data() random_sample_description = pd.DataFrame({ "continuous": np.asarray(np.random.randint(0, n_timepoints, size=sim.nobs), dtype=float) }) random_sample_description["condition"] = [ str(np.random.randint(0, 2)) for x in random_sample_description["continuous"] ] random_sample_description["batch"] = [ x + str(np.random.randint(0, 3)) for x in random_sample_description["condition"] ] random_sample_description["size_factors"] = np.random.uniform( 0.9, 1.1, sim.nobs) # TODO put into simulation. det = self._fit_continuous_interaction( sim=sim, sample_description=random_sample_description, test=test, constrained=constrained, spline_basis=spline_basis, ) return det
def _prepare_data(self, n_cells: int, n_genes: int, noise_model: str): """ :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.numpy.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(5, 10, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models.numpy.glm_norm import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) num_non_de = n_genes // 2 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params(rand_fn_loc=rand_fn_loc, rand_fn_scale=rand_fn_scale) sim.a_var[1, :num_non_de] = 0 sim.b_var[1, :num_non_de] = 0 self.isDE = np.arange(n_genes) >= num_non_de sim.generate_data() return sim
def test(self): """ Check that factors that are numeric receive the correct number of coefficients. :return: """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=2000, num_features=2) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params() sim.generate_data() sample_description = sim.sample_description sample_description["numeric1"] = np.random.random(size=sim.nobs) sample_description["numeric2"] = np.random.random(size=sim.nobs) test = de.test.wald( data=sim.input_data, sample_description=sample_description, formula_loc="~ 1 + condition + numeric1 + numeric2", formula_scale="~ 1", factor_loc_totest="condition", as_numeric=["numeric1", "numeric2"], training_strategy="DEFAULT") # Check that number of coefficients is correct. assert test.model_estim.a_var.shape[0] == 4 return True
def _test_null_distribution_wald_repeated( self, n_cells: int, n_genes: int, noise_model: str ): """ Test if de.wald() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.numpy.glm_nb import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models.numpy.glm_norm import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.nobs), "batch": np.random.randint(2, size=sim.nobs) }) test1 = de.test.wald( data=sim.input_data, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", noise_model=noise_model ) test = de.test.wald_repeated( det=test1, factor_loc_totest="condition" ) _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info('KS-test pvalue for null model match of wald_repeated(): %f' % pval_h0) assert pval_h0 > 0.05, ("KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)) return True
def _test_wald_de(self, constrained: bool, spline_basis: str, ngenes: int): if self.noise_model == "nb": from batchglm.api.models.numpy.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(2, 5, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif self.noise_model == "norm": from batchglm.api.models.numpy.glm_norm import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % self.noise_model) n_timepoints = 7 sim = Simulator(num_observations=n_timepoints * 200, num_features=ngenes) sim.generate_sample_description(num_batches=0, num_conditions=n_timepoints) sim.generate_params(rand_fn_loc=rand_fn_loc, rand_fn_scale=rand_fn_scale) num_non_de = round(ngenes / 2) sim.a_var[ 1:, : num_non_de] = 0 # Set all condition effects of non DE genes to zero. sim.b_var[1:, :] = 0 # Use constant dispersion across all conditions. self.isDE = np.arange(ngenes) >= num_non_de sim.generate_data() random_sample_description = sim.sample_description random_sample_description["continuous"] = [ int(x) for x in random_sample_description["condition"] ] random_sample_description["batch"] = [ str(int(x)) + str(np.random.randint(0, 3)) for x in random_sample_description["continuous"] ] test = de.test.continuous_1d( data=sim.input_data, sample_description=random_sample_description, gene_names=[ "gene" + str(i) for i in range(sim.input_data.num_features) ], formula_loc="~ 1 + continuous + batch" if constrained else "~ 1 + continuous", formula_scale="~ 1", factor_loc_totest="continuous", continuous="continuous", constraints_loc={"batch": "continuous"} if constrained else None, df=5, spline_basis=spline_basis, test="wald", quick_scale=True, noise_model=self.noise_model) self._eval(sim=sim, test=test)
def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100): """ :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params() sim.generate_data() return sim
def _test_model_fit_partition(self, n_cells: int, n_genes: int, noise_model: str): """ Test if de.wald() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.numpy.glm_nb import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models.numpy.glm_norm import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.nobs), "batch": np.random.randint(2, size=sim.nobs) }) partition = de.fit.partition( data=sim.input_data, sample_description=random_sample_description, parts="condition") estim = partition.model(formula_loc="~ 1 + batch", noise_model=noise_model) return True
def _prepate_data(self, n_cells: int, n_genes: int, n_groups: int): if self.noise_model == "nb": from batchglm.api.models.numpy.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(0.1, 1, shape) rand_fn_scale = lambda shape: np.random.uniform(0.5, 1, shape) elif self.noise_model == "norm" or self.noise_model is None: from batchglm.api.models.numpy.glm_norm import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % self.noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_loc=rand_fn_loc, rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame({ "condition": [str(x) for x in np.random.randint(n_groups, size=sim.nobs)] }) return sim, random_sample_description