def test_setUp(self, tol=0.02): # assumes working directory is diamond/ folder = "diamond/integration_tests/logistic" simulated_data_loc = "%s/simulated_logistic_df.csv" % folder estimated_covariance_loc = "%s/simulated_logistic_covariance.csv" % folder resources_exist = os.path.exists(simulated_data_loc) and os.path.exists(estimated_covariance_loc) if not resources_exist: logging.info("Simulating data and estimating covariances in R") os.system("/usr/local/bin/Rscript %s/logistic_generate_and_fit.R" % folder) logging.info("Reading in training data and R::lme4-estimated covariance matrix") df_train = pd.read_csv(simulated_data_loc) df_estimated_covariance = pd.read_csv(estimated_covariance_loc) self.model = LogisticRegression(train_df=df_train, priors_df=df_estimated_covariance, copy=True, test_df=None) logging.info("Fitting model in diamond") self.formula = "y ~ 1 + x + (1 + x | level)" results = self.model.fit(self.formula, tol=1e-4, verbose=True) # the format of the coefficient vector is: # fixed effects, then [random intercept, random slope] for each level beta_hat = np.append(results["fixed_effects"].value.values, pd.melt(results["level"], "level").sort_values(["level", "variable"]).value.values) beta_true = pd.read_csv("%s/simulated_logistic_true_parameters.csv" % folder)["x"].values rel_error = np.mean((beta_hat - beta_true) ** 2) / np.mean(abs(beta_true)) if rel_error > tol: logging.warn("relative error = %f > tolerance = %f" % (rel_error, tol)) else: logging.info("relative error = %f < tolerance = %f" % (rel_error, tol)) # make sure the coefficients are very close self.assertTrue(rel_error < tol)
def fit_diamond_model(df_train): logging.info('fitting diamond model') formula = 'target ~ 1 + (1|song_id) + (1|msno)' priors = pd.DataFrame({ 'group': ['song_id', 'msno'], 'var1': ['intercept'] * 2, 'var2': [np.nan] * 2, # fit on a sample of data in R/lme4 'vcov': [0.00845, 0.07268] }) diamond_model = LogisticRegression(df_train, priors) diamond_model.fit(formula, tol=1e-5, verbose=False, max_its=200) df_train.drop(['row_index', 'intercept'], axis=1, inplace=True, errors='ignore') return diamond_model
def setUp(self): data = {"response": [0, 1, 1], "var_a": [21, 32, 10], "cyl": [4, 6, 4]} df = pd.DataFrame(data, index=[0, 1, 2]) priors_data = { "grp": ["cyl", "cyl", "cyl"], "var1": ["intercept", "intercept", "var_a"], "var2": [np.NaN, "var_a", np.NaN], "vcov": [0.123, -1.42, 0.998] } priors_df = pd.DataFrame(priors_data, index=[0, 1, 2]) self.formula = "response ~ 1 + var_a + (1 + var_a | cyl)" self.model = LogisticRegression(train_df=df, priors_df=priors_df, test_df=None)
class TestGLM(unittest.TestCase): def setUp(self): data = {"response": [0, 1, 1], "var_a": [21, 32, 10], "cyl": [4, 6, 4]} df = pd.DataFrame(data, index=[0, 1, 2]) priors_data = { "grp": ["cyl", "cyl", "cyl"], "var1": ["intercept", "intercept", "var_a"], "var2": [np.NaN, "var_a", np.NaN], "vcov": [0.123, -1.42, 0.998] } priors_df = pd.DataFrame(priors_data, index=[0, 1, 2]) self.formula = "response ~ 1 + var_a + (1 + var_a | cyl)" self.model = LogisticRegression(train_df=df, priors_df=priors_df, test_df=None) def test_parse_formula(self): self.model._parse_formula(self.formula) self.assertEqual(self.model.num_main, 2) self.assertEqual(self.model.response, "response") self.assertListEqual(self.model.main_effects, ["intercept", "var_a"]) self.assertEqual(self.model.total_num_interactions, self.model.train_df.cyl.nunique()) self.assertListEqual(self.model.grouping_factors, ["cyl"]) self.assertListEqual(self.model.group_levels.keys(), ["cyl"]) self.assertListEqual(list(self.model.group_levels["cyl"]), [4, 6]) def test_create_penalty_matrix(self): self.model._parse_formula(self.formula) self.model._create_penalty_matrix() expected_inv_cov_block = np.linalg.inv([[0.123, -1.42], [-1.42, 0.998]]) actual_inv_cov_block = self.model.sparse_inv_covs["cyl"]._block self.assertListEqual(self.model.sparse_inv_covs.keys(), ["main", "cyl"]) self.assertEqual(self.model.sparse_inv_covs["cyl"]._num_blocks, 2) self.assertEqual(self.model.sparse_inv_covs["cyl"]._block_shape, 2) self.assertTrue((expected_inv_cov_block == actual_inv_cov_block).all()) def test_create_main_design(self): self.model._parse_formula(self.formula) self.model._create_design_matrix() expected_design = [[1, float(row[1].var_a)] for row in self.model.train_df.iterrows()] actual_design = self.model._create_main_design() self.assertEqual( actual_design.shape, (len(self.model.train_df), self.model.train_df.cyl.nunique())) self.assertTrue((expected_design == actual_design.todense()).all()) def test_create_inter_design(self): self.model._parse_formula(self.formula) self.model._create_design_matrix() expected_design = [[1, float(row[1].var_a), 0, 0] if row[1].cyl == 4 else [0, 0, 1, float(row[1].var_a)] for row in self.model.train_df.iterrows()] actual_design = self.model._create_inter_design(g="cyl") # shape is (num variables * num levels, num observations) self.assertEqual( actual_design.shape, (len(self.model.train_df), 2 * self.model.train_df.cyl.nunique())) self.assertTrue((expected_design == actual_design.todense()).all())