def test_over_cat_limit(self): y = pd.Series(np.random.choice([0, 1], size=(500, ))) X = pd.DataFrame({ 'a': np.random.normal(size=500), 'b': np.random.normal(size=500), 'c': np.random.choice([0, 1], size=500), 'd': np.random.choice(['a', 'b', 'c', 'd'], size=500), 'e': np.random.choice([7, 8, 9, 10, 11], size=500), 'f': np.random.choice(['x', 'y'], size=500), 'g': np.random.choice([0, 1], size=500), 'h': np.random.choice(['q', 'r', 's'], size=500) }) inds = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] cats = ['c', 'd', 'e', 'f', 'g', 'h'] hinds = ['a', 'd'] ca = CausalAnalysis(inds, cats, hinds, upper_bound_on_cat_expansion=2) ca.fit(X, y) # columns 'd', 'e', 'h' have too many values self.assertEqual([res.feature_name for res in ca._results], ['a', 'b', 'c', 'f', 'g']) inds = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] cats = ['c', 'd', 'e', 'f', 'g', 'h'] hinds = ['a', 'd'] ca = CausalAnalysis(inds, cats, hinds, upper_bound_on_cat_expansion=3) ca.fit(X, y) # columns 'd', 'e' have too many values self.assertEqual([res.feature_name for res in ca._results], ['a', 'b', 'c', 'f', 'g', 'h']) ca.upper_bound_on_cat_expansion = 2 ca.fit(X, y, warm_start=True) # lowering bound shouldn't affect already fit columns when warm starting self.assertEqual([res.feature_name for res in ca._results], ['a', 'b', 'c', 'f', 'g', 'h']) ca.upper_bound_on_cat_expansion = 4 ca.fit(X, y, warm_start=True) # column d is now okay, too self.assertEqual([res.feature_name for res in ca._results], ['a', 'b', 'c', 'd', 'f', 'g', 'h'])
def test_invalid_inds(self): X = np.zeros((300, 6)) y = np.random.normal(size=(300, )) # first column: 10 ones, this is fine X[np.random.choice(300, 10, replace=False), 0] = 1 # ten ones, should be fine # second column: 6 categories, plenty of random instances of each # this is fine only if we increase the cateogry limit X[:, 1] = np.random.choice(6, 300) # six categories # third column: nine ones, lots of twos, not enough unless we disable check X[np.random.choice(300, 100, replace=False), 2] = 2 X[np.random.choice(300, 9, replace=False), 2] = 1 # fourth column: 5 ones, also not enough but barely works even with forest heterogeneity X[np.random.choice(300, 5, replace=False), 3] = 1 # fifth column: 2 ones, ensures that we will change number of folds for linear heterogeneity # forest heterogeneity won't work X[np.random.choice(300, 2, replace=False), 4] = 1 # sixth column: just 1 one, not enough even without check X[np.random.choice(300, 1), 5] = 1 # one instance of col_names = ['a', 'b', 'c', 'd', 'e', 'f'] X = pd.DataFrame(X, columns=col_names) for n in ['linear', 'automl']: for h in ['linear', 'forest']: for warm_start in [True, False]: ca = CausalAnalysis(col_names, col_names, col_names, verbose=1, nuisance_models=n, heterogeneity_model=h) ca.fit(X, y) self.assertEqual(ca.trained_feature_indices_, [0]) # only first column okay self.assertEqual(ca.untrained_feature_indices_, [(1, 'upper_bound_on_cat_expansion'), (2, 'cat_limit'), (3, 'cat_limit'), (4, 'cat_limit'), (5, 'cat_limit')]) # increase bound on cat expansion ca.upper_bound_on_cat_expansion = 6 ca.fit(X, y, warm_start=warm_start) self.assertEqual(ca.trained_feature_indices_, [0, 1]) # second column okay also self.assertEqual(ca.untrained_feature_indices_, [(2, 'cat_limit'), (3, 'cat_limit'), (4, 'cat_limit'), (5, 'cat_limit')]) # skip checks (reducing folds accordingly) ca.skip_cat_limit_checks = True ca.fit(X, y, warm_start=warm_start) if h == 'linear': self.assertEqual( ca.trained_feature_indices_, [0, 1, 2, 3, 4]) # all but last col okay self.assertEqual(ca.untrained_feature_indices_, [(5, 'cat_limit')]) else: self.assertEqual(ca.trained_feature_indices_, [0, 1, 2, 3]) # can't handle last two self.assertEqual(ca.untrained_feature_indices_, [(4, 'cat_limit'), (5, 'cat_limit')])