ac_filepath = "tmp/sample_ac.pickle" al_filepath = "tmp/sample_al.pickle" df = pd.read_csv('data/train.csv') if not (os.path.exists(ac_filepath) and os.path.exists(al_filepath)): ac = AutoConverter(target='Survived') X, y = ac.fit_transform(df) al = AutoLearn(customized_clf_list=[('LogisticRegression', LogisticRegression())], metric='roc_auc', cv_num=5, pos_label=1, n_jobs=1, verbose=0) results = al.learn(X, y) print(results['name']) print(results['eval_df']) pred = al.predict(X) print(pred) ac.save(ac_filepath) al.save(al_filepath) ac = AutoConverter.load(ac_filepath) al = AutoLearn.load(al_filepath) e = Evaluate(ac, al) orig_eval_s = e.evaluate_performance(df) col_imp_df = e.calculate_column_importance(df)
class EvaluateTestCase(unittest.TestCase): def setUp(self): self.df1 = pd.read_csv('data/train.csv') self.assertTrue(True) ac1 = AutoConverter(target='Survived') self.assertTrue(True) X1, y1 = ac1.fit_transform(self.df1) al1 = AutoLearn(level=1) al1.learn(X1, y1) self.e1 = Evaluate(ac=ac1, alearn=al1) self.assertTrue(True) clf1 = LogisticRegression() clf1.fit(X1, y1) self.e1a = Evaluate(ac=ac1, alearn=clf1) with self.assertRaises(ValueError): Evaluate(alearn=al1) self.e1b = Evaluate(alearn=al1, feature_names=ac1.feature_names) data = datasets.load_iris() self.df2 = pd.DataFrame(np.c_[data.target.reshape(-1, 1), data.data], columns=["class"] + data.feature_names) ac2 = AutoConverter(target="class") al2 = AutoLearn(level=1) X2, y2 = ac2.fit_transform(self.df2) al2.learn(X2, y2) self.e2 = Evaluate(ac=ac2, alearn=al2) clf2 = LogisticRegression() clf2.fit(X2, y2) self.e2a = Evaluate(ac=ac2, alearn=clf2) # subtable dirpath = "data/kaggle-kkbox-churn-prediction-challenge-1k" members_df = pd.read_csv(os.path.join(dirpath, "members_train.csv")) transactions_df = pd.read_csv(os.path.join(dirpath, "transactions.csv")) user_logs_df = pd.read_csv(os.path.join(dirpath, "user_logs.csv")) subtables3 = { "transactions": { "table": transactions_df, "link_key": "msno", "group_key": "msno" }, "user_logs": { "table": user_logs_df, "link_key": "msno", "group_key": "msno" } } ac3 = AutoConverter(target="is_churn") X3, y3 = ac3.fit_transform(df=members_df, subtables=subtables3) al3 = AutoLearn(level=1) al3.learn(X3, y3) self.e3 = Evaluate(ac=ac3, alearn=al3) self.df4 = members_df ac4 = AutoConverter(target="is_churn", task_type="regression") X4, y4 = ac4.fit_transform(df=members_df) al4 = AutoLearn(level=1, task="regression") al4.learn(X4, y4) e4 = Evaluate(alearn=al4, ac=ac4) self.e4 = e4 def test_calculate_column_importance(self): for e in [self.e1, self.e1a, self.e2, self.e2a, self.e3, self.e4]: try: e.calculate_column_importance() except Exception as e: self.fail(str(e)) def test_evaluate(self): for e in [self.e1, self.e1a, self.e2, self.e2a, self.e3, self.e4]: orig_eval_s = e.evaluate_performance() col_imp_df = e.calculate_column_importance() self.assertEqual(orig_eval_s.index.tolist(), col_imp_df.columns.tolist()) # They should raise Errors as X and y are not given with self.assertRaises(ValueError): self.e1b.evaluate_performance() with self.assertRaises(ValueError): self.e1b.calculate_column_importance() def test_get_top_column(self): self.assertEqual(5, len(self.e1.get_top_columns(n=5))) for table_colname in self.e3.get_top_columns(n=3): tablename = table_colname.split("..")[0] self.assertTrue(tablename in list(self.e3.ac.subtables_.keys()) + ["main"]) def test_get_mispredictions(self): for e, df in [(self.e1, self.df1), (self.e1a, self.df1), (self.e2, self.df2), (self.e2a, self.df2)]: mispred_df = e.get_mispredictions(df) orig_colset = set(df.columns.tolist()) mispred_colset = set(mispred_df.columns.tolist()) # All columns in mispred_df should be in df self.assertEqual(len(mispred_colset & orig_colset), len(mispred_colset)) def test_stratify_errors(self): for e, df in [(self.e1, self.df1), (self.e1a, self.df1)]: es = e.stratify_errors(df) self.assertIsNotNone(es) self.assertIsInstance(es, ErrorSummary) self.assertIsNotNone(es.diversity) self.assertIsNotNone(es.error_dist) self.assertIsNotNone(es.errors) self.assertEqual(es.error_dist.index.levels[0].tolist(), es.diversity.index.tolist()) # None should be returned for the Iris dataset self.assertIsNone(self.e2.stratify_errors(self.df2)) self.assertIsNone(self.e2a.stratify_errors(self.df2)) def test_get_explanations(self): e_df1 = self.e1.get_explanations(self.df1) self.assertEqual(e_df1.shape[0], self.df1.shape[0]) e_df1a = self.e1a.get_explanations(self.df1) self.assertEqual(e_df1a.shape[0], self.df1.shape[0]) e_df2 = self.e2.get_explanations(self.df2) self.assertEqual(e_df2.shape[0], self.df2.shape[0]) e_df2a = self.e2a.get_explanations(self.df2) self.assertEqual(e_df2a.shape[0], self.df2.shape[0])