def setUp(self) -> None: super(TestBalance, self).setUp() X, y = load_iris(return_X_y=True) y[y == 2] = 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) X_train = DataFrameContainer( "TrainSet", dataset_instance=X_train, resource_manager=self.mock_resource_manager) X_test = DataFrameContainer( "TestSet", dataset_instance=X_test, resource_manager=self.mock_resource_manager) y_train = NdArrayContainer("TrainLabel", dataset_instance=y_train, resource_manager=self.mock_resource_manager) y_test = NdArrayContainer("TestLabel", dataset_instance=y_test, resource_manager=self.mock_resource_manager) X_train.set_feature_groups(["num"] * 4) X_test.set_feature_groups(["num"] * 4) self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test
def test_upload_download(self): titanic_df = load("titanic") titanic_df.index = reversed(titanic_df.index) dc = DataFrameContainer(dataset_instance=titanic_df, resource_manager=self.mock_resource_manager) feat_grp = [f"feat_{i}" for i in range(dc.shape[1])] dc.set_feature_groups(feat_grp) column_descriptions = dc.column_descriptions dc.upload() dataset_id = dc.dataset_id download_dc = DataFrameContainer( "Unittest", dataset_id=dataset_id, resource_manager=self.mock_resource_manager) self.assertTrue( np.all(download_dc.data.fillna(0) == dc.data.fillna(0))) self.assertTrue( np.all(download_dc.feature_groups == dc.feature_groups)) self.assertTrue(np.all(download_dc.columns == dc.columns)) self.assertTrue(np.all(download_dc.index == dc.index)) self.assertEqual(download_dc.column_descriptions, dc.column_descriptions) self.assertEqual(download_dc.columns_mapper, dc.columns_mapper) self.assertEqual(download_dc.dataset_type, dc.dataset_type) self.assertEqual(download_dc.dataset_source, dc.dataset_source)
def test_classifier(self): train_df = datasets.load("titanic")[["Name", "Survived"]] y = np.array(train_df.pop("Survived")) X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, random_state=0) X_train = DataFrameContainer( "TrainSet", dataset_instance=X_train, resource_manager=self.mock_resource_manager) X_test = DataFrameContainer( "TestSet", dataset_instance=X_test, resource_manager=self.mock_resource_manager) y_train = NdArrayContainer("TrainLabel", dataset_instance=y_train, resource_manager=self.mock_resource_manager) y_test = NdArrayContainer("TestLabel", dataset_instance=y_test, resource_manager=self.mock_resource_manager) X_train.set_feature_groups(["text"]) X_test.set_feature_groups(["text"]) est_cls_list = [ TsvdTransformer, NmfTransformer, LsiTransformer, LdaTransformer, RpTransformer, ] for cls in est_cls_list: print("=========================") print(cls.__name__) print("=========================") tokenizer = SimpleTokenlizer( **get_default_hp_of_cls(SimpleTokenlizer)) tokenizer.in_feature_groups = "text" tokenizer.out_feature_groups = "token" transformer = cls(**get_default_hp_of_cls(cls)) transformer.in_feature_groups = "token" transformer.out_feature_groups = "num" classifier = RandomForestClassifier( **get_default_hp_of_cls(RandomForestClassifier)) pipeline = ML_Workflow([ ("tokenizer", tokenizer), ("transformer", transformer), ("classifier", classifier), ], resource_manager=self.mock_resource_manager) start = time() pipeline.fit(X_train, y_train, X_test, y_test) y_pred = pipeline.predict(X_test) score = accuracy_score(y_test.data, y_pred) end = time() print("score:", score) print("time:", end - start) self.assertGreater(score, 0.6) print('\n' * 2)
def process_X(self, X: DataFrameContainer, X_origin): if X is None: return None assert X.shape[1] == len(self.columns) if isinstance(X_origin, np.ndarray): X.columns = self.columns elif isinstance(X_origin, pd.DataFrame): assert set(X.columns) == set(self.columns) if not np.all(X.columns == self.columns): self.logger.warning( f"{X.dataset_source}'s columns do not match the TrainSet's columns by position!" ) X.data = X.data[self.columns] elif isinstance(X_origin, DataFrameContainer): pass else: raise NotImplementedError X.set_feature_groups(self.feature_groups) return X
def test_handle_unknown(self): X_train = pd.DataFrame([ ['A', 'alpha', 9], ['A', 'alpha', 1], ['B', 'beta', 2], ['B', 'beta', 3], ['C', 'gamma', 4], ['C', 'gamma', 5], ], columns=['col1', 'col2', 'col3']) X_valid = pd.DataFrame([ ['D', 'kappa', 6], ['D', 'kappa', 6], ['E', 'sigma', 7], ['E', 'sigma', 7], ['F', 'mu', 8], ['F', 'mu', 8], ], columns=['col1', 'col2', 'col3']) X_train = DataFrameContainer(dataset_instance=X_train) X_valid = DataFrameContainer(dataset_instance=X_valid) X_train.set_feature_groups(['cat'] * 3) X_valid.set_feature_groups(['cat'] * 3) y_train = NdArrayContainer(dataset_instance=[0, 1, 0, 1, 0, 1]) for cls in [ EntityEncoder, OrdinalEncoder, OneHotEncoder, TargetEncoder, CatBoostEncoder ]: hp = get_default_hp_of_cls(cls) encoder = cls(**hp) encoder.in_feature_groups = "cat" encoder.out_feature_groups = "ordinal" result = encoder.fit_transform(X_train=X_train, X_valid=X_valid, y_train=y_train) assert np.all( encoder.transform(X_train)['X_train'].data == result['X_train'].data) assert np.all( encoder.transform(X_valid)['X_train'].data == result['X_valid'].data)
def test_ordinal_encode_category(self): df2 = pd.DataFrame([ ['C', '3'], ['D', '4'], ['D', '4'], ], columns=['alpha', 'digits']) df2["digits"] = df2["digits"].astype( CategoricalDtype(categories=["4", "3"], ordered=True)) df2["alpha"] = df2["alpha"].astype( CategoricalDtype(categories=["D", "C"], ordered=True)) df2_ = df2.loc[1:, :] df2_1 = df2.loc[:1, :] df2_c = pd.concat([df2_, df2_1]) df2_c.index = range(4) encoder = OrdinalEncoder() encoder.in_feature_groups = "cat" encoder.out_feature_groups = "ordinal" # RunFeatureSelection().test_univar_clf() # RunCoding().test_procedure() dc = DataFrameContainer(dataset_instance=df2_c) dc.set_feature_groups(["cat"] * 2) encoder.fit(X_train=dc) result = encoder.transform(X_train=dc)["X_train"] print(result) should_be = pd.DataFrame({ 'alpha': { 0: 0, 1: 0, 2: 1, 3: 0 }, 'digits': { 0: 0, 1: 0, 2: 1, 3: 0 } }) assert np.all(result.data == should_be)
def setUp(self) -> None: super(TestFeatureSelection, self).setUp() self.L = 1024 df = load("qsar") y = df.pop("target") X = df X[X == 0] = -1 X.index = reversed(X.index) self.index = deepcopy(X.index) X = DataFrameContainer("TrainSet", dataset_instance=X, resource_manager=self.mock_resource_manager) X.set_feature_groups(["num"] * X.shape[1]) self.X = X self.y = NdArrayContainer("TrainSet", dataset_instance=y, resource_manager=self.mock_resource_manager) y_reg = y + np.random.rand(*y.shape) self.y_reg = NdArrayContainer( "TrainSet", dataset_instance=y_reg, resource_manager=self.mock_resource_manager)
def setUp(self) -> None: super(RunReduce, self).setUp() self.L = 1024 df = load("qsar") y = df.pop("target") X = df X[X == 0] = -1 X.index = reversed(X.index) self.index = deepcopy(X.index) X = DataFrameContainer("TrainSet", dataset_instance=X) X.set_feature_groups(["num"] * X.shape[1]) X2 = deepcopy(X) y2 = deepcopy(y) N = 500 X2.data = X2.data.iloc[:N, :] X2.set_feature_groups(["num"] * X2.shape[1]) y2 = y2.iloc[:N] self.Xs = [ X, X2 ] self.ys = [ NdArrayContainer("TrainLabel", dataset_instance=y), NdArrayContainer("TrainLabel", dataset_instance=y2) ]