def test_tabular_preprocess(self): dataset_properties = { 'numerical_columns': list(range(15)), 'categorical_columns': [], 'task_type': 'tabular_classification' } X = dict( X_train=np.random.random((10, 15)), y_train=np.random.random(10), train_indices=[0, 1, 2, 3, 4, 5], val_indices=[6, 7, 8, 9], is_small_preprocess=True, numerical_columns=list(range(15)), categorical_columns=[], num_features=15, num_classes=2, categories=[], # Training configuration job_id='test', device='cpu', budget_type='epochs', epochs=10, torch_num_threads=1, early_stopping=20, dataset_properties=dataset_properties, split_id=0, backend=self.backend, ) pipeline = TabularClassificationPipeline( dataset_properties=dataset_properties) # Remove the trainer pipeline.steps.pop() pipeline = pipeline.fit(X) X = pipeline.transform(X) self.assertNotIn('preprocess_transforms', X.keys())
def test_pipeline_transform(self, fit_dictionary_tabular): """ In the context of autopytorch, transform expands a fit dictionary with components that where previously fit. We can use this as a nice way to make sure that fit properly work. This code is added in light of components not properly added to the fit dicitonary """ pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties']) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \ as patch_train: patch_train.return_value = 1, {} # We do not want to make the same early preprocessing operation to the fit dictionary pipeline.fit(fit_dictionary_tabular.copy()) transformed_fit_dictionary_tabular = pipeline.transform( fit_dictionary_tabular) # First, we do not lose anyone! (We use a fancy subset containment check) assert fit_dictionary_tabular.items( ) <= transformed_fit_dictionary_tabular.items() # Then the pipeline should have added the following keys expected_keys = { 'imputer', 'encoder', 'scaler', 'tabular_transformer', 'preprocess_transforms', 'network', 'optimizer', 'lr_scheduler', 'train_data_loader', 'val_data_loader', 'run_summary' } assert expected_keys.issubset( set(transformed_fit_dictionary_tabular.keys())) # Then we need to have transformations being created. assert len( get_preprocess_transforms(transformed_fit_dictionary_tabular)) > 0 # We expect the transformations to be in the pipeline at anytime for inference assert 'preprocess_transforms' in transformed_fit_dictionary_tabular.keys( )
def test_tabular_no_preprocess(self): dataset_properties = { 'numerical_columns': list(range(15)), 'categorical_columns': [], 'task_type': TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION], 'output_type': OUTPUT_TYPES_TO_STRING[MULTICLASS], 'is_small_preprocess': False, 'input_shape': (15, ), 'output_shape': 2, 'categories': [], 'issparse': False } X = dict( X_train=np.random.random((10, 15)), y_train=np.random.random(10), train_indices=[0, 1, 2, 3, 4, 5], val_indices=[6, 7, 8, 9], dataset_properties=dataset_properties, # Training configuration num_run=16, device='cpu', budget_type='epochs', epochs=10, torch_num_threads=1, early_stopping=20, split_id=0, backend=self.backend, ) pipeline = TabularClassificationPipeline( dataset_properties=dataset_properties) # Remove the trainer pipeline.steps.pop() pipeline = pipeline.fit(X) X = pipeline.transform(X) self.assertIn('preprocess_transforms', X.keys()) self.assertIsInstance(X['preprocess_transforms'], list) self.assertIsInstance(X['preprocess_transforms'][-1].preprocessor, BaseEstimator)