def test_predict_proba_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() # Multiclass cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, list) self.assertEqual(2, len(prediction)) self.assertEqual((1647, 10), prediction[0].shape) self.assertEqual((1647, 10), prediction[1].shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_proba_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': ['decision_tree']}) default = cs.get_default_configuration() # Multiclass cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array( list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(prediction.shape, ((1647, 10))) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_proba_batched(self): # Multiclass cls = SimpleClassificationPipeline(include={'classifier': ['sgd']}) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba) cls.steps[-1][-1].predict_proba = cls_predict prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.call_count) np.testing.assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(include={'classifier': ['lda']}) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba) cls.steps[-1][-1].predict_proba = cls_predict prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.call_count) np.testing.assert_array_almost_equal(prediction_, prediction)
def test_predict_proba_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration( cs, values={ "balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": 'True', "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, "rescaling:__choice__": "min/max" }) # Multiclass cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array( list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual(prediction.shape, ((1647, 10))) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier( self): cs = SimpleClassificationPipeline( include={'preprocessor': ['densifier']}, dataset_properties={'sparse': True}).\ get_hyperparameter_search_space() self.assertEqual( cs.get_hyperparameter('classifier:__choice__').default, 'qda') cs = SimpleClassificationPipeline(include={'preprocessor': ['nystroem_sampler']}).\ get_hyperparameter_search_space() self.assertEqual( cs.get_hyperparameter('classifier:__choice__').default, 'sgd')
def test_configurations_signed_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'signed': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls = SimpleClassificationPipeline(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabiliets = cls.predict_proba(X_test_) self.assertIsInstance(predicted_probabiliets, np.ndarray) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
def get_models_performance_by_data(input): X = input[0] y = input[1] probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx)) model = random_model() train_accuracy_score = [] test_accuracy_score = [] train_log_loss = [] test_log_loss = [] kf = KFold(n_splits=5, random_state=1, shuffle=True) time_start = time.time() for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] p = SimpleClassificationPipeline(config=model) p.fit(X_train, y_train) y_train_pred = p.predict(X_train) y_test_pred = p.predict(X_test) train_accuracy_score.append(accuracy_score(y_train, y_train_pred)) test_accuracy_score.append(accuracy_score(y_test, y_test_pred)) train_log_loss.append(log_loss(y_train, y_train_pred)) test_log_loss.append(log_loss(y_test, y_test_pred)) time_end = time.time() duration = time_end - time_start models_performance = { "train_accuracy_score": np.mean(train_accuracy_score), "test_accuracy_score": np.mean(test_accuracy_score), "train_log_loss": np.mean(train_log_loss), "test_log_loss": np.mean(test_log_loss), "duration": duration / 5 } return models_performance
def get_performance_of_encoded_model(data_set, encoded_model, verbose=False): """ Get model performance array(4 * 1) from encoded model vector(17 * 1) data_set : (X, y) input dataset to get performance encoded_model : encoded model choice vector (17 * 1) verbose : if True, will log model choice dictionary and model performance array return : model performance vector(4 * 1) """ train_accuracy_score = [] test_accuracy_score = [] train_log_loss = [] test_log_loss = [] X, y = data_set #kf = KFold(n_splits=5, random_state=1, shuffle=True) model = decode_model(encoded_model) if verbose: print('Model choice: {0}'.format(model)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) p = SimpleClassificationPipeline(config=model) p.fit(X_train, y_train) #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True) #print(scores) y_train_pred = p.predict(X_train) y_test_pred = p.predict(X_test) train_accuracy_score.append(accuracy_score(y_train, y_train_pred)) test_accuracy_score.append(accuracy_score(y_test, y_test_pred)) train_log_loss.append(log_loss(y_train, y_train_pred)) test_log_loss.append(log_loss(y_test, y_test_pred)) model_performance = np.array([np.mean(train_accuracy_score), np.mean(test_accuracy_score), np.mean(train_log_loss), np.mean(test_log_loss)]) if verbose: print('Model Performance: {o}'.format(model_performance)) return model_performance
def test_predict_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': ['decision_tree']}, dataset_properties={'sparse': True}) config = cs.get_default_configuration() cls = SimpleClassificationPipeline(config) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, ), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array( list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_fit_instantiates_component(self): """Make sure that if a preprocessor is added, it's fit method is called""" preprocessing_components.add_preprocessor(CrashPreprocessor) # We reduce the search space as forbidden clauses prevent to instantiate # the user defined preprocessor manually cls = SimpleClassificationPipeline(include={'classifier': ['random_forest']}) cs = cls.get_hyperparameter_search_space() self.assertIn('CrashPreprocessor', str(cs)) config = cs.sample_configuration() try: config['feature_preprocessor:__choice__'] = 'CrashPreprocessor' except Exception as e: # In case of failure clean up the components and print enough information # to clean up with check in the future del preprocessing_components._addons.components['CrashPreprocessor'] self.fail("cs={} config={} Exception={}".format(cs, config, e)) cls.set_hyperparameters(config) with self.assertRaisesRegex( ValueError, "Make sure fit is called" ): cls.fit( X=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]), y=np.array([1, 0, 1, 1]) ) del preprocessing_components._addons.components['CrashPreprocessor']
def test_configurations_signed_data(self): dataset_properties = {'signed': True} cs = SimpleClassificationPipeline(dataset_properties=dataset_properties)\ .get_hyperparameter_search_space() self._test_configurations(configurations_space=cs, dataset_properties=dataset_properties)
def test_get_hyperparameter_search_space(self): cs = SimpleClassificationPipeline().get_hyperparameter_search_space() self.assertIsInstance(cs, ConfigurationSpace) conditions = cs.get_conditions() forbiddens = cs.get_forbiddens() self.assertEqual( len( cs.get_hyperparameter( 'data_preprocessing:numerical_transformer:rescaling:__choice__' ).choices), 7) self.assertEqual( len(cs.get_hyperparameter('classifier:__choice__').choices), 16) self.assertEqual( len( cs.get_hyperparameter( 'feature_preprocessor:__choice__').choices), 13) hyperparameters = cs.get_hyperparameters() self.assertEqual(167, len(hyperparameters)) # for hp in sorted([str(h) for h in hyperparameters]): # print hp # The four components which are always active are classifier, # feature preprocessor, balancing and data preprocessing pipeline. self.assertEqual(len(hyperparameters) - 7, len(conditions)) self.assertEqual(len(forbiddens), 53)
def _get_classification_configuration_space(info, include, exclude): task_type = info['task'] multilabel = False multiclass = False sparse = False if task_type == MULTILABEL_CLASSIFICATION: multilabel = True if task_type == REGRESSION: raise NotImplementedError() if task_type == MULTICLASS_CLASSIFICATION: multiclass = True if task_type == BINARY_CLASSIFICATION: pass if info['is_sparse'] == 1: sparse = True dataset_properties = { 'multilabel': multilabel, 'multiclass': multiclass, 'sparse': sparse } return SimpleClassificationPipeline( dataset_properties=dataset_properties, include=include, exclude=exclude).\ get_hyperparameter_search_space()
def test_configurations_categorical_data(self): cs = SimpleClassificationPipeline(dataset_properties={'sparse': True}).\ get_hyperparameter_search_space() categorical = [ True, True, True, False, False, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True ] this_directory = os.path.dirname(__file__) X = np.loadtxt( os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl")) y = X[:, -1].copy() X = X[:, :-1] # In order to usefully test the neural networks _, y = np.unique(y, return_inverse=True) X_train, X_test, Y_train, Y_test = \ sklearn.model_selection.train_test_split(X, y) data = { 'X_train': X_train, 'Y_train': Y_train, 'X_test': X_test, 'Y_test': Y_test } init_params = {'one_hot_encoding:categorical_features': categorical} self._test_configurations(configurations_space=cs, make_sparse=True, data=data, init_params=init_params)
def test_add_classifier(self): self.assertEqual(len(classification_components._addons.components), 0) classification_components.add_classifier(DummyClassifier) self.assertEqual(len(classification_components._addons.components), 1) cs = SimpleClassificationPipeline().get_hyperparameter_search_space() self.assertIn('DummyClassifier', str(cs)) del classification_components._addons.components['DummyClassifier']
def test_configurations_categorical_data(self): cs = SimpleClassificationPipeline( dataset_properties={'sparse': False}, include={ 'feature_preprocessor': ['no_preprocessing'], 'classifier': ['sgd', 'adaboost'] } ).get_hyperparameter_search_space() categorical = [True, True, True, False, False, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True] this_directory = os.path.dirname(__file__) X = np.loadtxt(os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl")) y = X[:, -1].copy() X = X[:, :-1] X_train, X_test, Y_train, Y_test = \ sklearn.model_selection.train_test_split(X, y) data = {'X_train': X_train, 'Y_train': Y_train, 'X_test': X_test, 'Y_test': Y_test} init_params = { 'data_preprocessing:categorical_features': categorical } self._test_configurations(configurations_space=cs, make_sparse=True, data=data, init_params=init_params)
def _get_classification_configuration_space( info: Dict[str, Any], include: Optional[Dict[str, List[str]]], exclude: Optional[Dict[str, List[str]]]) -> ConfigurationSpace: task_type = info['task'] multilabel = False multiclass = False sparse = False if task_type == MULTILABEL_CLASSIFICATION: multilabel = True if task_type == MULTICLASS_CLASSIFICATION: multiclass = True if task_type == BINARY_CLASSIFICATION: pass if info['is_sparse'] == 1: sparse = True dataset_properties = { 'multilabel': multilabel, 'multiclass': multiclass, 'sparse': sparse } return SimpleClassificationPipeline( dataset_properties=dataset_properties, include=include, exclude=exclude).\ get_hyperparameter_search_space()
def test_add_preprocessor(self): self.assertEqual(len(preprocessing_components._addons.components), 0) preprocessing_components.add_preprocessor(DummyPreprocessor) self.assertEqual(len(preprocessing_components._addons.components), 1) cs = SimpleClassificationPipeline().get_hyperparameter_search_space() self.assertIn('DummyPreprocessor', str(cs)) del preprocessing_components._addons.components['DummyPreprocessor']
def test_predict_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() cls = SimpleClassificationPipeline(default) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, ), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array( list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_multilabel(self): cache = Memory(location=tempfile.gettempdir()) cached_func = cache.cache( sklearn.datasets.make_multilabel_classification ) X, Y = cached_func( n_samples=150, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator=True, return_distributions=False, random_state=1 ) X_train = X[:100, :] Y_train = Y[:100, :] X_test = X[101:, :] Y_test = Y[101:, ] data = {'X_train': X_train, 'Y_train': Y_train, 'X_test': X_test, 'Y_test': Y_test} dataset_properties = {'multilabel': True} cs = SimpleClassificationPipeline(dataset_properties=dataset_properties).\ get_hyperparameter_search_space() self._test_configurations(configurations_space=cs, data=data)
def test_default_configuration(self): for i in range(2): X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') auto = SimpleClassificationPipeline() auto = auto.fit(X_train, Y_train) predictions = auto.predict(X_test) self.assertAlmostEqual(0.96, sklearn.metrics.accuracy_score(predictions, Y_test)) auto.predict_proba(X_test)
def get_models_performance(reproduce_num, data_set_idx): ''' reproduce_num: the number of model choices for the dataset to reproduce data_set_idx: generated data set index, will load tried models for the dataset json file return: reproduced models performance json file ''' X = np.loadtxt('Data_Set/X_' + str(data_set_idx)) y = np.loadtxt('Data_Set/y_' + str(data_set_idx)) probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx)) # X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1) tried_models_filename = "./log/classifier_log" + str( data_set_idx) + "/tried_models_for_dataset" + str( data_set_idx) + ".json" models_performance = {} # duration = get_training_duration(data_set_idx) with open(tried_models_filename) as fp: models = json.load(fp) reproduce_num_act = min(len(models), reproduce_num) for i in range(1, reproduce_num_act + 1): model = models[str(i)] #print(model) train_accuracy_score = [] test_accuracy_score = [] train_log_loss = [] test_log_loss = [] #kf = KFold(n_splits=5, random_state=1, shuffle=True) time_start = time.time() X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42, shuffle=True) p = SimpleClassificationPipeline(config=model) p.fit(X_train, y_train) #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True) #print(scores) y_train_pred = p.predict(X_train) y_test_pred = p.predict(X_test) train_accuracy_score.append(accuracy_score(y_train, y_train_pred)) test_accuracy_score.append(accuracy_score(y_test, y_test_pred)) train_log_loss.append(log_loss(y_train, y_train_pred)) test_log_loss.append(log_loss(y_test, y_test_pred)) time_end = time.time() duration = time_end - time_start models_performance[i] = { "train_accuracy_score": np.mean(train_accuracy_score), "test_accuracy_score": np.mean(test_accuracy_score), "train_log_loss": np.mean(train_log_loss), "test_log_loss": np.mean(test_log_loss), "duration": duration } #if i in duration: # models_performance[i]["duration"] = duration[i] repreduce_performance_json_filename = "./log/classifier_log" + str( data_set_idx) + "/reproduce_models_performance" + str( data_set_idx) + ".json" with open(repreduce_performance_json_filename, 'w') as fp: json.dump(models_performance, fp) return models_performance
def test_default_configuration_iterative_fit(self): classifier = SimpleClassificationPipeline( include={'classifier': ['random_forest'], 'feature_preprocessor': ['no_preprocessing']}) X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') classifier.fit_transformer(X_train, Y_train) for i in range(1, 11): classifier.iterative_fit(X_train, Y_train) self.assertEqual(classifier.steps[-1][-1].choice.estimator.n_estimators, i)
def test_default_configuration(self): for i in range(2): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') auto = SimpleClassificationPipeline(default) auto = auto.fit(X_train, Y_train) predictions = auto.predict(X_test) self.assertAlmostEqual(0.9599999999999995, sklearn.metrics.accuracy_score(predictions, Y_test)) scores = auto.predict_proba(X_test)
def test_configurations_sparse(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if 'classifier:passive_aggressive:n_iter' in config and \ config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if 'classifier:sgd:n_iter' in config and \ config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 print(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls = SimpleClassificationPipeline(config, random_state=1) try: cls.fit(X_train, Y_train) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def test_predict_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration( cs, values={ "balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": "True", "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, "rescaling:__choice__": "min/max" }) cls = SimpleClassificationPipeline(config) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, ), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 2), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def max_estimators_fit_duration(X, y, max_classifier_time_budget, logger, sample_factor=1): lo = utl.get_logger(inspect.stack()[0][3]) lo.info("Constructing preprocessor pipeline and transforming sample data") # we don't care about the data here but need to preprocess, otherwise the classifiers crash pipeline = SimpleClassificationPipeline(include={ 'imputation': ['most_frequent'], 'rescaling': ['standardize'] }) default_cs = pipeline.get_hyperparameter_search_space( ).get_default_configuration() pipeline = pipeline.set_hyperparameters(default_cs) pipeline.fit(X, y) X_tr, dummy = pipeline.fit_transformer(X, y) lo.info("Running estimators on the sample") # going over all default classifiers used by auto-sklearn clfs = autosklearn.pipeline.components.classification._classifiers processes = [] with multiprocessing.Manager() as manager: max_clf_time = manager.Value('i', 3) # default 3 sec for clf_name, clf_class in clfs.items(): pr = multiprocessing.Process(target=time_single_estimator, name=clf_name, args=(clf_name, clf_class, X_tr, y, max_clf_time, logger)) pr.start() processes.append(pr) for pr in processes: pr.join(max_classifier_time_budget ) # will block for max_classifier_time_budget or # until the classifier fit process finishes. After max_classifier_time_budget # we will terminate all still running processes here. if pr.is_alive(): logger.info("Terminating " + pr.name + " process due to timeout") pr.terminate() result_max_clf_time = max_clf_time.value lo.info("Test classifier fit completed") per_run_time_limit = int(sample_factor * result_max_clf_time) return max_classifier_time_budget if per_run_time_limit > max_classifier_time_budget else per_run_time_limit
def test_default_configuration_multilabel(self): for i in range(2): classifier = SimpleClassificationPipeline( random_state=1, dataset_properties={'multilabel': True}) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset( dataset='iris', make_multilabel=True) classifier.set_hyperparameters(default) classifier = classifier.fit(X_train, Y_train) predictions = classifier.predict(X_test) self.assertAlmostEqual( 0.96, sklearn.metrics.accuracy_score(predictions, Y_test)) classifier.predict_proba(X_test)
def test_get_hyperparameter_search_space_include_exclude_models(self): cs = SimpleClassificationPipeline(include={'classifier': ['libsvm_svc']})\ .get_hyperparameter_search_space() self.assertEqual( cs.get_hyperparameter('classifier:__choice__'), CategoricalHyperparameter('classifier:__choice__', ['libsvm_svc'])) cs = SimpleClassificationPipeline(exclude={'classifier': ['libsvm_svc']}).\ get_hyperparameter_search_space() self.assertNotIn('libsvm_svc', str(cs)) cs = SimpleClassificationPipeline( include={'preprocessor': ['select_percentile_classification']}).\ get_hyperparameter_search_space() self.assertEqual( cs.get_hyperparameter('preprocessor:__choice__'), CategoricalHyperparameter('preprocessor:__choice__', ['select_percentile_classification'])) cs = SimpleClassificationPipeline(exclude={ 'preprocessor': ['select_percentile_classification'] }).get_hyperparameter_search_space() self.assertNotIn('select_percentile_classification', str(cs))
def get_performance_of_range_encoded_models(data_set_idx, encoded_all_model_hyperparameters, json_model, verbose=False): """ Get models performance (30 * 5) from encoded model choice matrix (30 * 38) """ X = np.loadtxt('Data_Set/X_' + str(data_set_idx)) y = np.loadtxt('Data_Set/y_' + str(data_set_idx)) probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx)) # X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1) models_performance = {} #get_performance_of_encoded_model([X, y], encoded_all_model_hyperparameters[0]) for i in range(len(encoded_all_model_hyperparameters)): #model = models[str(i)] encoded_model = encoded_all_model_hyperparameters[i] model = decode_model(encoded_model) if verbose: print('Original json model: ', json_model[str(i+1)]) print('Encoded model: ', encoded_model) print('Decoded model:' , model) print("==========================================================") train_accuracy_score = [] test_accuracy_score = [] train_log_loss = [] test_log_loss = [] #kf = KFold(n_splits=5, random_state=1) time_start = time.time() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True) p = SimpleClassificationPipeline(config=model) p.fit(X_train, y_train) #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True) #print(scores) y_train_pred = p.predict(X_train) y_test_pred = p.predict(X_test) train_accuracy_score.append(accuracy_score(y_train, y_train_pred)) test_accuracy_score.append(accuracy_score(y_test, y_test_pred)) train_log_loss.append(log_loss(y_train, y_train_pred)) test_log_loss.append(log_loss(y_test, y_test_pred)) time_end = time.time() duration = time_end - time_start models_performance[i] = {"train_accuracy_score": np.mean(train_accuracy_score), "test_accuracy_score": np.mean(test_accuracy_score), "train_log_loss" : np.mean(train_log_loss), "test_log_loss" : np.mean(test_log_loss), "duration" : duration} performance_json_filename = "./log/classifier_log" + str(data_set_idx) + "/reproduce_models_performance" + str(data_set_idx) + ".json" with open(performance_json_filename, 'w') as fp: json.dump(models_performance, fp) return models_performance