def test_predict_proba_batched(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space() default = cs.get_default_configuration() # Multiclass cls = ParamSklearnClassifier(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = ParamSklearnClassifier(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, list) self.assertEqual(2, len(prediction)) self.assertEqual((1647, 10), prediction[0].shape) self.assertEqual((1647, 10), prediction[1].shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_proba_batched_sparse(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration( cs, values={ "balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": 'True', "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, "rescaling:__choice__": "min/max" }) # Multiclass cls = ParamSklearnClassifier(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = ParamSklearnClassifier(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, list) self.assertEqual(2, len(prediction)) self.assertEqual((1647, 10), prediction[0].shape) self.assertEqual((1647, 10), prediction[1].shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_proba_batched_sparse(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration(cs, values={"balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": 'True', "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, "rescaling:__choice__": "min/max"}) # Multiclass cls = ParamSklearnClassifier(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = ParamSklearnClassifier(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, list) self.assertEqual(2, len(prediction)) self.assertEqual((1647, 10), prediction[0].shape) self.assertEqual((1647, 10), prediction[1].shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_configurations_signed_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'signed': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls = ParamSklearnClassifier(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabiliets = cls.predict_proba(X_test_) self.assertIsInstance(predicted_probabiliets, np.ndarray) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
def test_default_configuration(self): for i in range(2): cs = ParamSklearnClassifier.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') auto = ParamSklearnClassifier(default) auto = auto.fit(X_train, Y_train) predictions = auto.predict(X_test) self.assertAlmostEqual(0.9599999999999995, sklearn.metrics.accuracy_score(predictions, Y_test)) scores = auto.predict_proba(X_test)
def test_default_configuration(self): for i in range(2): cs = ParamSklearnClassifier.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') auto = ParamSklearnClassifier(default) auto = auto.fit(X_train, Y_train) predictions = auto.predict(X_test) self.assertAlmostEqual( 0.9599999999999995, sklearn.metrics.accuracy_score(predictions, Y_test)) scores = auto.predict_proba(X_test)
for weight, configuration in zip(weights, configurations): for param in configuration: try: configuration[param] = int(configuration[param]) except Exception: try: configuration[param] = float(configuration[param]) except Exception: pass classifier = ParamSklearnClassifier(configuration, 1) classifiers.append(classifier) try: classifier.fit(X.copy(), y.copy()) predictions_valid.append( classifier.predict_proba(X_valid.copy()) * weight) predictions_test.append( classifier.predict_proba(X_test.copy()) * weight) except Exception as e: print e print configuration # Output the predictions for name, predictions in [('valid', predictions_valid), ('test', predictions_test)]: predictions = np.array(predictions) predictions = np.sum(predictions, axis=0) predictions = predictions[:, 1].reshape((-1, 1)) filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name)) np.savetxt(filepath, predictions, delimiter=' ')
def test_multilabel(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) dataset_properties = {'multilabel': True} cs = ParamSklearnClassifier.get_hyperparameter_search_space(dataset_properties=dataset_properties) print(cs) cs.seed(5) for i in range(50): X, Y = sklearn.datasets.\ make_multilabel_classification(n_samples=150, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator=True, return_distributions=False, random_state=1) X_train = X[:100, :] Y_train = Y[:100, :] X_test = X[101:, :] Y_test = Y[101:, ] config = cs.sample_configuration() config._populate_values() if 'classifier:passive_aggressive:n_iter' in config: config._values['classifier:passive_aggressive:n_iter'] = 5 if 'classifier:sgd:n_iter' in config: config._values['classifier:sgd:n_iter'] = 5 cls = ParamSklearnClassifier(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabilities = cls.predict_proba(X_test_) [self.assertIsInstance(i, np.ndarray) for i in predicted_probabilities] except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
# Make predictions and weight them for weight, configuration in zip(weights, configurations): for param in configuration: try: configuration[param] = int(configuration[param]) except Exception: try: configuration[param] = float(configuration[param]) except Exception: pass classifier = ParamSklearnClassifier(configuration, 1) classifiers.append(classifier) try: classifier.fit(X.copy(), y.copy()) predictions_valid.append(classifier.predict_proba(X_valid.copy()) * weight) predictions_test.append(classifier.predict_proba(X_test.copy()) * weight) except Exception as e: print e print configuration # Output the predictions for name, predictions in [('valid', predictions_valid), ('test', predictions_test)]: predictions = np.array(predictions) predictions = np.sum(predictions, axis=0) predictions = predictions[:, 1].reshape((-1, 1)) filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name)) np.savetxt(filepath, predictions, delimiter=' ')
def test_multilabel(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) dataset_properties = {'multilabel': True} cs = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties=dataset_properties) print(cs) cs.seed(5) for i in range(50): X, Y = sklearn.datasets.\ make_multilabel_classification(n_samples=150, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator=True, return_distributions=False, random_state=1) X_train = X[:100, :] Y_train = Y[:100, :] X_test = X[101:, :] Y_test = Y[101:, ] config = cs.sample_configuration() config._populate_values() if 'classifier:passive_aggressive:n_iter' in config: config._values['classifier:passive_aggressive:n_iter'] = 5 if 'classifier:sgd:n_iter' in config: config._values['classifier:sgd:n_iter'] = 5 cls = ParamSklearnClassifier(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabilities = cls.predict_proba(X_test_) [ self.assertIsInstance(i, np.ndarray) for i in predicted_probabilities ] except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue