class AutoClassifier(Classifier): def __init__(self, time_left_for_this_task, per_run_time_limit, folds): now = strftime("%Y-%m-%d-%H-%M-%S", gmtime()) self.automl = AutoSklearnClassifier( time_left_for_this_task=time_left_for_this_task, per_run_time_limit=per_run_time_limit, #tmp_folder='/tmp/autosklearn_switch_tmp', #output_folder='/tmp/autosklearn_switch_out', #delete_tmp_folder_after_terminate=False, #delete_output_folder_after_terminate=False, #shared_mode=True, resampling_strategy='cv', resampling_strategy_arguments={'folds': folds}) def classify(self, X_train, y_train, X_test): # fit() changes the data in place, but refit needs the original data. We # therefore copy the data. In practice, one should reload the data self.automl.fit(X_train.copy(), y_train.copy()) # During fit(), models are fit on individual cross-validation folds. To use # all available data, we call refit() which trains all models in the # final ensemble on the whole dataset. self.automl.refit(X_train.copy(), y_train.copy()) predictions = self.automl.predict(X_test) return predictions def show_models(self): return self.automl.show_models()
def zeroconf_fit_ensemble(y): p("Building ensemble") seed = 1 ensemble = AutoSklearnClassifier( time_left_for_this_task=300,per_run_time_limit=150,ml_memory_limit=20240,ensemble_size=50,ensemble_nbest=200, shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, initial_configurations_via_metalearning=0, seed=seed) ensemble.fit_ensemble( task = BINARY_CLASSIFICATION ,y = y ,metric = F1_METRIC ,precision = '32' ,dataset_name = 'foobar' ,ensemble_size=10 ,ensemble_nbest=15) sleep(20) p("Ensemble built") p("Show models") p(str(ensemble.show_models())) return ensemble
def zeroconf_fit_ensemble(y, atsklrn_tempdir): lo = utl.get_logger(inspect.stack()[0][3]) lo.info("Building ensemble") seed = 1 ensemble = AutoSklearnClassifier( time_left_for_this_task=300, per_run_time_limit=150, ml_memory_limit=20240, ensemble_size=50, ensemble_nbest=200, shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, initial_configurations_via_metalearning=0, seed=seed) lo.info("Done AutoSklearnClassifier - seed:" + str(seed)) try: lo.debug("Start ensemble.fit_ensemble - seed:" + str(seed)) ensemble.fit_ensemble(task=BINARY_CLASSIFICATION, y=y, metric=autosklearn.metrics.f1, precision='32', dataset_name='foobar', ensemble_size=10, ensemble_nbest=15) except Exception: lo = utl.get_logger(inspect.stack()[0][3]) lo.exception("Error in ensemble.fit_ensemble - seed:" + str(seed)) raise lo = utl.get_logger(inspect.stack()[0][3]) lo.debug("Done ensemble.fit_ensemble - seed:" + str(seed)) sleep(20) lo.info("Ensemble built - seed:" + str(seed)) lo.info("Show models - seed:" + str(seed)) txtList = str(ensemble.show_models()).split("\n") for row in txtList: lo.info(row) return ensemble
def main(): X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=1) processes = [] spawn_classifier = get_spawn_classifier(X_train, y_train) for i in range(4): # set this at roughly half of your cores p = multiprocessing.Process( target=spawn_classifier, args=(i, 'breast_cancer'), ) p.start() processes.append(p) for p in processes: p.join() print('Starting to build an ensemble!') automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='digits', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
def main(): X, y = sklearn.datasets.load_digits(return_X_y=True) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=1) processes = [] spawn_classifier = get_spawn_classifier(X_train, y_train) for i in range(4): # set this at roughly half of your cores p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits')) p.start() processes.append(p) for p in processes: p.join() print('Starting to build an ensemble!') automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='digits', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
def test_fit(self): output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=5, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) print(automl.show_models()) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_fit(self): if self.travis: self.skipTest('This test does currently not run on travis-ci. ' 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) print(automl.show_models()) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_fit(self): if self.travis: self.skipTest('This test does currently not run on travis-ci. ' 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) print(automl.show_models()) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
initial_configurations_via_metalearning=0, seed=seed) c.fit_ensemble( task = BINARY_CLASSIFICATION ,y = y_train ,metric = F1_METRIC ,precision = '32' ,dataset_name = 'foobar' ,ensemble_size=10 ,ensemble_nbest=15) sleep(20) p("Ensemble built") p("Show models") print(c.show_models()) p("Predicting") y_hat = c.predict(X_test.values) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat)) if df_unknown.shape[0]==0: p("nothing to predict. Prediction dataset is empty.") exit() p("Re-fitting on full known dataset. This can take long for a large set.") try: c.refit(X.values, y) except Exception as e: p("Refit failed, restarting") print(e) try:
print("Starting to build an ensemble!") automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=ACC_METRIC, precision="32", dataset_name="digits", ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
print('[INFO] Loading digits dataset.') X, y = load_digits(return_X_y=True) print('[INFO] Splitting.') X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8) print(f'[INFO] Train shape: {X_train.shape}') print(f'[INFO] Test shape: {X_test.shape}') print('[INFO] Finding best model...') classifier = AutoSklearnClassifier(per_run_time_limit=360, ml_memory_limit=1024 * 6, time_left_for_this_task=7200) start = time.time() X_train = X_train.astype('float') classifier.fit(X_train, y_train) print( f'[INFO] Elapsed time finding best model: {time.time() - start} seconds.') predictions = classifier.predict(X_test) print('--- CLASSIFICATION REPORT: ---') print(classification_report(y_test, predictions)) print('\n\n--- MODELS: ---') print(classifier.show_models()) print('\n\n--- STATISTICS: ---') print(classifier.sprint_statistics())
print('Starting to build an ensemble!') automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='digits', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
class MLClassifier(GenericClassifier): def __init__(self, train, dataset_name, weight, num_processes=1): super().__init__(train) # init shared tmp folders for parallel automl automl_tmp_folder = "/tmp/autosklearn_parallel_tmp_%.1f" % weight automl_output_folder = "/tmp/autosklearn_parallel_out_%.1f" % weight for dir in [automl_tmp_folder, automl_output_folder]: try: shutil.rmtree(dir) except OSError as e: pass # parallel automl processes = [] spawn_classifier = MLClassifier.__get_spawn_classifier( train.X, train.y) for i in range(num_processes): p = multiprocessing.Process(target=spawn_classifier, args=(i, dataset_name, automl_tmp_folder, automl_output_folder)) p.start() processes.append(p) for p in processes: p.join() self.__cls = AutoSklearnClassifier( # time_left_for_this_task=15, # per_run_time_limit=15, # ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=automl_tmp_folder, output_folder=automl_output_folder, initial_configurations_via_metalearning=0, seed=1, ) self.__cls.fit_ensemble( train.y, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name=dataset_name, ensemble_size=20, ensemble_nbest=50, ) @property def name(self): return "MALAISE" @property def cls(self): return self.__cls def dump(self, pickle_file): # print models self.__cls.show_models() # dump model to file with open(pickle_file, 'wb') as fio: pickle.dump(self.cls, fio) def predict(self, test): return self.cls.predict(test.X) @staticmethod def __get_spawn_classifier(X_train, y_train): def spawn_classifier(seed, dataset_name, automl_tmp_folder, automl_output_folder): if seed == 0: initial_configurations_via_metalearning = 25 smac_scenario_args = {} else: initial_configurations_via_metalearning = 0 smac_scenario_args = {'initial_incumbent': 'RANDOM'} automl = AutoSklearnClassifier( # time_left_for_this_task=60, # sec., how long should this seed fit process run # per_run_time_limit=15, # sec., each model may only take this long before it's killed # ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm shared_mode=True, # tmp folder will be shared between seeds tmp_folder=automl_tmp_folder, output_folder=automl_output_folder, delete_tmp_folder_after_terminate=False, ensemble_size= 0, # ensembles will be built when all optimization runs are finished initial_configurations_via_metalearning= initial_configurations_via_metalearning, seed=seed, smac_scenario_args=smac_scenario_args, ) automl.fit(X_train, y_train, dataset_name=dataset_name) return spawn_classifier
def evaluate_ml_algorithm(dataset, algo, run_id, obj_metric, time_limit=600, seed=1, task_type=None): if algo == 'lightgbm': _algo = ['LightGBM'] add_classifier(LightGBM) elif algo == 'logistic_regression': _algo = ['Logistic_Regression'] add_classifier(Logistic_Regression) else: _algo = [algo] print('EVALUATE-%s-%s-%s: run_id=%d' % (dataset, algo, obj_metric, run_id)) train_data, test_data = load_train_test_data(dataset, task_type=task_type) if task_type in CLS_TASKS: task_type = BINARY_CLS if len(set( train_data.data[1])) == 2 else MULTICLASS_CLS print(set(train_data.data[1])) raw_data, test_raw_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) X, y = raw_data.data X_test, y_test = test_raw_data.data feat_type = [ 'Categorical' if _type == CATEGORICAL else 'Numerical' for _type in raw_data.feature_types ] from autosklearn.metrics import balanced_accuracy as balanced_acc automl = AutoSklearnClassifier( time_left_for_this_task=int(time_limit), per_run_time_limit=180, n_jobs=1, include_estimators=_algo, initial_configurations_via_metalearning=0, ensemble_memory_limit=16384, ml_memory_limit=16384, # tmp_folder='/var/folders/0t/mjph32q55hd10x3qr_kdd2vw0000gn/T/autosklearn_tmp', ensemble_size=1, seed=int(seed), resampling_strategy='holdout', resampling_strategy_arguments={'train_size': 0.67}) automl.fit(X.copy(), y.copy(), feat_type=feat_type, metric=balanced_acc) model_desc = automl.show_models() str_stats = automl.sprint_statistics() valid_results = automl.cv_results_['mean_test_score'] print('Eval num: %d' % (len(valid_results))) validation_score = np.max(valid_results) # Test performance. automl.refit(X.copy(), y.copy()) predictions = automl.predict(X_test) test_score = balanced_accuracy_score(y_test, predictions) # Print statistics about the auto-sklearn run such as number of # iterations, number of models failed with a time out. print(str_stats) print(model_desc) print('Validation Accuracy:', validation_score) print("Test Accuracy :", test_score) save_path = save_dir + '%s-%s-%s-%d-%d.pkl' % (dataset, algo, obj_metric, run_id, time_limit) with open(save_path, 'wb') as f: pickle.dump([dataset, algo, validation_score, test_score, task_type], f)
delete_output_folder_after_terminate=False, shared_mode=False) model.fit(data, target, metric='f1_metric', feat_type=None, dataset_name='numerai_20161021') try: report(model.grid_scores_) except: pass with open('result.txt', 'w') as f: f.write(model.show_models()) cv = StratifiedKFold(target, n_folds=3, shuffle=True, random_state=0) for train_idx, test_idx in list(cv)[:1]: model.refit(data.ix[train_idx, :], target[train_idx]) ans = model.predict_proba(data.ix[test_idx, :])[:, 1] score = roc_auc_score(target[test_idx], ans) print(' score: %s' % score) print(' model thresh: %s, score: %s' % mcc_optimize(ans, target[test_idx])) model.refit(data.ix, target) del data gc.collect() try:
data = dataframe.values X, y = data[:, :-1], data[:, -1] # minimally prepare dataset X = X.astype('float32') y = LabelEncoder().fit_transform(y.astype('str')) # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) # define search model = AutoSklearnClassifier(time_left_for_this_task=10*60, per_run_time_limit=45, n_jobs=6) # perform the search model.fit(X_train, y_train) # summarize print(model.sprint_statistics()) # get model and weights model_weights = model.get_models_with_weights() for model_weight in model_weights: print(model_weight) print("Show models") models_def = model.show_models() print(models_def) # evaluate best model y_hat = model.predict(X_test) acc = accuracy_score(y_test, y_hat) print("Test Dataset Accuracy: %.3f" % acc)