current_dynamic.append(0.0) print('dynamic: ' + str(current_dynamic)) print('static: ' + str(current_static)) gen_new = SpaceGenerator() space_new = gen_new.generate_params() for pre, _, node in RenderTree(space_new.parameter_tree): if node.status == True: print("%s%s" % (pre, node.name)) try: search = MyAutoML(n_jobs=1, time_search_budget=search_time_frozen, space=space_new, evaluation_budget=int(0.1 * search_time_frozen), main_memory_budget_gb=memory_budget, pipeline_size_limit=pipeline_size, hold_out_fraction=0.33) best_result = search.fit( X_train_hold, y_train_hold, categorical_indicator=categorical_indicator_hold, scorer=my_scorer) test_score = my_scorer(search.get_best_pipeline(), X_test_hold, y_test_hold) except: test_score = 0.0 current_static.append(test_score)
def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None): space = None search_time = None if not 'space' in trial.user_attrs: # which hyperparameters to use gen = SpaceGenerator() space = gen.generate_params() space.sample_parameters(trial) trial.set_user_attr('space', copy.deepcopy(space)) search_time, evaluation_time, memory_limit, privacy_limit, training_time_limit, inference_time_limit, pipeline_size_limit, cv, number_of_cvs, hold_out_fraction, sample_fraction, dataset_id = generate_parameters(trial, total_search_time, my_openml_datasets) else: space = trial.user_attrs['space'] print(trial.params) #make this a hyperparameter search_time = trial.params['global_search_time_constraint'] evaluation_time = search_time if 'global_evaluation_time_constraint' in trial.params: evaluation_time = trial.params['global_evaluation_time_constraint'] memory_limit = 10 if 'global_memory_constraint' in trial.params: memory_limit = trial.params['global_memory_constraint'] privacy_limit = None if 'privacy_constraint' in trial.params: privacy_limit = trial.params['privacy_constraint'] training_time_limit = search_time if 'training_time_constraint' in trial.params: training_time_limit = trial.params['training_time_constraint'] inference_time_limit = 60 if 'inference_time_constraint' in trial.params: inference_time_limit = trial.params['inference_time_constraint'] pipeline_size_limit = 350000000 if 'pipeline_size_constraint' in trial.params: pipeline_size_limit = trial.params['pipeline_size_constraint'] cv = 1 number_of_cvs = 1 hold_out_fraction = None if 'global_cv' in trial.params: cv = trial.params['global_cv'] if 'global_number_cv' in trial.params: number_of_cvs = trial.params['global_number_cv'] else: hold_out_fraction = trial.params['hold_out_fraction'] sample_fraction = 1.0 if 'sample_fraction' in trial.params: sample_fraction = trial.params['sample_fraction'] if 'dataset_id' in trial.params: dataset_id = trial.params['dataset_id'] else: dataset_id = trial.user_attrs['dataset_id'] for pre, _, node in RenderTree(space.parameter_tree): if node.status == True: print("%s%s" % (pre, node.name)) if type(X_train) == type(None): my_random_seed = int(time.time()) if 'data_random_seed' in trial.user_attrs: my_random_seed = trial.user_attrs['data_random_seed'] X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id, randomstate=my_random_seed) if not isinstance(trial, FrozenTrial): my_list_constraints_values = [search_time, evaluation_time, memory_limit, cv, number_of_cvs, ifNull(privacy_limit, constant_value=1000), ifNull(hold_out_fraction), sample_fraction, training_time_limit, inference_time_limit, pipeline_size_limit] metafeature_values = data2features(X_train, y_train, categorical_indicator) features = space2features(space, my_list_constraints_values, metafeature_values) features = FeatureTransformations().fit(features).transform(features, feature_names=feature_names) trial.set_user_attr('features', features) dynamic_params = [] for random_i in range(5): #5 search = MyAutoML(cv=cv, number_of_cvs=number_of_cvs, n_jobs=1, evaluation_budget=evaluation_time, time_search_budget=search_time, space=space, main_memory_budget_gb=memory_limit, differential_privacy_epsilon=privacy_limit, hold_out_fraction=hold_out_fraction, sample_fraction=sample_fraction, training_time_limit=training_time_limit, inference_time_limit=inference_time_limit, pipeline_size_limit=pipeline_size_limit) test_score = 0.0 try: search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=my_scorer) best_pipeline = search.get_best_pipeline() if type(best_pipeline) != type(None): test_score = my_scorer(search.get_best_pipeline(), X_test, y_test) except: pass dynamic_params.append(test_score) count_success = 0 for i_run in range(len(dynamic_params)): if dynamic_params[i_run] > 0.0: count_success += 1 success_rate = float(count_success) / float(len(dynamic_params)) return success_rate, search
for pre, _, node in RenderTree(space.parameter_tree): print("%s%s: %s" % (pre, node.name, node.status)) my_study = optuna.create_study(direction='maximize') validation_scores = [] test_scores = [] #add Caruana ensemble with replacement # save pipelines to disk for i in range(1, 10): search = MyAutoML(cv=10, number_of_cvs=1, n_jobs=2, time_search_budget=2 * 60, space=space, study=my_study, main_memory_budget_gb=4) best_result = search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=auc) my_study = search.study test_score = auc(search.get_best_pipeline(), X_test, y_test) print("budget: " + str(i) + ' => ' + str(best_result) + " test: " + str(test_score)) validation_scores.append(best_result)
def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None): space = None search_time = None if not 'space' in trial.user_attrs: # which hyperparameters to use gen = SpaceGenerator() space = gen.generate_params() space.sample_parameters(trial) trial.set_user_attr('space', copy.deepcopy(space)) # which constraints to use search_time = trial.suggest_int('global_search_time_constraint', 10, total_search_time, log=False) # how much time for each evaluation evaluation_time = trial.suggest_int('global_evaluation_time_constraint', 10, search_time, log=False) # how much memory is allowed memory_limit = trial.suggest_uniform('global_memory_constraint', 1.5, 4) # how many cvs should be used cv = trial.suggest_int('global_cv', 2, 20, log=False) #todo: calculate minimum number of splits based on y number_of_cvs = trial.suggest_int('global_number_cv', 1, 10, log=False) dataset_id = trial.suggest_categorical('dataset_id', my_openml_datasets) else: space = trial.user_attrs['space'] print(trial.params) #make this a hyperparameter search_time = trial.params['global_search_time_constraint'] evaluation_time = trial.params['global_evaluation_time_constraint'] memory_limit = trial.params['global_memory_constraint'] cv = trial.params['global_cv'] number_of_cvs = trial.params['global_number_cv'] if 'dataset_id' in trial.params: dataset_id = trial.params['dataset_id'] #get same random seed else: dataset_id = 31 for pre, _, node in RenderTree(space.parameter_tree): print("%s%s: %s" % (pre, node.name, node.status)) # which dataset to use #todo: add more datasets if type(X_train) == type(None): X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id, randomstate=int(time.time())) if not isinstance(trial, FrozenTrial): my_list_constraints_values = [search_time, evaluation_time, memory_limit, cv, number_of_cvs] metafeature_values = data2features(X_train, y_train, categorical_indicator) features = space2features(space, my_list_constraints_values, metafeature_values) trial.set_user_attr('features', features) search = MyAutoML(cv=cv, number_of_cvs=number_of_cvs, n_jobs=1, evaluation_budget=evaluation_time, time_search_budget=search_time, space=space, main_memory_budget_gb=memory_limit) search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=auc) best_pipeline = search.get_best_pipeline() test_score = 0.0 if type(best_pipeline) != type(None): test_score = auc(search.get_best_pipeline(), X_test, y_test) return test_score
def utils_run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None, my_scorer=None, search_time=None, memory_limit=None, privacy_limit=None, training_time_limit=None, inference_time_limit=None, pipeline_size_limit=None): space = trial.user_attrs['space'] print(trial.params) if 'evaluation_time' in trial.user_attrs: evaluation_time = trial.user_attrs['evaluation_time'] else: evaluation_time = search_time if 'global_evaluation_time_constraint' in trial.params: evaluation_time = trial.params['global_evaluation_time_constraint'] cv = 1 number_of_cvs = 1 if 'hold_out_fraction' in trial.user_attrs: hold_out_fraction = trial.user_attrs['hold_out_fraction'] else: hold_out_fraction = None if 'global_cv' in trial.params: cv = trial.params['global_cv'] if 'global_number_cv' in trial.params: number_of_cvs = trial.params['global_number_cv'] if 'hold_out_fraction' in trial.params: hold_out_fraction = trial.params['hold_out_fraction'] sample_fraction = 1.0 if 'sample_fraction' in trial.params: sample_fraction = trial.params['sample_fraction'] search = MyAutoML(cv=cv, number_of_cvs=number_of_cvs, n_jobs=1, evaluation_budget=evaluation_time, time_search_budget=search_time, space=space, main_memory_budget_gb=memory_limit, differential_privacy_epsilon=privacy_limit, hold_out_fraction=hold_out_fraction, sample_fraction=sample_fraction, training_time_limit=training_time_limit, inference_time_limit=inference_time_limit, pipeline_size_limit=pipeline_size_limit) search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=my_scorer) best_pipeline = search.get_best_pipeline() test_score = 0.0 if type(best_pipeline) != type(None): test_score = my_scorer(search.get_best_pipeline(), X_test, y_test) return test_score, search
def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None): space = None search_time = None if not 'space' in trial.user_attrs: # which hyperparameters to use gen = SpaceGenerator() space = gen.generate_params() space.sample_parameters(trial) trial.set_user_attr('space', copy.deepcopy(space)) search_time, evaluation_time, memory_limit, privacy_limit, cv, number_of_cvs, hold_out_fraction, sample_fraction, dataset_id = generate_parameters(trial) else: space = trial.user_attrs['space'] print(trial.params) #make this a hyperparameter search_time = total_search_time evaluation_time = search_time memory_limit = 4 privacy_limit = None cv = 1 number_of_cvs = 1 hold_out_fraction = None if 'global_cv' in trial.params: cv = trial.params['global_cv'] if 'global_number_cv' in trial.params: number_of_cvs = trial.params['global_number_cv'] else: hold_out_fraction = trial.params['hold_out_fraction'] sample_fraction = 1.0 if 'sample_fraction' in trial.params: sample_fraction = trial.params['sample_fraction'] if 'dataset_id' in trial.params: dataset_id = trial.params['dataset_id'] #get same random seed else: dataset_id = 31 for pre, _, node in RenderTree(space.parameter_tree): if node.status == True: print("%s%s" % (pre, node.name)) if type(X_train) == type(None): my_random_seed = int(time.time()) if 'data_random_seed' in trial.user_attrs: my_random_seed = trial.user_attrs['data_random_seed'] X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id, randomstate=my_random_seed) if not isinstance(trial, FrozenTrial): my_list_constraints_values = [search_time, evaluation_time, memory_limit, cv, number_of_cvs, ifNull(privacy_limit, constant_value=1000), ifNull(hold_out_fraction), sample_fraction] metafeature_values = data2features(X_train, y_train, categorical_indicator) features = space2features(space, my_list_constraints_values, metafeature_values) features = FeatureTransformations().fit(features).transform(features, feature_names=feature_names) trial.set_user_attr('features', features) search = MyAutoML(cv=cv, number_of_cvs=number_of_cvs, n_jobs=1, evaluation_budget=evaluation_time, time_search_budget=search_time, space=space, main_memory_budget_gb=memory_limit, differential_privacy_epsilon=privacy_limit, hold_out_fraction=hold_out_fraction, sample_fraction=sample_fraction) search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=my_scorer) best_pipeline = search.get_best_pipeline() test_score = 0.0 if type(best_pipeline) != type(None): test_score = my_scorer(search.get_best_pipeline(), X_test, y_test) return test_score, search