def optimize_uncertainty(trial, dataset_id): dataset_id = str(dataset_id) try: gen = SpaceGenerator() space = gen.generate_params() space.sample_parameters(trial) trial.set_user_attr('space', copy.deepcopy(space)) search_time, evaluation_time, memory_limit, privacy_limit, training_time_limit, inference_time_limit, pipeline_size_limit, cv, number_of_cvs, hold_out_fraction, sample_fraction, _ = generate_parameters(trial, total_search_time, my_openml_datasets) my_random_seed = int(time.time()) X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id, randomstate=my_random_seed) trial.set_user_attr('data_random_seed', my_random_seed) trial.set_user_attr('dataset_id', dataset_id) #add metafeatures of data my_list_constraints_values = [search_time, evaluation_time, memory_limit, cv, number_of_cvs, ifNull(privacy_limit, constant_value=1000), ifNull(hold_out_fraction), sample_fraction, training_time_limit, inference_time_limit, pipeline_size_limit] metafeature_values = data2features(X_train, y_train, categorical_indicator) features = space2features(space, my_list_constraints_values, metafeature_values) features = FeatureTransformations().fit(features).transform(features, feature_names=feature_names) trial.set_user_attr('features', features) model = mp_glob.ml_model trial.set_user_attr('predicted_target', model.predict(features)) predictions = [] for tree in range(model.n_estimators): predictions.append(predict_range(model.estimators_[tree], features)) stddev_pred = np.std(np.matrix(predictions).transpose(), axis=1) return stddev_pred[0] except Exception as e: print(str(e) + 'except dataset _ uncertainty: ' + str(dataset_id) + '\n\n') return -np.inf
def optimize_uncertainty(trial): try: search_time, _, memory_limit, privacy_limit, training_time_limit, inference_time_limit, pipeline_size_limit, _, _, _, _, dataset_id = generate_parameters( trial, total_search_time, my_openml_datasets) model_weight = 0 if trial.suggest_categorical('use_model_weight', [True, False]): model_weight = trial.suggest_loguniform('model_weight', 0.0000001, 1000) number_trials = trial.suggest_int('number_trials', 10, 500, log=False) my_random_seed = int(time.time()) X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data( dataset_id, randomstate=my_random_seed) trial.set_user_attr('data_random_seed', my_random_seed) #add metafeatures of data my_list_constraints_values = [ search_time, memory_limit, ifNull(privacy_limit, constant_value=1000), training_time_limit, inference_time_limit, pipeline_size_limit, model_weight, number_trials ] metafeature_values = data2features(X_train, y_train, categorical_indicator) features = merge_features(my_list_constraints_values, metafeature_values) features = FeatureTransformations().fit(features).transform( features, feature_names=feature_names) trial.set_user_attr('features', features) trial.set_user_attr('predicted_target', model.predict(features)) predictions = [] for tree in range(model.n_estimators): predictions.append(predict_range(model.estimators_[tree], features)) stddev_pred = np.std(np.matrix(predictions).transpose(), axis=1) return stddev_pred[0] except Exception as e: print( str(e) + 'except dataset _ uncertainty: ' + str(dataset_id) + '\n\n') return -1 * np.inf
my_list_constraints = [ 'global_search_time_constraint', 'global_evaluation_time_constraint', 'global_memory_constraint', 'global_cv', 'global_number_cv', 'privacy', 'hold_out_fraction', 'sample_fraction', 'training_time_constraint', 'inference_time_constraint', 'pipeline_size_constraint' ] _, feature_names = get_feature_names(my_list_constraints) results_dict = {} for test_holdout_dataset_id in test_holdout_dataset_ids: X_train_hold, X_test_hold, y_train_hold, y_test_hold, categorical_indicator_hold, attribute_names_hold = get_data( test_holdout_dataset_id, randomstate=42) metafeature_values_hold = data2features(X_train_hold, y_train_hold, categorical_indicator_hold) #plot_most_important_features(model, feature_names, k=len(feature_names)) dynamic_approach = [] static_approach = [] minutes_to_search = 5 memory_budget = 8 for pipeline_size in [2583, 2971, 3008, 3724, 5621, 26540, 65529]: #bytes current_dynamic = [] current_static = [] search_time_frozen = minutes_to_search * 60
def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None): space = None search_time = None if not 'space' in trial.user_attrs: # which hyperparameters to use gen = SpaceGenerator() space = gen.generate_params() space.sample_parameters(trial) trial.set_user_attr('space', copy.deepcopy(space)) search_time, evaluation_time, memory_limit, privacy_limit, training_time_limit, inference_time_limit, pipeline_size_limit, cv, number_of_cvs, hold_out_fraction, sample_fraction, dataset_id = generate_parameters(trial, total_search_time, my_openml_datasets) else: space = trial.user_attrs['space'] print(trial.params) #make this a hyperparameter search_time = trial.params['global_search_time_constraint'] evaluation_time = search_time if 'global_evaluation_time_constraint' in trial.params: evaluation_time = trial.params['global_evaluation_time_constraint'] memory_limit = 10 if 'global_memory_constraint' in trial.params: memory_limit = trial.params['global_memory_constraint'] privacy_limit = None if 'privacy_constraint' in trial.params: privacy_limit = trial.params['privacy_constraint'] training_time_limit = search_time if 'training_time_constraint' in trial.params: training_time_limit = trial.params['training_time_constraint'] inference_time_limit = 60 if 'inference_time_constraint' in trial.params: inference_time_limit = trial.params['inference_time_constraint'] pipeline_size_limit = 350000000 if 'pipeline_size_constraint' in trial.params: pipeline_size_limit = trial.params['pipeline_size_constraint'] cv = 1 number_of_cvs = 1 hold_out_fraction = None if 'global_cv' in trial.params: cv = trial.params['global_cv'] if 'global_number_cv' in trial.params: number_of_cvs = trial.params['global_number_cv'] else: hold_out_fraction = trial.params['hold_out_fraction'] sample_fraction = 1.0 if 'sample_fraction' in trial.params: sample_fraction = trial.params['sample_fraction'] if 'dataset_id' in trial.params: dataset_id = trial.params['dataset_id'] else: dataset_id = trial.user_attrs['dataset_id'] for pre, _, node in RenderTree(space.parameter_tree): if node.status == True: print("%s%s" % (pre, node.name)) if type(X_train) == type(None): my_random_seed = int(time.time()) if 'data_random_seed' in trial.user_attrs: my_random_seed = trial.user_attrs['data_random_seed'] X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id, randomstate=my_random_seed) if not isinstance(trial, FrozenTrial): my_list_constraints_values = [search_time, evaluation_time, memory_limit, cv, number_of_cvs, ifNull(privacy_limit, constant_value=1000), ifNull(hold_out_fraction), sample_fraction, training_time_limit, inference_time_limit, pipeline_size_limit] metafeature_values = data2features(X_train, y_train, categorical_indicator) features = space2features(space, my_list_constraints_values, metafeature_values) features = FeatureTransformations().fit(features).transform(features, feature_names=feature_names) trial.set_user_attr('features', features) dynamic_params = [] for random_i in range(5): #5 search = MyAutoML(cv=cv, number_of_cvs=number_of_cvs, n_jobs=1, evaluation_budget=evaluation_time, time_search_budget=search_time, space=space, main_memory_budget_gb=memory_limit, differential_privacy_epsilon=privacy_limit, hold_out_fraction=hold_out_fraction, sample_fraction=sample_fraction, training_time_limit=training_time_limit, inference_time_limit=inference_time_limit, pipeline_size_limit=pipeline_size_limit) test_score = 0.0 try: search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=my_scorer) best_pipeline = search.get_best_pipeline() if type(best_pipeline) != type(None): test_score = my_scorer(search.get_best_pipeline(), X_test, y_test) except: pass dynamic_params.append(test_score) count_success = 0 for i_run in range(len(dynamic_params)): if dynamic_params[i_run] > 0.0: count_success += 1 success_rate = float(count_success) / float(len(dynamic_params)) return success_rate, search
40669, 40680, 40681, 40690, 40693, 40701, 40705, 40706, 40710, 40713, 40714, 40900, 40910, 40922, 40999, 41005, 41007, 41138, 41142, 41144, 41145, 41146, 41147, 41150, 41156, 41158, 41159, 41160, 41161, 41162, 41228, 41430, 41521, 41538, 41976, 42172, 42477 ] my_openml_datasets = np.array(my_openml_datasets) if not os.path.isfile("/tmp/feature_matrix.p"): metadata_matrix = [] for dataset_id in my_openml_datasets: X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data( str(dataset_id), randomstate=42) metadata_matrix.append( data2features(X_train, y_train, categorical_indicator)[0]) pickle.dump(metadata_matrix, open("/tmp/feature_matrix.p", "wb")) else: metadata_matrix = pickle.load(open("/tmp/feature_matrix.p", "rb")) metadata_matrix = StandardScaler().fit_transform(X=np.array(metadata_matrix)) print(metadata_matrix.shape) NUM_COLORS = 30 kmeans = KMeans(n_clusters=NUM_COLORS, random_state=0).fit(X=metadata_matrix) labels = kmeans.predict(metadata_matrix) print(labels)
def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None): space = None search_time = None if not 'space' in trial.user_attrs: # which hyperparameters to use gen = SpaceGenerator() space = gen.generate_params() space.sample_parameters(trial) trial.set_user_attr('space', copy.deepcopy(space)) search_time, evaluation_time, memory_limit, privacy_limit, cv, number_of_cvs, hold_out_fraction, sample_fraction, dataset_id = generate_parameters(trial) else: space = trial.user_attrs['space'] print(trial.params) #make this a hyperparameter search_time = total_search_time evaluation_time = search_time memory_limit = 4 privacy_limit = None cv = 1 number_of_cvs = 1 hold_out_fraction = None if 'global_cv' in trial.params: cv = trial.params['global_cv'] if 'global_number_cv' in trial.params: number_of_cvs = trial.params['global_number_cv'] else: hold_out_fraction = trial.params['hold_out_fraction'] sample_fraction = 1.0 if 'sample_fraction' in trial.params: sample_fraction = trial.params['sample_fraction'] if 'dataset_id' in trial.params: dataset_id = trial.params['dataset_id'] #get same random seed else: dataset_id = 31 for pre, _, node in RenderTree(space.parameter_tree): if node.status == True: print("%s%s" % (pre, node.name)) if type(X_train) == type(None): my_random_seed = int(time.time()) if 'data_random_seed' in trial.user_attrs: my_random_seed = trial.user_attrs['data_random_seed'] X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id, randomstate=my_random_seed) if not isinstance(trial, FrozenTrial): my_list_constraints_values = [search_time, evaluation_time, memory_limit, cv, number_of_cvs, ifNull(privacy_limit, constant_value=1000), ifNull(hold_out_fraction), sample_fraction] metafeature_values = data2features(X_train, y_train, categorical_indicator) features = space2features(space, my_list_constraints_values, metafeature_values) features = FeatureTransformations().fit(features).transform(features, feature_names=feature_names) trial.set_user_attr('features', features) search = MyAutoML(cv=cv, number_of_cvs=number_of_cvs, n_jobs=1, evaluation_budget=evaluation_time, time_search_budget=search_time, space=space, main_memory_budget_gb=memory_limit, differential_privacy_epsilon=privacy_limit, hold_out_fraction=hold_out_fraction, sample_fraction=sample_fraction) search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=my_scorer) best_pipeline = search.get_best_pipeline() test_score = 0.0 if type(best_pipeline) != type(None): test_score = my_scorer(search.get_best_pipeline(), X_test, y_test) return test_score, search
def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None): search_time = None if not isinstance(trial, FrozenTrial): search_time, _, memory_limit, privacy_limit, training_time_limit, inference_time_limit, pipeline_size_limit, _, _, _, _, dataset_id = generate_parameters( trial, total_search_time, my_openml_datasets) model_weight = 0 if trial.suggest_categorical('use_model_weight', [True, False]): model_weight = trial.suggest_loguniform('model_weight', 0.0000001, 1000) number_trials = trial.suggest_int('number_trials', 10, 500, log=False) else: #make this a hyperparameter search_time = trial.params['global_search_time_constraint'] memory_limit = 10 if 'global_memory_constraint' in trial.params: memory_limit = trial.params['global_memory_constraint'] privacy_limit = None if 'privacy_constraint' in trial.params: privacy_limit = trial.params['privacy_constraint'] training_time_limit = search_time if 'training_time_constraint' in trial.params: training_time_limit = trial.params['training_time_constraint'] inference_time_limit = 60 if 'inference_time_constraint' in trial.params: inference_time_limit = trial.params['inference_time_constraint'] pipeline_size_limit = 350000000 if 'pipeline_size_constraint' in trial.params: pipeline_size_limit = trial.params['pipeline_size_constraint'] model_weight = 0 if 'model_weight' in trial.params: pipeline_size_limit = trial.params['model_weight'] number_trials = trial.params['number_trials'] if 'dataset_id' in trial.params: dataset_id = trial.params['dataset_id'] #get same random seed else: dataset_id = 31 if type(X_train) == type(None): my_random_seed = int(time.time()) if 'data_random_seed' in trial.user_attrs: my_random_seed = trial.user_attrs['data_random_seed'] X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data( dataset_id, randomstate=my_random_seed) my_list_constraints_values = [ search_time, memory_limit, ifNull(privacy_limit, constant_value=1000), training_time_limit, inference_time_limit, pipeline_size_limit, model_weight, number_trials ] metafeature_values = data2features(X_train, y_train, categorical_indicator) features = merge_features(my_list_constraints_values, metafeature_values) features = FeatureTransformations().fit(features).transform( features, feature_names=feature_names) print(len(features[0])) print(len(feature_names_new)) print(feature_names_new) assert len(feature_names_new) == features.shape[1], 'feature error' try: trial.set_user_attr('features', features) except: pass try: model_compare = pickle.load( open( '/home/felix/phd2/picture_progress/al_only/my_great_model_compare.p', "rb")) model_success = pickle.load( open( '/home/felix/phd2/picture_progress/al_only/my_great_model_success.p', "rb")) except: model_compare = pickle.load( open('/home/neutatz/data/my_models/my_great_model_compare.p', "rb")) model_success = pickle.load( open('/home/neutatz/data/my_models/my_great_model_success.p', "rb")) dynamic_params = [] static_params = [] for random_i in range(5): study_prune = optuna.create_study(direction='maximize') study_prune.optimize( lambda trial: optimize_accuracy_under_constraints2( trial=trial, metafeature_values_hold=metafeature_values, search_time=search_time, model_compare=model_compare, model_success=model_success, memory_limit=memory_limit, privacy_limit=privacy_limit, comparison_weight=model_weight), n_trials=number_trials, n_jobs=1) result = 0 try: result, search = utils_run_AutoML( study_prune.best_trial, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, categorical_indicator=categorical_indicator, my_scorer=my_scorer, search_time=search_time, memory_limit=memory_limit, privacy_limit=privacy_limit) except: result = 0 dynamic_params.append(result) study_prune = optuna.create_study(direction='maximize') study_prune.optimize( lambda trial: optimize_accuracy_under_constraints2( trial=trial, metafeature_values_hold=metafeature_values, search_time=search_time, model_compare=model_compare, model_success=model_success, memory_limit=memory_limit, privacy_limit=privacy_limit, ), n_trials=500, n_jobs=1) success_result = 0 try: success_result, search = utils_run_AutoML( study_prune.best_trial, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, categorical_indicator=categorical_indicator, my_scorer=my_scorer, search_time=search_time, memory_limit=memory_limit, privacy_limit=privacy_limit) except: success_result = 0 static_params.append(success_result) comparison = np.mean(dynamic_params) - np.mean(static_params) return comparison, search