示例#1
0
                current_dynamic.append(0.0)

            print('dynamic: ' + str(current_dynamic))
            print('static: ' + str(current_static))

            gen_new = SpaceGenerator()
            space_new = gen_new.generate_params()
            for pre, _, node in RenderTree(space_new.parameter_tree):
                if node.status == True:
                    print("%s%s" % (pre, node.name))

            try:
                search = MyAutoML(n_jobs=1,
                                  time_search_budget=search_time_frozen,
                                  space=space_new,
                                  evaluation_budget=int(0.1 *
                                                        search_time_frozen),
                                  main_memory_budget_gb=memory_budget,
                                  pipeline_size_limit=pipeline_size,
                                  hold_out_fraction=0.33)

                best_result = search.fit(
                    X_train_hold,
                    y_train_hold,
                    categorical_indicator=categorical_indicator_hold,
                    scorer=my_scorer)

                test_score = my_scorer(search.get_best_pipeline(), X_test_hold,
                                       y_test_hold)
            except:
                test_score = 0.0
            current_static.append(test_score)
def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None):
    space = None
    search_time = None
    if not 'space' in trial.user_attrs:
        # which hyperparameters to use
        gen = SpaceGenerator()
        space = gen.generate_params()
        space.sample_parameters(trial)

        trial.set_user_attr('space', copy.deepcopy(space))

        search_time, evaluation_time, memory_limit, privacy_limit, training_time_limit, inference_time_limit, pipeline_size_limit, cv, number_of_cvs, hold_out_fraction, sample_fraction, dataset_id = generate_parameters(trial, total_search_time, my_openml_datasets)

    else:
        space = trial.user_attrs['space']

        print(trial.params)

        #make this a hyperparameter
        search_time = trial.params['global_search_time_constraint']

        evaluation_time = search_time
        if 'global_evaluation_time_constraint' in trial.params:
            evaluation_time = trial.params['global_evaluation_time_constraint']

        memory_limit = 10
        if 'global_memory_constraint' in trial.params:
            memory_limit = trial.params['global_memory_constraint']

        privacy_limit = None
        if 'privacy_constraint' in trial.params:
            privacy_limit = trial.params['privacy_constraint']

        training_time_limit = search_time
        if 'training_time_constraint' in trial.params:
            training_time_limit = trial.params['training_time_constraint']

        inference_time_limit = 60
        if 'inference_time_constraint' in trial.params:
            inference_time_limit = trial.params['inference_time_constraint']

        pipeline_size_limit = 350000000
        if 'pipeline_size_constraint' in trial.params:
            pipeline_size_limit = trial.params['pipeline_size_constraint']

        cv = 1
        number_of_cvs = 1
        hold_out_fraction = None
        if 'global_cv' in trial.params:
            cv = trial.params['global_cv']
            if 'global_number_cv' in trial.params:
                number_of_cvs = trial.params['global_number_cv']
        else:
            hold_out_fraction = trial.params['hold_out_fraction']

        sample_fraction = 1.0
        if 'sample_fraction' in trial.params:
            sample_fraction = trial.params['sample_fraction']

        if 'dataset_id' in trial.params:
            dataset_id = trial.params['dataset_id']
        else:
            dataset_id = trial.user_attrs['dataset_id']

    for pre, _, node in RenderTree(space.parameter_tree):
        if node.status == True:
            print("%s%s" % (pre, node.name))

    if type(X_train) == type(None):

        my_random_seed = int(time.time())
        if 'data_random_seed' in trial.user_attrs:
            my_random_seed = trial.user_attrs['data_random_seed']

        X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id, randomstate=my_random_seed)

        if not isinstance(trial, FrozenTrial):
            my_list_constraints_values = [search_time,
                                          evaluation_time,
                                          memory_limit, cv,
                                          number_of_cvs,
                                          ifNull(privacy_limit, constant_value=1000),
                                          ifNull(hold_out_fraction),
                                          sample_fraction,
                                          training_time_limit,
                                          inference_time_limit,
                                          pipeline_size_limit]

            metafeature_values = data2features(X_train, y_train, categorical_indicator)
            features = space2features(space, my_list_constraints_values, metafeature_values)
            features = FeatureTransformations().fit(features).transform(features, feature_names=feature_names)
            trial.set_user_attr('features', features)


    dynamic_params = []
    for random_i in range(5): #5
        search = MyAutoML(cv=cv,
                          number_of_cvs=number_of_cvs,
                          n_jobs=1,
                          evaluation_budget=evaluation_time,
                          time_search_budget=search_time,
                          space=space,
                          main_memory_budget_gb=memory_limit,
                          differential_privacy_epsilon=privacy_limit,
                          hold_out_fraction=hold_out_fraction,
                          sample_fraction=sample_fraction,
                          training_time_limit=training_time_limit,
                          inference_time_limit=inference_time_limit,
                          pipeline_size_limit=pipeline_size_limit)

        test_score = 0.0
        try:
            search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=my_scorer)

            best_pipeline = search.get_best_pipeline()
            if type(best_pipeline) != type(None):
                test_score = my_scorer(search.get_best_pipeline(), X_test, y_test)
        except:
            pass
        dynamic_params.append(test_score)

    count_success = 0
    for i_run in range(len(dynamic_params)):
        if dynamic_params[i_run] > 0.0:
            count_success += 1
    success_rate = float(count_success) / float(len(dynamic_params))

    return success_rate, search
示例#3
0
for pre, _, node in RenderTree(space.parameter_tree):
    print("%s%s: %s" % (pre, node.name, node.status))

my_study = optuna.create_study(direction='maximize')

validation_scores = []
test_scores = []

#add Caruana ensemble with replacement # save pipelines to disk

for i in range(1, 10):
    search = MyAutoML(cv=10,
                      number_of_cvs=1,
                      n_jobs=2,
                      time_search_budget=2 * 60,
                      space=space,
                      study=my_study,
                      main_memory_budget_gb=4)
    best_result = search.fit(X_train,
                             y_train,
                             categorical_indicator=categorical_indicator,
                             scorer=auc)
    my_study = search.study

    test_score = auc(search.get_best_pipeline(), X_test, y_test)

    print("budget: " + str(i) + ' => ' + str(best_result) + " test: " +
          str(test_score))

    validation_scores.append(best_result)
示例#4
0
def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None):
    space = None
    search_time = None
    if not 'space' in trial.user_attrs:
        # which hyperparameters to use
        gen = SpaceGenerator()
        space = gen.generate_params()
        space.sample_parameters(trial)

        trial.set_user_attr('space', copy.deepcopy(space))

        # which constraints to use
        search_time = trial.suggest_int('global_search_time_constraint', 10, total_search_time, log=False)

        # how much time for each evaluation
        evaluation_time = trial.suggest_int('global_evaluation_time_constraint', 10, search_time, log=False)

        # how much memory is allowed
        memory_limit = trial.suggest_uniform('global_memory_constraint', 1.5, 4)

        # how many cvs should be used
        cv = trial.suggest_int('global_cv', 2, 20, log=False) #todo: calculate minimum number of splits based on y

        number_of_cvs = trial.suggest_int('global_number_cv', 1, 10, log=False)

        dataset_id = trial.suggest_categorical('dataset_id', my_openml_datasets)

    else:
        space = trial.user_attrs['space']

        print(trial.params)

        #make this a hyperparameter
        search_time = trial.params['global_search_time_constraint']
        evaluation_time = trial.params['global_evaluation_time_constraint']
        memory_limit = trial.params['global_memory_constraint']
        cv = trial.params['global_cv']
        number_of_cvs = trial.params['global_number_cv']

        if 'dataset_id' in trial.params:
            dataset_id = trial.params['dataset_id'] #get same random seed
        else:
            dataset_id = 31


    for pre, _, node in RenderTree(space.parameter_tree):
        print("%s%s: %s" % (pre, node.name, node.status))

    # which dataset to use
    #todo: add more datasets


    if type(X_train) == type(None):

        X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id, randomstate=int(time.time()))

        if not isinstance(trial, FrozenTrial):
            my_list_constraints_values = [search_time, evaluation_time, memory_limit, cv, number_of_cvs]

            metafeature_values = data2features(X_train, y_train, categorical_indicator)
            features = space2features(space, my_list_constraints_values, metafeature_values)
            trial.set_user_attr('features', features)

    search = MyAutoML(cv=cv,
                      number_of_cvs=number_of_cvs,
                      n_jobs=1,
                      evaluation_budget=evaluation_time,
                      time_search_budget=search_time,
                      space=space,
                      main_memory_budget_gb=memory_limit)
    search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=auc)

    best_pipeline = search.get_best_pipeline()

    test_score = 0.0
    if type(best_pipeline) != type(None):
        test_score = auc(search.get_best_pipeline(), X_test, y_test)


    return test_score
示例#5
0
def utils_run_AutoML(trial,
                     X_train=None,
                     X_test=None,
                     y_train=None,
                     y_test=None,
                     categorical_indicator=None,
                     my_scorer=None,
                     search_time=None,
                     memory_limit=None,
                     privacy_limit=None,
                     training_time_limit=None,
                     inference_time_limit=None,
                     pipeline_size_limit=None):
    space = trial.user_attrs['space']

    print(trial.params)

    if 'evaluation_time' in trial.user_attrs:
        evaluation_time = trial.user_attrs['evaluation_time']
    else:
        evaluation_time = search_time
        if 'global_evaluation_time_constraint' in trial.params:
            evaluation_time = trial.params['global_evaluation_time_constraint']

    cv = 1
    number_of_cvs = 1
    if 'hold_out_fraction' in trial.user_attrs:
        hold_out_fraction = trial.user_attrs['hold_out_fraction']
    else:
        hold_out_fraction = None
        if 'global_cv' in trial.params:
            cv = trial.params['global_cv']
            if 'global_number_cv' in trial.params:
                number_of_cvs = trial.params['global_number_cv']
        if 'hold_out_fraction' in trial.params:
            hold_out_fraction = trial.params['hold_out_fraction']

    sample_fraction = 1.0
    if 'sample_fraction' in trial.params:
        sample_fraction = trial.params['sample_fraction']

    search = MyAutoML(cv=cv,
                      number_of_cvs=number_of_cvs,
                      n_jobs=1,
                      evaluation_budget=evaluation_time,
                      time_search_budget=search_time,
                      space=space,
                      main_memory_budget_gb=memory_limit,
                      differential_privacy_epsilon=privacy_limit,
                      hold_out_fraction=hold_out_fraction,
                      sample_fraction=sample_fraction,
                      training_time_limit=training_time_limit,
                      inference_time_limit=inference_time_limit,
                      pipeline_size_limit=pipeline_size_limit)
    search.fit(X_train,
               y_train,
               categorical_indicator=categorical_indicator,
               scorer=my_scorer)

    best_pipeline = search.get_best_pipeline()

    test_score = 0.0
    if type(best_pipeline) != type(None):
        test_score = my_scorer(search.get_best_pipeline(), X_test, y_test)

    return test_score, search
示例#6
0
def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None):
    space = None
    search_time = None
    if not 'space' in trial.user_attrs:
        # which hyperparameters to use
        gen = SpaceGenerator()
        space = gen.generate_params()
        space.sample_parameters(trial)

        trial.set_user_attr('space', copy.deepcopy(space))

        search_time, evaluation_time, memory_limit, privacy_limit, cv, number_of_cvs, hold_out_fraction, sample_fraction, dataset_id = generate_parameters(trial)

    else:
        space = trial.user_attrs['space']

        print(trial.params)

        #make this a hyperparameter
        search_time = total_search_time
        evaluation_time = search_time
        memory_limit = 4
        privacy_limit = None

        cv = 1
        number_of_cvs = 1
        hold_out_fraction = None
        if 'global_cv' in trial.params:
            cv = trial.params['global_cv']
            if 'global_number_cv' in trial.params:
                number_of_cvs = trial.params['global_number_cv']
        else:
            hold_out_fraction = trial.params['hold_out_fraction']

        sample_fraction = 1.0
        if 'sample_fraction' in trial.params:
            sample_fraction = trial.params['sample_fraction']

        if 'dataset_id' in trial.params:
            dataset_id = trial.params['dataset_id'] #get same random seed
        else:
            dataset_id = 31

    for pre, _, node in RenderTree(space.parameter_tree):
        if node.status == True:
            print("%s%s" % (pre, node.name))

    if type(X_train) == type(None):

        my_random_seed = int(time.time())
        if 'data_random_seed' in trial.user_attrs:
            my_random_seed = trial.user_attrs['data_random_seed']

        X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id, randomstate=my_random_seed)

        if not isinstance(trial, FrozenTrial):
            my_list_constraints_values = [search_time,
                                          evaluation_time,
                                          memory_limit, cv,
                                          number_of_cvs,
                                          ifNull(privacy_limit, constant_value=1000),
                                          ifNull(hold_out_fraction),
                                          sample_fraction]

            metafeature_values = data2features(X_train, y_train, categorical_indicator)
            features = space2features(space, my_list_constraints_values, metafeature_values)
            features = FeatureTransformations().fit(features).transform(features, feature_names=feature_names)
            trial.set_user_attr('features', features)

    search = MyAutoML(cv=cv,
                      number_of_cvs=number_of_cvs,
                      n_jobs=1,
                      evaluation_budget=evaluation_time,
                      time_search_budget=search_time,
                      space=space,
                      main_memory_budget_gb=memory_limit,
                      differential_privacy_epsilon=privacy_limit,
                      hold_out_fraction=hold_out_fraction,
                      sample_fraction=sample_fraction)
    search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=my_scorer)

    best_pipeline = search.get_best_pipeline()

    test_score = 0.0
    if type(best_pipeline) != type(None):
        test_score = my_scorer(search.get_best_pipeline(), X_test, y_test)


    return test_score, search