예제 #1
0
 def _build_pipeline_space(self):
     ps = PipelineSpace()
     o_s = OneHotEncodingStep()
     i_s = ImputationStep()
     r_s = RescalingStep()
     b_s = BalancingStep()
     p_s = PreprocessingStep()
     c_s = ClassificationStep()
     ps.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])  #[p_s, c_s])
     return ps
def run_experiment_vector():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])
    constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling",
                               "balancing", "feature_preprocessor"]
    variable_pipeline_steps = ["classifier"]

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    timing_v_1 = []
    timing_v_2 = []
    for i in range(0, 10):
        print("Run: {}".format(i))
        # sample 1 start config
        start_config = config_space.sample_configuration(size=1)
        sample_configs = config_space.sample_configuration(size=1000)
        sample_configs_values = [get_values(evaluation_config.get_dictionary(), variable_pipeline_steps) \
                                 for evaluation_config in sample_configs]

        # version 1
        start_time = time.time()
        new_configurations = combine_configurations_batch_version1(config_space=config_space,
                                                                   start_config=start_config,
                                                                   complemented_configs_values=sample_configs_values,
                                                                   constant_pipeline_steps=constant_pipeline_steps)
        timing_v_1.append(time.time() - start_time)

        # version 2
        print("VERSION2")
        start_config = config_space.sample_configuration(size=1)
        sample_configs = config_space.sample_configuration(size=1000)
        sample_configs_values = [get_values(evaluation_config.get_dictionary(), variable_pipeline_steps) \
                                for evaluation_config in sample_configs]
        start_time = time.time()
        new_configurations_2 = combine_configurations_batch_version2(config_space=config_space,
                                                                     start_config=start_config,
                                                                     complemented_configs_values=sample_configs_values,
                                                                     constant_pipeline_steps=constant_pipeline_steps)
        timing_v_2.append(time.time() - start_time)
        #print(len(new_configurations), len(new_configurations_2))

        # Check new configs
        for config in new_configurations_2:
            config.is_valid_configuration()

        print(np.mean(timing_v_1))
        print(np.mean(timing_v_2))
def run_experiment_sampling():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])
    constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling",
                               "balancing", "feature_preprocessor"]
    variable_pipeline_steps = ["classifier"]

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    timing_v_1 = []
    timing_v_2 = []
    for i in range(0, 1):
        print("Run: {}".format(i))
        # sample 1 start config

        # version 1
        start_time = time.time()
        sample_configs = config_space.sample_configuration(size=1000)
        timing_v_1.append(time.time() - start_time)

        # version 2
        print("VERSION2")
        # start_config = config_space.sample_configuration(size=1)
        # sample_configs = config_space.sample_configuration(size=2)
        # sample_configs_values = [get_values(evaluation_config.get_dictionary(), variable_pipeline_steps) \
        #                         for evaluation_config in sample_configs]
        # start_time = time.time()
        # sample_configs_2 = config_space.sample_configuration_forbidden(size=500)
        # timing_v_2.append(time.time() - start_time)

        # invalid_configs = []
        # for config in sample_configs:
        #     try:
        #         config.is_valid_configuration()
        #     except ValueError as v:
        #         exc_info = sys.exc_info()
        #         # Display the *original* exception
        #         traceback.print_exception(*exc_info)
        #         del exc_info
        #
        #         invalid_configs.append(config)
        #         print("Config not valid: {}".format(config))

        # print("Nb of invalid configs: {}".format(len(invalid_configs)))
        #print(len(sample_configs), len(sample_configs_2))

        print(np.mean(timing_v_1))
예제 #4
0
    def build_pipeline_space(self, preprocessor_names, classifier_names):
        prepr_nodes = [self.preprocessor_nodes[pname] for pname in preprocessor_names]
        class_nodes = [self.classifier_nodes[cname] for cname in classifier_names]

        pipeline_space = PipelineSpace()
        o_s = OneHotEncodingStep()
        i_s = ImputationStep()
        r_s = RescalingStep()
        b_s = BalancingStep()
        p_s = PipelineStep(name='feature_preprocessor', nodes=prepr_nodes, caching=True)
        c_s = PipelineStep(name='classifier', nodes=class_nodes)
        pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])

        return pipeline_space
def run_experiment_check_configurations():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])
    constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling",
                               "balancing", "feature_preprocessor"]
    variable_pipeline_steps = ["classifier"]

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    timing_v_1 = []
    timing_v_2 = []
    for i in range(0, 20):
        print("Run: {}".format(i))
        # sample 1 start config

        sample_configs = config_space.sample_configuration(size=500)
        for config in sample_configs:
            config.get_dictionary()

        # version 1
        start_time = time.time()
        for config in sample_configs:
            config_space.check_configuration(config)
        timing_v_1.append(time.time() - start_time)

        # version 2
        print("VERSION2")

        sample_configs = config_space.sample_configuration(size=500)

        start_time = time.time()
        for config in sample_configs:
            config_space.check_configuration(config)
        timing_v_2.append(time.time() - start_time)

        print(np.mean(timing_v_1))
        print(np.mean(timing_v_2))
def run_experiment_get_one_exchange_neighbourhood():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])
    constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling",
                               "balancing", "feature_preprocessor"]
    variable_pipeline_steps = ["classifier"]

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    timing_v_1 = []
    timing_v_2 = []
    for i in range(0, 10):
        print("Run: {}".format(i))
        # sample 1 start config
        sample_configs = config_space.sample_configuration(size=1000)

        # version 1
        start_time = time.time()
        for config in sample_configs:
            get_one_exchange_neighbourhood(config, seed=1)
        timing_v_1.append(time.time() - start_time)

        # version 2
        print("VERSION2")
        sample_configs = config_space.sample_configuration(size=1000)

        start_time = time.time()
        for config in sample_configs:
            get_one_exchange_neighbourhood_vector_checking(config, seed=1)
        timing_v_2.append(time.time() - start_time)
        #print(len(new_configurations), len(new_configurations_2))

        print(np.mean(timing_v_1))
        print(np.mean(timing_v_2))
예제 #7
0
def run_experiment():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])

    runhistory = PCRunHistory(average_cost)

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    args = {
        'cs': config_space,
        'run_obj': "quality",
        'runcount_limit': 100,
        'wallclock_limit': 100,
        'memory_limit': 100,
        'cutoff_time': 100,
        'deterministic': "true"
    }
    scenario = Scenario(args)

    # Build stats
    stats = Stats(scenario, output_dir=None, stamp="")

    types, bounds = get_types(scenario.cs, scenario.feature_array)

    model = RandomForestWithInstances(types=types, bounds=bounds)

    constant_pipeline_steps = [
        "one_hot_encoder", "imputation", "rescaling", "balancing",
        "feature_preprocessor"
    ]
    variable_pipeline_steps = ["classifier"]
    rng = np.random.RandomState()
    num_params = len(scenario.cs.get_hyperparameters())

    acquisition_func = EI(model)
    acq_func_wrapper = PCAquisitionFunctionWrapper(
        acquisition_func=acquisition_func,
        config_space=scenario.cs,
        runhistory=runhistory,
        constant_pipeline_steps=constant_pipeline_steps,
        variable_pipeline_steps=variable_pipeline_steps)
    runhistory2epm = RunHistory2EPM4Cost(scenario,
                                         num_params,
                                         success_states=[StatusType.SUCCESS])
    local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                               config_space=scenario.cs)
    select_configuration = SelectConfigurationsWithMarginalization(
        scenario=scenario,
        stats=stats,
        runhistory=runhistory,
        model=model,
        acq_optimizer=local_search,
        acquisition_func=acq_func_wrapper,
        rng=rng,
        constant_pipeline_steps=constant_pipeline_steps,
        variable_pipeline_steps=variable_pipeline_steps,
        num_marginalized_configurations_by_random_search=40,
        num_configs_for_marginalization=200)

    # sample configurations to fill runhistory
    sample_configs = config_space.sample_configuration(size=10)
    for config in sample_configs:
        runhistory.add(config, 1, 1, StatusType.SUCCESS)

    # test select_configurations procedure
    X, Y = runhistory2epm.transform(runhistory)
    challengers = select_configuration.run(
        X,
        Y,
        sample_configs[0],
        num_configurations_by_random_search_sorted=100,
        num_configurations_by_local_search=10,
        random_leaf_size=1)

    print(challengers[0])
def run_random_search(stamp,
                      data_path,
                      version,
                      wallclock_limit,
                      run_limit,
                      memory_limit,
                      cutoff,
                      splitting_number,
                      random_splitting_enabled,
                      seed=None,
                      output_dir=None,
                      cache_directory=None,
                      downsampling=None):
    # data set
    data_set = data_path.split("/")[-1]

    # cache directory
    try:
        if not os.path.exists(cache_directory):
            os.makedirs(cache_directory)
    except FileExistsError:
        pass

    # ouput directory
    try:
        if output_dir == None:
            output_dir = os.path.dirname(
                os.path.abspath(__file__)) + "/results/"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    except FileExistsError:
        pass

    # load data
    data_loader = DataLoader(data_path)
    data = data_loader.get_data()
    dataset_properties = data_loader.info

    # Build pipeline space
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep(
    )  # PipelineStep(name='feature_preprocessor', nodes=[KernelPcaNode()])
    c_s = ClassificationStep(
    )  # PipelineStep(name='classifier', nodes=[SGDNode()])
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])

    # Build configuration space
    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space(
        seed=seed, dataset_properties=dataset_properties)

    # Build statistics
    info = {
        'data_location': data_path,
        'stamp': stamp,
        'version': version,
        'wallclock_limit': wallclock_limit,
        'memory_limit': memory_limit,
        'cutoff': cutoff,
        'seed': seed,
        'downsampling': downsampling
    }
    statistics = Statistics(stamp,
                            output_dir,
                            information=info,
                            total_runtime=wallclock_limit,
                            run_limit=run_limit)
    statistics.clean_files()

    # The pipeline parts that can get cached
    cached_pipeline_steps = [[
        "one_hot_encoder", "imputation", "rescaling", "balancing",
        "feature_preprocessor"
    ]]

    num_cross_validation_folds = 10
    # Build pipeline runner
    if version == '2step':
        pipeline_runner = CachedPipelineRunner(
            data=data,
            data_info=dataset_properties,
            pipeline_space=pipeline_space,
            runhistory=None,
            cached_pipeline_steps=cached_pipeline_steps,
            statistics=statistics,
            cache_directory=cache_directory,
            downsampling=downsampling,
            num_cross_validation_folds=num_cross_validation_folds)
        random_search = TreeRandomSearch(
            config_space=config_space,
            pipeline_runner=pipeline_runner,
            wallclock_limit=wallclock_limit,
            memory_limit=memory_limit,
            statistics=statistics,
            constant_pipeline_steps=[
                "one_hot_encoder", "imputation", "rescaling", "balancing",
                "feature_preprocessor"
            ],
            variable_pipeline_steps=["classifier"],
            splitting_number=splitting_number,
            random_splitting_enabled=random_splitting_enabled)
    elif version == 'sigmoid':
        pipeline_runner = CachedPipelineRunner(
            data=data,
            data_info=dataset_properties,
            pipeline_space=pipeline_space,
            runhistory=None,
            cached_pipeline_steps=cached_pipeline_steps,
            statistics=statistics,
            cache_directory=cache_directory,
            downsampling=downsampling,
            num_cross_validation_folds=num_cross_validation_folds)
        random_search = SigmoidRandomSearch(
            config_space=config_space,
            pipeline_runner=pipeline_runner,
            wallclock_limit=wallclock_limit,
            memory_limit=memory_limit,
            statistics=statistics,
            constant_pipeline_steps=[
                "one_hot_encoder", "imputation", "rescaling", "balancing",
                "feature_preprocessor"
            ],
            variable_pipeline_steps=["classifier"],
            splitting_number=splitting_number,
            random_splitting_enabled=False)
    else:
        pipeline_runner = PipelineRunner(
            data=data,
            data_info=dataset_properties,
            pipeline_space=pipeline_space,
            runhistory=None,
            statistics=statistics,
            downsampling=downsampling,
            num_cross_validation_folds=num_cross_validation_folds)
        random_search = RandomSearch(config_space=config_space,
                                     pipeline_runner=pipeline_runner,
                                     wallclock_limit=wallclock_limit,
                                     memory_limit=memory_limit,
                                     statistics=statistics)

    # Run random search
    print("start random search")
    incumbent = random_search.run(cutoff=cutoff)
    print("... end random search")

    # test performance of incumbents
    incumbent_trajectory = statistics.get_incumbent_trajectory(
        config_space=config_space)
    trajectory = run_tests(data,
                           dataset_properties,
                           incumbent_trajectory,
                           pipeline_space,
                           downsampling=downsampling)
    print(trajectory)

    # Save new trajectory to output directory
    # First transform the configuration to a dictionary
    for traj in trajectory:
        traj['incumbent'] = traj['incumbent'].get_dictionary()
    statistics.add_incumbents_trajectory(trajectory)

    return incumbent
예제 #9
0
def run_experiment(data_id,
                   location,
                   output_dir,
                   prepr_name=None,
                   class_name=None,
                   nb_configs=100,
                   seed=None,
                   cache_directory=None,
                   downsampling=None):

    preprocessor_names = [
        'extra_rand_trees', 'fast_ica', 'feature_agglomeration', 'kernel_pca',
        'kitchen_sinks', 'linear_svm', 'no_preprocessing', 'nystroem_sampler',
        'pca', 'polynomial_features', 'rand_trees_embedding',
        'select_percentile', 'select_rates'
    ]
    class_names = [
        'adaboost', 'bernoulli_nb', 'decision_tree', 'extra_trees',
        'gaussian_nb', 'gradient_boosting', 'k_nearest_neighbors', 'lda',
        'liblinear_svc', 'libsvm_svc', 'multinomial_nb', 'passive_aggresive',
        'qda', 'random_forest', 'sgd'
    ]
    #preprocessor_names = ['extra_rand_trees']

    preprocessor_nodes = {
        'extra_rand_trees': ExtraTreesNode(),
        'fast_ica': FastICANode(),
        'feature_agglomeration': FeatureAgglomerationNode(),
        'kernel_pca': KernelPcaNode(),
        'kitchen_sinks': RandomKitchenSinksNode(),
        'linear_svm': LinearSVMNode(),
        'no_preprocessing': NoPreprocessingNode(),
        'nystroem_sampler': NystroemSamplerNode(),
        'pca': PcaNode(),
        'polynomial_features': PolynomialFeaturesNode(),
        'rand_trees_embedding': RandomTreesEmbeddingNode(),
        'select_percentile': SelectPercentileNode(),
        'select_rates': SelectRatesNode()
    }

    classifier_nodes = {
        'adaboost': AdaBoostNode(),
        'bernoulli_nb': BernoulliNBNode(),
        'decision_tree': DecisionTreeNode(),
        'extra_trees': ExtraTreesClassifierNode(),
        'gaussian_nb': GaussianNBNode(),
        'gradient_boosting': GradientBoostingNode(),
        'k_nearest_neighbors': KNearestNeighborsNode(),
        'lda': LDANode(),
        'liblinear_svc': LibLinear_SVC_Node(),
        'libsvm_svc': LibSVM_SVC_Node(),
        'multinomial_nb': MultinomialNBNode(),
        'passive_aggresive': PassiveAggresiveNode(),
        'qda': QDANode(),
        'random_forest': RandomForestNode(),
        'sgd': SGDNode()
    }

    if prepr_name != None:
        prepr_nodes = [preprocessor_nodes[prepr_name]]
    else:
        prepr_nodes = []
        for prepr in preprocessor_names:
            prepr_nodes.append(preprocessor_nodes[prepr])
        prepr_name = 'all'

    if class_name != None:
        class_nodes = [classifier_nodes[class_name]]
    else:
        class_nodes = []
        for c_name in class_names:
            class_nodes.append(classifier_nodes[c_name])
        class_name = 'all'

    # ouput directory
    if output_dir == None:
        output_dir = os.path.dirname(os.path.abspath(__file__)) + "/results/"

    # Build pipeline space
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PipelineStep(name='feature_preprocessor',
                       nodes=prepr_nodes,
                       caching=True)
    c_s = PipelineStep(name='classifier', nodes=class_nodes)
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])

    # Build configuration space
    cs_builder = ConfigSpaceBuilder(pipeline_space)
    #print("SEED: {}".format(seed)) if seed else print("NOT SEED: {}".format(seed))
    config_space = cs_builder.build_config_space(seed=seed)

    # Sample configurations from configuration space
    rand_configs = config_space.sample_configuration(
        size=nb_configs) if nb_configs > 1 else [
            config_space.sample_configuration(size=nb_configs)
        ]

    # Run the random configurations = pipelines on data set
    data_path = location + str(
        data_id) if location[-1] == "/" else location + "/" + str(data_id)
    data_set = data_path.split("/")[-1]
    output_dir = output_dir + data_set + "/" + str(prepr_name) + "_" + str(class_name) + "/" if output_dir[-1] == "/" \
                                            else output_dir + "/" + data_set + "/" + str(prepr_name) + "_" + str(class_name) + "/"
    stamp = data_set + "_seed_" + str(seed)
    run_experiment_on_data(stamp=stamp,
                           data_path=data_path,
                           output_dir=output_dir,
                           pipeline_space=pipeline_space,
                           configs=rand_configs,
                           cache_directory=cache_directory,
                           downsampling=downsampling)