示例#1
0
 def _build_pipeline_space(self):
     ps = PipelineSpace()
     o_s = OneHotEncodingStep()
     i_s = ImputationStep()
     r_s = RescalingStep()
     b_s = BalancingStep()
     p_s = PreprocessingStep()
     c_s = ClassificationStep()
     ps.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])  #[p_s, c_s])
     return ps
def run_experiment_vector():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])
    constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling",
                               "balancing", "feature_preprocessor"]
    variable_pipeline_steps = ["classifier"]

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    timing_v_1 = []
    timing_v_2 = []
    for i in range(0, 10):
        print("Run: {}".format(i))
        # sample 1 start config
        start_config = config_space.sample_configuration(size=1)
        sample_configs = config_space.sample_configuration(size=1000)
        sample_configs_values = [get_values(evaluation_config.get_dictionary(), variable_pipeline_steps) \
                                 for evaluation_config in sample_configs]

        # version 1
        start_time = time.time()
        new_configurations = combine_configurations_batch_version1(config_space=config_space,
                                                                   start_config=start_config,
                                                                   complemented_configs_values=sample_configs_values,
                                                                   constant_pipeline_steps=constant_pipeline_steps)
        timing_v_1.append(time.time() - start_time)

        # version 2
        print("VERSION2")
        start_config = config_space.sample_configuration(size=1)
        sample_configs = config_space.sample_configuration(size=1000)
        sample_configs_values = [get_values(evaluation_config.get_dictionary(), variable_pipeline_steps) \
                                for evaluation_config in sample_configs]
        start_time = time.time()
        new_configurations_2 = combine_configurations_batch_version2(config_space=config_space,
                                                                     start_config=start_config,
                                                                     complemented_configs_values=sample_configs_values,
                                                                     constant_pipeline_steps=constant_pipeline_steps)
        timing_v_2.append(time.time() - start_time)
        #print(len(new_configurations), len(new_configurations_2))

        # Check new configs
        for config in new_configurations_2:
            config.is_valid_configuration()

        print(np.mean(timing_v_1))
        print(np.mean(timing_v_2))
def run_experiment_sampling():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])
    constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling",
                               "balancing", "feature_preprocessor"]
    variable_pipeline_steps = ["classifier"]

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    timing_v_1 = []
    timing_v_2 = []
    for i in range(0, 1):
        print("Run: {}".format(i))
        # sample 1 start config

        # version 1
        start_time = time.time()
        sample_configs = config_space.sample_configuration(size=1000)
        timing_v_1.append(time.time() - start_time)

        # version 2
        print("VERSION2")
        # start_config = config_space.sample_configuration(size=1)
        # sample_configs = config_space.sample_configuration(size=2)
        # sample_configs_values = [get_values(evaluation_config.get_dictionary(), variable_pipeline_steps) \
        #                         for evaluation_config in sample_configs]
        # start_time = time.time()
        # sample_configs_2 = config_space.sample_configuration_forbidden(size=500)
        # timing_v_2.append(time.time() - start_time)

        # invalid_configs = []
        # for config in sample_configs:
        #     try:
        #         config.is_valid_configuration()
        #     except ValueError as v:
        #         exc_info = sys.exc_info()
        #         # Display the *original* exception
        #         traceback.print_exception(*exc_info)
        #         del exc_info
        #
        #         invalid_configs.append(config)
        #         print("Config not valid: {}".format(config))

        # print("Nb of invalid configs: {}".format(len(invalid_configs)))
        #print(len(sample_configs), len(sample_configs_2))

        print(np.mean(timing_v_1))
def run_experiment_check_configurations():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])
    constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling",
                               "balancing", "feature_preprocessor"]
    variable_pipeline_steps = ["classifier"]

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    timing_v_1 = []
    timing_v_2 = []
    for i in range(0, 20):
        print("Run: {}".format(i))
        # sample 1 start config

        sample_configs = config_space.sample_configuration(size=500)
        for config in sample_configs:
            config.get_dictionary()

        # version 1
        start_time = time.time()
        for config in sample_configs:
            config_space.check_configuration(config)
        timing_v_1.append(time.time() - start_time)

        # version 2
        print("VERSION2")

        sample_configs = config_space.sample_configuration(size=500)

        start_time = time.time()
        for config in sample_configs:
            config_space.check_configuration(config)
        timing_v_2.append(time.time() - start_time)

        print(np.mean(timing_v_1))
        print(np.mean(timing_v_2))
def run_experiment_get_one_exchange_neighbourhood():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])
    constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling",
                               "balancing", "feature_preprocessor"]
    variable_pipeline_steps = ["classifier"]

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    timing_v_1 = []
    timing_v_2 = []
    for i in range(0, 10):
        print("Run: {}".format(i))
        # sample 1 start config
        sample_configs = config_space.sample_configuration(size=1000)

        # version 1
        start_time = time.time()
        for config in sample_configs:
            get_one_exchange_neighbourhood(config, seed=1)
        timing_v_1.append(time.time() - start_time)

        # version 2
        print("VERSION2")
        sample_configs = config_space.sample_configuration(size=1000)

        start_time = time.time()
        for config in sample_configs:
            get_one_exchange_neighbourhood_vector_checking(config, seed=1)
        timing_v_2.append(time.time() - start_time)
        #print(len(new_configurations), len(new_configurations_2))

        print(np.mean(timing_v_1))
        print(np.mean(timing_v_2))
def run_experiment():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])

    runhistory = PCRunHistory(average_cost)

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    args = {
        'cs': config_space,
        'run_obj': "quality",
        'runcount_limit': 100,
        'wallclock_limit': 100,
        'memory_limit': 100,
        'cutoff_time': 100,
        'deterministic': "true"
    }
    scenario = Scenario(args)

    # Build stats
    stats = Stats(scenario, output_dir=None, stamp="")

    types, bounds = get_types(scenario.cs, scenario.feature_array)

    model = RandomForestWithInstances(types=types, bounds=bounds)

    constant_pipeline_steps = [
        "one_hot_encoder", "imputation", "rescaling", "balancing",
        "feature_preprocessor"
    ]
    variable_pipeline_steps = ["classifier"]
    rng = np.random.RandomState()
    num_params = len(scenario.cs.get_hyperparameters())

    acquisition_func = EI(model)
    acq_func_wrapper = PCAquisitionFunctionWrapper(
        acquisition_func=acquisition_func,
        config_space=scenario.cs,
        runhistory=runhistory,
        constant_pipeline_steps=constant_pipeline_steps,
        variable_pipeline_steps=variable_pipeline_steps)
    runhistory2epm = RunHistory2EPM4Cost(scenario,
                                         num_params,
                                         success_states=[StatusType.SUCCESS])
    local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                               config_space=scenario.cs)
    select_configuration = SelectConfigurationsWithMarginalization(
        scenario=scenario,
        stats=stats,
        runhistory=runhistory,
        model=model,
        acq_optimizer=local_search,
        acquisition_func=acq_func_wrapper,
        rng=rng,
        constant_pipeline_steps=constant_pipeline_steps,
        variable_pipeline_steps=variable_pipeline_steps,
        num_marginalized_configurations_by_random_search=40,
        num_configs_for_marginalization=200)

    # sample configurations to fill runhistory
    sample_configs = config_space.sample_configuration(size=10)
    for config in sample_configs:
        runhistory.add(config, 1, 1, StatusType.SUCCESS)

    # test select_configurations procedure
    X, Y = runhistory2epm.transform(runhistory)
    challengers = select_configuration.run(
        X,
        Y,
        sample_configs[0],
        num_configurations_by_random_search_sorted=100,
        num_configurations_by_local_search=10,
        random_leaf_size=1)

    print(challengers[0])
def run_random_search(stamp,
                      data_path,
                      version,
                      wallclock_limit,
                      run_limit,
                      memory_limit,
                      cutoff,
                      splitting_number,
                      random_splitting_enabled,
                      seed=None,
                      output_dir=None,
                      cache_directory=None,
                      downsampling=None):
    # data set
    data_set = data_path.split("/")[-1]

    # cache directory
    try:
        if not os.path.exists(cache_directory):
            os.makedirs(cache_directory)
    except FileExistsError:
        pass

    # ouput directory
    try:
        if output_dir == None:
            output_dir = os.path.dirname(
                os.path.abspath(__file__)) + "/results/"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    except FileExistsError:
        pass

    # load data
    data_loader = DataLoader(data_path)
    data = data_loader.get_data()
    dataset_properties = data_loader.info

    # Build pipeline space
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep(
    )  # PipelineStep(name='feature_preprocessor', nodes=[KernelPcaNode()])
    c_s = ClassificationStep(
    )  # PipelineStep(name='classifier', nodes=[SGDNode()])
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])

    # Build configuration space
    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space(
        seed=seed, dataset_properties=dataset_properties)

    # Build statistics
    info = {
        'data_location': data_path,
        'stamp': stamp,
        'version': version,
        'wallclock_limit': wallclock_limit,
        'memory_limit': memory_limit,
        'cutoff': cutoff,
        'seed': seed,
        'downsampling': downsampling
    }
    statistics = Statistics(stamp,
                            output_dir,
                            information=info,
                            total_runtime=wallclock_limit,
                            run_limit=run_limit)
    statistics.clean_files()

    # The pipeline parts that can get cached
    cached_pipeline_steps = [[
        "one_hot_encoder", "imputation", "rescaling", "balancing",
        "feature_preprocessor"
    ]]

    num_cross_validation_folds = 10
    # Build pipeline runner
    if version == '2step':
        pipeline_runner = CachedPipelineRunner(
            data=data,
            data_info=dataset_properties,
            pipeline_space=pipeline_space,
            runhistory=None,
            cached_pipeline_steps=cached_pipeline_steps,
            statistics=statistics,
            cache_directory=cache_directory,
            downsampling=downsampling,
            num_cross_validation_folds=num_cross_validation_folds)
        random_search = TreeRandomSearch(
            config_space=config_space,
            pipeline_runner=pipeline_runner,
            wallclock_limit=wallclock_limit,
            memory_limit=memory_limit,
            statistics=statistics,
            constant_pipeline_steps=[
                "one_hot_encoder", "imputation", "rescaling", "balancing",
                "feature_preprocessor"
            ],
            variable_pipeline_steps=["classifier"],
            splitting_number=splitting_number,
            random_splitting_enabled=random_splitting_enabled)
    elif version == 'sigmoid':
        pipeline_runner = CachedPipelineRunner(
            data=data,
            data_info=dataset_properties,
            pipeline_space=pipeline_space,
            runhistory=None,
            cached_pipeline_steps=cached_pipeline_steps,
            statistics=statistics,
            cache_directory=cache_directory,
            downsampling=downsampling,
            num_cross_validation_folds=num_cross_validation_folds)
        random_search = SigmoidRandomSearch(
            config_space=config_space,
            pipeline_runner=pipeline_runner,
            wallclock_limit=wallclock_limit,
            memory_limit=memory_limit,
            statistics=statistics,
            constant_pipeline_steps=[
                "one_hot_encoder", "imputation", "rescaling", "balancing",
                "feature_preprocessor"
            ],
            variable_pipeline_steps=["classifier"],
            splitting_number=splitting_number,
            random_splitting_enabled=False)
    else:
        pipeline_runner = PipelineRunner(
            data=data,
            data_info=dataset_properties,
            pipeline_space=pipeline_space,
            runhistory=None,
            statistics=statistics,
            downsampling=downsampling,
            num_cross_validation_folds=num_cross_validation_folds)
        random_search = RandomSearch(config_space=config_space,
                                     pipeline_runner=pipeline_runner,
                                     wallclock_limit=wallclock_limit,
                                     memory_limit=memory_limit,
                                     statistics=statistics)

    # Run random search
    print("start random search")
    incumbent = random_search.run(cutoff=cutoff)
    print("... end random search")

    # test performance of incumbents
    incumbent_trajectory = statistics.get_incumbent_trajectory(
        config_space=config_space)
    trajectory = run_tests(data,
                           dataset_properties,
                           incumbent_trajectory,
                           pipeline_space,
                           downsampling=downsampling)
    print(trajectory)

    # Save new trajectory to output directory
    # First transform the configuration to a dictionary
    for traj in trajectory:
        traj['incumbent'] = traj['incumbent'].get_dictionary()
    statistics.add_incumbents_trajectory(trajectory)

    return incumbent