def run_experiment_sampling():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])
    constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling",
                               "balancing", "feature_preprocessor"]
    variable_pipeline_steps = ["classifier"]

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    timing_v_1 = []
    timing_v_2 = []
    for i in range(0, 1):
        print("Run: {}".format(i))
        # sample 1 start config

        # version 1
        start_time = time.time()
        sample_configs = config_space.sample_configuration(size=1000)
        timing_v_1.append(time.time() - start_time)

        # version 2
        print("VERSION2")
        # start_config = config_space.sample_configuration(size=1)
        # sample_configs = config_space.sample_configuration(size=2)
        # sample_configs_values = [get_values(evaluation_config.get_dictionary(), variable_pipeline_steps) \
        #                         for evaluation_config in sample_configs]
        # start_time = time.time()
        # sample_configs_2 = config_space.sample_configuration_forbidden(size=500)
        # timing_v_2.append(time.time() - start_time)

        # invalid_configs = []
        # for config in sample_configs:
        #     try:
        #         config.is_valid_configuration()
        #     except ValueError as v:
        #         exc_info = sys.exc_info()
        #         # Display the *original* exception
        #         traceback.print_exception(*exc_info)
        #         del exc_info
        #
        #         invalid_configs.append(config)
        #         print("Config not valid: {}".format(config))

        # print("Nb of invalid configs: {}".format(len(invalid_configs)))
        #print(len(sample_configs), len(sample_configs_2))

        print(np.mean(timing_v_1))
def run_experiment_vector():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])
    constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling",
                               "balancing", "feature_preprocessor"]
    variable_pipeline_steps = ["classifier"]

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    timing_v_1 = []
    timing_v_2 = []
    for i in range(0, 10):
        print("Run: {}".format(i))
        # sample 1 start config
        start_config = config_space.sample_configuration(size=1)
        sample_configs = config_space.sample_configuration(size=1000)
        sample_configs_values = [get_values(evaluation_config.get_dictionary(), variable_pipeline_steps) \
                                 for evaluation_config in sample_configs]

        # version 1
        start_time = time.time()
        new_configurations = combine_configurations_batch_version1(config_space=config_space,
                                                                   start_config=start_config,
                                                                   complemented_configs_values=sample_configs_values,
                                                                   constant_pipeline_steps=constant_pipeline_steps)
        timing_v_1.append(time.time() - start_time)

        # version 2
        print("VERSION2")
        start_config = config_space.sample_configuration(size=1)
        sample_configs = config_space.sample_configuration(size=1000)
        sample_configs_values = [get_values(evaluation_config.get_dictionary(), variable_pipeline_steps) \
                                for evaluation_config in sample_configs]
        start_time = time.time()
        new_configurations_2 = combine_configurations_batch_version2(config_space=config_space,
                                                                     start_config=start_config,
                                                                     complemented_configs_values=sample_configs_values,
                                                                     constant_pipeline_steps=constant_pipeline_steps)
        timing_v_2.append(time.time() - start_time)
        #print(len(new_configurations), len(new_configurations_2))

        # Check new configs
        for config in new_configurations_2:
            config.is_valid_configuration()

        print(np.mean(timing_v_1))
        print(np.mean(timing_v_2))
def run_experiment_check_configurations():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])
    constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling",
                               "balancing", "feature_preprocessor"]
    variable_pipeline_steps = ["classifier"]

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    timing_v_1 = []
    timing_v_2 = []
    for i in range(0, 20):
        print("Run: {}".format(i))
        # sample 1 start config

        sample_configs = config_space.sample_configuration(size=500)
        for config in sample_configs:
            config.get_dictionary()

        # version 1
        start_time = time.time()
        for config in sample_configs:
            config_space.check_configuration(config)
        timing_v_1.append(time.time() - start_time)

        # version 2
        print("VERSION2")

        sample_configs = config_space.sample_configuration(size=500)

        start_time = time.time()
        for config in sample_configs:
            config_space.check_configuration(config)
        timing_v_2.append(time.time() - start_time)

        print(np.mean(timing_v_1))
        print(np.mean(timing_v_2))
def run_experiment_get_one_exchange_neighbourhood():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])
    constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling",
                               "balancing", "feature_preprocessor"]
    variable_pipeline_steps = ["classifier"]

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    timing_v_1 = []
    timing_v_2 = []
    for i in range(0, 10):
        print("Run: {}".format(i))
        # sample 1 start config
        sample_configs = config_space.sample_configuration(size=1000)

        # version 1
        start_time = time.time()
        for config in sample_configs:
            get_one_exchange_neighbourhood(config, seed=1)
        timing_v_1.append(time.time() - start_time)

        # version 2
        print("VERSION2")
        sample_configs = config_space.sample_configuration(size=1000)

        start_time = time.time()
        for config in sample_configs:
            get_one_exchange_neighbourhood_vector_checking(config, seed=1)
        timing_v_2.append(time.time() - start_time)
        #print(len(new_configurations), len(new_configurations_2))

        print(np.mean(timing_v_1))
        print(np.mean(timing_v_2))
예제 #5
0
    def __init__(self, data_path, output_dir=None, pipeline_space_string=None):
        if data_path == None:
            current_directory = dirname(dirname(os.path.abspath(__file__)))
            self.data_path = os.path.join(current_directory, 'data/46_bac')
        else:
            self.data_path = data_path

        self.data_loader = DataLoader(self.data_path)

        self.pipeline_space = self._build_pipeline_space() if (
            pipeline_space_string
            == None) else self._parse_pipeline_space(pipeline_space_string)
        self.cs_builder = ConfigSpaceBuilder(self.pipeline_space)
        self.config_space = self.cs_builder.build_config_space(
            dataset_properties=self.data_loader.info)

        self.output_dir = output_dir if output_dir else \
                                os.path.join(dirname(dirname(os.path.abspath(__file__))), 'output')
        try:
            if not os.path.exists(self.output_dir):
                os.makedirs(self.output_dir)
        except FileExistsError:
            pass
예제 #6
0
def run_experiment():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])

    runhistory = PCRunHistory(average_cost)

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    args = {
        'cs': config_space,
        'run_obj': "quality",
        'runcount_limit': 100,
        'wallclock_limit': 100,
        'memory_limit': 100,
        'cutoff_time': 100,
        'deterministic': "true"
    }
    scenario = Scenario(args)

    # Build stats
    stats = Stats(scenario, output_dir=None, stamp="")

    types, bounds = get_types(scenario.cs, scenario.feature_array)

    model = RandomForestWithInstances(types=types, bounds=bounds)

    constant_pipeline_steps = [
        "one_hot_encoder", "imputation", "rescaling", "balancing",
        "feature_preprocessor"
    ]
    variable_pipeline_steps = ["classifier"]
    rng = np.random.RandomState()
    num_params = len(scenario.cs.get_hyperparameters())

    acquisition_func = EI(model)
    acq_func_wrapper = PCAquisitionFunctionWrapper(
        acquisition_func=acquisition_func,
        config_space=scenario.cs,
        runhistory=runhistory,
        constant_pipeline_steps=constant_pipeline_steps,
        variable_pipeline_steps=variable_pipeline_steps)
    runhistory2epm = RunHistory2EPM4Cost(scenario,
                                         num_params,
                                         success_states=[StatusType.SUCCESS])
    local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                               config_space=scenario.cs)
    select_configuration = SelectConfigurationsWithMarginalization(
        scenario=scenario,
        stats=stats,
        runhistory=runhistory,
        model=model,
        acq_optimizer=local_search,
        acquisition_func=acq_func_wrapper,
        rng=rng,
        constant_pipeline_steps=constant_pipeline_steps,
        variable_pipeline_steps=variable_pipeline_steps,
        num_marginalized_configurations_by_random_search=40,
        num_configs_for_marginalization=200)

    # sample configurations to fill runhistory
    sample_configs = config_space.sample_configuration(size=10)
    for config in sample_configs:
        runhistory.add(config, 1, 1, StatusType.SUCCESS)

    # test select_configurations procedure
    X, Y = runhistory2epm.transform(runhistory)
    challengers = select_configuration.run(
        X,
        Y,
        sample_configs[0],
        num_configurations_by_random_search_sorted=100,
        num_configurations_by_local_search=10,
        random_leaf_size=1)

    print(challengers[0])
예제 #7
0
class Driver:
    def __init__(self, data_path, output_dir=None, pipeline_space_string=None):
        if data_path == None:
            current_directory = dirname(dirname(os.path.abspath(__file__)))
            self.data_path = os.path.join(current_directory, 'data/46_bac')
        else:
            self.data_path = data_path

        self.data_loader = DataLoader(self.data_path)

        self.pipeline_space = self._build_pipeline_space() if (
            pipeline_space_string
            == None) else self._parse_pipeline_space(pipeline_space_string)
        self.cs_builder = ConfigSpaceBuilder(self.pipeline_space)
        self.config_space = self.cs_builder.build_config_space(
            dataset_properties=self.data_loader.info)

        self.output_dir = output_dir if output_dir else \
                                os.path.join(dirname(dirname(os.path.abspath(__file__))), 'output')
        try:
            if not os.path.exists(self.output_dir):
                os.makedirs(self.output_dir)
        except FileExistsError:
            pass

    def initialize(self, stamp, acq_func, double_intensification,
                   cache_directory, wallclock_limit, runcount_limit, cutoff,
                   memory_limit, downsampling, intensification_fold_size,
                   random_splitting_number, random_splitting_enabled):
        # Check if caching is enabled
        caching = True if acq_func[:2] == "pc" else False

        # Make a cache directory
        if cache_directory == None:
            current_directory = dirname(dirname(os.path.abspath(__file__)))
            self.cache_directory = os.path.join(current_directory, 'cache')
        else:
            self.cache_directory = cache_directory

        # Check if cache_directory exists
        try:
            if not os.path.exists(self.cache_directory):
                os.makedirs(self.cache_directory)
        except FileExistsError:
            pass

        # Load data
        self.data = self.data_loader.get_data()

        # Build runhistory
        # TODO Does this work correctly for non-caching?
        runhistory = PCRunHistory(average_cost)

        # Setup statistics
        info = {
            'stamp': stamp,
            'caching': caching,
            'acquisition_function': acq_func,
            'cache_directory': self.cache_directory,
            'wallclock_limit': wallclock_limit,
            'downsampling': downsampling
        }

        self.statistics = Statistics(stamp,
                                     self.output_dir,
                                     information=info,
                                     total_runtime=wallclock_limit)

        # The pipeline parts that get marginalized
        constant_pipeline_steps = [
            "one_hot_encoder", "imputation", "rescaling", "balancing",
            "feature_preprocessor"
        ]

        variable_pipeline_steps = ["classifier"]

        # The pipeline parts that can get cached
        cached_pipeline_steps = [["one_hot_encoder", "imputation"],
                                 [
                                     "one_hot_encoder", "imputation",
                                     "rescaling", "balancing",
                                     "feature_preprocessor"
                                 ]]

        # Set cache directory
        if caching:
            pr = CachedPipelineRunner(
                self.data,
                self.data_loader.info,
                self.pipeline_space,
                runhistory,
                self.statistics,
                cached_pipeline_steps=cached_pipeline_steps,
                cache_directory=self.cache_directory,
                downsampling=downsampling,
                num_cross_validation_folds=intensification_fold_size)
        else:
            pr = PipelineRunner(
                self.data,
                self.data_loader.info,
                self.pipeline_space,
                runhistory,
                self.statistics,
                downsampling=downsampling,
                num_cross_validation_folds=intensification_fold_size)

        # Choose acquisition function
        if acq_func in [
                "eips", "pc-eips", "m-eips", "pc-m-eips", "pceips",
                "pc-m-pceips"
        ]:
            model_target_names = ['cost', 'time']
        elif acq_func in ["ei", "pc-ei", "m-ei", "pc-m-ei"]:
            model_target_names = ['cost']
        elif acq_func in ["roar", "pc-roar-mrs", "pc-roar-sigmoid-rs"]:
            model_target_names = []
        else:
            # Not a valid acquisition function
            raise ValueError("The provided acquisition function is not valid")

        trajectory_path = self.output_dir + "/logging/" + stamp  # + self.data_path.split("/")[-1] + "/" + str(stamp)
        if not os.path.exists(trajectory_path):
            os.makedirs(trajectory_path)
        self.trajectory_path_json = trajectory_path + "/traj_aclib2.json"
        self.trajectory_path_csv = trajectory_path + "/traj_old.csv"

        # Build scenario
        intensification_instances = [[
            1
        ]] if intensification_fold_size == None else [
            [i] for i in range(0, intensification_fold_size)
        ]
        args = {
            'cs': self.config_space,
            'run_obj': "quality",
            'runcount_limit': runcount_limit,
            'wallclock_limit': wallclock_limit,
            'memory_limit': memory_limit,
            'cutoff_time': cutoff,
            'deterministic': "true",
            'abort_on_first_run_crash': "false",
            'instances': intensification_instances
        }
        scenario = Scenario(args)

        # Build stats
        stats = Stats(scenario,
                      output_dir=self.output_dir + "/smac/",
                      stamp=stamp)

        # Build tae runner
        tae_runner = ExecuteTAFuncDict(ta=pr.run,
                                       stats=stats,
                                       runhistory=runhistory,
                                       run_obj=scenario.run_obj,
                                       memory_limit=scenario.memory_limit)

        # Build SMBO object
        intensification_instances = [
            1
        ] if intensification_fold_size == None else [
            i for i in range(0, intensification_fold_size)
        ]

        smbo_builder = SMBOBuilder()
        self.smbo = smbo_builder.build_pc_smbo(
            tae_runner=tae_runner,
            stats=stats,
            scenario=scenario,
            runhistory=runhistory,
            aggregate_func=average_cost,
            acq_func_name=acq_func,
            model_target_names=model_target_names,
            logging_directory=trajectory_path,
            double_intensification=double_intensification,
            constant_pipeline_steps=constant_pipeline_steps,
            variable_pipeline_steps=variable_pipeline_steps,
            cached_pipeline_steps=cached_pipeline_steps,
            intensification_instances=intensification_instances,
            num_marginalized_configurations_by_random_search=20,
            num_configs_for_marginalization=40,
            random_splitting_number=random_splitting_number,
            random_splitting_enabled=random_splitting_enabled)

    def run(self,
            stamp=time.time(),
            acq_func="ei",
            double_intensification=False,
            wallclock_limit=3600,
            runcount_limit=10000,
            memory_limit=6000,
            cutoff=3600,
            cache_directory=None,
            downsampling=None,
            intensification_fold_size=None,
            random_splitting_number=5,
            random_splitting_enabled=False):

        random_leaf_size = None

        # Initialize SMBO
        self.initialize(stamp=stamp,
                        acq_func=acq_func,
                        double_intensification=double_intensification,
                        cache_directory=cache_directory,
                        wallclock_limit=wallclock_limit,
                        runcount_limit=runcount_limit,
                        memory_limit=memory_limit,
                        cutoff=cutoff,
                        downsampling=downsampling,
                        intensification_fold_size=intensification_fold_size,
                        random_splitting_number=random_splitting_number,
                        random_splitting_enabled=random_splitting_enabled)

        # clean trajectory files
        self._clean_trajectory_files()

        # Start timer and clean statistics files
        self.statistics.start_timer()
        self.statistics.clean_files()

        # Run SMBO
        incumbent = self.smbo.run()

        # Save statistics
        # self.statistics.save()

        # Read trajectory files with incumbents and retrieve test performances
        self.trajectory = TrajLogger.read_traj_aclib_format(
            self.trajectory_path_json, self.config_space)
        trajectory = self.run_tests(self.trajectory, downsampling=downsampling)

        # Save new trajectory to output directory
        # First transform the configuration to a dictionary
        for traj in trajectory:
            traj['incumbent'] = traj['incumbent'].get_dictionary()
        self.statistics.add_incumbents_trajectory(trajectory)

        # Clean cache after running
        # shutil.rmtree(dir_name)

        return incumbent

    def run_tests(self, trajectory, downsampling=None):
        pt = PipelineTester(self.data,
                            self.data_loader.info,
                            self.pipeline_space,
                            downsampling=downsampling)

        for traj in trajectory:
            traj['test_performance'] = pt.get_error(traj['incumbent'])

        return trajectory

    #### INTERNAL METHODS ####

    def _clean_trajectory_files(self):
        open(self.trajectory_path_json, 'w')
        open(self.trajectory_path_csv, 'w')

    def _build_pipeline_space(self):
        ps = PipelineSpace()
        o_s = OneHotEncodingStep()
        i_s = ImputationStep()
        r_s = RescalingStep()
        b_s = BalancingStep()
        p_s = PreprocessingStep()
        c_s = ClassificationStep()
        ps.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])  #[p_s, c_s])
        return ps

    def _parse_pipeline_space(self, pipeline_space_string):
        preprocessor_names = pipeline_space_string.split("-")[0].split(",")
        classifier_names = pipeline_space_string.split("-")[1].split(",")

        pipeline_space_builder = PipelineSpaceBuilder()

        return pipeline_space_builder.build_pipeline_space(
            preprocessor_names=preprocessor_names,
            classifier_names=classifier_names)
def run_random_search(stamp,
                      data_path,
                      version,
                      wallclock_limit,
                      run_limit,
                      memory_limit,
                      cutoff,
                      splitting_number,
                      random_splitting_enabled,
                      seed=None,
                      output_dir=None,
                      cache_directory=None,
                      downsampling=None):
    # data set
    data_set = data_path.split("/")[-1]

    # cache directory
    try:
        if not os.path.exists(cache_directory):
            os.makedirs(cache_directory)
    except FileExistsError:
        pass

    # ouput directory
    try:
        if output_dir == None:
            output_dir = os.path.dirname(
                os.path.abspath(__file__)) + "/results/"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    except FileExistsError:
        pass

    # load data
    data_loader = DataLoader(data_path)
    data = data_loader.get_data()
    dataset_properties = data_loader.info

    # Build pipeline space
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep(
    )  # PipelineStep(name='feature_preprocessor', nodes=[KernelPcaNode()])
    c_s = ClassificationStep(
    )  # PipelineStep(name='classifier', nodes=[SGDNode()])
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])

    # Build configuration space
    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space(
        seed=seed, dataset_properties=dataset_properties)

    # Build statistics
    info = {
        'data_location': data_path,
        'stamp': stamp,
        'version': version,
        'wallclock_limit': wallclock_limit,
        'memory_limit': memory_limit,
        'cutoff': cutoff,
        'seed': seed,
        'downsampling': downsampling
    }
    statistics = Statistics(stamp,
                            output_dir,
                            information=info,
                            total_runtime=wallclock_limit,
                            run_limit=run_limit)
    statistics.clean_files()

    # The pipeline parts that can get cached
    cached_pipeline_steps = [[
        "one_hot_encoder", "imputation", "rescaling", "balancing",
        "feature_preprocessor"
    ]]

    num_cross_validation_folds = 10
    # Build pipeline runner
    if version == '2step':
        pipeline_runner = CachedPipelineRunner(
            data=data,
            data_info=dataset_properties,
            pipeline_space=pipeline_space,
            runhistory=None,
            cached_pipeline_steps=cached_pipeline_steps,
            statistics=statistics,
            cache_directory=cache_directory,
            downsampling=downsampling,
            num_cross_validation_folds=num_cross_validation_folds)
        random_search = TreeRandomSearch(
            config_space=config_space,
            pipeline_runner=pipeline_runner,
            wallclock_limit=wallclock_limit,
            memory_limit=memory_limit,
            statistics=statistics,
            constant_pipeline_steps=[
                "one_hot_encoder", "imputation", "rescaling", "balancing",
                "feature_preprocessor"
            ],
            variable_pipeline_steps=["classifier"],
            splitting_number=splitting_number,
            random_splitting_enabled=random_splitting_enabled)
    elif version == 'sigmoid':
        pipeline_runner = CachedPipelineRunner(
            data=data,
            data_info=dataset_properties,
            pipeline_space=pipeline_space,
            runhistory=None,
            cached_pipeline_steps=cached_pipeline_steps,
            statistics=statistics,
            cache_directory=cache_directory,
            downsampling=downsampling,
            num_cross_validation_folds=num_cross_validation_folds)
        random_search = SigmoidRandomSearch(
            config_space=config_space,
            pipeline_runner=pipeline_runner,
            wallclock_limit=wallclock_limit,
            memory_limit=memory_limit,
            statistics=statistics,
            constant_pipeline_steps=[
                "one_hot_encoder", "imputation", "rescaling", "balancing",
                "feature_preprocessor"
            ],
            variable_pipeline_steps=["classifier"],
            splitting_number=splitting_number,
            random_splitting_enabled=False)
    else:
        pipeline_runner = PipelineRunner(
            data=data,
            data_info=dataset_properties,
            pipeline_space=pipeline_space,
            runhistory=None,
            statistics=statistics,
            downsampling=downsampling,
            num_cross_validation_folds=num_cross_validation_folds)
        random_search = RandomSearch(config_space=config_space,
                                     pipeline_runner=pipeline_runner,
                                     wallclock_limit=wallclock_limit,
                                     memory_limit=memory_limit,
                                     statistics=statistics)

    # Run random search
    print("start random search")
    incumbent = random_search.run(cutoff=cutoff)
    print("... end random search")

    # test performance of incumbents
    incumbent_trajectory = statistics.get_incumbent_trajectory(
        config_space=config_space)
    trajectory = run_tests(data,
                           dataset_properties,
                           incumbent_trajectory,
                           pipeline_space,
                           downsampling=downsampling)
    print(trajectory)

    # Save new trajectory to output directory
    # First transform the configuration to a dictionary
    for traj in trajectory:
        traj['incumbent'] = traj['incumbent'].get_dictionary()
    statistics.add_incumbents_trajectory(trajectory)

    return incumbent
예제 #9
0
def run_experiment(data_id,
                   location,
                   output_dir,
                   prepr_name=None,
                   class_name=None,
                   nb_configs=100,
                   seed=None,
                   cache_directory=None,
                   downsampling=None):

    preprocessor_names = [
        'extra_rand_trees', 'fast_ica', 'feature_agglomeration', 'kernel_pca',
        'kitchen_sinks', 'linear_svm', 'no_preprocessing', 'nystroem_sampler',
        'pca', 'polynomial_features', 'rand_trees_embedding',
        'select_percentile', 'select_rates'
    ]
    class_names = [
        'adaboost', 'bernoulli_nb', 'decision_tree', 'extra_trees',
        'gaussian_nb', 'gradient_boosting', 'k_nearest_neighbors', 'lda',
        'liblinear_svc', 'libsvm_svc', 'multinomial_nb', 'passive_aggresive',
        'qda', 'random_forest', 'sgd'
    ]
    #preprocessor_names = ['extra_rand_trees']

    preprocessor_nodes = {
        'extra_rand_trees': ExtraTreesNode(),
        'fast_ica': FastICANode(),
        'feature_agglomeration': FeatureAgglomerationNode(),
        'kernel_pca': KernelPcaNode(),
        'kitchen_sinks': RandomKitchenSinksNode(),
        'linear_svm': LinearSVMNode(),
        'no_preprocessing': NoPreprocessingNode(),
        'nystroem_sampler': NystroemSamplerNode(),
        'pca': PcaNode(),
        'polynomial_features': PolynomialFeaturesNode(),
        'rand_trees_embedding': RandomTreesEmbeddingNode(),
        'select_percentile': SelectPercentileNode(),
        'select_rates': SelectRatesNode()
    }

    classifier_nodes = {
        'adaboost': AdaBoostNode(),
        'bernoulli_nb': BernoulliNBNode(),
        'decision_tree': DecisionTreeNode(),
        'extra_trees': ExtraTreesClassifierNode(),
        'gaussian_nb': GaussianNBNode(),
        'gradient_boosting': GradientBoostingNode(),
        'k_nearest_neighbors': KNearestNeighborsNode(),
        'lda': LDANode(),
        'liblinear_svc': LibLinear_SVC_Node(),
        'libsvm_svc': LibSVM_SVC_Node(),
        'multinomial_nb': MultinomialNBNode(),
        'passive_aggresive': PassiveAggresiveNode(),
        'qda': QDANode(),
        'random_forest': RandomForestNode(),
        'sgd': SGDNode()
    }

    if prepr_name != None:
        prepr_nodes = [preprocessor_nodes[prepr_name]]
    else:
        prepr_nodes = []
        for prepr in preprocessor_names:
            prepr_nodes.append(preprocessor_nodes[prepr])
        prepr_name = 'all'

    if class_name != None:
        class_nodes = [classifier_nodes[class_name]]
    else:
        class_nodes = []
        for c_name in class_names:
            class_nodes.append(classifier_nodes[c_name])
        class_name = 'all'

    # ouput directory
    if output_dir == None:
        output_dir = os.path.dirname(os.path.abspath(__file__)) + "/results/"

    # Build pipeline space
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PipelineStep(name='feature_preprocessor',
                       nodes=prepr_nodes,
                       caching=True)
    c_s = PipelineStep(name='classifier', nodes=class_nodes)
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])

    # Build configuration space
    cs_builder = ConfigSpaceBuilder(pipeline_space)
    #print("SEED: {}".format(seed)) if seed else print("NOT SEED: {}".format(seed))
    config_space = cs_builder.build_config_space(seed=seed)

    # Sample configurations from configuration space
    rand_configs = config_space.sample_configuration(
        size=nb_configs) if nb_configs > 1 else [
            config_space.sample_configuration(size=nb_configs)
        ]

    # Run the random configurations = pipelines on data set
    data_path = location + str(
        data_id) if location[-1] == "/" else location + "/" + str(data_id)
    data_set = data_path.split("/")[-1]
    output_dir = output_dir + data_set + "/" + str(prepr_name) + "_" + str(class_name) + "/" if output_dir[-1] == "/" \
                                            else output_dir + "/" + data_set + "/" + str(prepr_name) + "_" + str(class_name) + "/"
    stamp = data_set + "_seed_" + str(seed)
    run_experiment_on_data(stamp=stamp,
                           data_path=data_path,
                           output_dir=output_dir,
                           pipeline_space=pipeline_space,
                           configs=rand_configs,
                           cache_directory=cache_directory,
                           downsampling=downsampling)