def _build_pipeline_space(self): ps = PipelineSpace() o_s = OneHotEncodingStep() i_s = ImputationStep() r_s = RescalingStep() b_s = BalancingStep() p_s = PreprocessingStep() c_s = ClassificationStep() ps.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s]) #[p_s, c_s]) return ps
def run_experiment_vector(): pipeline_space = PipelineSpace() o_s = OneHotEncodingStep() i_s = ImputationStep() r_s = RescalingStep() b_s = BalancingStep() p_s = PreprocessingStep() c_s = ClassificationStep() pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s]) constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling", "balancing", "feature_preprocessor"] variable_pipeline_steps = ["classifier"] cs_builder = ConfigSpaceBuilder(pipeline_space) config_space = cs_builder.build_config_space() timing_v_1 = [] timing_v_2 = [] for i in range(0, 10): print("Run: {}".format(i)) # sample 1 start config start_config = config_space.sample_configuration(size=1) sample_configs = config_space.sample_configuration(size=1000) sample_configs_values = [get_values(evaluation_config.get_dictionary(), variable_pipeline_steps) \ for evaluation_config in sample_configs] # version 1 start_time = time.time() new_configurations = combine_configurations_batch_version1(config_space=config_space, start_config=start_config, complemented_configs_values=sample_configs_values, constant_pipeline_steps=constant_pipeline_steps) timing_v_1.append(time.time() - start_time) # version 2 print("VERSION2") start_config = config_space.sample_configuration(size=1) sample_configs = config_space.sample_configuration(size=1000) sample_configs_values = [get_values(evaluation_config.get_dictionary(), variable_pipeline_steps) \ for evaluation_config in sample_configs] start_time = time.time() new_configurations_2 = combine_configurations_batch_version2(config_space=config_space, start_config=start_config, complemented_configs_values=sample_configs_values, constant_pipeline_steps=constant_pipeline_steps) timing_v_2.append(time.time() - start_time) #print(len(new_configurations), len(new_configurations_2)) # Check new configs for config in new_configurations_2: config.is_valid_configuration() print(np.mean(timing_v_1)) print(np.mean(timing_v_2))
def run_experiment_sampling(): pipeline_space = PipelineSpace() o_s = OneHotEncodingStep() i_s = ImputationStep() r_s = RescalingStep() b_s = BalancingStep() p_s = PreprocessingStep() c_s = ClassificationStep() pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s]) constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling", "balancing", "feature_preprocessor"] variable_pipeline_steps = ["classifier"] cs_builder = ConfigSpaceBuilder(pipeline_space) config_space = cs_builder.build_config_space() timing_v_1 = [] timing_v_2 = [] for i in range(0, 1): print("Run: {}".format(i)) # sample 1 start config # version 1 start_time = time.time() sample_configs = config_space.sample_configuration(size=1000) timing_v_1.append(time.time() - start_time) # version 2 print("VERSION2") # start_config = config_space.sample_configuration(size=1) # sample_configs = config_space.sample_configuration(size=2) # sample_configs_values = [get_values(evaluation_config.get_dictionary(), variable_pipeline_steps) \ # for evaluation_config in sample_configs] # start_time = time.time() # sample_configs_2 = config_space.sample_configuration_forbidden(size=500) # timing_v_2.append(time.time() - start_time) # invalid_configs = [] # for config in sample_configs: # try: # config.is_valid_configuration() # except ValueError as v: # exc_info = sys.exc_info() # # Display the *original* exception # traceback.print_exception(*exc_info) # del exc_info # # invalid_configs.append(config) # print("Config not valid: {}".format(config)) # print("Nb of invalid configs: {}".format(len(invalid_configs))) #print(len(sample_configs), len(sample_configs_2)) print(np.mean(timing_v_1))
def run_experiment_check_configurations(): pipeline_space = PipelineSpace() o_s = OneHotEncodingStep() i_s = ImputationStep() r_s = RescalingStep() b_s = BalancingStep() p_s = PreprocessingStep() c_s = ClassificationStep() pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s]) constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling", "balancing", "feature_preprocessor"] variable_pipeline_steps = ["classifier"] cs_builder = ConfigSpaceBuilder(pipeline_space) config_space = cs_builder.build_config_space() timing_v_1 = [] timing_v_2 = [] for i in range(0, 20): print("Run: {}".format(i)) # sample 1 start config sample_configs = config_space.sample_configuration(size=500) for config in sample_configs: config.get_dictionary() # version 1 start_time = time.time() for config in sample_configs: config_space.check_configuration(config) timing_v_1.append(time.time() - start_time) # version 2 print("VERSION2") sample_configs = config_space.sample_configuration(size=500) start_time = time.time() for config in sample_configs: config_space.check_configuration(config) timing_v_2.append(time.time() - start_time) print(np.mean(timing_v_1)) print(np.mean(timing_v_2))
def run_experiment_get_one_exchange_neighbourhood(): pipeline_space = PipelineSpace() o_s = OneHotEncodingStep() i_s = ImputationStep() r_s = RescalingStep() b_s = BalancingStep() p_s = PreprocessingStep() c_s = ClassificationStep() pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s]) constant_pipeline_steps = ["one_hot_encoder", "imputation", "rescaling", "balancing", "feature_preprocessor"] variable_pipeline_steps = ["classifier"] cs_builder = ConfigSpaceBuilder(pipeline_space) config_space = cs_builder.build_config_space() timing_v_1 = [] timing_v_2 = [] for i in range(0, 10): print("Run: {}".format(i)) # sample 1 start config sample_configs = config_space.sample_configuration(size=1000) # version 1 start_time = time.time() for config in sample_configs: get_one_exchange_neighbourhood(config, seed=1) timing_v_1.append(time.time() - start_time) # version 2 print("VERSION2") sample_configs = config_space.sample_configuration(size=1000) start_time = time.time() for config in sample_configs: get_one_exchange_neighbourhood_vector_checking(config, seed=1) timing_v_2.append(time.time() - start_time) #print(len(new_configurations), len(new_configurations_2)) print(np.mean(timing_v_1)) print(np.mean(timing_v_2))
def run_experiment(): pipeline_space = PipelineSpace() o_s = OneHotEncodingStep() i_s = ImputationStep() r_s = RescalingStep() b_s = BalancingStep() p_s = PreprocessingStep() c_s = ClassificationStep() pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s]) runhistory = PCRunHistory(average_cost) cs_builder = ConfigSpaceBuilder(pipeline_space) config_space = cs_builder.build_config_space() args = { 'cs': config_space, 'run_obj': "quality", 'runcount_limit': 100, 'wallclock_limit': 100, 'memory_limit': 100, 'cutoff_time': 100, 'deterministic': "true" } scenario = Scenario(args) # Build stats stats = Stats(scenario, output_dir=None, stamp="") types, bounds = get_types(scenario.cs, scenario.feature_array) model = RandomForestWithInstances(types=types, bounds=bounds) constant_pipeline_steps = [ "one_hot_encoder", "imputation", "rescaling", "balancing", "feature_preprocessor" ] variable_pipeline_steps = ["classifier"] rng = np.random.RandomState() num_params = len(scenario.cs.get_hyperparameters()) acquisition_func = EI(model) acq_func_wrapper = PCAquisitionFunctionWrapper( acquisition_func=acquisition_func, config_space=scenario.cs, runhistory=runhistory, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps) runhistory2epm = RunHistory2EPM4Cost(scenario, num_params, success_states=[StatusType.SUCCESS]) local_search = LocalSearch(acquisition_function=acq_func_wrapper, config_space=scenario.cs) select_configuration = SelectConfigurationsWithMarginalization( scenario=scenario, stats=stats, runhistory=runhistory, model=model, acq_optimizer=local_search, acquisition_func=acq_func_wrapper, rng=rng, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps, num_marginalized_configurations_by_random_search=40, num_configs_for_marginalization=200) # sample configurations to fill runhistory sample_configs = config_space.sample_configuration(size=10) for config in sample_configs: runhistory.add(config, 1, 1, StatusType.SUCCESS) # test select_configurations procedure X, Y = runhistory2epm.transform(runhistory) challengers = select_configuration.run( X, Y, sample_configs[0], num_configurations_by_random_search_sorted=100, num_configurations_by_local_search=10, random_leaf_size=1) print(challengers[0])
def run_random_search(stamp, data_path, version, wallclock_limit, run_limit, memory_limit, cutoff, splitting_number, random_splitting_enabled, seed=None, output_dir=None, cache_directory=None, downsampling=None): # data set data_set = data_path.split("/")[-1] # cache directory try: if not os.path.exists(cache_directory): os.makedirs(cache_directory) except FileExistsError: pass # ouput directory try: if output_dir == None: output_dir = os.path.dirname( os.path.abspath(__file__)) + "/results/" if not os.path.exists(output_dir): os.makedirs(output_dir) except FileExistsError: pass # load data data_loader = DataLoader(data_path) data = data_loader.get_data() dataset_properties = data_loader.info # Build pipeline space pipeline_space = PipelineSpace() o_s = OneHotEncodingStep() i_s = ImputationStep() r_s = RescalingStep() b_s = BalancingStep() p_s = PreprocessingStep( ) # PipelineStep(name='feature_preprocessor', nodes=[KernelPcaNode()]) c_s = ClassificationStep( ) # PipelineStep(name='classifier', nodes=[SGDNode()]) pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s]) # Build configuration space cs_builder = ConfigSpaceBuilder(pipeline_space) config_space = cs_builder.build_config_space( seed=seed, dataset_properties=dataset_properties) # Build statistics info = { 'data_location': data_path, 'stamp': stamp, 'version': version, 'wallclock_limit': wallclock_limit, 'memory_limit': memory_limit, 'cutoff': cutoff, 'seed': seed, 'downsampling': downsampling } statistics = Statistics(stamp, output_dir, information=info, total_runtime=wallclock_limit, run_limit=run_limit) statistics.clean_files() # The pipeline parts that can get cached cached_pipeline_steps = [[ "one_hot_encoder", "imputation", "rescaling", "balancing", "feature_preprocessor" ]] num_cross_validation_folds = 10 # Build pipeline runner if version == '2step': pipeline_runner = CachedPipelineRunner( data=data, data_info=dataset_properties, pipeline_space=pipeline_space, runhistory=None, cached_pipeline_steps=cached_pipeline_steps, statistics=statistics, cache_directory=cache_directory, downsampling=downsampling, num_cross_validation_folds=num_cross_validation_folds) random_search = TreeRandomSearch( config_space=config_space, pipeline_runner=pipeline_runner, wallclock_limit=wallclock_limit, memory_limit=memory_limit, statistics=statistics, constant_pipeline_steps=[ "one_hot_encoder", "imputation", "rescaling", "balancing", "feature_preprocessor" ], variable_pipeline_steps=["classifier"], splitting_number=splitting_number, random_splitting_enabled=random_splitting_enabled) elif version == 'sigmoid': pipeline_runner = CachedPipelineRunner( data=data, data_info=dataset_properties, pipeline_space=pipeline_space, runhistory=None, cached_pipeline_steps=cached_pipeline_steps, statistics=statistics, cache_directory=cache_directory, downsampling=downsampling, num_cross_validation_folds=num_cross_validation_folds) random_search = SigmoidRandomSearch( config_space=config_space, pipeline_runner=pipeline_runner, wallclock_limit=wallclock_limit, memory_limit=memory_limit, statistics=statistics, constant_pipeline_steps=[ "one_hot_encoder", "imputation", "rescaling", "balancing", "feature_preprocessor" ], variable_pipeline_steps=["classifier"], splitting_number=splitting_number, random_splitting_enabled=False) else: pipeline_runner = PipelineRunner( data=data, data_info=dataset_properties, pipeline_space=pipeline_space, runhistory=None, statistics=statistics, downsampling=downsampling, num_cross_validation_folds=num_cross_validation_folds) random_search = RandomSearch(config_space=config_space, pipeline_runner=pipeline_runner, wallclock_limit=wallclock_limit, memory_limit=memory_limit, statistics=statistics) # Run random search print("start random search") incumbent = random_search.run(cutoff=cutoff) print("... end random search") # test performance of incumbents incumbent_trajectory = statistics.get_incumbent_trajectory( config_space=config_space) trajectory = run_tests(data, dataset_properties, incumbent_trajectory, pipeline_space, downsampling=downsampling) print(trajectory) # Save new trajectory to output directory # First transform the configuration to a dictionary for traj in trajectory: traj['incumbent'] = traj['incumbent'].get_dictionary() statistics.add_incumbents_trajectory(trajectory) return incumbent