def _build_pipeline_space(self): ps = PipelineSpace() o_s = OneHotEncodingStep() i_s = ImputationStep() r_s = RescalingStep() b_s = BalancingStep() p_s = PreprocessingStep() c_s = ClassificationStep() ps.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s]) #[p_s, c_s]) return ps
def run_random_search(stamp, data_path, version, wallclock_limit, run_limit, memory_limit, cutoff, splitting_number, random_splitting_enabled, seed=None, output_dir=None, cache_directory=None, downsampling=None): # data set data_set = data_path.split("/")[-1] # cache directory try: if not os.path.exists(cache_directory): os.makedirs(cache_directory) except FileExistsError: pass # ouput directory try: if output_dir == None: output_dir = os.path.dirname( os.path.abspath(__file__)) + "/results/" if not os.path.exists(output_dir): os.makedirs(output_dir) except FileExistsError: pass # load data data_loader = DataLoader(data_path) data = data_loader.get_data() dataset_properties = data_loader.info # Build pipeline space pipeline_space = PipelineSpace() o_s = OneHotEncodingStep() i_s = ImputationStep() r_s = RescalingStep() b_s = BalancingStep() p_s = PreprocessingStep( ) # PipelineStep(name='feature_preprocessor', nodes=[KernelPcaNode()]) c_s = ClassificationStep( ) # PipelineStep(name='classifier', nodes=[SGDNode()]) pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s]) # Build configuration space cs_builder = ConfigSpaceBuilder(pipeline_space) config_space = cs_builder.build_config_space( seed=seed, dataset_properties=dataset_properties) # Build statistics info = { 'data_location': data_path, 'stamp': stamp, 'version': version, 'wallclock_limit': wallclock_limit, 'memory_limit': memory_limit, 'cutoff': cutoff, 'seed': seed, 'downsampling': downsampling } statistics = Statistics(stamp, output_dir, information=info, total_runtime=wallclock_limit, run_limit=run_limit) statistics.clean_files() # The pipeline parts that can get cached cached_pipeline_steps = [[ "one_hot_encoder", "imputation", "rescaling", "balancing", "feature_preprocessor" ]] num_cross_validation_folds = 10 # Build pipeline runner if version == '2step': pipeline_runner = CachedPipelineRunner( data=data, data_info=dataset_properties, pipeline_space=pipeline_space, runhistory=None, cached_pipeline_steps=cached_pipeline_steps, statistics=statistics, cache_directory=cache_directory, downsampling=downsampling, num_cross_validation_folds=num_cross_validation_folds) random_search = TreeRandomSearch( config_space=config_space, pipeline_runner=pipeline_runner, wallclock_limit=wallclock_limit, memory_limit=memory_limit, statistics=statistics, constant_pipeline_steps=[ "one_hot_encoder", "imputation", "rescaling", "balancing", "feature_preprocessor" ], variable_pipeline_steps=["classifier"], splitting_number=splitting_number, random_splitting_enabled=random_splitting_enabled) elif version == 'sigmoid': pipeline_runner = CachedPipelineRunner( data=data, data_info=dataset_properties, pipeline_space=pipeline_space, runhistory=None, cached_pipeline_steps=cached_pipeline_steps, statistics=statistics, cache_directory=cache_directory, downsampling=downsampling, num_cross_validation_folds=num_cross_validation_folds) random_search = SigmoidRandomSearch( config_space=config_space, pipeline_runner=pipeline_runner, wallclock_limit=wallclock_limit, memory_limit=memory_limit, statistics=statistics, constant_pipeline_steps=[ "one_hot_encoder", "imputation", "rescaling", "balancing", "feature_preprocessor" ], variable_pipeline_steps=["classifier"], splitting_number=splitting_number, random_splitting_enabled=False) else: pipeline_runner = PipelineRunner( data=data, data_info=dataset_properties, pipeline_space=pipeline_space, runhistory=None, statistics=statistics, downsampling=downsampling, num_cross_validation_folds=num_cross_validation_folds) random_search = RandomSearch(config_space=config_space, pipeline_runner=pipeline_runner, wallclock_limit=wallclock_limit, memory_limit=memory_limit, statistics=statistics) # Run random search print("start random search") incumbent = random_search.run(cutoff=cutoff) print("... end random search") # test performance of incumbents incumbent_trajectory = statistics.get_incumbent_trajectory( config_space=config_space) trajectory = run_tests(data, dataset_properties, incumbent_trajectory, pipeline_space, downsampling=downsampling) print(trajectory) # Save new trajectory to output directory # First transform the configuration to a dictionary for traj in trajectory: traj['incumbent'] = traj['incumbent'].get_dictionary() statistics.add_incumbents_trajectory(trajectory) return incumbent
def run_experiment(data_id, location, output_dir, prepr_name=None, class_name=None, nb_configs=100, seed=None, cache_directory=None, downsampling=None): preprocessor_names = [ 'extra_rand_trees', 'fast_ica', 'feature_agglomeration', 'kernel_pca', 'kitchen_sinks', 'linear_svm', 'no_preprocessing', 'nystroem_sampler', 'pca', 'polynomial_features', 'rand_trees_embedding', 'select_percentile', 'select_rates' ] class_names = [ 'adaboost', 'bernoulli_nb', 'decision_tree', 'extra_trees', 'gaussian_nb', 'gradient_boosting', 'k_nearest_neighbors', 'lda', 'liblinear_svc', 'libsvm_svc', 'multinomial_nb', 'passive_aggresive', 'qda', 'random_forest', 'sgd' ] #preprocessor_names = ['extra_rand_trees'] preprocessor_nodes = { 'extra_rand_trees': ExtraTreesNode(), 'fast_ica': FastICANode(), 'feature_agglomeration': FeatureAgglomerationNode(), 'kernel_pca': KernelPcaNode(), 'kitchen_sinks': RandomKitchenSinksNode(), 'linear_svm': LinearSVMNode(), 'no_preprocessing': NoPreprocessingNode(), 'nystroem_sampler': NystroemSamplerNode(), 'pca': PcaNode(), 'polynomial_features': PolynomialFeaturesNode(), 'rand_trees_embedding': RandomTreesEmbeddingNode(), 'select_percentile': SelectPercentileNode(), 'select_rates': SelectRatesNode() } classifier_nodes = { 'adaboost': AdaBoostNode(), 'bernoulli_nb': BernoulliNBNode(), 'decision_tree': DecisionTreeNode(), 'extra_trees': ExtraTreesClassifierNode(), 'gaussian_nb': GaussianNBNode(), 'gradient_boosting': GradientBoostingNode(), 'k_nearest_neighbors': KNearestNeighborsNode(), 'lda': LDANode(), 'liblinear_svc': LibLinear_SVC_Node(), 'libsvm_svc': LibSVM_SVC_Node(), 'multinomial_nb': MultinomialNBNode(), 'passive_aggresive': PassiveAggresiveNode(), 'qda': QDANode(), 'random_forest': RandomForestNode(), 'sgd': SGDNode() } if prepr_name != None: prepr_nodes = [preprocessor_nodes[prepr_name]] else: prepr_nodes = [] for prepr in preprocessor_names: prepr_nodes.append(preprocessor_nodes[prepr]) prepr_name = 'all' if class_name != None: class_nodes = [classifier_nodes[class_name]] else: class_nodes = [] for c_name in class_names: class_nodes.append(classifier_nodes[c_name]) class_name = 'all' # ouput directory if output_dir == None: output_dir = os.path.dirname(os.path.abspath(__file__)) + "/results/" # Build pipeline space pipeline_space = PipelineSpace() o_s = OneHotEncodingStep() i_s = ImputationStep() r_s = RescalingStep() b_s = BalancingStep() p_s = PipelineStep(name='feature_preprocessor', nodes=prepr_nodes, caching=True) c_s = PipelineStep(name='classifier', nodes=class_nodes) pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s]) # Build configuration space cs_builder = ConfigSpaceBuilder(pipeline_space) #print("SEED: {}".format(seed)) if seed else print("NOT SEED: {}".format(seed)) config_space = cs_builder.build_config_space(seed=seed) # Sample configurations from configuration space rand_configs = config_space.sample_configuration( size=nb_configs) if nb_configs > 1 else [ config_space.sample_configuration(size=nb_configs) ] # Run the random configurations = pipelines on data set data_path = location + str( data_id) if location[-1] == "/" else location + "/" + str(data_id) data_set = data_path.split("/")[-1] output_dir = output_dir + data_set + "/" + str(prepr_name) + "_" + str(class_name) + "/" if output_dir[-1] == "/" \ else output_dir + "/" + data_set + "/" + str(prepr_name) + "_" + str(class_name) + "/" stamp = data_set + "_seed_" + str(seed) run_experiment_on_data(stamp=stamp, data_path=data_path, output_dir=output_dir, pipeline_space=pipeline_space, configs=rand_configs, cache_directory=cache_directory, downsampling=downsampling)