예제 #1
0
 def _build_pipeline_space(self):
     ps = PipelineSpace()
     o_s = OneHotEncodingStep()
     i_s = ImputationStep()
     r_s = RescalingStep()
     b_s = BalancingStep()
     p_s = PreprocessingStep()
     c_s = ClassificationStep()
     ps.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])  #[p_s, c_s])
     return ps
def run_random_search(stamp,
                      data_path,
                      version,
                      wallclock_limit,
                      run_limit,
                      memory_limit,
                      cutoff,
                      splitting_number,
                      random_splitting_enabled,
                      seed=None,
                      output_dir=None,
                      cache_directory=None,
                      downsampling=None):
    # data set
    data_set = data_path.split("/")[-1]

    # cache directory
    try:
        if not os.path.exists(cache_directory):
            os.makedirs(cache_directory)
    except FileExistsError:
        pass

    # ouput directory
    try:
        if output_dir == None:
            output_dir = os.path.dirname(
                os.path.abspath(__file__)) + "/results/"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    except FileExistsError:
        pass

    # load data
    data_loader = DataLoader(data_path)
    data = data_loader.get_data()
    dataset_properties = data_loader.info

    # Build pipeline space
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep(
    )  # PipelineStep(name='feature_preprocessor', nodes=[KernelPcaNode()])
    c_s = ClassificationStep(
    )  # PipelineStep(name='classifier', nodes=[SGDNode()])
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])

    # Build configuration space
    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space(
        seed=seed, dataset_properties=dataset_properties)

    # Build statistics
    info = {
        'data_location': data_path,
        'stamp': stamp,
        'version': version,
        'wallclock_limit': wallclock_limit,
        'memory_limit': memory_limit,
        'cutoff': cutoff,
        'seed': seed,
        'downsampling': downsampling
    }
    statistics = Statistics(stamp,
                            output_dir,
                            information=info,
                            total_runtime=wallclock_limit,
                            run_limit=run_limit)
    statistics.clean_files()

    # The pipeline parts that can get cached
    cached_pipeline_steps = [[
        "one_hot_encoder", "imputation", "rescaling", "balancing",
        "feature_preprocessor"
    ]]

    num_cross_validation_folds = 10
    # Build pipeline runner
    if version == '2step':
        pipeline_runner = CachedPipelineRunner(
            data=data,
            data_info=dataset_properties,
            pipeline_space=pipeline_space,
            runhistory=None,
            cached_pipeline_steps=cached_pipeline_steps,
            statistics=statistics,
            cache_directory=cache_directory,
            downsampling=downsampling,
            num_cross_validation_folds=num_cross_validation_folds)
        random_search = TreeRandomSearch(
            config_space=config_space,
            pipeline_runner=pipeline_runner,
            wallclock_limit=wallclock_limit,
            memory_limit=memory_limit,
            statistics=statistics,
            constant_pipeline_steps=[
                "one_hot_encoder", "imputation", "rescaling", "balancing",
                "feature_preprocessor"
            ],
            variable_pipeline_steps=["classifier"],
            splitting_number=splitting_number,
            random_splitting_enabled=random_splitting_enabled)
    elif version == 'sigmoid':
        pipeline_runner = CachedPipelineRunner(
            data=data,
            data_info=dataset_properties,
            pipeline_space=pipeline_space,
            runhistory=None,
            cached_pipeline_steps=cached_pipeline_steps,
            statistics=statistics,
            cache_directory=cache_directory,
            downsampling=downsampling,
            num_cross_validation_folds=num_cross_validation_folds)
        random_search = SigmoidRandomSearch(
            config_space=config_space,
            pipeline_runner=pipeline_runner,
            wallclock_limit=wallclock_limit,
            memory_limit=memory_limit,
            statistics=statistics,
            constant_pipeline_steps=[
                "one_hot_encoder", "imputation", "rescaling", "balancing",
                "feature_preprocessor"
            ],
            variable_pipeline_steps=["classifier"],
            splitting_number=splitting_number,
            random_splitting_enabled=False)
    else:
        pipeline_runner = PipelineRunner(
            data=data,
            data_info=dataset_properties,
            pipeline_space=pipeline_space,
            runhistory=None,
            statistics=statistics,
            downsampling=downsampling,
            num_cross_validation_folds=num_cross_validation_folds)
        random_search = RandomSearch(config_space=config_space,
                                     pipeline_runner=pipeline_runner,
                                     wallclock_limit=wallclock_limit,
                                     memory_limit=memory_limit,
                                     statistics=statistics)

    # Run random search
    print("start random search")
    incumbent = random_search.run(cutoff=cutoff)
    print("... end random search")

    # test performance of incumbents
    incumbent_trajectory = statistics.get_incumbent_trajectory(
        config_space=config_space)
    trajectory = run_tests(data,
                           dataset_properties,
                           incumbent_trajectory,
                           pipeline_space,
                           downsampling=downsampling)
    print(trajectory)

    # Save new trajectory to output directory
    # First transform the configuration to a dictionary
    for traj in trajectory:
        traj['incumbent'] = traj['incumbent'].get_dictionary()
    statistics.add_incumbents_trajectory(trajectory)

    return incumbent
예제 #3
0
def run_experiment(data_id,
                   location,
                   output_dir,
                   prepr_name=None,
                   class_name=None,
                   nb_configs=100,
                   seed=None,
                   cache_directory=None,
                   downsampling=None):

    preprocessor_names = [
        'extra_rand_trees', 'fast_ica', 'feature_agglomeration', 'kernel_pca',
        'kitchen_sinks', 'linear_svm', 'no_preprocessing', 'nystroem_sampler',
        'pca', 'polynomial_features', 'rand_trees_embedding',
        'select_percentile', 'select_rates'
    ]
    class_names = [
        'adaboost', 'bernoulli_nb', 'decision_tree', 'extra_trees',
        'gaussian_nb', 'gradient_boosting', 'k_nearest_neighbors', 'lda',
        'liblinear_svc', 'libsvm_svc', 'multinomial_nb', 'passive_aggresive',
        'qda', 'random_forest', 'sgd'
    ]
    #preprocessor_names = ['extra_rand_trees']

    preprocessor_nodes = {
        'extra_rand_trees': ExtraTreesNode(),
        'fast_ica': FastICANode(),
        'feature_agglomeration': FeatureAgglomerationNode(),
        'kernel_pca': KernelPcaNode(),
        'kitchen_sinks': RandomKitchenSinksNode(),
        'linear_svm': LinearSVMNode(),
        'no_preprocessing': NoPreprocessingNode(),
        'nystroem_sampler': NystroemSamplerNode(),
        'pca': PcaNode(),
        'polynomial_features': PolynomialFeaturesNode(),
        'rand_trees_embedding': RandomTreesEmbeddingNode(),
        'select_percentile': SelectPercentileNode(),
        'select_rates': SelectRatesNode()
    }

    classifier_nodes = {
        'adaboost': AdaBoostNode(),
        'bernoulli_nb': BernoulliNBNode(),
        'decision_tree': DecisionTreeNode(),
        'extra_trees': ExtraTreesClassifierNode(),
        'gaussian_nb': GaussianNBNode(),
        'gradient_boosting': GradientBoostingNode(),
        'k_nearest_neighbors': KNearestNeighborsNode(),
        'lda': LDANode(),
        'liblinear_svc': LibLinear_SVC_Node(),
        'libsvm_svc': LibSVM_SVC_Node(),
        'multinomial_nb': MultinomialNBNode(),
        'passive_aggresive': PassiveAggresiveNode(),
        'qda': QDANode(),
        'random_forest': RandomForestNode(),
        'sgd': SGDNode()
    }

    if prepr_name != None:
        prepr_nodes = [preprocessor_nodes[prepr_name]]
    else:
        prepr_nodes = []
        for prepr in preprocessor_names:
            prepr_nodes.append(preprocessor_nodes[prepr])
        prepr_name = 'all'

    if class_name != None:
        class_nodes = [classifier_nodes[class_name]]
    else:
        class_nodes = []
        for c_name in class_names:
            class_nodes.append(classifier_nodes[c_name])
        class_name = 'all'

    # ouput directory
    if output_dir == None:
        output_dir = os.path.dirname(os.path.abspath(__file__)) + "/results/"

    # Build pipeline space
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PipelineStep(name='feature_preprocessor',
                       nodes=prepr_nodes,
                       caching=True)
    c_s = PipelineStep(name='classifier', nodes=class_nodes)
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])

    # Build configuration space
    cs_builder = ConfigSpaceBuilder(pipeline_space)
    #print("SEED: {}".format(seed)) if seed else print("NOT SEED: {}".format(seed))
    config_space = cs_builder.build_config_space(seed=seed)

    # Sample configurations from configuration space
    rand_configs = config_space.sample_configuration(
        size=nb_configs) if nb_configs > 1 else [
            config_space.sample_configuration(size=nb_configs)
        ]

    # Run the random configurations = pipelines on data set
    data_path = location + str(
        data_id) if location[-1] == "/" else location + "/" + str(data_id)
    data_set = data_path.split("/")[-1]
    output_dir = output_dir + data_set + "/" + str(prepr_name) + "_" + str(class_name) + "/" if output_dir[-1] == "/" \
                                            else output_dir + "/" + data_set + "/" + str(prepr_name) + "_" + str(class_name) + "/"
    stamp = data_set + "_seed_" + str(seed)
    run_experiment_on_data(stamp=stamp,
                           data_path=data_path,
                           output_dir=output_dir,
                           pipeline_space=pipeline_space,
                           configs=rand_configs,
                           cache_directory=cache_directory,
                           downsampling=downsampling)