def write_to_database(scenario: ASlibScenario, approach, fold: int, on_training=False): metrics = list() metrics.append(Par10Metric()) metrics.append(NumberUnsolvedInstances(False)) metrics.append(NumberUnsolvedInstances(True)) scenario_name = scenario.scenario scenario = ASlibScenario() if scenario_name == 'GLUHACK-18': scenario_name = 'GLUHACK-2018' scenario.read_scenario('data/aslib_data-master/' + scenario_name) metric_results = _evaluate_train_test_split_mod(scenario, approach, metrics, fold, on_training) db_config = load_configuration() for i, result in enumerate(metric_results): if on_training: name = 'training_' + approach.get_name() publish_results_to_database(db_config, scenario.scenario, fold, name, metrics[i].get_name(), result) else: publish_results_to_database(db_config, scenario.scenario, fold, approach.get_name(), metrics[i].get_name(), result)
def _transform_aslib_scenario_to_kebi_format(self, scenario_folder_path): # read scenario scenario = ASlibScenario() scenario.logger.disabled = True scenario.read_scenario(dn=str(scenario_folder_path)) # prepare performance data and ranking data in XY_concationation DataFrame X = scenario.feature_data Y = self._performances_to_rankings(scenario) X, Y = self._adapt_column_names_according_to_the_output_format(X, Y) XY_concatination = pd.concat([X, Y], axis=1, join_axes=[X.index]) # Save in CSV file output_file_path = os.path.join(str(self.absolute_path_output_folder), scenario.scenario + ".csv") XY_concatination.to_csv( output_file_path, sep=self.separator, encoding='UTF-8', index=False, float_format='%g', na_rep=self.replacement_string_null_feature_values) # post step: add column types and empty line according to KEBI format to exported csv file self._add_value_type_column_name_line_in_kebi_formatted_csv( output_file_path, X.columns, Y.columns) return scenario
def evaluate_scenario(scenario_name: str, approach, metrics, amount_of_training_scenario_instances: int, fold: int, db_config, tune_hyperparameters: bool): scenario = ASlibScenario() scenario.read_scenario('data/aslib_data-master/' + scenario_name) print_stats_of_scenario(scenario) evaluate(scenario, approach, metrics, amount_of_training_scenario_instances, fold, db_config, tune_hyperparameters) return scenario_name
def read_scenario_ASlib(self, scenario_dn: str): ''' Read scenario from ASlib format Arguments --------- scenario_dn: str Scenario directory name ''' self.scenario = ASlibScenario() self.scenario.read_scenario(dn=scenario_dn)
def evaluate_scenario(scenario_name: str, approach, metrics, amount_of_training_scenario_instances: int, fold: int, db_config, tune_hyperparameters: bool): scenario = ASlibScenario() scenario.read_scenario('data/aslib_data-master/' + scenario_name) if scenario_name in ['OPENML-WEKA-2017', 'TTP-2016']: metrics = list() metrics.append(PerformanceMetric()) evaluate(scenario, approach, metrics, amount_of_training_scenario_instances, fold, db_config, tune_hyperparameters) return scenario_name
def _save_model(self, out_fn: str, scenario: ASlibScenario, feature_pre_pipeline: list, pre_solver: Aspeed, selector, config: Configuration): ''' save all pipeline objects for predictions Arguments --------- out_fn: str filename of output file scenario: AslibScenario ASlib scenario with all the data feature_pre_pipeline: list list of preprocessing objects pre_solver: Aspeed aspeed object with pre-solving schedule selector: autofolio.selector.* fitted selector object config: Configuration parameter setting configuration ''' scenario.logger = None for fpp in feature_pre_pipeline: fpp.logger = None if pre_solver: pre_solver.logger = None selector.logger = None model = [scenario, feature_pre_pipeline, pre_solver, selector, config] with open(out_fn, "bw") as fp: pickle.dump(model, fp)
def transform(self, scenario: ASlibScenario): ''' transform ASLib scenario data Arguments --------- scenario: data.aslib_scenario.ASlibScenario ASlib Scenario with all data in pandas Returns ------- data.aslib_scenario.ASlibScenario ''' if self.scaler: self.logger.debug("Applying StandardScaler") values = self.scaler.transform( np.array(scenario.feature_data.values)) scenario.feature_data = pd.DataFrame( data=values, index=scenario.feature_data.index, columns=scenario.feature_data.columns) return scenario
def read_scenario_CSV(self, csv_data: namedtuple): ''' Read scenario from ASlib format Arguments --------- csv_data: namedtuple namedtuple with the following fields: "perf_csv", "feat_csv", "obj", "cutoff", "maximize", "cv_csv" "cv_csv" can be None ''' self.scenario = ASlibScenario() self.scenario.read_from_csv(perf_fn=csv_data.perf_csv, feat_fn=csv_data.feat_csv, objective=csv_data.obj, runtime_cutoff=csv_data.cutoff, maximize=csv_data.maximize, cv_fn=csv_data.cv_csv)
def __init__(self, path): # read the parts of the aslib scenario which are present. This is adapted from # the example here: (in the predict method) # # https://github.com/mlindauer/OASC_starterkit/blob/master/oasc_starterkit/single_best.py scenario = ASlibScenario() scenario.read_description(fn=os.path.join(path,"description.txt")) scenario.read_feature_values(fn=os.path.join(path,"feature_values.arff")) scenario.read_feature_runstatus(fn=os.path.join(path,"feature_runstatus.arff")) scenario.instances = scenario.feature_data.index self.scenario = scenario
def __init__(self, perf_fn: str, feat_fn: str, objective: str = "solution_quality", runtime_cutoff: float = None, maximize: bool = True, cv_fn: str = None, seed: int = 12345): """ Constructor """ self.scenario = ASlibScenario() self.scenario.read_from_csv(perf_fn=perf_fn, feat_fn=feat_fn, objective=objective, runtime_cutoff=runtime_cutoff, maximize=maximize, cv_fn=cv_fn) self.seed = seed self.af = AutoFolio(random_seed=seed) self.logger = logging.getLogger("AF Facade")
def fit(self, scenario: ASlibScenario, fold: int, amount_of_training_instances: int): print("Run fit on " + self.get_name() + " for fold " + str(fold)) self.num_algorithms = len(scenario.algorithms) # create all bootstrap samples bootstrap_samples, out_of_sample_samples = self.generate_bootstrap_sample( scenario, fold, self.num_base_learner) weights_denorm = list() # train each base learner on a different sample for index in range(self.num_base_learner): self.current_iteration = index + 1 self.base_learners.append(copy.deepcopy(self.base_learner)) original_scenario = copy.deepcopy(scenario) scenario.feature_data, scenario.performance_data, scenario.runstatus_data, scenario.feature_runstatus_data, scenario.feature_cost_data = bootstrap_samples[ index] self.base_learners[index].fit(scenario, fold, amount_of_training_instances) if self.weighting: if self.weight_type == 'oos': scenario.feature_data, scenario.performance_data, scenario.runstatus_data, scenario.feature_runstatus_data, scenario.feature_cost_data = out_of_sample_samples[ index] elif self.weight_type == 'original_set': scenario = original_scenario weights_denorm.append( base_learner_performance(scenario, len(scenario.feature_data), self.base_learners[index])) #if self.current_iteration != self.num_base_learner: # write_to_database(scenario, self, fold) # Turn around values (lowest (best) gets highest weight) and normalize weights_denorm = [ max(weights_denorm) / float(i + 1) for i in weights_denorm ] self.weights = [float(i) / max(weights_denorm) for i in weights_denorm]
def _outer_cv(solver_fold, args, config): solver, fold = solver_fold # there are problems serializing the aslib scenario, so just read it again scenario = ASlibScenario() scenario.read_scenario(args.scenario) msg = "Solver: {}, Fold: {}".format(solver, fold) logger.info(msg) msg = "Constructing template pipeline" logger.info(msg) pipeline = _get_pipeline(args, config, scenario) msg = "Extracting solver and fold performance data" logger.info(msg) testing, training = scenario.get_split(fold) X_train = training.feature_data y_train = training.performance_data[solver].values if 'log_performance_data' in config: y_train = np.log1p(y_train) msg = "Fitting the pipeline" logger.info(msg) pipeline = pipeline.fit(X_train, y_train) out = string.Template(args.out) out = out.substitute(solver=solver, fold=fold) msg = "Writing fit pipeline to disk: {}".format(out) logger.info(msg) joblib.dump(pipeline, out) return pipeline
def split_scenario(scenario: ASlibScenario, sub_fold: int, num_instances: int): fold_len = int(num_instances / 10) instances = scenario.instances if sub_fold < 10: test_insts = instances[(sub_fold - 1) * fold_len:sub_fold * fold_len] training_insts = instances[:(sub_fold - 1) * fold_len] training_insts = np.append(training_insts, instances[sub_fold * fold_len:]) else: test_insts = instances[(sub_fold - 1) * fold_len:] training_insts = instances[:(sub_fold - 1) * fold_len] test = copy.copy(scenario) training = copy.copy(scenario) # feature_data test.feature_data = test.feature_data.drop(training_insts).sort_index() training.feature_data = training.feature_data.drop(test_insts).sort_index() # performance_data test.performance_data = test.performance_data.drop(training_insts).sort_index() training.performance_data = training.performance_data.drop(test_insts).sort_index() # runstatus_data test.runstatus_data = test.runstatus_data.drop(training_insts).sort_index() training.runstatus_data = training.runstatus_data.drop(test_insts).sort_index() # feature_runstatus_data test.feature_runstatus_data = test.feature_runstatus_data.drop(training_insts).sort_index() training.feature_runstatus_data = training.feature_runstatus_data.drop(test_insts).sort_index() # feature_cost_data if scenario.feature_cost_data is not None: test.feature_cost_data = test.feature_cost_data.drop(training_insts).sort_index() training.feature_cost_data = training.feature_cost_data.drop(test_insts).sort_index() # ground_truth_data if scenario.ground_truth_data is not None: test.ground_truth_data = test.ground_truth_data.drop(training_insts).sort_index() training.ground_truth_data = training.ground_truth_data.drop(test_insts).sort_index() test.cv_data = None training.cv_data = None test.instances = test_insts training.instances = training_insts scenario.used_feature_groups = None return test, training
def _evaluate_train_test_split_mod(scenario: ASlibScenario, approach, metrics, fold: int, on_training): test_scenario, train_scenario = scenario.get_split(indx=fold) if on_training: test_scenario = train_scenario approach_metric_values = np.zeros(len(metrics)) num_counted_test_values = 0 feature_data = test_scenario.feature_data.to_numpy() performance_data = test_scenario.performance_data.to_numpy() feature_cost_data = test_scenario.feature_cost_data.to_numpy( ) if test_scenario.feature_cost_data is not None else None for instance_id in range(0, len(test_scenario.instances)): X_test = feature_data[instance_id] y_test = performance_data[instance_id] accumulated_feature_time = 0 if test_scenario.feature_cost_data is not None and approach.get_name( ) != 'sbs' and approach.get_name() != 'oracle': feature_time = feature_cost_data[instance_id] accumulated_feature_time = np.sum(feature_time) contains_non_censored_value = False for y_element in y_test: if y_element < test_scenario.algorithm_cutoff_time: contains_non_censored_value = True if contains_non_censored_value: num_counted_test_values += 1 predicted_scores = approach.predict(X_test, instance_id) for i, metric in enumerate(metrics): runtime = metric.evaluate(y_test, predicted_scores, accumulated_feature_time, scenario.algorithm_cutoff_time) approach_metric_values[i] = (approach_metric_values[i] + runtime) approach_metric_values = np.true_divide(approach_metric_values, num_counted_test_values) print('PAR10: {0:.10f}'.format(approach_metric_values[0])) return approach_metric_values
def get_par10(self, scenario: ASlibScenario, fold: int): metrics = list() metrics.append(Par10Metric()) test_scenario, train_scenario = scenario.get_split(indx=fold) approach_metric_values = np.zeros(len(metrics)) num_counted_test_values = 0 feature_data = train_scenario.feature_data.to_numpy() performance_data = train_scenario.performance_data.to_numpy() feature_cost_data = train_scenario.feature_cost_data.to_numpy( ) if train_scenario.feature_cost_data is not None else None for instance_id in range(0, len(train_scenario.instances)): X_test = feature_data[instance_id] y_test = performance_data[instance_id] accumulated_feature_time = 0 if train_scenario.feature_cost_data is not None and self.get_name( ) != 'sbs' and self.get_name() != 'oracle': feature_time = feature_cost_data[instance_id] accumulated_feature_time = np.sum(feature_time) contains_non_censored_value = False for y_element in y_test: if y_element < train_scenario.algorithm_cutoff_time: contains_non_censored_value = True if contains_non_censored_value: num_counted_test_values += 1 predicted_scores = self.predict(X_test, instance_id, opt=True) for i, metric in enumerate(metrics): runtime = metric.evaluate(y_test, predicted_scores, accumulated_feature_time, scenario.algorithm_cutoff_time) approach_metric_values[i] = (approach_metric_values[i] + runtime) approach_metric_values = np.true_divide(approach_metric_values, num_counted_test_values) return approach_metric_values
def run_fold(self, config: Configuration, scenario:ASlibScenario, fold:int): ''' run a given fold of cross validation Arguments --------- scenario: aslib_scenario.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration to use for preprocessing fold: int fold id Returns ------- Stats() ''' self.logger.info("CV-Iteration: %d" % (fold)) test_scenario, training_scenario = scenario.get_split(indx=fold) feature_pre_pipeline, pre_solver, selector = self.fit( scenario=training_scenario, config=config) schedules = self.predict( test_scenario, config, feature_pre_pipeline, pre_solver, selector) val = Validator() if scenario.performance_type[0] == "runtime": stats = val.validate_runtime( schedules=schedules, test_scenario=test_scenario) elif scenario.performance_type[0] == "solution_quality": stats = val.validate_quality( schedules=schedules, test_scenario=test_scenario) else: raise ValueError("Unknown: %s" %(performance_type[0])) return stats
def transform(self, scenario: ASlibScenario): ''' transform ASLib scenario data Arguments --------- scenario: data.aslib_scenario.ASlibScenario ASlib Scenario with all data in pandas Returns ------- data.aslib_scenario.ASlibScenario ''' self.logger.debug("Impute Missing Feature Values") values = self.imputer.transform(np.array(scenario.feature_data.values)) scenario.feature_data = pd.DataFrame( data=values, index=scenario.feature_data.index, columns=scenario.feature_data.columns) return scenario
def transform(self, scenario: ASlibScenario): ''' transform ASLib scenario data Arguments --------- scenario: data.aslib_scenario.ASlibScenario ASlib Scenario with all data in pandas Returns ------- data.aslib_scenario.ASlibScenario ''' if self.pca: self.logger.debug("Applying PCA") values = self.pca.transform(np.array(scenario.feature_data.values)) scenario.feature_data = pd.DataFrame( data=values, index=scenario.feature_data.index, columns=["f%d" % (i) for i in range(values.shape[1])]) return scenario
def run_cli(self): ''' main method of AutoFolio based on command line interface ''' cmd_parser = CMDParser() args_, self.overwrite_args = cmd_parser.parse() self._root_logger.setLevel(args_.verbose) if args_.load: self.read_model_and_predict( model_fn=args_.load, feature_vec=list(map(float, args_.feature_vec))) else: scenario = ASlibScenario() if args_.scenario: scenario.read_scenario(args_.scenario) elif args_.performance_csv and args_.feature_csv: scenario.read_from_csv(perf_fn=args_.performance_csv, feat_fn=args_.feature_csv, objective=args_.objective, runtime_cutoff=args_.runtime_cutoff, maximize=args_.maximize, cv_fn=args_.cv_csv) else: raise ValueError("Missing inputs to read scenario data.") self.cs = self.get_cs(scenario) if args_.tune: config = self.get_tuned_config(scenario) else: config = self.cs.get_default_configuration() self.logger.debug(config) if args_.save: feature_pre_pipeline, pre_solver, selector = self.fit( scenario=scenario, config=config) self._save_model( args_.save, scenario, feature_pre_pipeline, pre_solver, selector, config) else: self.run_cv(config=config, scenario=scenario, folds=scenario.cv_data.max().max())
def run_cli(self): ''' main method of AutoFolio based on command line interface ''' cmd_parser = CMDParser() args_, self.overwrite_args = cmd_parser.parse() self._root_logger.setLevel(args_.verbose) if args_.load: pred = self.read_model_and_predict( model_fn=args_.load, feature_vec=list(map(float, args_.feature_vec.split(" ")))) print("Selected Schedule [(algorithm, budget)]: %s" % (pred)) else: scenario = ASlibScenario() if args_.scenario: scenario.read_scenario(args_.scenario) elif args_.performance_csv and args_.feature_csv: scenario.read_from_csv(perf_fn=args_.performance_csv, feat_fn=args_.feature_csv, objective=args_.objective, runtime_cutoff=args_.runtime_cutoff, maximize=args_.maximize, cv_fn=args_.cv_csv) else: raise ValueError("Missing inputs to read scenario data.") test_scenario = None if args_.performance_test_csv and args_.feature_test_csv: test_scenario = ASlibScenario() test_scenario.read_from_csv(perf_fn=args_.performance_test_csv, feat_fn=args_.feature_test_csv, objective=args_.objective, runtime_cutoff=args_.runtime_cutoff, maximize=args_.maximize, cv_fn=None) config = {} if args_.config is not None: self.logger.info("Reading yaml config file") config = yaml.load(open(args_.config)) if not config.get("wallclock_limit"): config["wallclock_limit"] = args_.wallclock_limit if not config.get("runcount_limit"): config["runcount_limit"] = args_.runcount_limit if not config.get("output-dir"): config["output-dir"] = args_.output_dir self.cs = self.get_cs(scenario, config) if args_.outer_cv: self._outer_cv(scenario, config, args_.outer_cv_fold, args_.out_template, smac_seed=args_.smac_seed) return 0 if args_.tune: config = self.get_tuned_config(scenario, wallclock_limit=args_.wallclock_limit, runcount_limit=args_.runcount_limit, autofolio_config=config, seed=args_.smac_seed) else: config = self.cs.get_default_configuration() self.logger.debug(config) if args_.save: feature_pre_pipeline, pre_solver, selector = self.fit( scenario=scenario, config=config) self._save_model( args_.save, scenario, feature_pre_pipeline, pre_solver, selector, config) else: self.run_cv(config=config, scenario=scenario, folds=int(scenario.cv_data.max().max())) if test_scenario is not None: stats = self.run_fold(config=config, fold=0, return_fit=False, scenario=scenario, test_scenario=test_scenario)
def run_fold(self, config: Configuration, scenario:ASlibScenario, fold:int, test_scenario=None, return_fit:bool=False): ''' run a given fold of cross validation Arguments --------- scenario: aslib_scenario.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration to use for preprocessing fold: int fold id test_scenario:aslib_scenario.aslib_scenario.ASlibScenario aslib scenario with test data for validation generated from <scenario> if None return_fit: bool optionally, the learned preprocessing options, presolver and selector can be returned Returns ------- Stats() (pre_pipeline, pre_solver, selector): only present if return_fit is True the pipeline components fit with the configuration options schedule: dict of string -> list of (solver, cutoff) pairs only present if return_fit is True the solver choices for each instance ''' if test_scenario is None: self.logger.info("CV-Iteration: %d" % (fold)) test_scenario, training_scenario = scenario.get_split(indx=fold) else: self.logger.info("Validation on test data") training_scenario = scenario feature_pre_pipeline, pre_solver, selector = self.fit( scenario=training_scenario, config=config) schedules = self.predict( test_scenario, config, feature_pre_pipeline, pre_solver, selector) val = Validator() if scenario.performance_type[0] == "runtime": stats = val.validate_runtime( schedules=schedules, test_scenario=test_scenario, train_scenario=training_scenario) elif scenario.performance_type[0] == "solution_quality": stats = val.validate_quality( schedules=schedules, test_scenario=test_scenario, train_scenario=training_scenario) else: raise ValueError("Unknown: %s" %(scenario.performance_type[0])) if return_fit: return stats, (feature_pre_pipeline, pre_solver, selector), schedules else: return stats
if __name__ == "__main__": parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument( "--result_fn", help="Result json file with predictions for each test instances") parser.add_argument("--test_as", help="Directory with *all* test data in ASlib format") parser.add_argument("--train_as", help="Directory with *all* train data in ASlib format") args_ = parser.parse_args() start_time_fold = tm.time() #read scenarios test_scenario = ASlibScenario() test_scenario.read_scenario(dn=args_.test_as) train_scenario = ASlibScenario() train_scenario.read_scenario(dn=args_.train_as) # read result file with open(args_.result_fn) as fp: schedules = json.load(fp) validator = Validator() if test_scenario.performance_type[0] == "runtime": validator.validate_runtime(schedules=schedules, test_scenario=test_scenario, train_scenario=train_scenario) else:
def _outer_cv(self, scenario: ASlibScenario, autofolio_config:dict=None, outer_cv_fold:int=None, out_template:str=None, smac_seed:int=42): ''' Evaluate on a scenario using an "outer" cross-fold validation scheme. In particular, this ensures that SMAC does not use the test set during hyperparameter optimization. Arguments --------- scenario: ASlibScenario ASlib Scenario at hand autofolio_config: dict, or None An optional dictionary of configuration options outer_cv_fold: int, or None If given, then only the single outer-cv fold is processed out_template: str, or None If given, the learned configurations are written to the specified locations. The string is considered a template, and "%fold%" will be replaced with the fold. smac_seed:int random seed for SMAC Returns ------- stats: validate.Stats Performance over all outer-cv folds ''' import string outer_stats = None # For each outer split outer_cv_folds = range(1, 11) if outer_cv_fold is not None: outer_cv_folds = range(outer_cv_fold, outer_cv_fold+1) for cv_fold in outer_cv_folds: # Use ‘ASlibScenario.get_split()’ to get the outer split outer_testing, outer_training = scenario.get_split(cv_fold) msg = ">>>>> Outer CV fold: {} <<<<<".format(cv_fold) self.logger.info(msg) # Use ASlibScenario.create_cv_splits() to get an inner-cv outer_training.create_cv_splits(n_folds=10) # Use ‘AutoFolio.get_tuned_config()’ to tune on inner-cv config = self.get_tuned_config( outer_training, autofolio_config=autofolio_config, seed=smac_seed ) # Use `AutoFolio.run_fold()’ to get the performance on the outer split stats, fit, schedule = self.run_fold( config, scenario, cv_fold, return_fit=True ) feature_pre_pipeline, pre_solver, selector = fit if outer_stats is None: outer_stats = stats else: outer_stats.merge(stats) # save the model, if given an output location if out_template is not None: out_template_ = string.Template(out_template) model_fn = out_template_.substitute(fold=cv_fold, type="pkl") msg = "Writing model to: {}".format(model_fn) self.logger.info(msg) self._save_model( model_fn, scenario, feature_pre_pipeline, pre_solver, selector, config ) # convert the schedule to a data frame schedule_df = pd.Series(schedule, name="solver") schedule_df.index.name = "instance" schedule_df = schedule_df.reset_index() # just keep the solver name; we don't care about the time # x[0] gets the first pair in the schedule list # and x[0][0] gets the name of the solver from that pair schedule_df['solver'] = schedule_df['solver'].apply(lambda x: x[0][0]) selections_fn = out_template_.substitute(fold=cv_fold, type="csv") msg = "Writing solver choices to: {}".format(selections_fn) self.logger.info(msg) schedule_df.to_csv(selections_fn, index=False) self.logger.info(">>>>> Final Stats <<<<<") outer_stats.show()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Validate the algorithm selection performance of the " "predictions made using test-as-auto-sklearn using " "autofolio.validation.validate.Validator.") parser.add_argument('scenario', help="The ASlib scenario") parser.add_argument('predictions', help="The predictions file, from " "test-as-auto-sklearn") parser.add_argument('--config', help="A (yaml) config file which " "specifies options controlling the learner behavior") logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Loading ASlib scenario" logger.info(msg) scenario = ASlibScenario() scenario.read_scenario(args.scenario) if args.config is not None: msg = "Loading yaml config file" logger.info(msg) config = yaml.load(open(args.config)) else: config = {} config['allowed_feature_groups'] = [scenario.feature_group_dict.keys()] # either way, update the scenario with the features used during training scenario.used_feature_groups = config['allowed_feature_groups'] msg = "Reading predictions" logger.info(msg) predictions = pd.read_csv(args.predictions) msg = "Selecting the algorithm with smallest prediction for each instance" logger.info(msg) algorithm_selections = pandas_utils.get_group_extreme( predictions, "predicted", ex_type="min", group_fields="instance_id") msg = "Creating the schedules for the validator" logger.info(msg) schedules = parallel.apply_df_simple(algorithm_selections, _get_schedule, scenario.algorithm_cutoff_time) schedules = utils.merge_dicts(*schedules) val = Validator() performance_type = scenario.performance_type[0] if performance_type == "runtime": stats = val.validate_runtime(schedules=schedules, test_scenario=scenario) elif performance_type == "solution_quality": stats = val.validate_quality(schedules=schedules, test_scenario=scenario) else: msg = "Unknown performance type: {}".format(performance_type) raise ValueError(msg) msg = "=== RESULTS ===" logger.info(msg) stats.show()
class AFCsvFacade(object): def __init__(self, perf_fn:str, feat_fn:str, objective:str = "solution_quality", runtime_cutoff:float = None, maximize:bool = True, cv_fn:str = None, seed: int = 12345 ): """ Constructor """ self.scenario = ASlibScenario() self.scenario.read_from_csv(perf_fn=perf_fn, feat_fn=feat_fn, objective=objective, runtime_cutoff=runtime_cutoff, maximize=maximize, cv_fn=cv_fn) self.seed = seed self.af = AutoFolio(random_seed=seed) self.logger = logging.getLogger("AF Facade") def fit(self, config:Configuration=None, save_fn:str = None): """ Train AutoFolio on data from init""" self.logger.info("Fit") if config is None: cs = self.af.get_cs(self.scenario, {}) config = cs.get_default_configuration() feature_pre_pipeline, pre_solver, selector = self.af.fit(scenario=self.scenario, config=config) if save_fn: self.af._save_model(save_fn, self.scenario, feature_pre_pipeline, pre_solver, selector, config) self.logger.info("AutoFolio model saved to %s" %(save_fn)) def tune(self, wallclock_limit:int = 1200, runcount_limit:int = np.inf, ): config = self.af.get_tuned_config(self.scenario, wallclock_limit=wallclock_limit, runcount_limit=runcount_limit, autofolio_config={}, seed=self.seed) self.logger.info("Optimized Configuration: %s" %(config)) return config def cross_validation(self, config:Configuration): """ run a cross validation on given AutoFolio configuration""" score = -1 * self.af.run_cv(config=config, scenario=self.scenario, folds=int(self.scenario.cv_data.max().max())) self.logger.info("AF's final performance %f" %(score)) return score @staticmethod def load_and_predict(vec: np.ndarray, load_fn:str): """ get predicted algorithm for given meta-feature vector""" af = AutoFolio(random_seed=42) # random seed doesn't matter here pred = af.read_model_and_predict(model_fn=load_fn, feature_vec=vec) print("Selected Schedule [(algorithm, budget)]: %s" % (pred)) return pred[0][0]
def evaluate_train_test_split(scenario: ASlibScenario, approach, metrics, fold: int, amount_of_training_instances: int, train_status: str): test_scenario, train_scenario = scenario.get_split(indx=fold) if train_status != 'all': train_scenario = copy.deepcopy(train_scenario) threshold = train_scenario.algorithm_cutoff_time if train_status == 'clip_censored': train_scenario.performance_data = train_scenario.performance_data.clip( upper=threshold) elif train_status == 'ignore_censored': train_scenario.performance_data = train_scenario.performance_data.replace( 10 * threshold, np.nan) if approach.get_name() == 'oracle' or approach.get_name( ) == 'virtual_sbs_with_feature_costs': approach.fit(test_scenario, fold, amount_of_training_instances) else: approach.fit(train_scenario, fold, amount_of_training_instances) approach_metric_values = np.zeros(len(metrics)) num_counted_test_values = 0 feature_data = test_scenario.feature_data.to_numpy() performance_data = test_scenario.performance_data.to_numpy() feature_cost_data = test_scenario.feature_cost_data.to_numpy( ) if test_scenario.feature_cost_data is not None else None instancewise_result_strings = list() simple_runtime_metric = RuntimeMetric() for instance_id in range(0, len(test_scenario.instances)): X_test = feature_data[instance_id] y_test = performance_data[instance_id] # compute feature time accumulated_feature_time = 0 if test_scenario.feature_cost_data is not None and approach.get_name( ) != 'sbs' and approach.get_name() != 'oracle': feature_time = feature_cost_data[instance_id] accumulated_feature_time = np.sum(feature_time) #compute the values of the different metrics predicted_scores = approach.predict(X_test, instance_id) num_counted_test_values += 1 for i, metric in enumerate(metrics): runtime = metric.evaluate(y_test, predicted_scores, accumulated_feature_time, scenario.algorithm_cutoff_time) approach_metric_values[i] = (approach_metric_values[i] + runtime) # store runtimes on a per instance basis in ASLib format runtime = simple_runtime_metric.evaluate( y_test, predicted_scores, accumulated_feature_time, scenario.algorithm_cutoff_time) run_status_to_print = "ok" if runtime < scenario.algorithm_cutoff_time else "timeout" line_to_store = test_scenario.instances[ instance_id] + ",1," + approach.get_name() + "," + str( runtime) + "," + run_status_to_print instancewise_result_strings.append(line_to_store) write_instance_wise_results_to_file(instancewise_result_strings, scenario.scenario) approach_metric_values = np.true_divide(approach_metric_values, num_counted_test_values) for i, metric in enumerate(metrics): print(metrics[i].get_name() + ': {0:.10f}'.format(approach_metric_values[i])) return approach_metric_values
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Test models learned with train-as-auto-sklearn. It " "writes the predictions to disk as a \"long\" data frame. The output " "file is in gzipped csv format.") parser.add_argument('scenario', help="The ASlib scenario") parser.add_argument('model_template', help="A template string for the filenames for " "the learned models. ${solver} and ${fold} are the template part of " "the string. It is probably necessary to surround this argument with " "single quotes in order to prevent shell replacement of the template " "parts.") parser.add_argument('out', help="The output csv file") parser.add_argument('--config', help="A (yaml) config file which " "specifies options controlling the learner behavior") logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Loading ASlib scenario" logger.info(msg) scenario = ASlibScenario() scenario.read_scenario(args.scenario) if args.config is not None: msg = "Loading yaml config file" logger.info(msg) config = yaml.load(open(args.config)) else: config = {} msg = "Creating string templates" logger.info(msg) model_template = string.Template(args.model_template) msg = "Finding folds from ASlib scenario" logger.info(msg) folds = [int(i) for i in scenario.cv_data['fold'].unique()] folds = sorted(folds) msg = "Making predictions" logger.info(msg) all_predictions = [] it = itertools.product(scenario.algorithms, folds) for solver, fold in it: model_file = model_template.substitute(solver=solver, fold=fold) if not os.path.exists(model_file): msg = "Could not find model file. Skipping: {}".format(model_file) logger.warning(msg) continue try: model = joblib.load(model_file) except: msg = ("Problem loading the model file. Skipping: {}".format( model_file)) logger.warning(msg) continue msg = "Processing. solver: {}. fold: {}".format(solver, fold) logger.info(msg) testing, training = scenario.get_split(fold) y_pred = model.predict(testing.feature_data) if 'log_performance_data': # exp transform it back out y_pred = np.expm1(y_pred) pred_df = pd.DataFrame() pred_df['instance_id'] = testing.feature_data.index pred_df['solver'] = solver pred_df['fold'] = fold pred_df['actual'] = testing.performance_data[solver].values pred_df['predicted'] = y_pred all_predictions.append(pred_df) msg = "Joining all predictions in a long data frame" logger.info(msg) all_predictions = pd.concat(all_predictions) msg = "Writing predictions to disk" logger.info(msg) utils.write_df(all_predictions, args.out, index=False)
def main(self, train_scenario_dn:str, test_scenario_dn:str=None): ''' main method Arguments --------- train_scenario_dn:str directory name with ASlib scenario training data test_scenarios_dn:str directory name with ASlib scenario test data (performance data is missing) ''' # Read scenario files scenario = ASlibScenario() scenario.read_scenario(dn=train_scenario_dn) # fit on training data self.fit(scenario=scenario) # Read test files # ASlibScenario is not designed to read partial scenarios # therefore, we have to cheat a bit scenario = ASlibScenario() scenario.read_description(fn=os.path.join(test_scenario_dn,"description.txt")) scenario.read_feature_values(fn=os.path.join(test_scenario_dn,"feature_values.arff")) scenario.read_feature_runstatus(fn=os.path.join(test_scenario_dn,"feature_runstatus.arff")) # predict on test data self.predict(scenario=scenario)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script trains a model to predict the runtime for a " "solver from an ASlib scenario using autosklearn. It assumes an " "\"outer\" cross-validation strategy, and it only trains a model for " "the indicated folds and solvers. It then writes the learned model to " "disk. It *does not* collect any statistics, make predictions ,etc.") parser.add_argument('scenario', help="The ASlib scenario") parser.add_argument('out', help="A template string for the filenames for " "the learned models. They are written with joblib.dump, so they need " "to be read back in with joblib.load. ${solver} and ${fold} are the " "template part of the string. It is probably necessary to surround " "this argument with single quotes in order to prevent shell " "replacement of the template parts.") parser.add_argument('--config', help="A (yaml) config file which specifies " "options controlling the learner behavior") parser.add_argument('--solvers', help="The solvers for which models will " "be learned. By default, models for all solvers are learned", nargs='*', default=[]) parser.add_argument('--folds', help="The outer-cv folds for which a model " "will be learned. By default, models for all folds are learned", type=int, nargs='*', default=[]) parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use " "for parallel solver/fold training", type=int, default=default_num_cpus) parser.add_argument('--num-blas-threads', help="The number of threads to " "use for parallelizing BLAS. The total number of CPUs will be " "\"num_cpus * num_blas_cpus\". Currently, this flag only affects " "OpenBLAS and MKL.", type=int, default=default_num_blas_cpus) parser.add_argument('--do-not-update-env', help="By default, num-blas-threads " "requires that relevant environment variables are updated. Likewise, " "if num-cpus is greater than one, it is necessary to turn off python " "assertions due to an issue with multiprocessing. If this flag is " "present, then the script assumes those updates are already handled. " "Otherwise, the relevant environment variables are set, and a new " "processes is spawned with this flag and otherwise the same " "arguments. This flag is not inended for external users.", action='store_true') automl_utils.add_automl_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # see which folds to run folds = args.folds if len(folds) == 0: folds = range(1, 11) for f in folds: math_utils.check_range(f, 1, 10, variable_name="fold") # and which solvers msg = "Reading ASlib scenario" logger.info(msg) scenario = ASlibScenario() scenario.read_scenario(args.scenario) # ensure the selected solver is present solvers = args.solvers if len(solvers) == 0: solvers = scenario.algorithms for solver in solvers: if solver not in scenario.algorithms: solver_str = ','.join(scenario.algorithms) msg = ("[train-auto-sklear]: the solver is not present in the " "ASlib scenario. given: {}. choices: {}".format(solver, solver_str)) raise ValueError(msg) if args.config is not None: msg = "Reading config file" logger.info(msg) config = yaml.load(open(args.config)) else: config = {} # everything is present, so update the environment variables and spawn a # new process, if necessary if not args.do_not_update_env: ### # # There is a lot going on with settings these environment variables. # please see the following references: # # Turning off assertions so we can parallelize sklearn across # multiple CPUs for different solvers/folds # https://github.com/celery/celery/issues/1709 # # Controlling OpenBLAS threads # https://github.com/automl/auto-sklearn/issues/166 # # Other environment variables controlling thread usage # http://stackoverflow.com/questions/30791550 # ### # we only need to turn off the assertions if we parallelize across cpus if args.num_cpus > 1: os.environ['PYTHONOPTIMIZE'] = "1" # openblas os.environ['OPENBLAS_NUM_THREADS'] = str(args.num_blas_threads) # mkl blas os.environ['MKL_NUM_THREADS'] = str(args.num_blas_threads) # other stuff from the SO post os.environ['OMP_NUM_THREADS'] = str(args.num_blas_threads) os.environ['NUMEXPR_NUM_THREADS'] = str(args.num_blas_threads) cmd = ' '.join(shlex.quote(a) for a in sys.argv) cmd += " --do-not-update-env" shell_utils.check_call(cmd) return msg = "Learning regressors" logger.info(msg) it = itertools.product(solvers, folds) regressors = parallel.apply_parallel_iter( it, args.num_cpus, _outer_cv, args, config, progress_bar=True )
def fit(self, scenario: ASlibScenario, fold: int, amount_of_training_instances: int): # setup the ensemble self.create_base_learner() self.scenario_name = scenario.scenario self.fold = fold self.num_algorithms = len(scenario.algorithms) num_instances = len(scenario.instances) feature_data = scenario.feature_data.to_numpy() performance_data = scenario.performance_data.to_numpy() # new features in matrix [instances x predictions] if self.new_feature_type == 'full': new_feature_data = np.zeros( (num_instances, self.num_algorithms * len(self.base_learners))) elif self.new_feature_type == 'small': new_feature_data = np.zeros( (num_instances, len(self.base_learners))) # if predictions are precomputed if self.pre_computed: for base_learner in self.base_learners: self.predictions.append( load_pickle(filename='predictions/' + base_learner.get_name() + '_' + scenario.scenario + '_' + str(fold))) # create new features for every base learner on each instance for learner_index, base_learner in enumerate(self.base_learners): # load pre computed predictions if self.pre_computed: if self.cross_validation: predictions = load_pickle( filename='predictions/cross_validation_' + base_learner.get_name() + '_' + scenario.scenario + '_' + str(fold)) else: predictions = load_pickle( filename='predictions/full_trainingdata_' + base_learner.get_name() + '_' + scenario.scenario + '_' + str(fold)) # create predictions, if they are not pre computed else: # if cross validation is used (h2o) if self.cross_validation: instance_counter = 0 for sub_fold in range(1, 11): test_scenario, training_scenario = split_scenario( scenario, sub_fold, num_instances) # train base learner base_learner.fit(training_scenario, fold, amount_of_training_instances) # create new feature data for instance_number in range( instance_counter, instance_counter + len(test_scenario.instances)): prediction = base_learner.predict( feature_data[instance_number], instance_number) predictions[instance_number] = prediction.flatten() instance_counter = instance_counter + len( test_scenario.instances) # fit base learner on the original training data self.create_base_learner() for base_learner in self.base_learners: base_learner.fit(scenario, fold, amount_of_training_instances) # if no cross validation is used else: base_learner.fit(scenario, fold, amount_of_training_instances) predictions = np.zeros( (len(scenario.instances), self.num_algorithms)) for instance_id, instance_feature in enumerate( feature_data): predictions[instance_id] = base_learner.predict( instance_feature, instance_id) # insert predictions to new feature data matrix for i in range(num_instances): if self.new_feature_type == 'full': for alo_num in range(self.num_algorithms): new_feature_data[i][ alo_num + self.num_algorithms * learner_index] = predictions[i][alo_num] elif self.new_feature_type == 'small': new_feature_data[i][learner_index] = np.argmin( predictions[i]) # add predictions to the features of the instances if self.new_feature_type == 'full': new_columns = np.arange(self.num_algorithms * len(self.base_learners)) elif self.new_feature_type == 'small': new_columns = np.arange(len(self.base_learners)) new_feature_data = pd.DataFrame(new_feature_data, index=scenario.feature_data.index, columns=new_columns) if self.meta_learner_input == 'full': new_feature_data = pd.concat( [scenario.feature_data, new_feature_data], axis=1, sort=False) elif self.meta_learner_input == 'predictions_only': pass else: sys.exit('Wrong meta learner input type option') scenario.feature_data = new_feature_data # meta learner selection if self.meta_learner_type == 'per_algorithm_regressor': self.meta_learner = PerAlgorithmRegressor( feature_importances=self.feature_importance) self.algorithm_selection_algorithm = True elif self.meta_learner_type == 'SUNNY': self.meta_learner = SUNNY() self.algorithm_selection_algorithm = True elif self.meta_learner_type == 'ISAC': self.meta_learner = ISAC() self.algorithm_selection_algorithm = True elif self.meta_learner_type == 'SATzilla-11': self.meta_learner = SATzilla11() self.algorithm_selection_algorithm = True elif self.meta_learner_type == 'multiclass': self.meta_learner = MultiClassAlgorithmSelector( feature_importance=self.feature_importance) self.algorithm_selection_algorithm = True elif self.meta_learner_type == 'Expectation': self.meta_learner = SurrogateSurvivalForest( criterion='Expectation') self.algorithm_selection_algorithm = True elif self.meta_learner_type == 'PAR10': self.meta_learner = SurrogateSurvivalForest(criterion='PAR10') self.algorithm_selection_algorithm = True elif self.meta_learner_type == 'RandomForest': self.meta_learner = RandomForestClassifier(random_state=fold) elif self.meta_learner_type == 'SVM': self.meta_learner = LinearSVC(random_state=fold, max_iter=10000) # feature selection if self.feature_selection == 'variance_threshold': self.feature_selector = VarianceThreshold(threshold=.8 * (1 - .8)) self.feature_selector.fit(scenario.feature_data) scenario.feature_data = pd.DataFrame( data=self.feature_selector.transform(scenario.feature_data)) elif self.feature_selection == 'select_k_best': self.feature_selector = SelectKBest(f_classif, k=self.num_algorithms) label_performance_data = [np.argmin(x) for x in performance_data] self.imputer = SimpleImputer() scenario.feature_data = self.imputer.fit_transform( scenario.feature_data) self.feature_selector.fit(scenario.feature_data, label_performance_data) scenario.feature_data = pd.DataFrame( data=self.feature_selector.transform(scenario.feature_data)) # fit meta learner if self.algorithm_selection_algorithm: self.meta_learner.fit(scenario, fold, amount_of_training_instances) else: label_performance_data = [np.argmin(x) for x in performance_data] self.pipe = Pipeline([('imputer', SimpleImputer()), ('standard_scaler', StandardScaler())]) x_train = self.pipe.fit_transform(scenario.feature_data.to_numpy(), label_performance_data) self.meta_learner.fit(x_train, label_performance_data)