def _assert_data_paths(dataset, definition): assert dataset.train.path == definition.train assert dataset.test.path == definition.test sp = split_path(definition.train) fmt = sp.extension[1:] for f in ['arff', 'csv', 'parquet']: if f == fmt: assert dataset.train.data_path(f) == dataset.train.path else: s = copy.copy(sp) s.extension = f assert dataset.train.data_path(f) == path_from_split(s)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** AutoWEKA [v{config.framework_version}]****\n") save_metadata(config) is_classification = config.type == 'classification' if not is_classification: raise ValueError('Regression is not supported.') # Mapping of benchmark metrics to Weka metrics metrics_mapping = dict(acc='errorRate', auc='areaUnderROC', logloss='kBInformation') metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) train_file = dataset.train.path test_file = dataset.test.path # Weka to requires target as the last attribute if dataset.target.index != len(dataset.predictors): train_file = reorder_dataset(dataset.train.path, target_src=dataset.target.index) test_file = reorder_dataset(dataset.test.path, target_src=dataset.target.index) training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } parallelRuns = config.framework_params.get('_parallelRuns', config.cores) memLimit = config.framework_params.get('_memLimit', 'auto') if memLimit == 'auto': memLimit = max( min(config.max_mem_size_mb, math.ceil(config.max_mem_size_mb / parallelRuns)), 1024) # AutoWEKA default memLimit log.info("Using %sMB memory per run on %s parallel runs.", memLimit, parallelRuns) f = split_path(config.output_predictions_file) f.extension = '.weka_pred.csv' weka_file = path_from_split(f) cmd_root = "java -cp {here}/lib/autoweka/autoweka.jar weka.classifiers.meta.AutoWEKAClassifier ".format( here=dir_of(__file__)) cmd_params = dict( t='"{}"'.format(train_file), T='"{}"'.format(test_file), memLimit=memLimit, classifications= '"weka.classifiers.evaluation.output.prediction.CSV -distribution -file \\\"{}\\\""' .format(weka_file), timeLimit=int(config.max_runtime_seconds / 60), parallelRuns=parallelRuns, metric=metric, seed=config.seed % (1 << 16), # weka accepts only int16 as seeds **training_params) cmd = cmd_root + ' '.join( ["-{} {}".format(k, v) for k, v in cmd_params.items()]) with Timer() as training: run_cmd(cmd, _live_output_=True) # if target values are not sorted alphabetically in the ARFF file, then class probabilities are returned in the original order # interestingly, other frameworks seem to always sort the target values first # that's why we need to specify the probabilities labels here: sorting+formatting is done in saving function probabilities_labels = dataset.target.values if not os.path.exists(weka_file): raise NoResultError("AutoWEKA failed producing any prediction.") with open(weka_file, 'r') as weka_file: probabilities = [] predictions = [] truth = [] for line in weka_file.readlines()[1:-1]: inst, actual, predicted, error, *distribution = line.split(',') pred_probabilities = [ pred_probability.replace('*', '').replace('\n', '') for pred_probability in distribution ] _, pred = predicted.split(':') _, tru = actual.split(':') probabilities.append(pred_probabilities) predictions.append(pred) truth.append(tru) save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth, probabilities_labels=probabilities_labels) return dict(training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** AutoSklearn ****\n") warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=DeprecationWarning) is_classification = config.type == 'classification' # Mapping of benchmark metrics to autosklearn metrics metrics_mapping = dict(acc=metrics.accuracy, auc=metrics.roc_auc, f1=metrics.f1, logloss=metrics.log_loss, mae=metrics.mean_absolute_error, mse=metrics.mean_squared_error, r2=metrics.r2) perf_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if perf_metric is None: # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported.", config.metric) # Set resources based on datasize log.info( "Running auto-sklearn with a maximum time of %ss on %s cores with %sMB, optimizing %s.", config.max_runtime_seconds, config.cores, config.max_mem_size_mb, perf_metric) log.info("Environment: %s", os.environ) X_train = dataset.train.X_enc y_train = dataset.train.y_enc # log.info("finite=%s", np.isfinite(X_train)) predictors_type = [ 'Categorical' if p.is_categorical() else 'Numerical' for p in dataset.predictors ] training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get('_n_jobs', config.cores) ml_memory_limit = config.framework_params.get('_ml_memory_limit', 'auto') ensemble_memory_limit = config.framework_params.get( '_ensemble_memory_limit', 'auto') # when memory is large enough, we should have: # (cores - 1) * ml_memory_limit_mb + ensemble_memory_limit_mb = config.max_mem_size_mb total_memory_mb = system_memory_mb().total if ml_memory_limit == 'auto': ml_memory_limit = max( min(config.max_mem_size_mb, math.ceil(total_memory_mb / n_jobs)), 3072) # 3072 is autosklearn defaults if ensemble_memory_limit == 'auto': ensemble_memory_limit = max( math.ceil(ml_memory_limit - (total_memory_mb - config.max_mem_size_mb)), math.ceil(ml_memory_limit / 3), # default proportions 1024) # 1024 is autosklearn defaults log.info( "Using %sMB memory per ML job and %sMB for ensemble job on a total of %s jobs.", ml_memory_limit, ensemble_memory_limit, n_jobs) log.warning( "Using meta-learned initialization, which might be bad (leakage).") # TODO: do we need to set per_run_time_limit too? estimator = AutoSklearnClassifier if is_classification else AutoSklearnRegressor auto_sklearn = estimator( time_left_for_this_task=config.max_runtime_seconds, n_jobs=n_jobs, ml_memory_limit=ml_memory_limit, ensemble_memory_limit=ensemble_memory_limit, seed=config.seed, **training_params) with Timer() as training: auto_sklearn.fit(X_train, y_train, metric=perf_metric, feat_type=predictors_type) models_repr = auto_sklearn.show_models() log.debug("Trained Ensemble:\n%s", models_repr) models_file = split_path(config.output_predictions_file) models_file.extension = '.models.txt' models_file = path_from_split(models_file) with open(models_file, 'w') as f: f.write(models_repr) # Convert output to strings for classification log.info("Predicting on the test set.") X_test = dataset.test.X_enc y_test = dataset.test.y_enc predictions = auto_sklearn.predict(X_test) probabilities = auto_sklearn.predict_proba( X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(auto_sklearn.get_models_with_weights()), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** TPOT ****\n") is_classification = config.type == 'classification' # Mapping of benchmark metrics to TPOT metrics metrics_mapping = dict(acc='accuracy', auc='roc_auc', f1='f1', logloss='neg_log_loss', mae='neg_mean_absolute_error', mse='neg_mean_squared_error', msle='neg_mean_squared_log_error', r2='r2') scoring_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if scoring_metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config log.info( 'Running TPOT with a maximum time of %ss on %s cores, optimizing %s.', config.max_runtime_seconds, n_jobs, scoring_metric) runtime_min = (config.max_runtime_seconds / 60) estimator = TPOTClassifier if is_classification else TPOTRegressor tpot = estimator(n_jobs=n_jobs, max_time_mins=runtime_min, scoring=scoring_metric, random_state=config.seed, **training_params) with Timer() as training: tpot.fit(X_train, y_train) log.debug("All individuals :\n%s", list(tpot.evaluated_individuals_.items())) models = tpot.pareto_front_fitted_pipelines_ hall_of_fame = list( zip(reversed(tpot._pareto_front.keys), tpot._pareto_front.items)) models_file = split_path(config.output_predictions_file) models_file.extension = '.models.txt' models_file = path_from_split(models_file) with open(models_file, 'w') as f: for m in hall_of_fame: pprint.pprint(dict( fitness=str(m[0]), model=str(m[1]), pipeline=models[str(m[1])], ), stream=f) log.info('Predicting on the test set.') predictions = tpot.predict(X_test) try: probabilities = tpot.predict_proba( X_test) if is_classification else None except RuntimeError: # TPOT throws a RuntimeError if the optimized pipeline does not support `predict_proba`. target_values_enc = dataset.target.label_encoder.transform( dataset.target.values) probabilities = Encoder( 'one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions) save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=is_classification) return dict(models_count=len(tpot.evaluated_individuals_), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads, config.max_mem_size_mb) h2o.init(nthreads=nthreads, min_mem_size=str(config.max_mem_size_mb) + "M", max_mem_size=str(config.max_mem_size_mb) + "M", log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **training_params) with Timer() as training: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") lb = aml.leaderboard.as_data_frame() log.debug("Leaderboard:\n%s", lb.to_string()) lbf = split_path(config.output_predictions_file) lbf.extension = '.leaderboard.csv' lbf = path_from_split(lbf) write_csv(lb, lbf) h2o_preds = aml.predict(test).as_data_frame(use_pandas=False) preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0]) y_pred = preds.iloc[:, 0] h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False, header=False) y_truth = to_data_frame(h2o_truth) predictions = y_pred.values probabilities = preds.iloc[:, 1:].values truth = y_truth.values save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()