def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Decision Tree (sklearn) ****\n") is_classification = config.type == 'classification' X_train, X_test = impute(dataset.train.X, dataset.test.X) y_train, y_test = dataset.train.y, dataset.test.y estimator = DecisionTreeClassifier if is_classification else DecisionTreeRegressor predictor = estimator(random_state=config.seed, **config.framework_params) with Timer() as training: predictor.fit(X_train, y_train) predictions = predictor.predict(X_test) probabilities = predictor.predict_proba(X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) return dict( models_count=1, training_duration=training.duration )
def run(dataset, config): log.info("\n**** Random Forest (sklearn %s) ****\n", sklearn.__version__) is_classification = config.type == 'classification' # Impute any missing data (can test using -t 146606) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y, dataset.test.y log.info( "Running RandomForest with a maximum time of {}s on {} cores.".format( config.max_runtime_seconds, config.cores)) log.warning( "We completely ignore the requirement to stay within the time limit.") log.warning( "We completely ignore the advice to optimize towards metric: {}.". format(config.metric)) estimator = RandomForestClassifier if is_classification else RandomForestRegressor rfc = estimator(n_jobs=config.cores, **config.framework_params) rfc.fit(X_train, y_train) predictions = rfc.predict(X_test) probabilities = rfc.predict_proba(X_test) if is_classification else None return ns(output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=False)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** TPOT ****\n") is_classification = config.type == 'classification' # Mapping of benchmark metrics to TPOT metrics metrics_mapping = dict(acc='accuracy', auc='roc_auc', f1='f1', logloss='neg_log_loss', mae='neg_mean_absolute_error', mse='neg_mean_squared_error', msle='neg_mean_squared_log_error', r2='r2') scoring_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if scoring_metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc log.info( 'Running TPOT with a maximum time of %ss on %s cores, optimizing %s.', config.max_runtime_seconds, config.cores, scoring_metric) runtime_min = (config.max_runtime_seconds / 60) estimator = TPOTClassifier if is_classification else TPOTRegressor tpot = estimator(n_jobs=config.cores, max_time_mins=runtime_min, scoring=scoring_metric, random_state=config.seed, **config.framework_params) with Timer() as training: tpot.fit(X_train, y_train) log.info('Predicting on the test set.') predictions = tpot.predict(X_test) try: probabilities = tpot.predict_proba( X_test) if is_classification else None except RuntimeError: # TPOT throws a RuntimeError if the optimized pipeline does not support `predict_proba`. probabilities = Encoder('one-hot', target=False, encoded_type=float).fit_transform(predictions) save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=is_classification) return dict(models_count=len(tpot.evaluated_individuals_), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Random Forest (sklearn) ****\n") is_classification = config.type == 'classification' X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config log.info( "Running RandomForest with a maximum time of {}s on {} cores.".format( config.max_runtime_seconds, n_jobs)) log.warning( "We completely ignore the requirement to stay within the time limit.") log.warning( "We completely ignore the advice to optimize towards metric: {}.". format(config.metric)) estimator = RandomForestClassifier if is_classification else RandomForestRegressor rf = estimator(n_jobs=n_jobs, random_state=config.seed, **training_params) with Timer() as training: rf.fit(X_train, y_train) predictions = rf.predict(X_test) probabilities = rf.predict_proba(X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(rf), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Oboe ****\n") is_classification = config.type == 'classification' if not is_classification: # regression currently fails (as of 29.01.2019: still under development state by oboe team) raise ValueError( 'Regression is not yet supported (under development).') X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc log.info('Running oboe with a maximum time of {}s on {} cores.'.format( config.max_runtime_seconds, config.cores)) log.warning( 'We completely ignore the advice to optimize towards metric: {}.'. format(config.metric)) aml = AutoLearner( p_type='classification' if is_classification else 'regression', n_cores=config.cores, runtime_limit=config.max_runtime_seconds, **config.framework_params) with Timer() as training: aml.fit(X_train, y_train) predictions = aml.predict(X_test).reshape(len(X_test)) probabilities = Encoder('one-hot', target=False, encoded_type=float).fit_transform( predictions) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(aml.get_models()), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Decision Tree (sklearn) ****\n") is_classification = config.type == 'classification' X_train, X_test = impute(dataset.train.X, dataset.test.X) y_train, y_test = dataset.train.y, dataset.test.y estimator = DecisionTreeClassifier if is_classification else DecisionTreeRegressor predictor = estimator(**config.framework_params) predictor.fit(X_train, y_train) predictions = predictor.predict(X_test) probabilities = predictor.predict_proba( X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Random Forest (sklearn) ****\n") is_classification = config.type == 'classification' # Impute any missing data (can test using -t 146606) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y, dataset.test.y log.info( "Running RandomForest with a maximum time of {}s on {} cores.".format( config.max_runtime_seconds, config.cores)) log.warning( "We completely ignore the requirement to stay within the time limit.") log.warning( "We completely ignore the advice to optimize towards metric: {}.". format(config.metric)) estimator = RandomForestClassifier if is_classification else RandomForestRegressor rf = estimator(n_jobs=config.cores, random_state=config.seed, **config.framework_params) with Timer() as training: rf.fit(X_train, y_train) predictions = rf.predict(X_test) probabilities = rf.predict_proba(X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=False) return dict(models_count=len(rf), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Hyperopt-sklearn ****\n") is_classification = config.type == 'classification' default = lambda: 0 metrics_to_loss_mapping = dict( acc=(default, False), # lambda y, pred: 1.0 - accuracy_score(y, pred) auc=(lambda y, pred: 1.0 - roc_auc_score(y, pred), False), f1=(lambda y, pred: 1.0 - f1_score(y, pred), False), # logloss=(log_loss, True), mae=(mean_absolute_error, False), mse=(mean_squared_error, False), msle=(mean_squared_log_error, False), r2=(default, False), # lambda y, pred: 1.0 - r2_score(y, pred) ) loss_fn, continuous_loss_fn = metrics_to_loss_mapping[ config.metric] if config.metric in metrics_to_loss_mapping else (None, False) if loss_fn is None: log.warning("Performance metric %s not supported: defaulting to %s.", config.metric, 'accuracy' if is_classification else 'r2') if loss_fn is default: loss_fn = None training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } log.warning("Ignoring cores constraint of %s cores.", config.cores) log.info( "Running hyperopt-sklearn with a maximum time of %ss on %s cores, optimizing %s.", config.max_runtime_seconds, 'all', config.metric) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc if is_classification: classifier = any_classifier('clf') regressor = None else: classifier = None regressor = any_regressor('rgr') estimator = HyperoptEstimator(classifier=classifier, regressor=regressor, algo=tpe.suggest, loss_fn=loss_fn, continuous_loss_fn=continuous_loss_fn, trial_timeout=config.max_runtime_seconds, seed=config.seed, **training_params) with InterruptTimeout(config.max_runtime_seconds * 4 / 3, sig=signal.SIGQUIT): with InterruptTimeout(config.max_runtime_seconds, before_interrupt=ft.partial( kill_proc_tree, timeout=5, include_parent=False)): with Timer() as training: estimator.fit(X_train, y_train) predictions = estimator.predict(X_test) if is_classification: target_values_enc = dataset.target.label_encoder.transform( dataset.target.values) probabilities = Encoder( 'one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions) else: probabilities = None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(estimator.trials), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Tuned Random Forest (sklearn) ****\n") is_classification = config.type == 'classification' training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } tuning_params = config.framework_params.get('_tuning', training_params) n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config # Impute any missing data (can test using -t 146606) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc log.info( "Running RandomForest with a maximum time of {}s on {} cores.".format( config.max_runtime_seconds, n_jobs)) estimator = RandomForestClassifier if is_classification else RandomForestRegressor metric = dict(auc='roc_auc', logloss='neg_log_loss', acc='accuracy')[config.metric] n_features = X_train.shape[1] default_value = max(1, int(math.sqrt(n_features))) below_default = pick_values_uniform(start=1, end=default_value, length=5 + 1)[:-1] # 5 below above_default = pick_values_uniform(start=default_value, end=n_features, length=10 + 1 - len(below_default))[1:] # 5 above # Mix up the order of `max_features` to try, so that a fair range is tried even if we have too little time # to try all possible values. Order: [sqrt(p), 1, p, random order for remaining values] # max_features_to_try = below_default[1:] + above_default[:-1] # max_features_values = ([default_value, 1, n_features] # + random.sample(max_features_to_try, k=len(max_features_to_try))) max_features_values = [default_value] + below_default + above_default # Define up to how much of total time we spend 'optimizing' `max_features`. # (the remainder if used for fitting the final model). safety_factor = 0.85 with stopit.ThreadingTimeout(seconds=int(config.max_runtime_seconds * safety_factor)): log.info("Evaluating multiple values for `max_features`: %s.", max_features_values) max_feature_scores = [] tuning_durations = [] for i, max_features_value in enumerate(max_features_values): log.info("[{:2d}/{:2d}] Evaluating max_features={}".format( i + 1, len(max_features_values), max_features_value)) imputation = Imputer() random_forest = estimator(n_jobs=n_jobs, random_state=config.seed, max_features=max_features_value, **tuning_params) pipeline = Pipeline(steps=[('preprocessing', imputation), ('learning', random_forest)]) with Timer() as cv_scoring: try: scores = cross_val_score(estimator=pipeline, X=dataset.train.X_enc, y=dataset.train.y_enc, scoring=metric, cv=5) max_feature_scores.append( (statistics.mean(scores), max_features_value)) except stopit.utils.TimeoutException as toe: log.error( "Failed CV scoring for max_features=%s : Timeout", max_features_value) tuning_durations.append( (max_features_value, cv_scoring.duration)) raise toe except Exception as e: log.error("Failed CV scoring for max_features=%s :\n%s", max_features_value, e) log.debug("Exception:", exc_info=True) tuning_durations.append((max_features_value, cv_scoring.duration)) log.info("Tuning scores:\n%s", sorted(max_feature_scores)) log.info("Tuning durations:\n%s", sorted(tuning_durations)) _, best_max_features_value = max( max_feature_scores) if len(max_feature_scores) > 0 else (math.nan, 'auto') log.info("Training final model with `max_features={}`.".format( best_max_features_value)) rf = estimator(n_jobs=n_jobs, random_state=config.seed, max_features=best_max_features_value, **training_params) with Timer() as training: rf.fit(X_train, y_train) predictions = rf.predict(X_test) probabilities = rf.predict_proba(X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(rf), training_duration=training.duration + sum(map(lambda t: t[1], tuning_durations)))
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** TPOT ****\n") is_classification = config.type == 'classification' # Mapping of benchmark metrics to TPOT metrics metrics_mapping = dict(acc='accuracy', auc='roc_auc', f1='f1', logloss='neg_log_loss', mae='neg_mean_absolute_error', mse='neg_mean_squared_error', msle='neg_mean_squared_log_error', r2='r2') scoring_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if scoring_metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config log.info( 'Running TPOT with a maximum time of %ss on %s cores, optimizing %s.', config.max_runtime_seconds, n_jobs, scoring_metric) runtime_min = (config.max_runtime_seconds / 60) estimator = TPOTClassifier if is_classification else TPOTRegressor tpot = estimator(n_jobs=n_jobs, max_time_mins=runtime_min, scoring=scoring_metric, random_state=config.seed, **training_params) with Timer() as training: tpot.fit(X_train, y_train) log.debug("All individuals :\n%s", list(tpot.evaluated_individuals_.items())) models = tpot.pareto_front_fitted_pipelines_ hall_of_fame = list( zip(reversed(tpot._pareto_front.keys), tpot._pareto_front.items)) models_file = split_path(config.output_predictions_file) models_file.extension = '.models.txt' models_file = path_from_split(models_file) with open(models_file, 'w') as f: for m in hall_of_fame: pprint.pprint(dict( fitness=str(m[0]), model=str(m[1]), pipeline=models[str(m[1])], ), stream=f) log.info('Predicting on the test set.') predictions = tpot.predict(X_test) try: probabilities = tpot.predict_proba( X_test) if is_classification else None except RuntimeError: # TPOT throws a RuntimeError if the optimized pipeline does not support `predict_proba`. target_values_enc = dataset.target.label_encoder.transform( dataset.target.values) probabilities = Encoder( 'one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions) save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=is_classification) return dict(models_count=len(tpot.evaluated_individuals_), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Oboe ****\n") is_classification = config.type == 'classification' if not is_classification: # regression currently fails (as of 26.02.2019: still under development state by oboe team) raise ValueError( 'Regression is not yet supported (under development).') X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_cores = config.framework_params.get('_n_cores', config.cores) log.info('Running oboe with a maximum time of {}s on {} cores.'.format( config.max_runtime_seconds, n_cores)) log.warning( 'We completely ignore the advice to optimize towards metric: {}.'. format(config.metric)) aml = AutoLearner( p_type='classification' if is_classification else 'regression', n_cores=n_cores, runtime_limit=config.max_runtime_seconds, **training_params) aml_models = lambda: [aml.ensemble, *aml.ensemble.base_learners] if len( aml.ensemble.base_learners) > 0 else [] with Timer() as training: try: aml.fit(X_train, y_train) except IndexError as e: if len( aml_models() ) == 0: # incorrect handling of some IndexError in oboe if ensemble is empty raise NoResultError( "Oboe could not produce any model in the requested time." ) from e raise e predictions = aml.predict(X_test).reshape(len(X_test)) if is_classification: target_values_enc = dataset.target.label_encoder.transform( dataset.target.values) probabilities = Encoder( 'one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions) else: probabilities = None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(aml_models()), training_duration=training.duration)