示例#1
0
def run(dataset: Dataset, config: TaskConfig):
    log.info(
        f"\n**** Gradient Boosting [sklearn v{sklearn.__version__}] ****\n")
    save_metadata(config, version=sklearn.__version__)

    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y, dataset.test.y

    estimator = GradientBoostingClassifier if is_classification else GradientBoostingRegressor
    predictor = estimator(random_state=config.seed, **config.framework_params)

    with Timer() as training:
        predictor.fit(X_train, y_train)
    predictions = predictor.predict(X_test)
    probabilities = predictor.predict_proba(
        X_test) if is_classification else None

    save_predictions(dataset=dataset,
                     output_file=config.output_predictions_file,
                     probabilities=probabilities,
                     predictions=predictions,
                     truth=y_test)

    return dict(models_count=1, training_duration=training.duration)
示例#2
0
def run(dataset: Dataset, config: TaskConfig):
    from frameworks.shared.caller import run_in_venv

    X_train_enc, X_test_enc = impute(dataset.train.X_enc, dataset.test.X_enc)
    data = dict(
        train=dict(
            X_enc=X_train_enc,
            y_enc=dataset.train.y_enc
        ),
        test=dict(
            X_enc=X_test_enc,
            y_enc=dataset.test.y_enc
        )
    )

    def process_results(results):
        if results.probabilities is not None and not results.probabilities.shape:  # numpy load always return an array
            prob_format = results.probabilities.item()
            if prob_format == "predictions":
                target_values_enc = dataset.target.label_encoder.transform(dataset.target.values)
                results.probabilities = Encoder('one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(results.predictions)
            else:
                raise ValueError(f"Unknown probabilities format: {prob_format}")
        return results

    return run_in_venv(__file__, "exec.py",
                       input_data=data, dataset=dataset, config=config,
                       process_results=process_results)
示例#3
0
def run(dataset, config):
    log.info("\n**** Random Forest (sklearn %s) ****\n", sklearn.__version__)

    is_classification = config.type == 'classification'

    # Impute any missing data (can test using -t 146606)
    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y, dataset.test.y

    log.info(
        "Running RandomForest with a maximum time of {}s on {} cores.".format(
            config.max_runtime_seconds, config.cores))
    log.warning(
        "We completely ignore the requirement to stay within the time limit.")
    log.warning(
        "We completely ignore the advice to optimize towards metric: {}.".
        format(config.metric))

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    rfc = estimator(n_jobs=config.cores, **config.framework_params)

    rfc.fit(X_train, y_train)

    predictions = rfc.predict(X_test)
    probabilities = rfc.predict_proba(X_test) if is_classification else None

    return ns(output_file=config.output_predictions_file,
              probabilities=probabilities,
              predictions=predictions,
              truth=y_test,
              target_is_encoded=False)
def run(dataset: Dataset, config: TaskConfig):
    X_train_enc, X_test_enc = impute(dataset.train.X_enc, dataset.test.X_enc)
    data = dict(train=dict(X_enc=X_train_enc, y_enc=dataset.train.y_enc),
                test=dict(X_enc=X_test_enc, y_enc=dataset.test.y_enc))

    return run_in_venv(__file__,
                       "exec.py",
                       input_data=data,
                       dataset=dataset,
                       config=config)
示例#5
0
def run(dataset: Dataset, config: TaskConfig):
    from amlb.datautils import impute
    from frameworks.shared.caller import run_in_venv

    X_train_enc, X_test_enc = impute(dataset.train.X_enc, dataset.test.X_enc)
    data = dict(train=dict(X_enc=X_train_enc, y_enc=dataset.train.y_enc),
                test=dict(X_enc=X_test_enc, y_enc=dataset.test.y_enc))

    return run_in_venv(__file__,
                       "exec.py",
                       input_data=data,
                       dataset=dataset,
                       config=config)
示例#6
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Oboe ****\n")

    is_classification = config.type == 'classification'
    if not is_classification:
        # regression currently fails (as of 26.02.2019: still under development state by oboe team)
        raise ValueError('Regression is not yet supported (under development).')

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
    n_cores = config.framework_params.get('_n_cores', config.cores)

    log.info('Running oboe with a maximum time of {}s on {} cores.'.format(config.max_runtime_seconds, n_cores))
    log.warning('We completely ignore the advice to optimize towards metric: {}.'.format(config.metric))

    aml = AutoLearner(p_type='classification' if is_classification else 'regression',
                      n_cores=n_cores,
                      runtime_limit=config.max_runtime_seconds,
                      **training_params)

    aml_models = lambda: [aml.ensemble, *aml.ensemble.base_learners] if len(aml.ensemble.base_learners) > 0 else []

    with Timer() as training:
        try:
            aml.fit(X_train, y_train)
        except IndexError as e:
            if len(aml_models()) == 0:  # incorrect handling of some IndexError in oboe if ensemble is empty
                raise NoResultError("Oboe could not produce any model in the requested time.") from e
            raise e

    predictions = aml.predict(X_test).reshape(len(X_test))

    if is_classification:
        target_values_enc = dataset.target.label_encoder.transform(dataset.target.values)
        probabilities = Encoder('one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions)
    else:
        probabilities = None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(
        models_count=len(aml_models()),
        training_duration=training.duration
    )
def run_random_forest(dataset, config, tuner, log):
    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X, dataset.test.X)
    y_train, y_test = dataset.train.y, dataset.test.y

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor

    best_score, best_params, best_model = None, None, None
    score_higher_better = True

    tuner.update_search_space(SEARCH_SPACE)
    start_time = time.time()
    while True:
        try:
            param_idx, cur_params = tuner.generate_parameters()
            cur_model = estimator(random_state=config.seed, **cur_params)
            # Here score is the output of score() from the estimator
            cur_score = cross_val_score(cur_model, X_train, y_train)
            cur_score = sum(cur_score) / float(len(cur_score))
            if best_score is None or (score_higher_better
                                      and cur_score > best_score) or (
                                          not score_higher_better
                                          and cur_score < best_score):
                best_score, best_params, best_model = cur_score, cur_params, cur_model

            log.info("Trial {}: \n{}\nScore: {}\n".format(
                param_idx, cur_params, cur_score))
            tuner.receive_trial_result(param_idx, cur_params, cur_score)

            current_time = time.time()
            elapsed_time = current_time - start_time
            if elapsed_time > config.max_runtime_seconds:
                break
        except:
            break

    # This line is required to fully terminate some advisors
    tuner.handle_terminate()

    log.info("Tuning done, the best parameters are:\n{}\n".format(best_params))

    # retrain on the whole dataset
    with Timer() as training:
        best_model.fit(X_train, y_train)
    predictions = best_model.predict(X_test)
    probabilities = best_model.predict_proba(
        X_test) if is_classification else None

    return probabilities, predictions, training, y_test
示例#8
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Random Forest (sklearn) ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    log.info(
        "Running RandomForest with a maximum time of {}s on {} cores.".format(
            config.max_runtime_seconds, n_jobs))
    log.warning(
        "We completely ignore the requirement to stay within the time limit.")
    log.warning(
        "We completely ignore the advice to optimize towards metric: {}.".
        format(config.metric))

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    rf = estimator(n_jobs=n_jobs, random_state=config.seed, **training_params)

    with Timer() as training:
        rf.fit(X_train, y_train)

    predictions = rf.predict(X_test)
    probabilities = rf.predict_proba(X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(models_count=len(rf), training_duration=training.duration)
示例#9
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("****TabNet****")
    save_metadata(config)

    is_classification = config.type == 'classification'
    X_train, X_test = dataset.train.X, dataset.test.X

    X_train, X_test = impute(X_train, X_test)

    X = np.concatenate((X_train, X_test), axis=0)
    enc = OrdinalEncoder()
    enc.fit(X)
    X_train = enc.transform(X_train)
    X_test = enc.transform(X_test)

    y_train, y_test = dataset.train.y, dataset.test.y

    estimator = TabNetClassifier if is_classification else TabNetRegressor
    predictor = estimator()  # you can change hyperparameters

    if not is_classification:
        y_train = np.reshape(y_train.astype(np.float32), (-1, 1))
        y_test = np.reshape(y_test.astype(np.float32), (-1, 1))

    with Timer() as training:
        predictor.fit(X_train,
                      y_train,
                      eval_set=[(X_train, y_train), (X_test, y_test)])
    with Timer() as predict:
        predictions = predictor.predict(X_test)
    probabilities = predictor.predict_proba(
        X_test) if is_classification else None

    save_predictions(dataset=dataset,
                     output_file=config.output_predictions_file,
                     probabilities=probabilities,
                     predictions=predictions,
                     truth=y_test)
    return dict(models_count=1,
                training_duration=training.duration,
                predict_duration=predict.duration)
示例#10
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Decision Tree (sklearn) ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y, dataset.test.y

    estimator = DecisionTreeClassifier if is_classification else DecisionTreeRegressor
    predictor = estimator(random_state=config.seed, **config.framework_params)

    with Timer() as training:
        predictor.fit(X_train, y_train)
    predictions = predictor.predict(X_test)
    probabilities = predictor.predict_proba(
        X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test)

    return dict(models_count=1, training_duration=training.duration)
示例#11
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Hyperopt-sklearn ****\n")

    is_classification = config.type == 'classification'

    default = lambda: 0
    metrics_to_loss_mapping = dict(
        acc=(default, False),  # lambda y, pred: 1.0 - accuracy_score(y, pred)
        auc=(lambda y, pred: 1.0 - roc_auc_score(y, pred), False),
        f1=(lambda y, pred: 1.0 - f1_score(y, pred), False),
        # logloss=(log_loss, True),
        mae=(mean_absolute_error, False),
        mse=(mean_squared_error, False),
        msle=(mean_squared_log_error, False),
        r2=(default, False),  # lambda y, pred: 1.0 - r2_score(y, pred)
    )
    loss_fn, continuous_loss_fn = metrics_to_loss_mapping[
        config.metric] if config.metric in metrics_to_loss_mapping else (None,
                                                                         False)
    if loss_fn is None:
        log.warning("Performance metric %s not supported: defaulting to %s.",
                    config.metric, 'accuracy' if is_classification else 'r2')
    if loss_fn is default:
        loss_fn = None

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    log.warning("Ignoring cores constraint of %s cores.", config.cores)
    log.info(
        "Running hyperopt-sklearn with a maximum time of %ss on %s cores, optimizing %s.",
        config.max_runtime_seconds, 'all', config.metric)

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    if is_classification:
        classifier = any_classifier('clf')
        regressor = None
    else:
        classifier = None
        regressor = any_regressor('rgr')

    estimator = HyperoptEstimator(classifier=classifier,
                                  regressor=regressor,
                                  algo=tpe.suggest,
                                  loss_fn=loss_fn,
                                  continuous_loss_fn=continuous_loss_fn,
                                  trial_timeout=config.max_runtime_seconds,
                                  seed=config.seed,
                                  **training_params)

    with InterruptTimeout(config.max_runtime_seconds * 4 / 3,
                          sig=signal.SIGQUIT):
        with InterruptTimeout(config.max_runtime_seconds,
                              before_interrupt=ft.partial(
                                  kill_proc_tree,
                                  timeout=5,
                                  include_parent=False)):
            with Timer() as training:
                estimator.fit(X_train, y_train)

    predictions = estimator.predict(X_test)

    if is_classification:
        target_values_enc = dataset.target.label_encoder.transform(
            dataset.target.values)
        probabilities = Encoder(
            'one-hot', target=False,
            encoded_type=float).fit(target_values_enc).transform(predictions)
    else:
        probabilities = None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(models_count=len(estimator.trials),
                training_duration=training.duration)
示例#12
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** TPOT ****\n")

    is_classification = config.type == 'classification'
    # Mapping of benchmark metrics to TPOT metrics
    metrics_mapping = dict(acc='accuracy',
                           auc='roc_auc',
                           f1='f1',
                           logloss='neg_log_loss',
                           mae='neg_mean_absolute_error',
                           mse='neg_mean_squared_error',
                           msle='neg_mean_squared_log_error',
                           r2='r2')
    scoring_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if scoring_metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    log.info(
        'Running TPOT with a maximum time of %ss on %s cores, optimizing %s.',
        config.max_runtime_seconds, n_jobs, scoring_metric)
    runtime_min = (config.max_runtime_seconds / 60)

    estimator = TPOTClassifier if is_classification else TPOTRegressor
    tpot = estimator(n_jobs=n_jobs,
                     max_time_mins=runtime_min,
                     scoring=scoring_metric,
                     random_state=config.seed,
                     **training_params)

    with Timer() as training:
        tpot.fit(X_train, y_train)

    log.info('Predicting on the test set.')
    predictions = tpot.predict(X_test)
    try:
        probabilities = tpot.predict_proba(
            X_test) if is_classification else None
    except RuntimeError:
        # TPOT throws a RuntimeError if the optimized pipeline does not support `predict_proba`.
        target_values_enc = dataset.target.label_encoder.transform(
            dataset.target.values)
        probabilities = Encoder(
            'one-hot', target=False,
            encoded_type=float).fit(target_values_enc).transform(predictions)

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=is_classification)

    save_artifacts(tpot, config)

    return dict(models_count=len(tpot.evaluated_individuals_),
                training_duration=training.duration)
示例#13
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Tuned Random Forest (sklearn) ****\n")

    is_classification = config.type == 'classification'

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    tuning_params = config.framework_params.get('_tuning', training_params)
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    # Impute any missing data (can test using -t 146606)
    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    log.info(
        "Running RandomForest with a maximum time of {}s on {} cores.".format(
            config.max_runtime_seconds, n_jobs))

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    metric = dict(auc='roc_auc', logloss='neg_log_loss',
                  acc='accuracy')[config.metric]

    n_features = X_train.shape[1]
    default_value = max(1, int(math.sqrt(n_features)))
    below_default = pick_values_uniform(start=1,
                                        end=default_value,
                                        length=5 + 1)[:-1]  # 5 below
    above_default = pick_values_uniform(start=default_value,
                                        end=n_features,
                                        length=10 + 1 -
                                        len(below_default))[1:]  # 5 above
    # Mix up the order of `max_features` to try, so that a fair range is tried even if we have too little time
    # to try all possible values. Order: [sqrt(p), 1, p, random order for remaining values]
    # max_features_to_try = below_default[1:] + above_default[:-1]
    # max_features_values = ([default_value, 1, n_features]
    #                        + random.sample(max_features_to_try, k=len(max_features_to_try)))
    max_features_values = [default_value] + below_default + above_default
    # Define up to how much of total time we spend 'optimizing' `max_features`.
    # (the remainder if used for fitting the final model).
    safety_factor = 0.85
    with stopit.ThreadingTimeout(seconds=int(config.max_runtime_seconds *
                                             safety_factor)):
        log.info("Evaluating multiple values for `max_features`: %s.",
                 max_features_values)
        max_feature_scores = []
        tuning_durations = []
        for i, max_features_value in enumerate(max_features_values):
            log.info("[{:2d}/{:2d}] Evaluating max_features={}".format(
                i + 1, len(max_features_values), max_features_value))
            imputation = Imputer()
            random_forest = estimator(n_jobs=n_jobs,
                                      random_state=config.seed,
                                      max_features=max_features_value,
                                      **tuning_params)
            pipeline = Pipeline(steps=[('preprocessing',
                                        imputation), ('learning',
                                                      random_forest)])
            with Timer() as cv_scoring:
                try:
                    scores = cross_val_score(estimator=pipeline,
                                             X=dataset.train.X_enc,
                                             y=dataset.train.y_enc,
                                             scoring=metric,
                                             cv=5)
                    max_feature_scores.append(
                        (statistics.mean(scores), max_features_value))
                except stopit.utils.TimeoutException as toe:
                    log.error(
                        "Failed CV scoring for max_features=%s : Timeout",
                        max_features_value)
                    tuning_durations.append(
                        (max_features_value, cv_scoring.duration))
                    raise toe
                except Exception as e:
                    log.error("Failed CV scoring for max_features=%s :\n%s",
                              max_features_value, e)
                    log.debug("Exception:", exc_info=True)
            tuning_durations.append((max_features_value, cv_scoring.duration))

    log.info("Tuning scores:\n%s", sorted(max_feature_scores))
    log.info("Tuning durations:\n%s", sorted(tuning_durations))
    _, best_max_features_value = max(
        max_feature_scores) if len(max_feature_scores) > 0 else (math.nan,
                                                                 'auto')
    log.info("Training final model with `max_features={}`.".format(
        best_max_features_value))
    rf = estimator(n_jobs=n_jobs,
                   random_state=config.seed,
                   max_features=best_max_features_value,
                   **training_params)
    with Timer() as training:
        rf.fit(X_train, y_train)

    predictions = rf.predict(X_test)
    probabilities = rf.predict_proba(X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(models_count=len(rf),
                training_duration=training.duration +
                sum(map(lambda t: t[1], tuning_durations)))
示例#14
0
def run(dataset: Dataset, config: TaskConfig):
    log.info(
        "\n**** Random Forest (sklearn) Tuned with NNI EvolutionTuner ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X, dataset.test.X)
    y_train, y_test = dataset.train.y, dataset.test.y

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    # model = estimator(random_state=config.seed, **config.framework_params)
    best_score, best_params, best_model = None, None, None
    score_higher_better = True

    log.info(
        "Tuning hyperparameters with NNI EvolutionTuner with a maximum time of {}s\n"
        .format(config.max_runtime_seconds))
    tuner = EvolutionTuner()
    tuner.update_search_space(SEARCH_SPACE)
    start_time = time.time()
    param_idx = 0
    while True:
        try:
            cur_params = tuner.generate_parameters(param_idx)
            cur_model = estimator(random_state=config.seed,
                                  **cur_params,
                                  **config.framework_params)
            # Here score is the output of score() from the estimator
            cur_score = cross_val_score(cur_model, X_train, y_train)
            cur_score = sum(cur_score) / float(len(cur_score))
            if best_score is None or (score_higher_better
                                      and cur_score > best_score) or (
                                          not score_higher_better
                                          and cur_score < best_score):
                best_score, best_params, best_model = cur_score, cur_params, cur_model

            log.info("Trial {}: \n{}\nScore: {}\n".format(
                param_idx, cur_params, cur_score))
            tuner.receive_trial_result(param_idx, cur_params, cur_score)
            param_idx += 1
            current_time = time.time()
            elapsed_time = current_time - start_time
            if elapsed_time > config.max_runtime_seconds:
                break
        except:
            break

    log.info("Tuning done, the best parameters are:\n{}\n".format(best_params))

    # retrain on the whole dataset
    with Timer() as training:
        best_model.fit(X_train, y_train)
    predictions = best_model.predict(X_test)
    probabilities = best_model.predict_proba(
        X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test)

    return dict(models_count=1, training_duration=training.duration)