예제 #1
0
    def fit(self, X, Y, sample_weight=None):
        if sample_weight is not None:
            Xt, Xtest, Yt, Ytest, sample_weightt, sample_weightest = model_selection.train_test_split(X, Y, sample_weight, test_size=0.1, random_state=0)
        else:
            Xt, Xtest, Yt, Ytest  = model_selection.train_test_split(X, Y, test_size=0.2, random_state=0)
            sample_weightt = None
            sample_weighttest = None

        self.clf.fit(Xt, Yt, sample_weightt)
        #Y1 = self.clf.predict(Xtest)
        logging.info("Doing scoring Xtest=%s Ytest=%s" % (str(Xtest.shape), str(Ytest.shape)))
        score1 = self.scorer(self.clf, Xtest, Ytest)
        should_stop_count = 0

        improvement_buffer = [];

        for i in xrange(0,1000):
            logging.info("IML training iteration %d (should_stop=%d)" % (i, should_stop_count))
            t1 = unix_time_millis()
            clf2 = self.model(self.params)
            t2 = unix_time_millis()
            clf2.fit(Xt, Yt, sample_weightt)
            t3 = unix_time_millis()
            self.merge(clf2)
            t4 = unix_time_millis()
            #Y2 = self.clf.predict(Xtest)
            score2 = self.scorer(self.clf, Xtest, Ytest)
            t5 = unix_time_millis()
            self.last_increase = score2/score1

            if len(improvement_buffer) < improvement_buffer_size:
                improvement_buffer.append(self.last_increase)
            else:
                improvement_buffer = improvement_buffer[1:]
                improvement_buffer.append(self.last_increase)

            cum_improvement = reduce (lambda cum, x : cum * x, improvement_buffer)

            logging.info("IML run done, score: %f -> %f last_inc=%.3f imp_buf=%s cum_imp=%.3f" % (score1, score2, self.last_increase, improvement_buffer, cum_improvement))
            logging.info(" IML run timing : create=%f fit=%f merge=%f score=%f total=%f" % (t2-t1, t3-t2, t4-t3, t5-t4, t5-t1))

            #if not self.last_increase > step_improvement_min:
            #    should_stop_count = should_stop_count + 1
            #else:
            #    should_stop_count = 0
            #if i > nb_trees_per_steps and should_stop_count >= no_improvement_steps_threshold:
            #    break
            if i > min_steps and cum_improvement <= min_improvement_over_buffer:
                break
            #Y1 = Y2
            score1 = score2
예제 #2
0
def main(exec_folder, output_dataset, keptInputColumns):
    start = unix_time_millis()
    listener = ProgressListener()

    split_desc = json.load(open(osp.join(exec_folder, "_esplit.json")))
    preprocessing_params = json.load(open(osp.join(exec_folder, "rpreprocessing_params.json")))
    modeling_params = json.load(open(osp.join(exec_folder, "rmodeling_params.json")))

    with listener.push_state(constants.STATE_LOADING_SRC):
        input_df = df_from_split_desc_no_normalization(split_desc, "full", preprocessing_params["per_feature"])
        logging.info("Loaded full df: shape=(%d,%d)" % input_df.shape)
        input_df_orig = input_df.copy()
        input_df = utils.normalize_dataframe(input_df, preprocessing_params["per_feature"])        

    with listener.push_state("Collecting preprocessing data"):
        collector = ClusteringPreprocessingDataCollector(input_df, preprocessing_params)
        collector_data = collector.build()

    preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, exec_folder)
    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline()

    with listener.push_state("Preprocessing data"):
        transformed_train = pipeline.fit_and_process(input_df)

    start_train = unix_time_millis()

    (clf, actual_params, cluster_labels, additional_columns) = clustering_fit(modeling_params, transformed_train)

    # if model has custom labels, use them
    try:
        cluster_names = clf.get_cluster_labels()
    except AttributeError:
        cluster_names = ["cluster_%s" % i for i in range(len(np.unique(cluster_labels)))]
    cl = pd.Series(data=cluster_labels, name="cluster_labels").map(lambda i: cluster_names[i])
    cl.index = transformed_train["TRAIN"].index

    final_df = pd.concat([input_df_orig.join(cl, how='left'), additional_columns], axis=1)

    if keptInputColumns is not None:
        final_df = final_df[keptInputColumns + ['cluster_labels']]

    if preprocessing_params["outliers"]["method"] == "CLUSTER":
        final_df['cluster_labels'].fillna(constants.CLUSTER_OUTLIERS, inplace=True)

    dataiku.Dataset(output_dataset).write_from_dataframe(final_df)

    end = unix_time_millis()

    utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
예제 #3
0
def main(exec_folder):
    start = unix_time_millis()
    listener = ProgressListener()

    def update_fn():
        utils.write_running_traininfo(exec_folder, start, listener)

    split_desc = json.load(open(osp.join(exec_folder, "_esplit.json")))
    preprocessing_params = json.load(
        open(osp.join(exec_folder, "rpreprocessing_params.json")))
    modeling_params = json.load(
        open(osp.join(exec_folder, "rmodeling_params.json")))

    with listener.push_state(constants.STATE_LOADING_SRC):
        update_fn()
        train_df = df_from_split_desc(split_desc, "full",
                                      preprocessing_params["per_feature"])
        logging.info("Loaded full df: shape=(%d,%d)" % train_df.shape)

    with listener.push_state("Collecting preprocessing data"):
        update_fn()
        collector = ClusteringPreprocessingDataCollector(
            train_df, preprocessing_params)
        collector_data = collector.build()

    preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params,
                                                     exec_folder)
    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline()

    with listener.push_state("Preprocessing data"):
        orig_index = train_df.index.copy()
        transformed_train = pipeline.fit_and_process(train_df)
        preproc_handler.save_data()
        preproc_handler.report(pipeline)

    start_train = unix_time_millis()

    clustering_train_score_save(transformed_train, orig_index,
                                preprocessing_params, modeling_params,
                                exec_folder, listener, update_fn, pipeline)

    end = unix_time_millis()

    utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
예제 #4
0
        def __init__(self, run_folder, modeling_params, validation_sequence,
                     prediction_type, test_df_index, target_map,
                     use_multi_gpus, base_model):
            self.run_folder = run_folder
            self.modeling_params = modeling_params
            self.validation_sequence = validation_sequence
            self.prediction_type = prediction_type
            self.test_df_index = test_df_index
            self.target_map = target_map
            self.use_multi_gpus = use_multi_gpus
            self.base_model = base_model

            self.epoch_start = None
            self.all_scorers = get_grid_scorers(
                self.modeling_params,
                self.prediction_type,
                self.target_map,
                custom_make_scorer=self._scorer_func)
            self.model_best_score = None

            # Share the name of metric used to optimize model
            # The user can then retrieve it to write his own callback for example
            self.evaluation_metric = self.modeling_params['metrics'][
                'evaluationMetric']
            set_variable(
                "DKU_MODEL_METRIC",
                "Test {}".format(METRICS_NAMES[self.evaluation_metric]))
            set_variable(
                "DKU_MODEL_METRIC_GREATER_IS_BETTER",
                greater_is_better(
                    self.evaluation_metric,
                    self.modeling_params["metrics"].get(
                        "customEvaluationMetricGIB", True)))

            # Initialize model info
            self.model_training_info = {
                "startedAt": unix_time_millis(),
                "epochs": [],
                'metric': modeling_params["metrics"]["evaluationMetric"],
            }

            # We want to compute the metrics on the training data as well. To do it in a Keras way
            # we retrieve, after each batch, the value of y and y_pred for this batch for the model at this
            # stage of the training, accumulate them and then compute the score and all the values retrieved during the
            # epoch. This means that it does not correspond exactly to the score on the training
            # data with a fixed model at the end of an epoch, but to the score of an evolving model.
            # Those values are stored in TensorFlow Variable in the model so we need to tell TensorFlow that we want to
            # to retrieve them

            # Variables to accumulate values of y and y_pred after each batch
            self.y_list = None
            self.y_pred_list = None

            # TensorFlow Variables that are placeholders for values of y and y_pred
            self.var_y = tf.Variable(0., validate_shape=False)
            self.var_y_pred = tf.Variable(0., validate_shape=False)
예제 #5
0
        def on_epoch_begin(self, epoch, logs=None):
            self.epoch_start = unix_time_millis()
            self.model_training_info["currentNumStepsTraining"] = 0
            self.model_training_info["currentNumStepsScoring"] = 0
            self.model_training_info["currentEpoch"] = epoch
            self._update_model_info()

            # Reinitialize the accumulators of y and y_pred at the beginning of each epoch.
            self.y_list = []
            self.y_pred_list = []
예제 #6
0
        def _update_epoch_graph(self, train_score, test_score, epoch):
            epoch_finish_time = unix_time_millis()

            new_point = {
                'time': epoch_finish_time - self.epoch_start,
                'index': epoch + 1,
                'trainScore': train_score,
                'testScore': test_score,
                "epoch": epoch
            }
            self.model_training_info['epochs'].append(new_point)
            self._update_model_info()
예제 #7
0
    def __init__(self,
                 parallel,
                 m_folder=None,
                 n_splits=None,
                 n_candidates=None,
                 timeout=None,
                 n_jobs=None,
                 evaluation_metric=None,
                 metric_sign=1):
        self.parallel = parallel
        self.m_folder = m_folder
        self.n_splits = n_splits
        self.n_candidates = n_candidates
        self._watching = self.m_folder is not None
        self.grid_search_summary = []
        self.end_time = time.time(
        ) + timeout * 60 if timeout is not None else None  # timeout in minutes
        self.initial_grid_points = []
        self.initial_grid_point_ids = []
        self.n_jobs = n_jobs
        self.evaluation_metric = evaluation_metric
        self.metric_sign = metric_sign
        self.start_time = unix_time_millis()
        self.is_interrupted = False

        if self._watching:
            self.grid_folder = os.path.join(self.m_folder, 'grid')
            self.grid_tmp_folder = os.path.join(self.m_folder, 'grid.tmp')
            interrupt_optimization.set_interrupt_folder(self.m_folder)
            self.grid_search_file = os.path.join(self.m_folder,
                                                 'grid_search_done_py.json')
            self.grid_search_summary = dkujson.load_from_filepath(self.grid_search_file) \
                if os.path.exists(self.grid_search_file) else []
            self.initial_grid_point_ids = [
                x['grid_point_id'] for x in self.grid_search_summary
            ]
            self.initial_grid_points = self.grid_search_summary[:]
            for grid_point_id in self.initial_grid_point_ids:
                logging.info(
                    "Using precomputed score for Grid point {}".format(
                        grid_point_id))

        super(CVInterruptWatcherThread, self).__init__()
예제 #8
0
def _dku_fit_and_score(estimator,
                       X,
                       y,
                       scorer,
                       train,
                       test,
                       verbose,
                       is_interruptible,
                       parameters,
                       cvwatcher,
                       fit_params,
                       error_score='raise',
                       m_folder=None,
                       split_id=None,
                       parameter_id=None,
                       sample_weight=None,
                       algo_supports_weight=True):
    if cvwatcher.is_interrupted and is_interruptible:
        return None

    current_thread = threading.current_thread()
    current_thread.name = "GS-%s" % (current_thread.ident)

    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        logging.info("Fit  p=%s s=%s: %s %s" % (parameter_id, split_id, msg,
                                                (64 - len(msg)) * '.'))

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = unix_time_millis()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}

    # XGBoost early stopping
    if fit_params.get("early_stopping_rounds") is not None:
        if fit_params.get("eval_set") is None:
            # log the train and test objective but optimize on the test (last tuple used for early stopping eval)
            fit_params["eval_set"] = [(X_train, y_train), (X_test, y_test)]
        else:
            pass  # still keep the possibility to use a fixed eval_set

    if sample_weight is not None:
        w_train, _ = _safe_split(estimator, sample_weight, y, train)
        w_test, _ = _safe_split(estimator, sample_weight, y, test)
        if algo_supports_weight:
            # fit with sample weights whenever they are enabled AND the algorithm supports them
            fit_params["sample_weight"] = np.array(w_train)

    fit_params = dict([(k, _dku_index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = unix_time_millis() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            train_score = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = unix_time_millis() - start_time
        if sample_weight is not None:
            # score with sample weights whenever they are enabled, regardless of the support by the algorithm
            test_score = _dku_score(estimator,
                                    X_test,
                                    y_test,
                                    scorer,
                                    sample_weight=w_test,
                                    indices=test)
            train_score = _dku_score(estimator,
                                     X_train,
                                     y_train,
                                     scorer,
                                     sample_weight=w_train,
                                     indices=train)
        else:
            test_score = _dku_score(estimator,
                                    X_test,
                                    y_test,
                                    scorer,
                                    indices=test)
            train_score = _dku_score(estimator,
                                     X_train,
                                     y_train,
                                     scorer,
                                     indices=train)
        score_time = unix_time_millis() - start_time - fit_time
    if verbose > 1:
        end_msg = "%s (ft=%.1fs st=%.1fs sc=%s)" % (
            msg, fit_time / 1000, score_time / 1000, test_score)
        logging.info("Done p=%s s=%s: %s" % (parameter_id, split_id, end_msg))
    num_samples = _num_samples(X_test)
    best_iteration = getattr(estimator, 'best_iteration', None)
    ret = {
        "train_score": train_score,
        "test_score": test_score,
        "num_samples": num_samples,
        "fit_time": fit_time,
        "score_time": score_time,
        "time": fit_time + score_time,
        "parameters": parameters,
        "parameter_id": parameter_id,
        "grid_point_id": get_grid_point_id(parameters, split_id),
        "best_iteration": best_iteration,
        "done_at": unix_time_millis()
    }
    if m_folder is not None:
        tmp_file = os.path.join(
            m_folder, 'grid.tmp/grid_search_{}.{}.gridpoint'.format(
                parameter_id, split_id))
        dest_file = os.path.join(
            m_folder,
            'grid/grid_search_{}.{}.gridpoint'.format(parameter_id, split_id))
        dkujson.dump_to_filepath(tmp_file, ret)
        os.rename(tmp_file, dest_file)
    return ret
예제 #9
0
def main(exec_folder, selection_state_folder, operation_mode):
    """The whole execution of the saved model train takes place in a single folder ?"""
    start = unix_time_millis()
    start_train = start
    listener = ProgressListener()

    def update_fn():
        utils.write_running_traininfo(exec_folder, start, listener)

    split_desc = json.load(open(osp.join(exec_folder, "_esplit.json")))
    core_params = json.load(open(osp.join(exec_folder, "core_params.json")))
    preprocessing_params = json.load(
        open(osp.join(exec_folder, "rpreprocessing_params.json")))
    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {
        "SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"
    }
    with_class_weight = weight_method in {
        "CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"
    }
    calibrate_proba = core_params.get("calibration",
                                      {}).get("calibrationMethod",
                                              None) in ["SIGMOID", "ISOTONIC"]
    modeling_params = json.load(
        open(osp.join(exec_folder, "rmodeling_params.json")))

    # For KERAS backend, need to tag special features, because they are only processed with process function,
    # not fit_and_process
    if modeling_params["algorithm"] == "KERAS_CODE":
        tag_special_features(preprocessing_params['per_feature'])

    def do_full_fit_and_save():
        """Fit on 100% and save the clf and out params"""
        with listener.push_state(constants.STATE_LOADING_TRAIN):
            update_fn()
            full_df = df_from_split_desc(split_desc, "full",
                                         preprocessing_params["per_feature"],
                                         core_params["prediction_type"])
            logging.info("Loaded FULL df: shape=(%d,%d)" % full_df.shape)

        with listener.push_state("Collecting preprocessing data"):
            update_fn()
            collector = ClusteringPreprocessingDataCollector(
                full_df, preprocessing_params)
            collector_data = collector.build()

            pipeline, preproc_handler = build_pipeline_and_handler(
                collector_data,
                core_params,
                exec_folder,
                preprocessing_params,
                selection_state_folder=selection_state_folder,
                allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE")

            # TODO
            if core_params["prediction_type"] in (
                    constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                target_map = preproc_handler.target_map
            else:
                target_map = None

        with listener.push_state("Preprocessing full set"):
            preprocessor_fit_full_df = full_df

            # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors
            if modeling_params["algorithm"] == "KERAS_CODE":
                need_subsampling = preprocessing_params[
                    "preprocessingFitSampleRatio"] < 1
                full_df_orig = full_df.copy()
                if need_subsampling:
                    preprocessor_fit_full_df = preprocessor_fit_full_df.sample(
                        frac=preprocessing_params[
                            "preprocessingFitSampleRatio"],
                        random_state=preprocessing_params[
                            "preprocessingFitSampleSeed"])

            transformed_full = pipeline.fit_and_process(
                preprocessor_fit_full_df)

            if with_sample_weight:
                assert transformed_full["weight"].values.min(
                ) > 0, "Sample weights must be positive"

            preproc_handler.save_data()
            preproc_handler.report(pipeline)

        if modeling_params["algorithm"] == "KERAS_CODE":

            modeling_set = {"run_folder": exec_folder, "listener": listener}

            def update_modeling_state():

                status = utils.make_running_traininfo(
                    modeling_set["run_folder"], start,
                    modeling_set["listener"])
                utils.write_model_status(modeling_set, status)

            empty_df = pd.DataFrame()

            return prediction_train_model_keras(
                transformed_full, full_df_orig, empty_df, pipeline,
                modeling_params, core_params,
                preprocessing_params["per_feature"], exec_folder, listener,
                update_modeling_state, preproc_handler.target_map,
                pipeline.generated_features_mapping)

        else:
            return fit_score_save(pipeline, target_map, transformed_full)

    def fit_score_save(pipeline, target_map, transformed_full):
        with listener.push_state(constants.STATE_FITTING):
            update_fn()
            if core_params["prediction_type"] in (
                    constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                (clf, out_params, prepared_X, iipd) = classification_fit(
                    modeling_params,
                    split_desc,
                    transformed_full,
                    core_params["prediction_type"],
                    exec_folder,
                    target_map=target_map,
                    with_sample_weight=with_sample_weight,
                    with_class_weight=with_class_weight)
                if calibrate_proba:
                    method = core_params.get(
                        "calibration", {}).get("calibrationMethod").lower()
                    calibrated_clf = CalibratedClassifierCV(clf,
                                                            cv="prefit",
                                                            method=method)
                    test_X = transformed_full["TRAIN"]
                    test_X, is_sparse = prepare_multiframe(
                        test_X, modeling_set['modelingParams'])
                    test_y = transformed_full["target"].astype(int)
                    if with_sample_weight:
                        test_weight = transformed_full["weight"].astype(float)
                        calibrated_clf.fit(test_X,
                                           test_y,
                                           sample_weight=test_weight)
                    else:
                        calibrated_clf.fit(test_X, test_y)
                    clf = calibrated_clf
            else:
                (clf, out_params, prepared_X, iipd) = regression_fit_single(
                    modeling_params,
                    split_desc,
                    transformed_full,
                    exec_folder,
                    with_sample_weight=with_sample_weight)

        with listener.push_state(constants.STATE_SAVING):
            save_prediction_model(clf, out_params, listener, update_fn,
                                  exec_folder)
        with listener.push_state(constants.STATE_SCORING):
            train_X = transformed_full["TRAIN"]
            train_y = transformed_full["target"]
            if core_params[
                    "prediction_type"] == constants.BINARY_CLASSIFICATION:
                ClassificationModelIntrinsicScorer(modeling_params, clf,
                                                   train_X, train_y, pipeline,
                                                   exec_folder, prepared_X,
                                                   iipd,
                                                   calibrate_proba).score()
                BinaryModelSerializer(train_X.columns(), clf, modeling_params,
                                      exec_folder, target_map,
                                      calibrate_proba).serialize()
            elif core_params["prediction_type"] == constants.MULTICLASS:
                ClassificationModelIntrinsicScorer(modeling_params, clf,
                                                   train_X, train_y, pipeline,
                                                   exec_folder, prepared_X,
                                                   iipd,
                                                   calibrate_proba).score()
                MulticlassModelSerializer(train_X.columns(), clf,
                                          modeling_params, exec_folder,
                                          target_map,
                                          calibrate_proba).serialize()
            else:
                RegressionModelIntrinsicScorer(modeling_params, clf, train_X,
                                               train_y, pipeline, exec_folder,
                                               prepared_X, iipd).score()
                RegressionModelSerializer(train_X.columns(), clf,
                                          modeling_params,
                                          exec_folder).serialize()
        return out_params

    if operation_mode == "TRAIN_SPLITTED_ONLY":

        with listener.push_state(constants.STATE_LOADING_TRAIN):
            update_fn()
            train_df = df_from_split_desc(split_desc, "train",
                                          preprocessing_params["per_feature"],
                                          core_params["prediction_type"])
            logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape)

        with listener.push_state(constants.STATE_LOADING_TEST):
            update_fn()
            test_df = df_from_split_desc(split_desc, "test",
                                         preprocessing_params["per_feature"],
                                         core_params["prediction_type"])
            logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape)

        with listener.push_state("Collecting preprocessing data"):
            update_fn()
            collector = PredictionPreprocessingDataCollector(
                train_df, preprocessing_params)
            collector_data = collector.build()
            pipeline, preproc_handler = build_pipeline_and_handler(
                collector_data,
                core_params,
                exec_folder,
                preprocessing_params,
                selection_state_folder=selection_state_folder,
                allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE")

            # TODO
            if core_params["prediction_type"] in (
                    constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                target_map = preproc_handler.target_map
            else:
                target_map = None

        with listener.push_state("Preprocessing train set"):
            preprocessor_fit_df = train_df

            # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors
            if modeling_params["algorithm"] == "KERAS_CODE":
                train_df_orig = train_df.copy()
                need_subsampling = preprocessing_params[
                    "preprocessingFitSampleRatio"] < 1
                if need_subsampling:
                    preprocessor_fit_df = preprocessor_fit_df.sample(
                        frac=preprocessing_params[
                            "preprocessingFitSampleRatio"],
                        random_state=preprocessing_params[
                            "preprocessingFitSampleSeed"])

            transformed_train = pipeline.fit_and_process(preprocessor_fit_df)
            if with_sample_weight:
                assert transformed_train["weight"].values.min(
                ) > 0, "Sample weights must be positive"

            preproc_handler.save_data()
            preproc_handler.report(pipeline)

        # For KERAS backend, cannot process test directly, because my have special features that may not
        # hold in memory
        if modeling_params["algorithm"] != "KERAS_CODE":
            with listener.push_state("Preprocessing test set"):
                test_df_index = test_df.index.copy()
                transformed_test = pipeline.process(test_df)
                if with_sample_weight:
                    assert transformed_test["weight"].values.min(
                    ) > 0, "Sample weights must be positive"

        if modeling_params["algorithm"] == "PYTHON_ENSEMBLE":
            prediction_train_score_save_ensemble(train_df, test_df,
                                                 core_params, split_desc,
                                                 modeling_params, exec_folder,
                                                 listener, target_map,
                                                 update_fn, pipeline,
                                                 with_sample_weight)
        elif modeling_params["algorithm"] == "KERAS_CODE":
            modeling_set = {"run_folder": exec_folder, "listener": listener}

            def update_modeling_state():
                status = utils.make_running_traininfo(
                    modeling_set["run_folder"], start,
                    modeling_set["listener"])
                utils.write_model_status(modeling_set, status)

            prediction_train_model_keras(
                transformed_train, train_df_orig, test_df, pipeline,
                modeling_params, core_params,
                preprocessing_params["per_feature"], exec_folder, listener,
                update_modeling_state, preproc_handler.target_map,
                pipeline.generated_features_mapping)
        else:
            prediction_train_score_save(transformed_train, transformed_test,
                                        test_df_index, core_params, split_desc,
                                        modeling_params, exec_folder, listener,
                                        target_map, update_fn, pipeline,
                                        exec_folder)

    elif operation_mode == "TRAIN_FULL_ONLY":
        # Not yet functional ...
        do_full_fit_and_save()

    elif operation_mode == "TRAIN_KFOLD":
        out_params = do_full_fit_and_save()

        full_df_clean = df_from_split_desc(split_desc, "full",
                                           preprocessing_params["per_feature"],
                                           core_params["prediction_type"])

        optimized_params = out_params["resolved"]

        logging.info("Regridifying post-train params: %s" %
                     json.dumps(optimized_params))

        # Regridify to a unary grid the optimized params
        optimized_params_grid = intercom.backend_json_call(
            "ml/prediction/regridify-to-pretrain", {
                "preTrain": json.dumps(modeling_params),
                "postTrain": json.dumps(optimized_params)
            })
        logging.info("Using unary grid params: %s" %
                     json.dumps(optimized_params_grid))

        prediction_train_model_kfold(full_df_clean, core_params, split_desc,
                                     preprocessing_params,
                                     optimized_params_grid, exec_folder,
                                     exec_folder, listener, update_fn,
                                     with_sample_weight, with_class_weight,
                                     calibrate_proba)

    else:
        do_full_fit_and_save()
        # Do the split and scoring but don't save data
        with listener.push_state(constants.STATE_LOADING_TRAIN):
            update_fn()
            train_df = df_from_split_desc(split_desc, "train",
                                          preprocessing_params["per_feature"],
                                          core_params["prediction_type"])
            logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape)

        with listener.push_state(constants.STATE_LOADING_TEST):
            update_fn()
            test_df = df_from_split_desc(split_desc, "test",
                                         preprocessing_params["per_feature"],
                                         core_params["prediction_type"])
            logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape)

        with listener.push_state("Collecting preprocessing data"):
            update_fn()
            collector = PredictionPreprocessingDataCollector(
                train_df, preprocessing_params)
            collector_data = collector.build()

            pipeline, preproc_handler = build_pipeline_and_handler(
                collector_data,
                core_params,
                exec_folder,
                preprocessing_params,
                selection_state_folder=selection_state_folder,
                allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE")

            # TODO
            if core_params["prediction_type"] in (
                    constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                target_map = preproc_handler.target_map
            else:
                target_map = None

        with listener.push_state("Preprocessing train set"):
            preprocessor_fit_df = train_df

            # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors
            if modeling_params["algorithm"] == "KERAS_CODE":
                need_subsampling = preprocessing_params[
                    "preprocessingFitSampleRatio"] < 1
                train_df_orig = train_df.copy()
                if need_subsampling:
                    preprocessor_fit_df = preprocessor_fit_df.sample(
                        frac=preprocessing_params[
                            "preprocessingFitSampleRatio"],
                        random_state=preprocessing_params[
                            "preprocessingFitSampleSeed"])

            transformed_train = pipeline.fit_and_process(preprocessor_fit_df)

        # For KERAS backend, cannot process test directly, because my have special features that may not
        # hold in memory
        if modeling_params["algorithm"] != "KERAS_CODE":
            with listener.push_state("Preprocessing test set"):
                test_df_index = test_df.index.copy()
                transformed_test = pipeline.process(test_df)

        if modeling_params["algorithm"] == "KERAS_CODE":
            modeling_set = {"run_folder": exec_folder, "listener": listener}

            def update_modeling_state():
                status = utils.make_running_traininfo(
                    modeling_set["run_folder"], start,
                    modeling_set["listener"])
                utils.write_model_status(modeling_set, status)

            prediction_train_model_keras(transformed_train,
                                         train_df_orig,
                                         test_df,
                                         pipeline,
                                         modeling_params,
                                         core_params,
                                         preprocessing_params["per_feature"],
                                         exec_folder,
                                         listener,
                                         update_modeling_state,
                                         preproc_handler.target_map,
                                         pipeline.generated_features_mapping,
                                         save_model=False)
        else:
            with listener.push_state(constants.STATE_FITTING):
                update_fn()
                if core_params["prediction_type"] in (
                        constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                    (clf, out_params, prepared_X, iipd) = classification_fit(
                        modeling_params,
                        split_desc,
                        transformed_train,
                        core_params["prediction_type"],
                        target_map=target_map,
                        with_sample_weight=with_sample_weight,
                        with_class_weight=with_class_weight)
                else:
                    (clf, out_params, prepared_X,
                     iipd) = regression_fit_single(
                         modeling_params,
                         split_desc,
                         transformed_train,
                         exec_folder,
                         with_sample_weight=with_sample_weight)
            with listener.push_state(constants.STATE_SCORING):
                train_X = transformed_train["TRAIN"]
                train_y = transformed_train["target"]
                if core_params[
                        "prediction_type"] == constants.BINARY_CLASSIFICATION:
                    ClassificationModelIntrinsicScorer(
                        modeling_params, clf, train_X, train_y, pipeline,
                        exec_folder, prepared_X, iipd,
                        calibrate_proba).score()
                    BinaryModelSerializer(train_X.columns(), clf,
                                          modeling_params, exec_folder,
                                          target_map).serialize()
                    binary_classification_scorer_with_valid(
                        modeling_params,
                        clf,
                        transformed_test,
                        exec_folder,
                        test_df_index,
                        target_map=target_map,
                        with_sample_weight=with_sample_weight).score()
                elif core_params["prediction_type"] == constants.MULTICLASS:
                    ClassificationModelIntrinsicScorer(
                        modeling_params, clf, train_X, train_y, pipeline,
                        exec_folder, prepared_X, iipd,
                        calibrate_proba).score()
                    MulticlassModelSerializer(train_X.columns(), clf,
                                              modeling_params, exec_folder,
                                              target_map).serialize()
                    multiclass_scorer_with_valid(
                        modeling_params,
                        clf,
                        transformed_test,
                        exec_folder,
                        test_df_index,
                        target_map=target_map,
                        with_sample_weight=with_sample_weight).score()
                else:
                    RegressionModelIntrinsicScorer(modeling_params, clf,
                                                   train_X, train_y, pipeline,
                                                   exec_folder, prepared_X,
                                                   iipd).score()
                    RegressionModelSerializer(train_X.columns(), clf,
                                              modeling_params,
                                              exec_folder).serialize()
                    regression_scorer_with_valid(modeling_params, clf,
                                                 transformed_test, exec_folder,
                                                 test_df_index,
                                                 with_sample_weight).score()

    end = unix_time_millis()

    utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
예제 #10
0
def train_prediction_kfold(core_params, preprocessing_set, split_desc):

    start = unix_time_millis()
    preprocessing_params = preprocessing_set['preprocessing_params']
    modeling_sets = preprocessing_set["modelingSets"]

    logging.info("PPS is %s" % preprocessing_params)
    preprocessing_listener = ProgressListener()
    preprocessing_listener.add_future_steps(constants.PRED_KFOLD_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        listener.add_future_steps(constants.PRED_KFOLD_TRAIN_STATES)
        modeling_set["listener"] = listener

    def update_one_preprocessing_state(modeling_set):
        status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                              (preprocessing_listener, modeling_set["listener"]))
        utils.write_model_status(modeling_set, status)

    def update_preprocessing_state():
        map(update_one_preprocessing_state, modeling_sets)


    with preprocessing_listener.push_state(constants.STATE_LOADING_SRC):
        update_preprocessing_state()
        full_df = df_from_split_desc(split_desc,
                                     "full",
                                     preprocessing_params["per_feature"],
                                     core_params["prediction_type"])
        logging.info("Loaded full_df df: shape=(%d,%d)" % full_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = PredictionPreprocessingDataCollector(full_df, preprocessing_params)
        collector_data = collector.build()

    pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'],
                                                           preprocessing_params)

    with preprocessing_listener.push_state(constants.KFOLDSTATE_PREPROCESS_GLOBAL):
        update_preprocessing_state()
        transformed_full = pipeline.fit_and_process(full_df)
        preproc_handler.save_data()
        preproc_handler.report(pipeline)

    update_preprocessing_state()
    preprocessing_end = unix_time_millis()

    train_X = transformed_full["TRAIN"]
    train_y = transformed_full["target"]

    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    calibrate_proba = core_params.get("calibration", {}).get("calibrationMethod", None) in ["SIGMOID", "ISOTONIC"]

    if with_sample_weight:
        assert transformed_full["weight"].values.min() > 0, "Sample weights must be positive"

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()

        update_fn = lambda: update_one_preprocessing_state(modeling_set)
        if core_params["prediction_type"] in (constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
            with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL):
                # no out-fold available, so calibrate through classification_fit on a random split
                if calibrate_proba:
                    calibration_method = core_params.get("calibration", {}).get("calibrationMethod").lower()
                else:
                    calibration_method = None
                update_one_preprocessing_state(modeling_set)
                (clf, out_params, prepared_X, iipd) = classification_fit(modeling_set['modelingParams'], split_desc,
                                                                         transformed_full,
                                                                         core_params["prediction_type"],
                                                                         modeling_set['run_folder'],
                                                                         target_map=preproc_handler.target_map,
                                                                         with_sample_weight=with_sample_weight,
                                                                         with_class_weight=with_class_weight,
                                                                         calibration=calibration_method)
            save_prediction_model(clf, out_params, modeling_set["listener"], update_fn,
                                  modeling_set['run_folder'])

            with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL):
                update_one_preprocessing_state(modeling_set)
                ClassificationModelIntrinsicScorer(modeling_set['modelingParams'], clf,
                         train_X, train_y, pipeline, modeling_set['run_folder'], prepared_X, iipd, calibrate_proba).score()
                if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION:
                    BinaryModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'],
                                          modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize()
                else:
                    MulticlassModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'],
                                              modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize()
        else:
            with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL):
                update_one_preprocessing_state(modeling_set)
                (clf, out_params, prepared_X, iipd) = regression_fit_single(modeling_set['modelingParams'],
                                                                            split_desc, transformed_full, modeling_set["run_folder"],
                                                                            with_sample_weight=with_sample_weight)
            save_prediction_model(clf, out_params, modeling_set["listener"], update_fn,
                                  modeling_set['run_folder'])

            with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL):
                update_fn()
                RegressionModelIntrinsicScorer(modeling_set['modelingParams'], clf, train_X, train_y, pipeline,
                                               modeling_set['run_folder'], prepared_X, iipd).score()
                # serialize the model if possible
                RegressionModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'],
                                          modeling_set['run_folder']).serialize()

        full_df_clean = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"],
                                           core_params["prediction_type"])
        optimized_params = out_params["resolved"]

        logging.info("Regridifying post-train params: %s" % json.dumps(optimized_params))

        # Regridify to a unary grid the optimized params
        optimized_params_grid = intercom.backend_json_call("ml/prediction/regridify-to-pretrain", {
            "preTrain" : json.dumps(modeling_set["modelingParams"]),
            "postTrain" : json.dumps(optimized_params)
        })
        logging.info("Using unary grid params: %s" % json.dumps(optimized_params_grid))

        prediction_train_model_kfold(full_df_clean, core_params, split_desc, preprocessing_params, optimized_params_grid,
                                     preprocessing_set['run_folder'], modeling_set['run_folder'],
                                     modeling_set["listener"], update_fn, with_sample_weight, with_class_weight, calibrate_proba)

        end = unix_time_millis()
        utils.write_done_traininfo(modeling_set['run_folder'], start, model_start, end,
                                   (preprocessing_listener, modeling_set["listener"]),
                                   end_preprocessing_time=preprocessing_end)

        return "ok"
예제 #11
0
def create_ensemble(split_desc, core_params, model_folder, preprocessing_folder, model_folders, preprocessing_folders):
    listener = ProgressListener()
    listener.add_future_steps(constants.ENSEMBLE_STATES)
    start = unix_time_millis()

    def update_preprocessing_state():
        utils.write_running_traininfo(model_folder, start, listener)

    split_desc = dkujson.loads(split_desc)
    core_params = dkujson.loads(core_params)
    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    # TODO: update downstream
    with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    preprocessing_folders = dkujson.loads(preprocessing_folders)
    model_folders = dkujson.loads(model_folders)
    modeling_params = dkujson.load_from_filepath(osp.join(model_folder, "rmodeling_params.json"))
    ensemble_params = modeling_params["ensemble_params"]
    logging.info("creating ensemble")
    with listener.push_state(constants.STATE_ENSEMBLING):
        update_preprocessing_state()
        from dataiku.doctor.prediction.ensembles import ensemble_from_fitted
        train = df_from_split_desc(split_desc, "train", ensemble_params["preprocessing_params"][0]["per_feature"],
                                   core_params["prediction_type"])
        iperf = {
            "modelInputNRows" : train.shape[0], #todo : not the right count as may have dropped ...
            "modelInputNCols" : -1, # makes no sense for an ensemble as may have different preprocessings
            "modelInputIsSparse" : False
        }
        dkujson.dump_to_filepath(osp.join(model_folder, "iperf.json"), iperf)
        clf = ensemble_from_fitted(core_params, ensemble_params, preprocessing_folders, model_folders, train, with_sample_weight, with_class_weight)

    logging.info("saving model")
    with listener.push_state(constants.STATE_SAVING):
        update_preprocessing_state()
        with open(osp.join(model_folder, "clf.pkl"), dku_write_mode_for_pickling()) as f:
            pickle.dump(clf, f, 2)

    logging.info("scoring model")
    with listener.push_state(constants.STATE_SCORING):
        update_preprocessing_state()
        test = df_from_split_desc(split_desc, "test", ensemble_params["preprocessing_params"][0]["per_feature"],
                                   core_params["prediction_type"])
        # this is annoying, but we have to use one of the previous preprocessings in order to get the target
        prep_folder = preprocessing_folders[0]
        rppp = dkujson.load_from_filepath(osp.join(prep_folder, "rpreprocessing_params.json"))
        collector_data = dkujson.load_from_filepath(osp.join(prep_folder, "collector_data.json"))
        preprocessing_handler = PreprocessingHandler.build(core_params, rppp, prep_folder)
        preprocessing_handler.collector_data = collector_data
        pipe = preprocessing_handler.build_preprocessing_pipeline(with_target=True)
        transformed = pipe.process(test)
        y = transformed["target"]

        if with_sample_weight:
            sample_weight = transformed["weight"]
        else:
            sample_weight = None

        # Now that the CLF with scorable pipelines has been saved, set it in "pipelines with target" mode
        # to be able to compute metrics
        clf.set_with_target_pipelines_mode(True)

        pred = clf.predict(test)
        probas = None if core_params["prediction_type"] == "REGRESSION" else clf.predict_proba(test)
        target_map = None if core_params["prediction_type"] == "REGRESSION" else \
            {t["sourceValue"]: t["mappedValue"] for t in ensemble_params["preprocessing_params"][0]["target_remapping"]}
        prediction_type = core_params["prediction_type"]
        if prediction_type == "REGRESSION":
            RegressionModelScorer(modeling_params, clf, pred, y, model_folder, transformed, test.index.copy(), sample_weight).score()
        elif prediction_type == "BINARY_CLASSIFICATION":
            BinaryClassificationModelScorer(modeling_params, clf, model_folder, pred, probas, y, target_map, transformed, test.index.copy(), sample_weight).score()
        else:
            MulticlassModelScorer(modeling_params, clf, model_folder, pred, probas, y.astype(int), target_map, transformed, test.index.copy(), sample_weight).score()

    update_preprocessing_state()
    end = unix_time_millis()
    dkujson.dump_to_filepath(osp.join(model_folder, "actual_params.json"), {"resolved": modeling_params})
    dkujson.dump_to_filepath(osp.join(preprocessing_folder, "preprocessing_report.json"), {})
    utils.write_done_traininfo(model_folder, start, end, end, listener, end_preprocessing_time=start)

    return "ok"
예제 #12
0
def train_clustering_models_nosave(
                            split_desc,
                            preprocessing_set):
    """Regular (mode 1) train:
      - Non streamed single split + fit preprocess on train + preprocess test
      - Fit N models sequentially
         - Fit
         - Save clf
         - Compute and save clf performance
         - Score, save scored test set + scored performnace
    """

    start = unix_time_millis()
    preprocessing_listener = ProgressListener()
    preprocessing_params = preprocessing_set["preprocessing_params"]
    modeling_sets = preprocessing_set["modelingSets"]

    # Fill all the listeners ASAP to have correct progress data
    preprocessing_listener.add_future_steps(constants.CLUSTERING_REGULAR_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        listener.add_future_steps(constants.ALL_CLUSTERING_TRAIN_STATES)
        modeling_set["listener"] = listener

    # Called by the preprocessing pipeline to update the state
    # of each model and dump it to disk
    def update_preprocessing_state():
        for modeling_set in modeling_sets:
            status = {
                "modelId" : modeling_set["modelId"],
                "state": "RUNNING",
                "startTime": start,
                "progress" : merge_listeners(preprocessing_listener, modeling_set["listener"])
            }
            utils.write_model_status(modeling_set, status)

    logging.info("START TRAIN :" + preprocessing_set["description"])
    preprocessing_params = preprocessing_set["preprocessing_params"]

    with preprocessing_listener.push_state(constants.STATE_LOADING_SRC):
        update_preprocessing_state()
        source_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"])

        logging.info("Loaded source df: shape=(%d,%d)" % source_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = ClusteringPreprocessingDataCollector(source_df, preprocessing_params)
        collector_data = collector.build()

    preproc_handler = ClusteringPreprocessingHandler({},
                        preprocessing_set["preprocessing_params"],
                        preprocessing_set["run_folder"])

    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline()

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_SRC):
        update_preprocessing_state()
        source_df_index = source_df.index.copy()
        # TODO: fit_and_process should take an update_fn argument
        transformed_source = pipeline.fit_and_process(source_df)
        # Saves fitted resources and collector data
        preproc_handler.save_data()
        # Report on work
        report = {}
        pipeline.report_fit(report, {})
        utils.write_preproc_file(preprocessing_set["run_folder"], "preprocessing_report.json", report)

    update_preprocessing_state()

    preprocessing_end = unix_time_millis()

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()
        def update_modeling_state():
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)
        clustering_train_score_save(transformed_source, source_df_index,
                                    preprocessing_set["preprocessing_params"],
                                    modeling_set["modelingParams"],
                                    modeling_set["run_folder"],
                                    listener,
                                    update_modeling_state,
                                    pipeline)

        model_end = end = unix_time_millis()
        end = unix_time_millis()

        # Write the final model training info
        status = {
            "modelId": modeling_set["modelId"],
            "state": "DONE",
            "startTime": start,
            "endTime": end,
            "preprocessingTime": preprocessing_end - start,
            "trainingTime": model_end - model_start,
            "progress": merge_listeners(preprocessing_listener, modeling_set["listener"])
        }
        utils.write_model_status(modeling_set, status)

    return "ok"
예제 #13
0
def train_prediction_keras(core_params, preprocessing_set, split_desc):

    start = unix_time_millis()

    preprocessing_params = preprocessing_set["preprocessing_params"]
    modeling_sets = preprocessing_set["modelingSets"]
    run_folder = preprocessing_set["run_folder"]

    logging.info("PPS is %s" % preprocessing_params)
    preprocessing_listener = ProgressListener()
    # Fill all the listeners ASAP to have correct progress data
    preprocessing_listener.add_future_steps(constants.PRED_KERAS_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        listener.add_future_steps(constants.PRED_KERAS_TRAIN_STATES)
        modeling_set["listener"] = listener

    # Called by the preprocessing pipeline to update the state
    # of each model and dump it to disk
    def update_preprocessing_state():
        for modeling_set in modeling_sets:
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)

    with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN):
        update_preprocessing_state()
        train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"])
        train_df_orig = train_df.copy()
        logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape)
        
        # Not implemented in the UI so far, so processor_fit_df will always be train_df
        preprocessor_fit_df = train_df
        need_subsampling = preprocessing_params["preprocessingFitSampleRatio"] < 1
        if need_subsampling:
            preprocessor_fit_df = preprocessor_fit_df.sample(frac=preprocessing_params["preprocessingFitSampleRatio"],
                                                             random_state=preprocessing_params["preprocessingFitSampleSeed"])

    with preprocessing_listener.push_state(constants.STATE_LOADING_TEST):
        update_preprocessing_state()
        test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"])
        logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = PredictionPreprocessingDataCollector(preprocessor_fit_df, preprocessing_params)
        collector_data = collector.build()

    # Tagging special features to take them into account only in special_preproc_handler/special_pipeline
    per_feature = preprocessing_params["per_feature"]
    tag_special_features(per_feature)

    pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, run_folder,
                                                           preprocessing_params, allow_empty_mf=True)

    with preprocessing_listener.push_state(constants.KERASSTATE_FIT_NORMAL_PREPROCESSING):
        update_preprocessing_state()

        # Retrieving transformed values to get the shape of all regular inputs, even if won't be
        # actually used, as each batch of data will be processed again
        transformed_normal = pipeline.fit_and_process(preprocessor_fit_df)
        preproc_handler.save_data()
        preproc_handler.report(pipeline)

    # TODO: REVIEW STATES OF TRAINING
    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN):
        update_preprocessing_state()

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST):
        update_preprocessing_state()

    update_preprocessing_state()
    preprocessing_end = unix_time_millis()

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()

        # Settings env variable that may be accessed in user defined code
        remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_ANALYSIS_ID, modeling_set["fullId"]["taskLoc"]["analysisId"])
        remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_MLTASK_ID, modeling_set["fullId"]["taskLoc"]["mlTaskId"])

        def update_modeling_state():
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)

        prediction_train_model_keras(transformed_normal, train_df_orig, test_df, pipeline, modeling_set["modelingParams"],
                                     core_params, per_feature, modeling_set["run_folder"], modeling_set["listener"],
                                     update_modeling_state, preproc_handler.target_map,
                                     pipeline.generated_features_mapping)

        end = unix_time_millis()
        utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end,
                                   (preprocessing_listener, modeling_set["listener"]),
                                   end_preprocessing_time=preprocessing_end)

    return "ok"
예제 #14
0
def train_prediction_models_nosave(core_params, preprocessing_set, split_desc):
    """Regular (mode 1) train:
      - Non streamed single split + fit preprocess on train + preprocess test
      - Fit N models sequentially
         - Fit
         - Save clf
         - Compute and save clf performance
         - Score, save scored test set + scored performnace
    """

    start = unix_time_millis()
    preprocessing_params = preprocessing_set["preprocessing_params"]
    modeling_sets = preprocessing_set["modelingSets"]

    logging.info("PPS is %s" % preprocessing_params)
    preprocessing_listener = ProgressListener()
    # Fill all the listeners ASAP to have correct progress data
    preprocessing_listener.add_future_steps(constants.PRED_REGULAR_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        if modeling_set.get('modelingParams', {}).get('gridLength', 1) > 1:
            listener.add_future_step(constants.STATE_GRIDSEARCHING)
        listener.add_future_steps(constants.PRED_REGULAR_TRAIN_STATES)
        modeling_set["listener"] = listener

    # Called by the preprocessing pipeline to update the state
    # of each model and dump it to disk
    def update_preprocessing_state():
        for modeling_set in modeling_sets:
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)

    with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN):
        update_preprocessing_state()
        train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"])
        logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape)

        for col in train_df:
            logging.info("Train col : %s (%s)" % (col, train_df[col].dtype))

    with preprocessing_listener.push_state(constants.STATE_LOADING_TEST):
        update_preprocessing_state()
        test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"])
        logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = PredictionPreprocessingDataCollector(train_df, preprocessing_params)
        collector_data = collector.build()

    pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'],
                                                           preprocessing_params)

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN):
        update_preprocessing_state()
        # TODO: fit_and_process should take an update_fn argument
        transformed_train = pipeline.fit_and_process(train_df)
        preproc_handler.save_data()
        preproc_handler.report(pipeline)

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST):
        update_preprocessing_state()
        test_df_index = test_df.index.copy()
        transformed_test = pipeline.process(test_df)

    update_preprocessing_state()
    preprocessing_end = unix_time_millis()

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()

        def update_modeling_state():
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)

        # since ensembles are never fitted through the doctor, no need to distinguish here
        prediction_train_score_save(transformed_train,
                                    transformed_test, test_df_index,
                                    core_params, split_desc,
                                    modeling_set["modelingParams"],
                                    modeling_set["run_folder"],
                                    modeling_set["listener"],
                                    preproc_handler.target_map,
                                    update_modeling_state,
                                    pipeline,
                                    modeling_set["run_folder"])

        end = unix_time_millis()

        utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end,
                                   (preprocessing_listener, modeling_set["listener"]),
                                   end_preprocessing_time=preprocessing_end)

    return "ok"