Пример #1
0
def train(fpath, max_depth, max_features, n_estimators):
    """
    :param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
    :param fpath: Path or URL for the training data used with the model.
    :param max_depth: RF max_depth parameter
    :param max_features: RF max_features parameter
    :param n_estimators: RF n_estimators parameter
    :return: trained model
    """
    X_train, X_test, y_train, y_test = load_data(fpath)

    mod = RandomForestClassifier(max_depth=max_depth,
                                 max_features=max_features,
                                 n_estimators=n_estimators)

    mod.fit(X_train, y_train)
    preds = mod.predict(X_test)
    acc = accuracy_score(y_test, preds)

    mlparams = {
        "max_depth": str(max_depth),
        "max_features": str(max_features),
        "n_estimators": str(n_estimators),
    }
    mlflow.log_params(mlparams)

    mlflow.log_metric("accuracy", acc)

    mlflow.sklearn.log_model(mod, "saved_models")

    return mod
Пример #2
0
    def fit(self, X_train, y_train):
        """ Fit decision tree model """
        if 'XGBoost' in self.hpo_config.model_type:
            hpo_log.info('> fit xgboost model')
            dtrain = xgboost.DMatrix(data=X_train, label=y_train)
            num_boost_round = self.hpo_config.model_params['num_boost_round']
            trained_model = xgboost.train(dtrain=dtrain,
                                          params=self.hpo_config.model_params,
                                          num_boost_round=num_boost_round)

        elif 'RandomForest' in self.hpo_config.model_type:
            hpo_log.info('> fit randomforest model')
            trained_model = RandomForestClassifier(
                n_estimators=self.hpo_config.model_params['n_estimators'],
                max_depth=self.hpo_config.model_params['max_depth'],
                max_features=self.hpo_config.model_params['max_features'],
                n_bins=self.hpo_config.model_params['n_bins']).fit(
                    X_train, y_train.astype('int32'))

        elif 'KMeans' in self.hpo_config.model_type:
            hpo_log.info('> fit kmeans model')
            trained_model = KMeans(
                n_clusters=self.hpo_config.model_params['n_clusters'],
                max_iter=self.hpo_config.model_params['max_iter'],
                random_state=self.hpo_config.model_params['random_state'],
                init=self.hpo_config.model_params['init']).fit(X_train)

        return trained_model
def GridSearch_random_forest(X_train, y_train):
    # Encode as float32
    X_train = X_train.to_numpy().astype('float32')
    y_train = y_train.to_numpy().astype('float32')

    # Init Kfolds
    folds = KFold(n_splits=5)

    # Init hyperparam vals
    n_estimators_lst = [128, 256, 512, 1024]
    max_features_lst = ['sqrt', 'log2']

    fin_arr = []

    # Run GridSearch for all hyperparam combos
    for n_estimators in n_estimators_lst:

        for max_features in max_features_lst:

            # Init clf
            clf = RandomForestClassifier(n_estimators=n_estimators,
                                         max_features=max_features)

            predicted_y = []
            true_y = []
            # Run CV and calc metrics
            for train, holdout in folds.split(X_train):
                clf.fit(X_train[train], y_train[train])

                predicted_y.append(clf.predict(X_train[holdout]))

                true_y.append(y_train[holdout])

            predicted_y = np.concatenate(predicted_y)
            true_y = np.concatenate(true_y)

            accuracy_train = accuracy_score(true_y, predicted_y)
            f1_train = f1_score(true_y, predicted_y)
            roc_auc_train = roc_auc_score(true_y, predicted_y)

            fin_arr.append([
                n_estimators, max_features, accuracy_train, f1_train,
                roc_auc_train
            ])
    # Create final dataframe from GridSearch results
    fin_arr = np.array(fin_arr).reshape(
        (len(n_estimators_lst) * len(max_features_lst)), 5)

    columns = [
        'n_estimators', 'max_features', 'mean_accuracy', 'mean_f1', 'mean_auc'
    ]

    results = pd.DataFrame(data=fin_arr, columns=columns)
    results.n_estimators = results.n_estimators.astype(int)

    return results
Пример #4
0
def train_and_eval(X_param, y_param, max_depth=16, n_estimators=100):
    X_train, X_valid, y_train, y_valid = train_test_split(X_param,
                                                          y_param,
                                                          random_state=77)
    classifier = RandomForestClassifier(max_depth=max_depth,
                                        n_estimators=n_estimators)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_valid)
    score = accuracy_score(y_valid, y_pred)
    return score
Пример #5
0
def fit(X, y):
    global clf
    clf = RandomForestClassifier(split_criterion=params.criterion,
                                 split_algo=params.split_algorithm,
                                 n_estimators=params.num_trees,
                                 max_depth=params.max_depth,
                                 max_features=params.max_features,
                                 min_samples_split=params.min_samples_split,
                                 max_leaves=params.max_leaf_nodes,
                                 min_impurity_decrease=params.min_impurity_decrease,
                                 bootstrap=params.bootstrap)
    return clf.fit(X, y)
    def fit(self, X_train, y_train):
        """Fit decision tree model"""
        if "XGBoost" in self.hpo_config.model_type:
            hpo_log.info("> fit xgboost model")
            dtrain = xgboost.DMatrix(data=X_train, label=y_train)
            num_boost_round = self.hpo_config.model_params["num_boost_round"]
            trained_model = xgboost.train(dtrain=dtrain,
                                          params=self.hpo_config.model_params,
                                          num_boost_round=num_boost_round)

        elif "RandomForest" in self.hpo_config.model_type:
            hpo_log.info("> fit randomforest model")
            trained_model = RandomForestClassifier(
                n_estimators=self.hpo_config.model_params["n_estimators"],
                max_depth=self.hpo_config.model_params["max_depth"],
                max_features=self.hpo_config.model_params["max_features"],
                n_bins=self.hpo_config.model_params["n_bins"],
            ).fit(X_train, y_train.astype("int32"))

        return trained_model
Пример #7
0
def _train(params, fpath, hyperopt=False):
    """
    :param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
    :param fpath: Path or URL for the training data used with the model.
    :param hyperopt: Use hyperopt for hyperparameter search during training.
    :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run)
    """
    max_depth, max_features, n_estimators = params
    max_depth, max_features, n_estimators = (int(max_depth),
                                             float(max_features),
                                             int(n_estimators))

    # Log all of our training parameters for this run.
    pyver = sys.version_info
    mlparams = {
        'cudf_version': str(cudf.__version__),
        'cuml_version': str(cuml.__version__),
        'max_depth': str(max_depth),
        'max_features': str(max_features),
        'n_estimators': str(n_estimators),
        'python_version': f"{pyver[0]}.{pyver[1]}.{pyver[2]}.{pyver[3]}",
    }
    mlflow.log_params(mlparams)

    X_train, X_test, y_train, y_test = load_data(fpath)
    mod = RandomForestClassifier(max_depth=max_depth,
                                 max_features=max_features,
                                 n_estimators=n_estimators)

    mod.fit(X_train, y_train)
    preds = mod.predict(X_test)
    acc = accuracy_score(y_test, preds)

    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(mod, "saved_models")

    if not hyperopt:
        return mod

    return {"loss": acc, "status": STATUS_OK}
Пример #8
0
def _train(params, fpath, hyperopt=False):
    """
    :param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
    :param fpath: Path or URL for the training data used with the model.
    :param hyperopt: Use hyperopt for hyperparameter search during training.
    :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run)
    """
    max_depth, max_features, n_estimators = params
    max_depth, max_features, n_estimators = (int(max_depth),
                                             float(max_features),
                                             int(n_estimators))

    X_train, X_test, y_train, y_test = load_data(fpath)

    mod = RandomForestClassifier(max_depth=max_depth,
                                 max_features=max_features,
                                 n_estimators=n_estimators)

    mod.fit(X_train, y_train)
    preds = mod.predict(X_test)
    acc = accuracy_score(y_test, preds)

    mlparams = {
        "max_depth": str(max_depth),
        "max_features": str(max_features),
        "n_estimators": str(n_estimators)
    }
    mlflow.log_params(mlparams)

    mlflow.log_metric("accuracy", acc)

    mlflow.sklearn.log_model(mod, "saved_models")

    if (not hyperopt):
        return mod

    return {'loss': acc, 'status': STATUS_OK}
Пример #9
0
print('cu_y_train:', type(cu_y_train), 'shape:', cu_y_train.shape)

print('Copying data to GPU done in {:.2f} seconds'.format(time() - t0))

# ### Learning
#
# Random forest classifiers are quick to train, quite robust to
# hyperparameter values, and often work relatively well.

print()
print('Learning begins')
t0 = time()

n_estimators = 100
max_depth = 16
clf_rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
print(clf_rf)
clf_rf.fit(cu_X_train, cu_y_train)

print('Learning done in {:.2f} seconds'.format(time() - t0))

# ### Inference
#
# We will use GPU-based inference to predict the classes for the test
# data.

print()
print('Inference begins')
t0 = time()

pred_rf = clf_rf.predict(X_test, predict_model='GPU')
def run_random_forest(scaled_df):
    raw_train_arr = []
    raw_test_arr = []
    # Over five trials
    for i in range(5):

        # Split data into train and test
        X_train, X_test, y_train, y_test = train_test_split(
            scaled_df.iloc[:, :-1], scaled_df.y, train_size=5000)

        # Run GridSearch
        search_results = GridSearch_random_forest(X_train, y_train)

        results = search_results
        # Get optimal clfs using gridsearch results
        opt_acc_inf = results.sort_values(by='mean_accuracy',
                                          ascending=False).iloc[0]
        opt_f1_inf = results.sort_values(by='mean_f1', ascending=False).iloc[0]
        opt_auc_inf = results.sort_values(by='mean_auc',
                                          ascending=False).iloc[0]

        # Init optimal clfs
        opt_acc_clf = RandomForestClassifier(
            n_estimators=opt_acc_inf.n_estimators,
            max_features=opt_acc_inf.max_features)

        opt_f1_clf = RandomForestClassifier(
            n_estimators=opt_f1_inf.n_estimators,
            max_features=opt_f1_inf.max_features)

        opt_auc_clf = RandomForestClassifier(
            n_estimators=opt_auc_inf.n_estimators,
            max_features=opt_auc_inf.max_features)

        # Encode as float32 for cuML
        X_train_np = X_train.to_numpy().astype('float32')
        y_train_np = y_train.to_numpy().astype('float32')

        X_test_np = X_test.to_numpy().astype('float32')
        y_test_np = y_test.to_numpy().astype('float32')

        # Fit clfs
        opt_acc_clf.fit(X_train_np, y_train_np)
        opt_f1_clf.fit(X_train_np, y_train_np)
        opt_auc_clf.fit(X_train_np, y_train_np)

        # Get train and test metrics
        train_score_acc = opt_acc_clf.score(X_train_np, y_train_np)
        train_score_f1 = f1_score(y_train_np, opt_f1_clf.predict(X_train_np))
        train_score_auc = roc_auc_score(y_train_np,
                                        opt_auc_clf.predict(X_train_np))

        test_score_acc = opt_acc_clf.score(X_test_np, y_test_np)
        test_score_f1 = f1_score(y_test_np, opt_f1_clf.predict(X_test_np))
        test_score_auc = roc_auc_score(y_test_np,
                                       opt_auc_clf.predict(X_test_np))

        raw_train_arr.append(
            [train_score_acc, train_score_f1, train_score_auc])
        raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc])

    raw_train_arr = np.array(raw_train_arr).reshape(5, 3)
    raw_test_arr = np.array(raw_test_arr).reshape(5, 3)

    raw_train_df = pd.DataFrame(data=raw_train_arr,
                                columns=['accuracy', 'f1', 'auc'])
    raw_test_df = pd.DataFrame(data=raw_test_arr,
                               columns=['accuracy', 'f1', 'auc'])

    return raw_train_df, raw_test_df
Пример #11
0
         from sklearn.linear_model import LogisticRegression
     model = LogisticRegression(**alg.input_variables.__dict__)
 elif alg.name == 'AdaBoost' and alg.type == 'classification':
     from sklearn.ensemble import AdaBoostClassifier
     model = AdaBoostClassifier(**alg.input_variables.__dict__)
     warn_not_gpu_support(alg)
 elif alg.name == 'GradientBoosting' and alg.type == 'classification':
     from sklearn.ensemble import GradientBoostingClassifier
     model = GradientBoostingClassifier(**alg.input_variables.__dict__)
     warn_not_gpu_support(alg)
 elif alg.name == 'RandomForest' and alg.type == 'classification':
     if NVIDIA_RAPIDS_ENABLED:
         from cuml.ensemble import RandomForestClassifier
     else:
         from sklearn.ensemble import RandomForestClassifier
     model = RandomForestClassifier(**alg.input_variables.__dict__)
 elif alg.name == 'XGBoost' and alg.type == 'classification':
     from xgboost.sklearn import XGBClassifier
     """
     Note from NVIDIA RAPIDS >= 0.17 (no error for == 0.13)
     ValueError: The option use_label_encoder=True is incompatible with inputs of type cuDF or cuPy. 
     Please set use_label_encoder=False when constructing XGBClassifier object. 
     NOTE: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. 
     To remove this warning, do the following: 
     1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 
     2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]
     """
     if NVIDIA_RAPIDS_ENABLED:
         model = XGBClassifier(**alg.input_variables.__dict__, use_label_encoder=False, tree_method="gpu_hist")
     else:
         model = XGBClassifier(**alg.input_variables.__dict__)