Пример #1
0
    def evaluate_test_perf(self, trained_model, X_test, y_test, threshold=0.5):
        """
        Evaluates the model performance on the inference set. For XGBoost we need
        to generate a DMatrix and then we can evaluate the model.
        For Random Forest, in single GPU case, we can just call .score function.
        And multi-GPU Random Forest needs to predict on the model and then compute
        the accuracy score.

        Parameters
        ----------
        trained_model : The object of the trained model either of XGBoost or RandomForest
        X_test : dataframe
                  The data for testing
        y_test : dataframe
                  The label to be used for testing.
        Returns
        ----------
        test_accuracy : float
                        The accuracy achieved on test set
        duration : float
                   The time it took to evaluate the model
        """
        self.log_to_file(f'\n> Inferencing on test set')
        test_accuracy = None
        with PerfTimer() as inference_timer:
            try:
                if self.model_type == 'XGBoost':
                    if 'multi' in self.compute_type:
                        test_DMatrix = xgboost.dask.DaskDMatrix(self.client, data = X_test, label = y_test)
                        xgb_pred = xgboost.dask.predict(self.client, trained_model, test_DMatrix).compute()
                        xgb_pred = (xgb_pred > threshold) * 1.0
                        test_accuracy = accuracy_score(y_test.compute(), xgb_pred)
                    elif 'single' in self.compute_type: 
                        test_DMatrix = xgboost.DMatrix(data = X_test, label = y_test)
                        xgb_pred = trained_model.predict(test_DMatrix)
                        xgb_pred = (xgb_pred > threshold) * 1.0
                        test_accuracy = accuracy_score(y_test, xgb_pred)

                elif self.model_type == 'RandomForest':
                    if 'multi' in self.compute_type:
                        cuml_pred = trained_model.predict(X_test).compute()
                        self.log_to_file("\n\tPrediction complete")
                        test_accuracy = accuracy_score(y_test.compute(), cuml_pred, convert_dtype=True)
                    elif 'single' in self.compute_type:
                        test_accuracy = trained_model.score( X_test, y_test.astype('int32') )

            except Exception as error:
                self.log_to_file( '\n\n!error during inference: ' + str(error))

        self.log_to_file(f'\n\tFinished inference in {inference_timer.duration:.4f} s')
        self.log_to_file(f'\n\tTest-accuracy: {test_accuracy}')
        return test_accuracy, inference_timer.duration
Пример #2
0
def train(fpath, max_depth, max_features, n_estimators):
    """
    :param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
    :param fpath: Path or URL for the training data used with the model.
    :param max_depth: RF max_depth parameter
    :param max_features: RF max_features parameter
    :param n_estimators: RF n_estimators parameter
    :return: trained model
    """
    X_train, X_test, y_train, y_test = load_data(fpath)

    mod = RandomForestClassifier(max_depth=max_depth,
                                 max_features=max_features,
                                 n_estimators=n_estimators)

    mod.fit(X_train, y_train)
    preds = mod.predict(X_test)
    acc = accuracy_score(y_test, preds)

    mlparams = {
        "max_depth": str(max_depth),
        "max_features": str(max_features),
        "n_estimators": str(n_estimators),
    }
    mlflow.log_params(mlparams)

    mlflow.log_metric("accuracy", acc)

    mlflow.sklearn.log_model(mod, "saved_models")

    return mod
Пример #3
0
    def score(self, X, y, **kwargs):
        """Scoring function for based on mean accuracy.

        Parameters
        ----------
        X : [cudf.DataFrame]
            Test samples on which we predict
        y : [cudf.Series, device array, or numpy array]
            Ground truth values for predict(X)

        Returns
        -------
        score : float
            Accuracy of self.predict(X) wrt. y (fraction where y == pred_y)
        """
        from cuml.metrics.accuracy import accuracy_score
        from cuml.utils import input_to_dev_array

        X_m = input_to_dev_array(X)[0]
        y_m = input_to_dev_array(y)[0]

        if hasattr(self, 'handle'):
            handle = self.handle
        else:
            handle = None

        return accuracy_score(y_m,
                              cuda.to_device(self.predict(X_m)),
                              handle=handle)
Пример #4
0
def _train(params, fpath, hyperopt=False):
    """
    :param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
    :param fpath: Path or URL for the training data used with the model.
    :param hyperopt: Use hyperopt for hyperparameter search during training.
    :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run)
    """
    max_depth, max_features, n_estimators = params
    max_depth, max_features, n_estimators = (int(max_depth),
                                             float(max_features),
                                             int(n_estimators))

    # Log all of our training parameters for this run.
    pyver = sys.version_info
    mlparams = {
        'cudf_version': str(cudf.__version__),
        'cuml_version': str(cuml.__version__),
        'max_depth': str(max_depth),
        'max_features': str(max_features),
        'n_estimators': str(n_estimators),
        'python_version': f"{pyver[0]}.{pyver[1]}.{pyver[2]}.{pyver[3]}",
    }
    mlflow.log_params(mlparams)

    X_train, X_test, y_train, y_test = load_data(fpath)
    mod = RandomForestClassifier(max_depth=max_depth,
                                 max_features=max_features,
                                 n_estimators=n_estimators)

    mod.fit(X_train, y_train)
    preds = mod.predict(X_test)
    acc = accuracy_score(y_test, preds)

    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(mod, "saved_models")

    if not hyperopt:
        return mod

    return {"loss": acc, "status": STATUS_OK}
Пример #5
0
def _train(params, fpath, hyperopt=False):
    """
    :param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
    :param fpath: Path or URL for the training data used with the model.
    :param hyperopt: Use hyperopt for hyperparameter search during training.
    :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run)
    """
    max_depth, max_features, n_estimators = params
    max_depth, max_features, n_estimators = (int(max_depth),
                                             float(max_features),
                                             int(n_estimators))

    X_train, X_test, y_train, y_test = load_data(fpath)

    mod = RandomForestClassifier(max_depth=max_depth,
                                 max_features=max_features,
                                 n_estimators=n_estimators)

    mod.fit(X_train, y_train)
    preds = mod.predict(X_test)
    acc = accuracy_score(y_test, preds)

    mlparams = {
        "max_depth": str(max_depth),
        "max_features": str(max_features),
        "n_estimators": str(n_estimators)
    }
    mlflow.log_params(mlparams)

    mlflow.log_metric("accuracy", acc)

    mlflow.sklearn.log_model(mod, "saved_models")

    if (not hyperopt):
        return mod

    return {'loss': acc, 'status': STATUS_OK}
Пример #6
0
def main():
    start_script = time.time()
    
    parser = argparse.ArgumentParser()

    parser.add_argument('--data_dir', type=str, help='location of data')
    parser.add_argument('--n_estimators', type=int, default=100, help='Number of trees in RF')
    parser.add_argument('--max_depth', type=int, default=16, help='Max depth of each tree')
    parser.add_argument('--n_bins', type=int, default=8, help='Number of bins used in split point calculation')
    parser.add_argument('--max_features', type=float, default=1.0, help='Number of features for best split')


    args = parser.parse_args()
    data_dir = args.data_dir
    
    print('\n---->>>> cuDF version <<<<----\n', cudf.__version__)
    print('\n---->>>> cuML version <<<<----\n', cuml.__version__)
    
    t1 = time.time()
    df = cudf.read_parquet(os.path.join(data_dir, 'airline_20m.parquet'))
#     df = cudf.read_orc(os.path.join(data_dir, 'airline_20000000.orc'))
    t2 = time.time()
    print('\n---->>>> cuDF time: {:.2f} <<<<----\n'.format(t2-t1))

    X = df[df.columns.difference(['ArrDelay', 'ArrDelayBinary'])]
    y = df['ArrDelayBinary'].astype(np.int32)
    del df
    
    n_estimators = args.n_estimators
    run.log('n_estimators', np.int(args.n_estimators))
    max_depth = args.max_depth
    run.log('max_depth', np.int(args.max_depth))
    n_bins = args.n_bins
    run.log('n_bins', np.int(args.n_bins))
    max_features = args.max_features
    run.log('max_features', np.str(args.max_features))
        
    print('\n---->>>> Training using GPUs <<<<----\n')
    
    # ----------------------------------------------------------------------------------------------------
    # cross-validation folds 
    # ----------------------------------------------------------------------------------------------------
    accuracy_per_fold = []; train_time_per_fold = []; infer_time_per_fold = []; trained_model = [];
    global_best_model = None; global_best_test_accuracy = 0
    
    traintime = time.time()
    # optional cross-validation w/ model_params['n_train_folds'] > 1
    for i_train_fold in range(5):
        print( f"\n CV fold { i_train_fold } of { 5 }\n" )

        # split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i_train_fold, shuffle = True)

        # train model 
        cu_rf = cuRF(n_estimators=n_estimators, max_depth=max_depth, n_bins=n_bins, max_features=max_features)
        start1 = time.time()
        trained_model = cu_rf.fit(X_train, y_train)
        training_time = time.time() - start1
        train_time_per_fold += [ round( training_time, 4) ]

        # evaluate perf
        start2 = time.time()
        cuml_pred = cu_rf.predict(X_test)
        infer_time = time.time() - start2

        cuml_accuracy = accuracy_score(cuml_pred, y_test) * 100
                
        accuracy_per_fold += [ round( cuml_accuracy, 4) ]
        infer_time_per_fold += [ round( infer_time, 4) ]

        # update best model [ assumes maximization of perf metric ]
        if cuml_accuracy > global_best_test_accuracy :
            global_best_test_accuracy = cuml_accuracy
    
    total_train_inference_time = time.time() - traintime
    run.log('Total training inference time', np.float(total_train_inference_time))
    run.log('Accuracy', np.float(global_best_test_accuracy))
    print( '\n Accuracy             :', global_best_test_accuracy)
    print( '\n accuracy per fold    :', accuracy_per_fold)
    print( '\n train-time per fold  :', train_time_per_fold)
    print( '\n train-time all folds  :', sum(train_time_per_fold))
    print( '\n infer-time per fold  :', infer_time_per_fold)
    print( '\n infer-time all folds  :', sum(infer_time_per_fold))
              
    end_script = time.time()
    print('Total runtime: {:.2f}'.format(end_script-start_script))
    run.log('Total runtime', np.float(end_script-start_script))
    
    print('\n Exiting script')