def train_rfg(X_train,
              y_train,
              X_test,
              y_test,
              sample_weight=None,
              uncertainty=False):
    rfg = RandomForestRegressor(n_estimators=300,
                                random_state=0).fit(X_train, y_train,
                                                    sample_weight)
    preds = [rfg.predict(X_train), rfg.predict(X_test)]

    variance_tr = fci.random_forest_error(rfg, X_train, X_train)
    variance_te = fci.random_forest_error(rfg, X_train, X_test)

    if uncertainty:
        sw_tr = variance_tr
        sw_te = variance_te
    else:
        sw_tr = (preds[0] - y_train)**2
        sw_te = (preds[1] - y_test)**2
    variance = [variance_tr, variance_te]
    sws = [sw_tr, sw_te]
    # print("Train rmse: ", mean_squared_error(preds[0], y_train, squared=False))
    # print("Test rmse: ", mean_squared_error(preds[1], y_test, squared=False))

    return preds, variance, sws
示例#2
0
def confidence_interval(model, Xtrain, Xtest):
    inbag = fci.calc_inbag(Xtrain.shape[0], model)
    ci = fci.random_forest_error(model,
                                 Xtrain.values,
                                 Xtest.values,
                                 inbag=inbag)
    return ci
def do_fci(n_trees):
    # Calculate the variance
    # mpg_V_IJ_unbiased = fci.random_forest_error(rfr, X_train, X_test)
    rfr.n_estimators = n_trees
    rfr.fit(X_train, Y_train)
    pred_test = rfr.predict(X_test).round(0).astype(int)
    # mpg_V_IJ_unbiased = fci.random_forest_error(rfr, X_train, X_test).round(0).astype(int)
    mpg_V_IJ_unbiased = fci.random_forest_error(
        rfr,
        X_train,
        X_test,
        memory_constrained=True,
        memory_limit=1024,
        calibration_scale=calibration_scale).round(0).astype(int)

    # mpg_V_IJ_unbiased = fci.random_forest_error(rfr, X_train, X_test, calibrate=False)

    df_test['pred_test'] = pred_test
    df_test['mpg_V_IJ_unbiased'] = mpg_V_IJ_unbiased
    df_test['mpg_V_IJ_unbiased_sqrt'] = np.sqrt(mpg_V_IJ_unbiased).round(
        0).astype(int)
    # df_test['lower'] = interval[0]
    # df_test['upper'] = interval[1]
    # df_test['diff'] = df_test['yield_pred'] - mpg_y_hat
    # df_test['stderr'] = stderr

    pd.options.display.max_columns = df_test.shape[1]
    print(df_test.describe())
    out_csv = r"out.rs/out.{0}.{1}.csv".format(n_trees, calibration_scale)
    df_test.describe().to_csv(out_csv,
                              index=True,
                              header=True,
                              sep=',',
                              float_format='%.0f')
def test_random_forest_error():
    X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]])

    y = np.array([70, 100, 60, 100, 120])

    train_idx = [2, 3, 4]
    test_idx = [0, 1]

    y_test = y[test_idx]
    y_train = y[train_idx]
    X_test = X[test_idx]
    X_train = X[train_idx]

    n_trees = 4
    forest = RandomForestRegressor(n_estimators=n_trees)
    forest.fit(X_train, y_train)
    inbag = fci.calc_inbag(X_train.shape[0], forest)
    V_IJ_unbiased = fci.random_forest_error(forest, X_train, X_test)
    npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])

    # We cannot calculate inbag from a non-bootstrapped forest. This is because
    # Scikit-learn trees do not store their own sample weights. If you did This
    # some other way, you can still use your own inbag
    non_bootstrap_forest = RandomForestRegressor(n_estimators=n_trees,
                                                 bootstrap=False)

    npt.assert_raises(ValueError, fci.calc_inbag, X_train.shape[0],
                      non_bootstrap_forest)
def calibration_isotonic_regression(model_name, model, prob_model,
                                    X_calibration, y_calibration, X_train):
    # 1. function that trains the calibration regressor using as input calibration data in the first instance
    # 2. it then takes in the prob_out of the mdel on the test and outputs calibrated prob for further calculation of
    # calibrated std
    # ref: https: // arxiv.org / abs / 1807.00263
    if model_name == 'Bayes_Ridge_model':
        y_hat_calibration, sem_hat_calibration = model.predict(X_calibration,
                                                               return_std=True)

    elif model_name == 'RF_model':
        y_hat_calibration = model.predict(X_calibration)
        sem_hat_calibration = np.sqrt(
            fci.random_forest_error(model, X_train, X_calibration))

    else:
        print('Error: Not able to calculate variace!')
        # y_hat, sem = model.predict(X_calibration)

    prob_per_int_y_calibration, prob_y_calibration, prob_y_calibration_expected, prob = count_entries_per_interval(
        y_calibration, y_hat_calibration, sem_hat_calibration)
    prob_model_y_calibration = predict_prob(y_calibration, y_hat_calibration,
                                            sem_hat_calibration)

    # isotonic regression
    from sklearn.isotonic import IsotonicRegression as IR
    ir = IR(out_of_bounds='clip')
    ir.fit(prob_model_y_calibration, prob_y_calibration)

    prob_test_calibrated = ir.transform(prob_model)
    return prob_test_calibrated
示例#6
0
    def confidence_cal(self, train_data, test_data, rf):
        import forestci as fci
        # calculate inbag and unbiased variance
        inbag = fci.calc_inbag(train_data.shape[0], rf)
        V_IJ, V_IJ_unbiased = fci.random_forest_error(rf, train_data,
                                                      test_data)

        return V_IJ, V_IJ_unbiased
示例#7
0
	def predict(self, X):
		# compute predictions
		y_bar = super(RandomForestRegressorWithIntervals, self).predict(X)

		# compute variance estimate
		y_var = forestci.random_forest_error(self, self.X_train, X)
		y_std = np.sqrt(y_var)

		return y_bar, y_std
示例#8
0
    def confidence_cal(train_data, test_data, rf):
        import forestci as fci
        from matplotlib import pyplot as plt
        import numpy as np
        # calculate inbag and unbiased variance
        spam_inbag = fci.calc_inbag(train_data.shape[0], rf)
        V_IJ, V_IJ_unbiased = fci.random_forest_error(rf, train_data,
                                                     test_data)

        return V_IJ, V_IJ_unbiased
示例#9
0
def rf_predict_proba(self, x, return_var=False, train_x=None):
    predictions = self.predict_proba_orig(x)

    import forestci as fci

    if return_var:
        assert train_x is not None

        var = fci.random_forest_error(self, train_x, x)
        return predictions, var
    else:
        return predictions
示例#10
0
 def pred_int_calc(self, calcV_IJ=False):
     trueV = self.yP
     self.df = pd.DataFrame()
     self.df['v'] = trueV
     if calcV_IJ:
         self.df['V_IJ_unbiased'] = fci.random_forest_error(
             self.clf, self.X, self.XP)
     self.df['p_d'] = self.err_dn
     self.df['p_u'] = self.err_up
     self.df['p_m'] = self.err_mean
     incorrect = ((np.sum(self.df.v > self.df.p_u) +
                   np.sum(self.df.v < self.df.p_d)) / self.df.shape[0])
     return 1 - incorrect
示例#11
0
    def confidence_cal(train_data, test_data, rf):
        import forestci as fci
        from matplotlib import pyplot as plt
        import numpy as np
        # calculate inbag and unbiased variance
        inbag = fci.calc_inbag(train_data.shape[0], rf)
        V_IJ_unbiased = fci.random_forest_error(rf, train_data, test_data)

        print("inbag: {}".format(inbag))
        print("V_IJ_unbiased: {}".format(V_IJ_unbiased))
        # Plot error bars for predicted MPG using unbiased variance

        return inbag, V_IJ_unbiased
def do_fci():
    # Calculate the variance
    mpg_V_IJ_unbiased, pred_mean_t = fci.random_forest_error(
        rfr, X_train, X_test)
    # mpg_V_IJ_unbiased = fci.random_forest_error(rfr, X_train, X_test, calibrate=False)
    print(mpg_V_IJ_unbiased.shape)
    print(mpg_V_IJ_unbiased)

    pred_rf = rfr.predict(X_test)

    import pandas as pd
    df = pd.DataFrame()
    df['pred_rf'] = pred_rf
    df['pred_mean_t'] = pred_mean_t
    df['mpg_V_IJ_unbiased'] = mpg_V_IJ_unbiased
    df['mpg_V_IJ_unbiased_sqrt'] = np.sqrt(mpg_V_IJ_unbiased)
    pd.options.display.max_columns = df.shape[1]
    print(df.describe())
示例#13
0
def test_random_forest_error():
    X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]])

    y = np.array([70, 100, 60, 100, 120])

    train_idx = [2, 3, 4]
    test_idx = [0, 1]

    y_test = y[test_idx]
    y_train = y[train_idx]
    X_test = X[test_idx]
    X_train = X[train_idx]

    n_trees = 4
    forest = RandomForestRegressor(n_estimators=n_trees)
    forest.fit(X_train, y_train)
    inbag = fci.calc_inbag(X_train.shape[0], forest)
    V_IJ_unbiased = fci.random_forest_error(forest, inbag, X_train, X_test)
    npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
示例#14
0
def test_with_calibration():
    # Test both with and without interpolation:
    for n in [25 * 5, 205 * 5]:
        X = np.random.rand(n).reshape(n // 5, 5)
        y = np.random.rand(n // 5)

        train_idx = np.arange(int(n // 5 * 0.75))
        test_idx = np.arange(int(n//5 * 0.75), n//5)

        y_test = y[test_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        X_train = X[train_idx]

        n_trees = 4
        forest = RandomForestRegressor(n_estimators=n_trees)
        forest.fit(X_train, y_train)
        V_IJ_unbiased = fci.random_forest_error(forest, X_train, X_test)
        npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
def test_with_calibration():
    # Test both with and without interpolation:
    for n in [25 * 5, 205 * 5]:
        X = np.random.rand(n).reshape(n // 5, 5)
        y = np.random.rand(n // 5)

        train_idx = np.arange(int(n // 5 * 0.75))
        test_idx = np.arange(int(n//5 * 0.75), n//5)

        y_test = y[test_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        X_train = X[train_idx]

        n_trees = 4
        forest = RandomForestRegressor(n_estimators=n_trees)
        forest.fit(X_train, y_train)
        V_IJ_unbiased = fci.random_forest_error(forest, X_train, X_test)
        npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
示例#16
0
    def confidenceCal(self, train_data, test_data, predictions, test_y, rf):
        pmax = np.amax(predictions)
        tmax = np.amax(test_y)

        axismax = max(pmax, tmax)

        import forestci as fci
        # calculate inbag and unbiased variance
        inbag = fci.calc_inbag(train_data.shape[0], rf)
        V_IJ, V_IJ_unbiased = fci.random_forest_error(rf, train_data,
                                                      test_data)

        # print "inbag: {}".format(inbag)
        # print "V_IJ_unbiased: {}".format(V_IJ_unbiased)
        # # Plot error bars for predicted MPG using unbiased variance
        (_, caps, _) = plt.errorbar(predictions,
                                    test_y,
                                    yerr=np.sqrt(V_IJ),
                                    fmt='o',
                                    markersize=4,
                                    capsize=10,
                                    mfc='red',
                                    mec='green')
        for cap in caps:
            cap.set_markeredgewidth(1)
        plt.title('Error bars for Patient: ' + str(self.patient_id))

        plt.xlabel('Actual BG')
        plt.ylabel('Predicted BG')
        plt.xlim(0, axismax)
        plt.ylim(0, axismax)

        plt.savefig(
            "prediction/tmp/confidence_intervals_bias_patient{}.png".format(
                self.patient_id))
        plt.close()

        return V_IJ, V_IJ_unbiased
示例#17
0
    def confidence_cal(train_data, train_y, test_data, test_y, predictions, rf, patientID):
        import forestci as fci
        from matplotlib import pyplot as plt
        import numpy as np
        # calculate inbag and unbiased variance
        spam_inbag = fci.calc_inbag(train_data.shape[0], rf)
        V_IJ_unbiased = fci.random_forest_error(rf, train_data,
                                                     test_data)

        # Plot forest prediction for emails and standard deviation for estimates
        # Blue points are spam emails; Green points are non-spam emails
        idx = np.where(test_y == 1)[0]
        plt.errorbar(predictions[idx, 1], np.sqrt(V_IJ_unbiased[idx]),
                     fmt='.', alpha=0.75, label='Hyper')

        idx = np.where(test_y == 0)[0]
        plt.errorbar(predictions[idx, 1], np.sqrt(V_IJ_unbiased[idx]),
                     fmt='.', alpha=0.75, label='Non')

        plt.xlabel('Prediction (hyper probability)')
        plt.ylabel('Standard deviation')
        plt.legend()
        plt.show()
示例#18
0
    def compute(self, X_test):
        if self.model_type == "gp":
            self.model.gp.fit(self.X_train, self.y_train)
            y_mean, y_std = self.model.gp.predict(X_test, return_std=True)
            y_variance = y_std**2
        else:
            self.model.rf.fit(self.y_train, self.y_train)
            y_mean = self.model.rf.predict(X_test)
            y_variance = fci.random_forest_error(self.model.rf, self.X_train,
                                                 X_test)
            y_std = np.sqrt(y_variance)

        z = (y_mean - self.current_optimal - self.trade_off) / y_std

        if self.mode == "ei":
            if y_std < 0.000001:
                return 0, y_mean, y_variance
            result = y_std * (z * norm.cdf(z) + norm.pdf(z))
        elif self.mode == "pi":
            result = norm.cdf(z)
        else:
            result = -(y_mean - self.trade_off * y_std)
        return np.squeeze(result), np.squeeze(y_mean), np.squeeze(y_variance)
def test_bagging_svr_error():
    X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]])

    y = np.array([70, 100, 60, 100, 120])

    train_idx = [2, 3, 4]
    test_idx = [0, 1]

    y_test = y[test_idx]
    y_train = y[train_idx]
    X_test = X[test_idx]
    X_train = X[train_idx]

    n_trees = 4
    bagger = BaggingRegressor(base_estimator=SVR(), n_estimators=n_trees)
    bagger.fit(X_train, y_train)
    inbag = fci.calc_inbag(X_train.shape[0], bagger)
    for ib in [inbag, None]:
        for calibrate in [True, False]:
            V_IJ_unbiased = fci.random_forest_error(
                bagger, X_train, X_test, inbag=ib, calibrate=calibrate
            )
        npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
示例#20
0
def analyze_on_test_data(rf_model, X_test, X_train, y_test=None):
    plot_settings = {'c': '#ff5e78',
                     's': 10,} 
    error_plot_settings = {'ecolor': '#ff5e78',
                           'elinewidth': 0.5,
                           'alpha': 0.4,
                           'fmt': 'o'
                           }   
    y_test_pred = rf_model.predict(X_test)
    prediction_variance = fci.random_forest_error(rf_model, X_train, X_test)
    cli_95 = 1.96 * np.sqrt(prediction_variance)
    
    plt.rcParams['svg.fonttype'] = 'none'
    plt.scatter(range(len(y_test_pred)), y_test_pred, **plot_settings, alpha=0.6)
    plt.errorbar(range(len(y_test_pred)), 
                 y_test_pred, 
                 yerr=cli_95, 
                 **error_plot_settings)
    plt.xlabel('Sample Index', fontsize=20)
    plt.ylabel('Predicted Response', fontsize=20)
    plt.show()

    if y_test is not None:
        mae = mean_absolute_error(y_true=y_test, y_pred=y_test_pred)
        r2 = r2_score(y_true=y_test, y_pred=y_test_pred)
        plot_parity(x=y_test, 
                    y=y_test_pred, 
                    xlabel='True response',
                    ylabel='Predicted response',
                    **plot_settings, 
                    show_plot=False,
                    text='MAE: {:.2f} R2: {:.2f}'.format(mae, r2),
                    text_x=0.1,
                    text_y=0.9)
        plt.errorbar(y_test, y_test_pred, yerr=cli_95, **error_plot_settings)
        plt.show()
示例#21
0
def test_random_forest_error():
    X = np.array([[5, 2],
                  [5, 5],
                  [3, 3],
                  [6, 4],
                  [6, 6]])

    y = np.array([70, 100, 60, 100, 120])

    train_idx = [2, 3, 4]
    test_idx = [0, 1]

    y_test = y[test_idx]
    y_train = y[train_idx]
    X_test = X[test_idx]
    X_train = X[train_idx]

    n_trees = 4
    forest = RandomForestRegressor(n_estimators=n_trees)
    forest.fit(X_train, y_train)
    inbag = fci.calc_inbag(X_train.shape[0], forest)
    for ib in [inbag, None]:
        for calibrate in [True, False]:
            V_IJ_unbiased = fci.random_forest_error(forest, X_train, X_test,
                                                    inbag=ib,
                                                    calibrate=calibrate)
        npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])

    # We cannot calculate inbag from a non-bootstrapped forest. This is because
    # Scikit-learn trees do not store their own sample weights. If you did This
    # some other way, you can still use your own inbag
    non_bootstrap_forest = RandomForestRegressor(n_estimators=n_trees,
                                                 bootstrap=False)

    npt.assert_raises(ValueError, fci.calc_inbag, X_train.shape[0],
                      non_bootstrap_forest)
def demo_variance_prediction(classifier, X_Train, X_Test):
    prediction_variance = fci.random_forest_error(classifier, X_Train, X_Test)
    print({
        "prediction_mean": classifier.predict(X_Test),
        "prediction_variance": prediction_variance
    })
示例#23
0
# split mpg data into training and test set
mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(
                                                   mpg_X, mpg_y,
                                                   test_size=0.25,
                                                   random_state=42
                                                   )

# create RandomForestRegressor
n_trees = 2000
mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42)
mpg_forest.fit(mpg_X_train, mpg_y_train)
mpg_y_hat = mpg_forest.predict(mpg_X_test)

# Plot predicted MPG without error bars
plt.scatter(mpg_y_test, mpg_y_hat)
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()

# Calculate the variance:
mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_X_train,
                                            mpg_X_test)

# Plot error bars for predicted MPG using unbiased variance
plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o')
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()
示例#24
0
def predict(json):

    print('\n-----------------------')
    print('Started prediction')
    print('-----------------------')

    modelname = json['model']
    datloc = json['data_location']
    cases = json['data_cases']
    savloc = json['save_location']
    dist = False
    uq = False
    os.makedirs(savloc, exist_ok=True)

    compare_predict = False
    if (("prediction_accuracy" in json) == True):
        compare_predict = True
        target = json['prediction_accuracy']['target']
        type = json['prediction_accuracy']['type']

    thresh = 0.5
    if (("prediction_threshold" in json) == True):
        thresh = json['prediction_threshold']

    if (("features_to_drop" in json) == True):
        features_to_drop = json['features_to_drop']
    else:
        features_to_drop = None
        if (("features_to_keep" in json) == True):
            features_to_keep = json['features_to_keep']
        else:
            features_to_keep = None
        if (features_to_drop and features_to_keep):
            quit('features_to_drop and features_to_keep both set')

    if (("uq" in json) == True):
        if json["uq"] == True:
            uq = True
            import forestci as fci

    if (("dist" in json) == True):
        if json["dist"] == True:
            dist = True
            from cfd2ml.utilities import mahalanobis

    # Read in ML model


#    filename = modelname + '.joblib'
    filename = modelname + '.p'
    print('\nReading model from ', filename)
    #    model = load(filename)
    model = pickle.load(open(filename, 'rb'))
    if isinstance(model, MondrianForestRegressor) or (
            isinstance(model, RandomForestRegressor)
            and uq is True):  #training data needed in these instances
        X_train = pd.read_csv(modelname + '_Xdat.csv')
        Y_train = pd.read_csv(modelname + '_Ydat.csv')[target]
        if (features_to_drop is not None):
            X_train = X_train.drop(columns=features_to_drop)
        elif (features_to_keep is not None):
            X_train = X_train[features_to_keep]
        #TODO - this required for now as pickle/joblib not saving fitted MF properly
        if isinstance(model, MondrianForestRegressor):
            model.fit(X_train, Y_train)
    cmap = plt.get_cmap('tab10')
    # Open a figure axes
    fig1, ax1 = plt.subplots()
    fig2, ax2 = plt.subplots()

    # Read in each X_data, predict Y, write predicted Y
    for caseno, case in enumerate(cases):
        # Read in RANS (X) data
        filename = os.path.join(datloc, case + '_X.pkl')
        X_case = CaseData(filename)

        print('\n***********************')
        print(' Case %d: %s ' % (caseno + 1, case))
        print('***********************')

        X_pred = X_case.pd
        if (features_to_drop is not None):
            X_pred = X_pred.drop(columns=features_to_drop)
        elif (features_to_keep is not None):
            X_pred = X_pred[features_to_keep]

        # Predict HiFi (Y) data and store add to vtk
        Y_pred = CaseData(case + '_pred')
        Y_pred.vtk = vista.UnstructuredGrid(X_case.vtk.offset,
                                            X_case.vtk.cells,
                                            X_case.vtk.celltypes,
                                            X_case.vtk.points)
        if (type == 'classification'):
            Y_prob = pd.Series(
                model.predict_proba(X_pred)[:, 1]
            )  # only need as numpy ndarray but convert to pd series for consistency
            Y_pred.pd = pd.Series(predict_with_threshold(Y_prob, thresh))
            Y_pred.vtk.point_arrays['Y_prob'] = Y_prob.to_numpy()
        elif (type == 'regression'):
            if isinstance(model, RandomForestRegressor):
                y_pred = model.predict(X_pred)
            elif isinstance(
                    model, MondrianForestRegressor
            ) and uq is False:  #if uq true prediction made below
                y_pred = model.predict(X_pred)

            # Uncertainty quantification
            if (uq is True):
                if isinstance(model, RandomForestRegressor):
                    print('Calculating infinitesimal jackknife variance')
                    y_var = fci.random_forest_error(model,
                                                    X_train,
                                                    X_pred,
                                                    calibrate=True)
                    y_sd = np.sqrt(np.maximum(y_var, 0))
                elif isinstance(model, MondrianForestRegressor):
                    print(
                        'Calculating mondrian forest posterior mean and standard deviation'
                    )
                    y_pred, y_sd = model.predict(X_pred, return_std=True)
                Y_pred.vtk.point_arrays['Y_std'] = y_sd
                # Print out rms of var
                sd_mean = np.mean(y_sd)
                y_mean = np.mean(y_pred)
                print('sd_mean/y_mean = ', 100 * sd_mean / y_mean, '%')

            if (dist is True):
                mah_dist = mahalanobis(x=X_pred, data=X_train)
                Y_pred.vtk.point_arrays['mah_dist'] = mah_dist
                print('Mean mahalanobis distance = ', np.mean(mah_dist))

        Y_pred.pd = pd.Series(
            y_pred
        )  # only need as numpy ndarray but convert to pd series for consistency
        Y_pred.vtk.point_arrays['Y_pred'] = y_pred

        # Read in true HiFi (Y) data and compare to predict
        if (compare_predict == True):
            filename = os.path.join(datloc, case + '_Y.pkl')
            Y_true = CaseData(filename)

            # Write Y_true to vtk for analysis
            index = Y_true.pd.columns.get_loc(target)
            Y_pred.vtk.point_arrays['Y_true'] = Y_true.pd.to_numpy()[:, index]

            # accuracy metrics
            if (type == 'classification'):
                predict_classifier_accuracy(Y_pred.pd, Y_true.pd[target])

                # Write TP, TN, FP, FN to vtk
                if (type == 'classification'):
                    Y_pred.vtk.point_arrays['confuse'] = confusion_labels(
                        Y_pred.pd, Y_true.pd[target])

                # Calc precision, recall and decision thresholds
                precisions, recalls, thresholds = precision_recall_curve(
                    Y_true.pd[target], Y_prob)
                c = cmap(caseno)

                # Plot precision-recall curve with current decision threshold marked
                plot_precision_recall_threshold(precisions,
                                                recalls,
                                                thresholds,
                                                t=thresh,
                                                ax=ax1,
                                                c=c)

                # Plot precision and recall vs decision threshold
                plot_precision_recall_vs_threshold(precisions,
                                                   recalls,
                                                   thresholds,
                                                   ax=ax2,
                                                   c=c,
                                                   t=thresh,
                                                   case=case)

            elif (type == 'regression'):
                predict_regressor_accuracy(Y_pred.pd, Y_true.pd[target])
                Y_pred.vtk.point_arrays['error'] = local_error(
                    Y_pred.pd, Y_true.pd[target])

        filename = os.path.join(savloc, Y_pred.name + '.vtk')
        Y_pred.WriteVTK(filename)

    if (type == 'classification'):
        ax1.legend()
        ax2.legend()
        plt.show()

    print('\n-----------------------')
    print('Finished prediction')
    print('-----------------------')
示例#25
0
 def predict(self, X_test):
     y_mean = self.rf.predict(X_test)
     y_variance = fci.random_forest_error(self.rf, self.X_train, X_test)
     y_std = np.sqrt(y_variance)
     return y_mean, y_std, y_variance
示例#26
0
spam_RFC.fit(spam_X_train, spam_y_train)
spam_y_hat = spam_RFC.predict_proba(spam_X_test)

idx_spam = np.where(spam_y_test == 1)[0]
idx_ham = np.where(spam_y_test == 0)[0]

# Histogram predictions without error bars:
fig, ax = plt.subplots(1)
ax.hist(spam_y_hat[idx_spam, 1], histtype='step', label='spam')
ax.hist(spam_y_hat[idx_ham, 1], histtype='step', label='not spam')
ax.set_xlabel('Prediction (spam probability)')
ax.set_ylabel('Number of observations')
plt.legend()

# Calculate the variance
spam_V_IJ_unbiased = fci.random_forest_error(spam_RFC, spam_X_train,
                                             spam_X_test)

# Plot forest prediction for emails and standard deviation for estimates
# Blue points are spam emails; Green points are non-spam emails
fig, ax = plt.subplots(1)
ax.scatter(spam_y_hat[idx_spam, 1],
           np.sqrt(spam_V_IJ_unbiased[idx_spam]),
           label='spam')

ax.scatter(spam_y_hat[idx_ham, 1],
           np.sqrt(spam_V_IJ_unbiased[idx_ham]),
           label='not spam')

ax.set_xlabel('Prediction (spam probability)')
ax.set_ylabel('Standard deviation')
plt.legend()
示例#27
0
def get_forest_conf_interval(rf_model, X_test, X_train):
    prediction_variance = fci.random_forest_error(rf_model, X_train, X_test)
    cli_95 = get_confidence_interval_from_std(np.sqrt(prediction_variance))
    return cli_95
# split mpg data into training and test set
mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(
    mpg_X, mpg_y, test_size=0.25, random_state=42)

# Create RandomForestRegressor
n_estimators = 1000
mpg_bagger = BaggingRegressor(base_estimator=SVR(),
                              n_estimators=n_estimators,
                              random_state=42)
mpg_bagger.fit(mpg_X_train, mpg_y_train)
mpg_y_hat = mpg_bagger.predict(mpg_X_test)

# Plot predicted MPG without error bars
plt.scatter(mpg_y_test, mpg_y_hat)
plt.plot([5, 45], [5, 45], "k--")
plt.xlabel("Reported MPG")
plt.ylabel("Predicted MPG")
plt.show()

# Calculate the variance
mpg_V_IJ_unbiased = fci.random_forest_error(mpg_bagger, mpg_X_train,
                                            mpg_X_test)

# Plot error bars for predicted MPG using unbiased variance
plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt="o")
plt.plot([5, 45], [5, 45], "k--")
plt.xlabel("Reported MPG")
plt.ylabel("Predicted MPG")
plt.show()
spam_RFC.fit(spam_X_train, spam_y_train)
spam_y_hat = spam_RFC.predict_proba(spam_X_test)

idx_spam = np.where(spam_y_test == 1)[0]
idx_ham = np.where(spam_y_test == 0)[0]

# Histogram predictions without error bars:
fig, ax = plt.subplots(1)
ax.hist(spam_y_hat[idx_spam, 1], histtype='step', label='spam')
ax.hist(spam_y_hat[idx_ham, 1], histtype='step', label='not spam')
ax.set_xlabel('Prediction (spam probability)')
ax.set_ylabel('Number of observations')
plt.legend()

# Calculate the variance
spam_V_IJ_unbiased = fci.random_forest_error(spam_RFC, spam_X_train,
                                             spam_X_test)

# Plot forest prediction for emails and standard deviation for estimates
# Blue points are spam emails; Green points are non-spam emails
fig, ax = plt.subplots(1)
ax.scatter(spam_y_hat[idx_spam, 1],
           np.sqrt(spam_V_IJ_unbiased[idx_spam]),
           label='spam')

ax.scatter(spam_y_hat[idx_ham, 1],
           np.sqrt(spam_V_IJ_unbiased[idx_ham]),
           label='not spam')

ax.set_xlabel('Prediction (spam probability)')
ax.set_ylabel('Standard deviation')
plt.legend()
示例#30
0
    def _get_model_errors(cls, model, X, X_train, X_test, error_method='stdev_weak_learners', remove_outlier_learners=False):

        err_down = list()
        err_up = list()
        indices_TF = list()
        X_aslist = X.values.tolist()
        if model.model.__class__.__name__ in ['RandomForestRegressor', 'GradientBoostingRegressor', 'ExtraTreesRegressor',
                                              'BaggingRegressor', 'AdaBoostRegressor']:

            if error_method == 'jackknife_after_bootstrap':
                model_errors_var = random_forest_error(forest=model.model, X_test=X_test, X_train=X_train)
                # Wager method returns the variance. Take sqrt to turn into stdev
                model_errors = np.sqrt(model_errors_var)
                num_removed_learners = list()
                if remove_outlier_learners is True:
                    print("Warning: removal of outlier learners isn't supported with jackknife after bootstrap")
                for _ in model_errors:
                    num_removed_learners.append(0)

            elif error_method == 'stdev_weak_learners':
                num_removed_learners = list()
                for x in range(len(X_aslist)):
                    preds = list()
                    if model.model.__class__.__name__ == 'RandomForestRegressor':
                        for pred in model.model.estimators_:
                            preds.append(pred.predict(np.array(X_aslist[x]).reshape(1, -1))[0])
                    elif model.model.__class__.__name__ == 'BaggingRegressor':
                        for pred in model.model.estimators_:
                            preds.append(pred.predict(np.array(X_aslist[x]).reshape(1, -1))[0])
                    elif model.model.__class__.__name__ == 'ExtraTreesRegressor':
                        for pred in model.model.estimators_:
                            preds.append(pred.predict(np.array(X_aslist[x]).reshape(1, -1))[0])
                    elif model.model.__class__.__name__ == 'GradientBoostingRegressor':
                        for pred in model.model.estimators_.tolist():
                            preds.append(pred[0].predict(np.array(X_aslist[x]).reshape(1, -1))[0])
                    elif model.model.__class__.__name__ == 'AdaBoostRegressor':
                        for pred in model.model.estimators_:
                            preds.append(pred.predict(np.array(X_aslist[x]).reshape(1, -1))[0])

                    # HERE flag outlier predictions, perhaps result of e.g. numerical issues in ensemble of models
                    if remove_outlier_learners == True:
                        preds, num_outliers = cls._remove_outlier_preds(preds=preds)
                        num_removed_learners.append(num_outliers)
                    else:
                        num_removed_learners.append(0)

                    e_down = np.std(preds)
                    e_up = np.std(preds)
                    err_down.append(e_down)
                    err_up.append(e_up)

                nan_indices = np.where(np.isnan(err_up))
                nan_indices_sorted = np.array(sorted(nan_indices[0], reverse=True))
                for i, val in enumerate(list(err_up)):
                    if i in nan_indices_sorted:
                        indices_TF.append(False)
                    else:
                        indices_TF.append(True)

                model_errors = (np.array(err_up) + np.array(err_down)) / 2

            else:
                print('ERROR: error_method must be one of "stdev_weak_learners" or "jackknife_after_bootstrap"')
                sys.exit()

        if model.model.__class__.__name__ == 'GaussianProcessRegressor':
            preds = model.model.predict(X, return_std=True)[1]  # Get the stdev model error from the predictions of GPR
            err_up = preds
            err_down = preds
            model_errors = (np.array(err_up) + np.array(err_down)) / 2
            nan_indices = np.where(np.isnan(err_up))
            nan_indices_sorted = np.array(sorted(nan_indices[0], reverse=True))
            num_removed_learners = list()
            for i, val in enumerate(list(err_up)):
                num_removed_learners.append(0)
                if i in nan_indices_sorted:
                    indices_TF.append(False)
                else:
                    indices_TF.append(True)

        model_errors = pd.Series(model_errors, name='model_errors')
        num_removed_learners = pd.Series(num_removed_learners, name='num_removed_learners')

        return model_errors, num_removed_learners
示例#31
0
# compute errors
errors = yhat_reais - y_reais

# compute median absolute error
median_abs_error = np.median(np.absolute(errors))
print('median absolute error (in R$):', median_abs_error)

# compute proportional error (error / asking price)
proportional_errors = errors / y_reais
median_prop_error = np.median(np.absolute(proportional_errors))
mean_prop_error = np.mean(np.absolute(proportional_errors))
print('median absolute error (in %):', median_prop_error)
print('mean absolute error (in %):', mean_prop_error)

# estimate uncertainty
variances = fci.random_forest_error(model, X_train, X_test)
plt.errorbar(y_test, yhat, yerr=np.sqrt(variances), fmt='o', ecolor='red')
plt.plot([10, 16], [10, 16], 'k--')
plt.xlabel('actual price, in log(R$)')
plt.ylabel('predicted price, in log(R$)')
plt.show()

# check interval predictions
lower = yhat - np.sqrt(variances)
upper = yhat + np.sqrt(variances)
corrects = 0
for y_i, l, u in zip(y_test, lower, upper):
    if l <= y_i <= u:
        corrects += 1
print(corrects, 'corrects out of', len(yhat))
示例#32
0
def get_RF_ci(RF_type,RF_classi,X_train,X_test,y_test,y_score,
                classes=['yes','no'],plot_fh=None):
    """
    Get confidence intervals for predicted classifications

    :param RF_type: type of random forest algorithm
    :param RF_classi: Classification estimator object
    :param X_train: pandas dataframe, Training data  
    :param X_test: pandas dataframe, Testing data
    :param y_test: pandas dataframe with the target values
    :param y_score: pandas dataframe with the y score values
    """
    # calculate inbag and unbiased variance
    inbag = fci.calc_inbag(X_train.shape[0], RF_classi)
    V_IJ_unbiased = fci.random_forest_error(RF_classi,inbag, X_train,
                                                 X_test)
    # Plot forest prediction for emails and standard deviation for estimates
    # Blue points are spam emails; Green points are non-spam emails
    idx = np.where(y_test == 1)[0]
    fig=plt.figure(figsize=[3,3])
    ax=plt.subplot(111)
    if RF_type=='classi':
        ax.errorbar(y_score[idx, 1], np.sqrt(V_IJ_unbiased[idx]),
                     fmt='.', alpha=0.75, label=classes[0])

        idx = np.where(y_test == 0)[0]
        ax.errorbar(y_score[idx, 1], np.sqrt(V_IJ_unbiased[idx]),
                     fmt='.', alpha=0.75, label=classes[1])

        ax.set_xlabel('Prediction probability')
        ax.set_ylabel('Standard deviation')
        space=0.3
        ax.set_ylim([ax.get_ylim()[0]*(1+space),
                     ax.get_ylim()[1]*(1+space)])
        leg=ax.legend(loc='upper right',frameon=True)
        leg.get_frame().set_alpha(0.5)
        # plt.axis('equal')
    if RF_type=='regress':
        # Plot error bars for predicted MPG using unbiased variance
        ax.errorbar(y_test, y_score, yerr=np.sqrt(V_IJ_unbiased), fmt='o')
        xlim,ylim=get_axlims(y_test,y_score,
                             space=0.1,equal=True)
        ax.plot(xlim,xlim, '--',color='gray')
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
        ax.set_xlabel('Test')
        ax.set_ylabel('Predicted')
        results,_,_=get_regression_metrics(y_test,y_score)
        logging.info(results.replace('\n',' '))
        ax.text(0, 1, results,
            horizontalalignment='left',
            verticalalignment='top',
            transform=ax.transAxes)
        data_regress=pd.DataFrame({'y_test':y_test,
                                    'y_pred':y_score,
                                    'err':np.sqrt(V_IJ_unbiased)
                                    })
        if not plot_fh is None:
            data_regress.to_csv('%s.csv' % plot_fh)
    ax.grid(True)
    saveplot(plot_fh)
示例#33
0
def get_RF_ci(RF_type,
              RF_classi,
              X_train,
              X_test,
              y_test,
              y_score,
              classes=['yes', 'no'],
              plot_fh=None):
    """
    Get confidence intervals for predicted classifications

    :param RF_type: type of random forest algorithm
    :param RF_classi: Classification estimator object
    :param X_train: pandas dataframe, Training data  
    :param X_test: pandas dataframe, Testing data
    :param y_test: pandas dataframe with the target values
    :param y_score: pandas dataframe with the y score values
    """
    # calculate inbag and unbiased variance
    inbag = fci.calc_inbag(X_train.shape[0], RF_classi)
    V_IJ_unbiased = fci.random_forest_error(RF_classi, inbag, X_train, X_test)
    # Plot forest prediction for emails and standard deviation for estimates
    # Blue points are spam emails; Green points are non-spam emails
    idx = np.where(y_test == 1)[0]
    fig = plt.figure(figsize=[3, 3])
    ax = plt.subplot(111)
    if RF_type == 'classi':
        ax.errorbar(y_score[idx, 1],
                    np.sqrt(V_IJ_unbiased[idx]),
                    fmt='.',
                    alpha=0.75,
                    label=classes[0])

        idx = np.where(y_test == 0)[0]
        ax.errorbar(y_score[idx, 1],
                    np.sqrt(V_IJ_unbiased[idx]),
                    fmt='.',
                    alpha=0.75,
                    label=classes[1])

        ax.set_xlabel('Prediction probability')
        ax.set_ylabel('Standard deviation')
        space = 0.3
        ax.set_ylim(
            [ax.get_ylim()[0] * (1 + space),
             ax.get_ylim()[1] * (1 + space)])
        leg = ax.legend(loc='upper right', frameon=True)
        leg.get_frame().set_alpha(0.5)
        # plt.axis('equal')
    if RF_type == 'regress':
        # Plot error bars for predicted MPG using unbiased variance
        ax.errorbar(y_test, y_score, yerr=np.sqrt(V_IJ_unbiased), fmt='o')
        xlim, ylim = get_axlims(y_test, y_score, space=0.1, equal=True)
        ax.plot(xlim, xlim, '--', color='gray')
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
        ax.set_xlabel('Test')
        ax.set_ylabel('Predicted')
        results, _, _ = get_regression_metrics(y_test, y_score)
        logging.info(results.replace('\n', ' '))
        ax.text(0,
                1,
                results,
                horizontalalignment='left',
                verticalalignment='top',
                transform=ax.transAxes)
        data_regress = pd.DataFrame({
            'y_test': y_test,
            'y_pred': y_score,
            'err': np.sqrt(V_IJ_unbiased)
        })
        if not plot_fh is None:
            data_regress.to_csv('%s.csv' % plot_fh)
    ax.grid(True)
    saveplot(plot_fh)
import forestci as fci

# retreive mpg data from machine learning library
mpg_data = fetch_mldata('mpg')

# separate mpg data into predictors and outcome variable
mpg_X = mpg_data["data"]
mpg_y = mpg_data["target"]

# split mpg data into training and test set
mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(
    mpg_X, mpg_y, test_size=0.25, random_state=42)

# create RandomForestRegressor
n_trees = 2000
mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42)
mpg_forest.fit(mpg_X_train, mpg_y_train)
mpg_y_hat = mpg_forest.predict(mpg_X_test)

# calculate inbag and unbiased variance
mpg_inbag = fci.calc_inbag(mpg_X_train.shape[0], mpg_forest)
mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_inbag, mpg_X_train,
                                            mpg_X_test)

# Plot error bars for predicted MPG using unbiased variance
plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o')
plt.plot([5, 45], [5, 45], '--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()
示例#35
0
def train_cv_one_fold(arg):
    g = None
    if len(arg) == 6:
        # groupの情報がある場合
        x, y, h, one_kf, g, args = arg
    else:
        # groupの情報がない場合
        x, y, h, one_kf, args = arg
    pipeline = []
    ##
    ## 学習用セットとテスト用セットに分ける
    ##
    train_idx, test_idx = one_kf
    if args.train_data_sample is not None:
        train_idx = np.random.choice(train_idx,
                                     args.train_data_sample,
                                     replace=False)
    train_x = np.copy(x[train_idx])
    train_y = y[train_idx]
    test_x = np.copy(x[test_idx])
    test_y = y[test_idx]
    test_g = g[test_idx] if g is not None else None
    ##
    ## 手法を選択
    ##
    if args.task == "regression":
        clf, param_grid = get_regressor_model(args)
    else:
        clf, param_grid = get_classifier_model(args)
    result = {}
    ##
    ## 特徴選択を行う
    ##
    selected_feature = None
    if args.feature_selection:
        ##
        ## 特徴選択を行い、選択された特徴で予測をする
        ##
        if args.num_features is not None:
            rfe = RFE(clf, args.num_features)
        else:
            rfe = RFECV(clf, cv=3)
        mask = ~np.isnan(train_y)
        rfe = rfe.fit(train_x[mask, :], train_y[mask])
        """
        # feature selection による予測結果を保存する場合はコメントをはずす
        result["feature_selection_pred_y"] = rfe.predict(test_x)
        prob_y = rfe.predict_proba(test_x) if hasattr(clf, "predict_proba") else None
        result["feature_selection_prob_y"] = prob_y
        """
        ##
        ## 選択された特徴を保存する
        ##
        selected_feature = rfe.support_
        print("=== selected feature ===")
        if h is None:
            selected_feature_name = [
                i for i, el in enumerate(selected_feature) if el == True
            ]
            print(len(selected_feature_name), ":", selected_feature_name)
        else:
            selected_feature_name = [
                attr for attr, el in zip(h, selected_feature) if el == True
            ]
            print(len(selected_feature_name), ":", selected_feature_name)
            result["selected_feature_name"] = selected_feature_name
        result["selected_feature"] = selected_feature
        result["feature_name"] = selected_feature
        ##
        ## 学習・テストデータをこのfold中、選択された特徴のみにする
        ##
        train_x = rfe.transform(train_x)
        test_x = rfe.transform(test_x)
        pipeline.append(rfe)
    if h is not None:
        result["feature_name"] = h

    if args.grid_search:
        ##
        ## グリッドサーチでハイパーパラメータを選択する
        ## ハイパーパラメータを評価するため学習セットを、さらに、パラメータを決定する学習セットとハイパーパラメータを評価するためのバリデーションセットに分けてクロスバリデーションを行う
        ##
        grid_search = sklearn.model_selection.GridSearchCV(
            clf, param_grid, cv=args.param_search_splits)
        mask = ~np.isnan(train_y)
        grid_search.fit(train_x[mask, :], train_y[mask])

        ##
        ## 最も良かったハイパーパラメータや結果を保存
        ##
        print("Best parameters: {}".format(grid_search.best_params_))
        print("Best cross-validation: {}".format(grid_search.best_score_))
        result.update({
            "param": grid_search.best_params_,
            "best_score": grid_search.best_score_,
        })
        """
        ## 最も良かったハイパーパラメータのモデルを用いてテストデータで評価を行い、保存する場合はコメントをはずす
        pred_y = grid_search.predict(test_x)
        prob_y = grid_search.predict_proba(test_x) if hasattr(grid_search, "predict_proba") else  None
        result["grid_search_pred_y"] = pred_y
        prob_y = rfe.predict_proba(test_x) if hasattr(clf, "predict_proba") else None
        result["grid_search_prob_y"] = prob_y
        """
        ##
        ## 最も良かったハイパーパラメータの識別器を保存
        ## (学習データ全体での再フィッティングはこの段階では行わない)
        ##
        clf = grid_search.best_estimator_
    if args.opt:
        clf = optimize(train_x, train_y)

    ##
    ## clf を学習データ全体で再学習する
    ##
    mask = ~np.isnan(train_y)
    clf.fit(train_x[mask, :], train_y[mask])
    ##
    ## 予測器ごとに特有の結果を出力する
    ##
    # ベイズ回帰の予測標準偏差
    if isinstance(clf, sklearn.linear_model.BayesianRidge):
        pred_y, pred_y_std = clf.predict(test_x, return_std=True)
        result["pred_y_std"] = pred_y_std
    else:
        pred_y = clf.predict(test_x)

    # 特徴量の重要度
    if hasattr(clf, "feature_importances_"):
        fi = clf.feature_importances_
        result["feature_importance"] = fi
        fi_str = ",".join(map(str, fi))
        print("feature_importance", len(fi), ":", fi_str)

    # ランダムフォレストの予測標準偏差
    if isinstance(clf, RandomForestRegressor):
        if args.fci:
            import forestci as fci
            unbiased_var = fci.random_forest_error(clf, train_x, test_x)
            result["test_y_std"] = np.sqrt(unbiased_var)

    ##
    ## 予測結果やインデックスの保存
    ##
    result["test_y"] = test_y
    result["test_idx"] = test_idx
    result["test_group"] = test_g
    result["pred_y"] = pred_y
    prob_y = None
    if hasattr(clf, "predict_proba"):
        prob_y = clf.predict_proba(test_x)
    result["prob_y"] = prob_y
    pipeline.append(clf)
    ##
    ## 評価
    ##
    #if test_g is not None:
    #    result=evaluate_group(test_y, pred_y, prob_y, test_g, args, result=result)
    result = evaluate(test_y, pred_y, prob_y, args, result)
    if "accuracy" in result:
        if args.task == "binary":
            print("Cross-validation test accuracy: %3f" % (result["accuracy"]))
            print("Cross-validation test AUC: %3f" % (result["auc"]))
        if args.task == "multiclass":
            for i, auc in enumerate(result["auc"]):
                print("Task %d Cross-validation test AUC: %3f" % (i, auc))
            acc = result["accuracy"]
            print("Cross-validation test accuracy: %3f" % (acc))
    else:
        print("Cross-validation r2: %3f" % (result["r2"]))

    return (result, pipeline)