Пример #1
0
def exp_space_leakage(data):
    """ Evaluates whether a set of features (default PROXIMITY_FEATURES) "leak"
        spatial information to GBRT by producing a scatter plot of spatial
        distance vs Euclidean distance in the subspace of specified features
        for a collection of random pairs of data points (default n = 20k).
        Plot is saved to <OUT_DIR>/space-leakage.png.
    """
    dumpfile = 'space_leakage.txt'
    num_samples = 20000
    plot_space_leakage(data, num_samples, features=PROXIMITY_FEATURES, dumpfile=dumpfile, replot=False)
    save_cur_fig('space-leakage.png', title='Spatial information leakage through proximity features', set_title_for=None)
Пример #2
0
def exp_sensitivity(data):
    """ Evaluates sensitivity of GBRT and linear regression to perturbations in
        training GHF. Plot is saved to <OUT_DIR>/sensitivity.png.
    """
    radius = GREENLAND_RADIUS
    roi_density = 11.3 # Greenland
    noise_amps = np.arange(0.025, .31, .025)
    ncenters = 50
    dumpfile = 'sensitivity.txt'
    plotfile = 'sensitivity.png'
    plot_sensitivity_analysis(data, roi_density, radius, noise_amps, ncenters, dumpfile=dumpfile, replot=False)
    save_cur_fig(plotfile, title='GBRT prediction sensitivity to noise in training GHF', set_title_for=None)
Пример #3
0
def exp_feature_importance(data):
    """ Plots feature importances for averaged over 50 ROIs with the same
        radius and sample density as the GrIS train/validation split. The max
        depth of trees is increased to 8 to avoid a few influential features
        taking over all trees."""
    radius = GREENLAND_RADIUS
    ncenters = 50
    roi_density = 11.3 # Greenland
    max_depth = 8
    dumpfile = 'feature_importances.txt'
    plot_feature_importance_analysis(data, roi_density, radius, ncenters, dumpfile=dumpfile, max_depth=max_depth, replot=False)
    save_cur_fig('feature-importance.png', title='Relative feature importances in GBRT', set_title_for=None)
Пример #4
0
def exp_partial_dependence():
    """ Produces partial dependence plots for GBRT. The one-way PPD is produced
        for all non-categorical features. The two-way PPD is produced for all
        combinations of a fixed set of top 6 features."""
    X_train, y_train, _ = greenland_train_test_sets()
    X_train = X_train.drop(['lat', 'lon'], axis=1)

    plot_partial_dependence(X_train, y_train, n_ways=1, include_features=None)
    save_cur_fig('partial-dependence-one-way.png', title='One way partial dependences', set_title_for=None)

    top_features = ['age', 'G_d_2yng_r', 'd_2trench', 'litho_asth', 'ETOPO_1deg', 'moho_GReD']
    plot_partial_dependence(X_train, y_train, n_ways=2, include_features=top_features)
    save_cur_fig('partial-dependence-two-way.png', title='Two way partial dependences', set_title_for=None)
Пример #5
0
def exp_generalization(data):
    """ Evaluates the generalization power of GBRT with increasing complexity
        (number of regression tress). This is used to verify that GBRT is
        robust against overfitting and to pick an appropriate number of trees
        for reported results and used in all other experiments (cf. `util.GBRT_params`).
    """
    radius = GREENLAND_RADIUS
    ncenters = 50
    roi_density = 11.3 # Greenland
    ns_estimators = range(50, 750, 100) + range(750, 3001, 750)
    dumpfile = 'generalization.txt'
    plotfile = 'generalization.png'
    plot_generalization_analysis(data, roi_density, radius, ncenters, ns_estimators, dumpfile=dumpfile, replot=False)
    save_cur_fig(plotfile, title='GBRT generalization power', set_title_for=None)
Пример #6
0
def exp_error_by_density(data):
    """ Evaluates prediction error (normalized rmse and r2) for GBRT, linear
        regression and constant predictor by using increasingly large sample
        densities in ROIs, constrained to the specified region, with radius
        equal to that of Greenland. Plot is saved to <OUT_DIR>/error_by_density[<region>].png.
    """
    densities = np.append(np.array([1]), np.arange(5, 51, 5))
    radius = GREENLAND_RADIUS
    ncenters = 50
    # region constraints: 'NA-WE', 'NA', 'WE', or None (i.e all)
    region = 'NA-WE'
    dumpfile = 'error_by_density[%s].txt' % region
    plotfile = 'error_by_density[%s].png' % region
    plot_error_by_density(data, densities, radius, ncenters, region=region, dumpfile=dumpfile, replot=False)
    save_cur_fig(plotfile)
Пример #7
0
def exp_error_by_radius(data):
    """ Evaluates prediction error (normalized rmse and r2) for GBRT, linear
        regression and constant predictor by using increasingly large radii for
        ROIs, constrained to the specified region, with sample density equal to
        that of Greenland. Plot is saved to <OUT_DIR>/error_by_radius[<region>].png.
    """
    radius = GREENLAND_RADIUS
    roi_density = 11.3 # Greenland
    ncenters = 50
    radii = np.arange(500, 4001, 500)
    region = 'NA-WE'
    dumpfile = 'error_by_radius[%s].txt' % region
    plotfile = 'error_by_radius[%s].png' % region

    sys.stderr.write('=> Experiment: Error by Radius (region: %s, no. centers: %d, no. radii: %d)\n' % (region, ncenters, len(radii)))
    plot_error_by_radius(data, roi_density, radii, ncenters, region=region, dumpfile=dumpfile, replot=False)
    save_cur_fig(plotfile)
Пример #8
0

if __name__ == '__main__':
    X_train, y_train, X_test = greenland_train_test_sets()

    train_lons = X_train.lon.as_matrix()
    train_lats = X_train.lat.as_matrix()
    X_train = X_train.drop(['lat', 'lon'], axis=1)

    test_lons = X_test.lon.as_matrix()
    test_lats = X_test.lat.as_matrix()
    X_test = X_test.drop(['lat', 'lon'], axis=1)

    # -------------------- Plot training data  -------------------------
    plot_training_GHF(train_lons, train_lats, y_train)
    save_cur_fig('greenland_training_GHF.png', title='GHF at training set')

    plot_gaussian_prescribed_GHF(train_lons, train_lats, y_train)
    save_cur_fig(
        'greenland_prescribed_GHF.png',
        title=
        'Points with prescribed GHF \n around GHF measurements (mW m$^{-2}$)')

    # -------------------- Plot predicted results ----------------------
    reg = train_gbrt(X_train, y_train)
    y_pred = reg.predict(X_test)

    plot_prediction_points(test_lons, test_lats, y_pred)
    save_cur_fig('greenland_prediction_points.png',
                 title='GHF predicted for Greenland (mW m$^{-2}$)')
Пример #9
0
                       parallel_step=10.,
                       meridian_step=10.,
                       colorbar_args=COLORBAR_ARGS,
                       scatter_args=scatter_args)

    equi(m,
         center[0],
         center[1],
         GREENLAND_RADIUS,
         lw=2,
         linestyle='-',
         color='black',
         alpha=.5)
    title = r'$GHF - \widehat{GHF}$ on validation set with ' + \
            r'$\rho_{ROI}$ = %d'%roi_density
    save_cur_fig('%d-diff-map.png' % roi_density, title=title)

    plot_test_pred_linregress(y_test, y_pred, label='GBRT', color='b')
    save_cur_fig('%d_linear_correlation.png' % roi_density,
                 title=r'$\rho_{ROI}$ = %i, $r^2=%.2f, RMSE=%.2f$' %
                 (roi_density, r2, rmse))

### Main Text Figure 1
## gbrt regression
X = data.drop(['GHF'], axis=1)
y = data.GHF

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=11,
                                                    test_size=0.10)