def plot_train_test_error_nocv_vs_n_trn_exs(f_ppi, o_ppi, max_z=40000.0,
                            training_expt = 'abs1.0_norf_spinup',
                            n_trees=10, min_samples_leaf=10,
                            rain_only=False,
                            no_cos=True,
                            use_rh=False):

    datadir, trainfile, testfile, _ = ml_load.GetDataPath(training_expt)

    f_test, o_test, _, z, _, _ = ml_load.LoadData(testfile, max_z, n_trn_exs=None, rain_only=rain_only, no_cos=no_cos, use_rh=use_rh)

    rf = RandomForestRegressor(n_estimators = n_trees, min_samples_leaf = min_samples_leaf, random_state = 123, warm_start = False)

    n_trn_exs = np.array([1, 5, 10, 30, 50, 70, 80, 90])*10000

    test_error = np.zeros(len(n_trn_exs))
    train_error = np.zeros(len(n_trn_exs))

    for i in range(len(n_trn_exs)):
       f, o, _, z, rho, _  = ml_load.LoadData(trainfile, max_z, n_trn_exs=n_trn_exs[i], rain_only=rain_only, no_cos=no_cos, use_rh=use_rh)
       f_pp = ml_load.init_pp(f_ppi, f)
       o_pp = ml_load.init_pp(o_ppi, o)
       f_scl = ml_load.transform_data(f_ppi, f_pp, f, z)
       o_scl = ml_load.transform_data(o_ppi, o_pp, o, z)
       rf.fit(f_scl, o_scl)
       f_test_scl = ml_load.transform_data(f_ppi, f_pp, f_test, z)
       o_test_scl = ml_load.transform_data(o_ppi, o_pp, o_test, z)
       test_error[i] = 1.0-rf.score(f_test_scl,o_test_scl)
       train_error[i] = 1.0-rf.score(f_scl,o_scl)
       print(str(n_trn_exs[i]) + ': ' + str(test_error[i]))
       print(str(n_trn_exs[i]) + ': ' + str(train_error[i]))

    print(test_error)
    print(train_error)
    fig = plt.figure()
    fscale = 100000.0
    plt.plot(n_trn_exs/fscale, test_error, '-o', label='test')
    plt.plot(n_trn_exs/fscale, train_error, '-o', label='train')
    plt.xlim(-0.15, 9.5)
    plt.ylim(0,0.51) 
    plt.xlabel("n_trn_exs")
    plt.ylabel("Error")
    plt.legend(loc="upper right")
    plt.legend(frameon=False)
    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    plt.tight_layout() # avoid overlap
    plt.show()
    fig.savefig('figs_errors/error_test_train_nocv_vs_n_trn_exs.eps', bbox_inches='tight')
    plt.close()
def plot_error_vs_n_trn_exs(f_ppi, o_ppi, max_z=40000.0,
                            training_expt = 'abs1.0_norf_spinup',
                            n_trees=10, min_samples_leaf=10,
                            rain_only=False, 
                            no_cos = True, 
                            use_rh=False,
                            load_results=True,
                            save_results=False):
   

    if load_results:
     print('loading results')
     n_trn_exs, cv_error, cv_error_std =pickle.load(open('figs_errors/error_vs_n_trn_exs.pkl', 'rb'))
    else:
     datadir, trainfile, _, _ = ml_load.GetDataPath(training_expt)

     rf = RandomForestRegressor(n_estimators = n_trees, min_samples_leaf = min_samples_leaf, max_features = 1.0/3.0, random_state = 123, warm_start = False)

     n_trn_exs = np.array([1, 5, 10, 30, 50, 70, 80, 90])*10000

     cv_error = np.zeros(len(n_trn_exs)) 
     cv_error_std = np.zeros(len(n_trn_exs)) # standard deviation of error estimate across folds

     for i in range(len(n_trn_exs)):
        f, o, _, z, rho, _  = ml_load.LoadData(trainfile, max_z, n_trn_exs=n_trn_exs[i], rain_only=rain_only, no_cos=no_cos, use_rh=use_rh)
        f_pp = ml_load.init_pp(f_ppi, f)
        o_pp = ml_load.init_pp(o_ppi, o)
        f_scl = ml_load.transform_data(f_ppi, f_pp, f, z)
        o_scl = ml_load.transform_data(o_ppi, o_pp, o, z)
        scores = cross_val_score(rf, f_scl, o_scl, cv=n_cv, n_jobs=n_jobs)
        cv_error[i] = 1-scores.mean()
        cv_error_std[i] = scores.std()
        print(str(n_trn_exs[i]) + ': ' + str(cv_error[i]))

     if save_results:
      print('saving results')
      pickle.dump([n_trn_exs, cv_error, cv_error_std], open('figs_errors/error_vs_n_trn_exs.pkl', 'wb'))


    print(n_trn_exs)
    print(cv_error)
    print(np.max(cv_error_std/np.sqrt(n_cv)))

    fig = plt.figure(figsize=(3.0,2.25))
    fscale = 100000.0
    plt.plot(n_trn_exs/fscale, cv_error, '-o')
    plt.xlim(-0.15, 9.5)
    plt.ylim(0,0.51) 
    plt.xlabel("Number of training examples ($10^5$)")
    plt.ylabel("Error")
    #plt.legend(loc="upper right")
    #plt.legend(frameon=False)
    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    plt.tight_layout() # avoid overlap
    plt.show()
    fig.savefig('figs_errors/error_vs_n_trn_exs.eps', bbox_inches='tight')
    plt.close()
예제 #3
0
def train_wrapper(f_ppi,
                  o_ppi,
                  training_expt,
                  input_vert_dim,
                  output_vert_dim,
                  input_vert_vars,
                  output_vert_vars,
                  flag_dict,
                  do_nn=False,
                  n_iter=None,
                  do_train=True,
                  no_cos=True,
                  use_rh=False,
                  max_z=40000.0,
                  rain_only=False,
                  n_trn_exs=None,
                  plot_training_results=False,
                  n_trees=100,
                  min_samples_leaf=10,
                  max_depth=25,
                  n_layers=2,
                  n_hid_neur=10,
                  n_stable=None,
                  weight_decay=0.0,
                  do_wind_input=False,
                  do_diffusion=True,
                  scale_level=False,
                  rewight_outputs=False,
                  weight_list=[1, 1],
                  is_cheyenne=False,
                  only_plot=False):
    """Loads training data and trains and stores estimator

    Args:
        f_ppi (dict): The type of preprocessing to do to the features (inputs)
        o_ppi (dict): The type of preprocessing to do to the targets (outputs)
        n_layers (int): Number of layers in the NN
        n_hid_neur (int): Number of hidden neurons in each layer
        n_iter (int): Number of iterations
        n_stable (int): Number of iterations after stability reached
        max_z (float): Don't train on data above this level
        weight_decay (float): Regularization strength. 0 is no regularization
        rain_only (bool): Only train on precipitating examples
        n_trn_exs (int): Number of training examples to learn on
        do_nn (bool): Use an ANN instead of a random forest 
        no_cos (bool): If true, don't weight by cosine(latitude)
        min_samples_leaf (int): minimum samples per leaf
        plot_training_results (bool): Whether to also plot the model on training data
        use_rh (bool): use generalized relative humidity instead of total non-precip water as feature
        do_train (bool): whether to train (just plot the results if false)
    Returns:
        str: String id of trained NN
    """
    # Load data (note LoadData seeds the random number generator)

    if not only_plot:
        datadir, trainfile, testfile, pp_str = ml_load.GetDataPath(
            training_expt, wind_input=do_wind_input, is_cheyenne=is_cheyenne)

        f, o, y, z, rho, p = ml_load.LoadData(
            trainfile,
            max_z,
            input_vert_vars=input_vert_vars,
            output_vert_vars=output_vert_vars,
            rain_only=rain_only,
            n_trn_exs=n_trn_exs,
            no_cos=no_cos,
            use_rh=use_rh,
            wind_input=do_wind_input,
            exclusion_flag=flag_dict['exclusion_flag'])

        #load test data
        tf, to, ty, tz, trho, tp = ml_load.LoadData(
            testfile,
            max_z,
            input_vert_vars=input_vert_vars,
            output_vert_vars=output_vert_vars,
            rain_only=rain_only,
            n_trn_exs=n_trn_exs,
            no_cos=no_cos,
            use_rh=use_rh)

        # Scale data (both train and test)
        f_pp, f_scl, tf_scl, o_pp, o_scl, to_scl, pp_str = PreprocessData_tr_ts(
            f_ppi,
            f,
            tf,
            o_ppi,
            o,
            to,
            pp_str,
            n_trn_exs,
            z,
            input_vert_dim,
            input_vert_vars,
            output_vert_dim,
            output_vert_vars,
            scale_level,
            rewight_outputs=rewight_outputs,
            weight_list=weight_list)  #Yani TO DO!!!

        #Scale test data
        # tf_pp, tf_scl, to_pp, to_scl, tpp_str = PreprocessData(f_ppi, tf, o_ppi, to, pp_str, n_trn_exs, tz)

        # # Scale data
        # f_pp, f_scl, o_pp, o_scl, pp_str = PreprocessData(f_ppi, f, o_ppi, o, pp_str, n_trn_exs, z)

        # Either build a random forest or build a neural netowrk
        if do_nn:
            regularize = CatchRegularization(weight_decay)
            est, est_str = BuildNN(max_z,
                                   n_layers,
                                   'Rectifier',
                                   n_hid_neur,
                                   'momentum',
                                   pp_str,
                                   batch_size=100,
                                   n_stable=n_stable,
                                   n_iter=n_iter,
                                   learning_momentum=0.9,
                                   learning_rate=0.01,
                                   regularize=regularize,
                                   weight_decay=weight_decay,
                                   valid_size=0.2)
        else:
            est, est_str = BuildRandomForest(max_z, n_trees, min_samples_leaf,
                                             pp_str, max_depth, do_diffusion)

        est_str = UpdateName(no_cos, use_rh, rain_only, est_str)

        # Print details about the ML algorithm we are using
        print(est_str + ' Using ' + str(f.shape[0]) +
              ' training examples with ' + str(f.shape[1]) +
              ' input features and ' + str(o.shape[1]) + ' output targets')

        # Train the estimator
        if do_train:
            est, est_errors, train_score, test_score = train_est(
                est, est_str, f_scl, o_scl, tf_scl, to_scl, do_nn)

            est_str = est_str + 'te' + str(int(
                str(test_score)[2:4])) + '_tr' + str(int(
                    str(train_score)[2:4]))

            # Save the estimator to access it later
            save_est(est, est_str, est_errors, f_ppi, o_ppi, f_pp, o_pp, y, z,
                     p, rho, train_score, test_score, is_cheyenne)
        # Write a netcdf file for the gcm
        if do_nn:
            write_netcdf_nn(est_str, trainfile, rain_only, no_cos, use_rh,
                            is_cheyenne)
        else:
            write_netcdf_rf(est_str,
                            trainfile,
                            output_vert_vars,
                            output_vert_dim,
                            rain_only,
                            no_cos,
                            use_rh,
                            scale_level,
                            rewight_outputs=rewight_outputs,
                            weight_list=weight_list,
                            is_cheyenne=is_cheyenne)

        # Plot figures with testing data using all of it
    if only_plot:
        trainfile = '/glade/work/janniy/mldata/training_data/qobsTTFFFFFTF26TTTFTF48TFFFFFTFTFFF40FFTFTTF4848_training_x_no_subsampling.pkl'
        testfile = '/glade/work/janniy/mldata/training_data/qobsTTFFFFFTF26TTTFTF48TFFFFFTFTFFF40FFTFTTF4848_testing_x_no_subsampling.pkl'
        est_str = 'qobsTTFFFFFTF26TTTFTF48TFFFFFTFTFFF40FFTFTTF4848_F-NoSc_O-Stan_Ntr5000000_Nte972360_F_Tin_qin_qpin_latin_O_Tout_qout_qpout_RF_NTr10_MinS20max_d27_maxzinf_nocos_te50_tr54'

        trainfile = '/glade/work/janniy/mldata/training_data/qobsFFTFTFTFF0FFTFTF15FFFFFTFFFFTF815FFTFTTF00_training.pkl'
        testfile = '/glade/work/janniy/mldata/training_data/qobsFFTFTFTFF0FFTFTF15FFFFFTFFFFTF815FFTFTTF00_testing.pkl'
        est_str = 'qobsFFTFTFTFF0FFTFTF15FFFFFTFFFFTF815FFTFTTF00_F-NoSc_O-Stan_Ntr5000002_Nte972360_F_Tin_qin_uin_vinMinusSH_usurf_latin_O_tsurfCorr_qsurfCorr_tkz_RF_NTr10_MinS20max_d27_maxzinf_nocos_te70_tr75'

        trainfile = '/glade/scratch/janniy/mldata_tmp/training_data/qobsTTFFFFFTF26TTTFTF48TFFFFFTFTFFF320FFTFTTF4848_training_x_no_subsampling.pkl'
        testfile = '/glade/scratch/janniy/mldata_tmp/training_data/qobsTTFFFFFTF26TTTFTF48TFFFFFTFTFFF320FFTFTTF4848_testing_x_no_subsampling.pkl'
        est_str = 'qobsTTFFFFFTF26TTTFTF48TFFFFFTFTFFF320FFTFTTF4848_F-NoSc_O-Stan_Ntr1969020_Nte218790_F_Tin_qin_qpin_latin_O_Tout_qout_qpout_RF_NTr10_MinS7max_d27_maxzinf_nocos_te85_tr88'

    if only_plot:
        figpath = '/glade/scratch/janniy/figs_tmp_xy' + est_str + '/'
    else:
        figpath = './figs/' + est_str + '/'
    ml_plot.PlotAllFigs(est_str,
                        testfile,
                        do_nn,
                        figpath,
                        input_vert_vars,
                        output_vert_vars,
                        input_vert_dim,
                        output_vert_dim,
                        rain_only=rain_only,
                        n_trn_exs=n_trn_exs,
                        no_cos=no_cos,
                        use_rh=use_rh,
                        wind_input=do_wind_input,
                        scale_per_column=scale_level,
                        rewight_outputs=rewight_outputs,
                        weight_list=weight_list,
                        is_cheyenne=is_cheyenne)

    if plot_training_results:  # note use n_trn_exs here as training data
        figpath = figpath + 'training_data/'
        ml_plot.PlotAllFigs(est_str,
                            trainfile,
                            do_nn,
                            figpath,
                            input_vert_vars,
                            output_vert_vars,
                            input_vert_dim,
                            output_vert_dim,
                            rain_only=rain_only,
                            n_trn_exs=n_trn_exs,
                            no_cos=no_cos,
                            use_rh=use_rh,
                            wind_input=do_wind_input,
                            rewight_outputs=rewight_outputs,
                            weight_list=weight_list,
                            is_cheyenne=is_cheyenne)
    return est_str
예제 #4
0
min_samples_leaf = 10
n_trees = 11
n_trn_exs = 10000000000
use_rh = False
no_cos = True

training_expt = 'qobs'
#training_expt2 = 'qobs4K
rain_only = False

datadir, trainfile, testfile, pp_str = ml_load.GetDataPath(training_expt)

#Train data
f, o, y, z, rho, p = ml_load.LoadData(trainfile,
                                      max_z,
                                      rain_only=rain_only,
                                      n_trn_exs=n_trn_exs,
                                      no_cos=no_cos,
                                      use_rh=use_rh)
f_pp, f_scl, o_pp, o_scl, pp_str = PreprocessData(f_ppi, f, o_ppi, o, pp_str,
                                                  n_trn_exs, z)
print('read training data')
#test data
tf, to, ty, tz, trho, tp = ml_load.LoadData(testfile,
                                            max_z,
                                            rain_only=rain_only,
                                            n_trn_exs=n_trn_exs,
                                            no_cos=no_cos,
                                            use_rh=use_rh)
tf_pp, tf_scl, to_pp, to_scl, tpp_str = PreprocessData(f_ppi, tf, o_ppi, to,
                                                       pp_str, n_trn_exs, tz)
print('read test data')
def write_netcdf_nn(est_str,
                    datasource,
                    rain_only=False,
                    no_cos=False,
                    use_rh=False,
                    is_cheyenne=False):
    # Set output filename
    if is_cheyenne == False:  # On aimsir/esker
        base_dir = '/net/aimsir/archive1/janniy/'
    else:
        base_dir = '/glade/work/janniy/'

    output_filename = base_dir + 'mldata/gcm_regressors/' + est_str + '.nc'
    # Load rf and preprocessors
    est, _, errors, f_ppi, o_ppi, f_pp, o_pp, y, z, p, rho = \
        pickle.load(open(base_dir + 'mldata/regressors/' + est_str + '.pkl', 'rb'))
    # Need to transform some data for preprocessors to be able to export params
    f, o, _, _, _, _, = ml_load.LoadData(datasource,
                                         max_z=max(z),
                                         rain_only=rain_only,
                                         no_cos=no_cos,
                                         use_rh=use_rh)
    f_scl = ml_load.transform_data(f_ppi, f_pp, f, z)
    _ = ml_load.transform_data(o_ppi, o_pp, o, z)
    # Also need to use the predict method to be able to export ANN params
    _ = est.predict(f_scl)

    # Grab weights
    w1 = est.get_parameters()[0].weights
    w2 = est.get_parameters()[1].weights
    b1 = est.get_parameters()[0].biases
    b2 = est.get_parameters()[1].biases

    # Grab input and output normalization
    if f_ppi['name'] == 'StandardScaler':
        fscale_mean = f_pp.mean_
        fscale_stnd = f_pp.scale_
    else:
        raise ValueError('Incorrect scaler name')

    if o_ppi['name'] == 'SimpleO':
        Nlev = len(z)
        oscale = np.zeros(b2.shape)
        oscale[:Nlev] = 1.0 / o_pp[0]
        oscale[Nlev:] = 1.0 / o_pp[1]
    elif o_ppi['name'] == 'StandardScaler':
        oscale_mean = o_pp.mean_
        oscale_stnd = o_pp.scale_
    else:
        raise ValueError('Incorrect scaler name')

        # Write weights to file
    ncfile = Dataset(output_filename, 'w', format="NETCDF3_CLASSIC")
    # Write the dimensions
    ncfile.createDimension('N_in', w1.shape[0])
    ncfile.createDimension('N_h1', w1.shape[1])
    ncfile.createDimension('N_out', w2.shape[1])
    # Create variable entries in the file
    nc_w1 = ncfile.createVariable('w1',
                                  np.dtype('float32').char,
                                  ('N_h1', 'N_in'))  # Reverse dims
    nc_w2 = ncfile.createVariable('w2',
                                  np.dtype('float32').char, ('N_out', 'N_h1'))
    nc_b1 = ncfile.createVariable('b1', np.dtype('float32').char, ('N_h1'))
    nc_b2 = ncfile.createVariable('b2', np.dtype('float32').char, ('N_out'))
    nc_fscale_mean = ncfile.createVariable('fscale_mean',
                                           np.dtype('float32').char, ('N_in'))
    nc_fscale_stnd = ncfile.createVariable('fscale_stnd',
                                           np.dtype('float32').char, ('N_in'))
    if o_ppi['name'] == 'SimpleO':
        nc_oscale = ncfile.createVariable('oscale',
                                          np.dtype('float32').char, ('N_out'))
    else:
        nc_oscale_mean = ncfile.createVariable('oscale_mean',
                                               np.dtype('float32').char,
                                               ('N_out'))
        nc_oscale_stnd = ncfile.createVariable('oscale_stnd',
                                               np.dtype('float32').char,
                                               ('N_out'))

    # Write variables and close file - transpose because fortran reads it in
    # "backwards"
    nc_w1[:] = w1.T
    nc_w2[:] = w2.T
    nc_b1[:] = b1
    nc_b2[:] = b2
    nc_fscale_mean[:] = fscale_mean
    nc_fscale_stnd[:] = fscale_stnd

    if o_ppi['name'] == 'SimpleO':
        nc_oscale[:] = oscale
    else:
        nc_oscale_mean[:] = oscale_mean
        nc_oscale_stnd[:] = oscale_stnd

    # Write global file attributes
    ncfile.description = est_str
    ncfile.close()
# Define preprocessor
f_ppi = {'name': 'NoScaler'} # At the moment treated only the possibility to put here NoScalar (for RF it doesn't matter)
# f_ppi = {'name': 'StandardScaler'}

# o_ppi = {'name': 'SimpleO'}
o_ppi = {'name': 'StandardScaler'}
rewight_outputs = False #If I want to give more wight to certain features.
weight_list = [1,1]

rain_only = False


datadir, trainfile, testfile, pp_str = ml_load.GetDataPath(training_expt1,is_cheyenne = is_cheyenne)

f, o, y, z, rho, p = ml_load.LoadData(trainfile, max_z, input_vert_vars=input_vert_vars,
                                      output_vert_vars=output_vert_vars,
                                      rain_only=rain_only, n_trn_exs=n_trn_exs, no_cos=no_cos, use_rh=use_rh,
                                      wind_input=do_wind_input)
print('read train data')

# load test data
tf, to, ty, tz, trho, tp = ml_load.LoadData(testfile, max_z, input_vert_vars=input_vert_vars,
                                            output_vert_vars=output_vert_vars,
                                            rain_only=rain_only, n_trn_exs=n_trn_exs, no_cos=no_cos, use_rh=use_rh)
print('read test data')

# Scale data (both train and test)
f_pp, f_scl, tf_scl, o_pp, o_scl, to_scl, pp_str = ml_train.PreprocessData_tr_ts(f_ppi, f, tf, o_ppi, o, to, pp_str,
                                                                        n_trn_exs, z, input_vert_dim, input_vert_vars,
                                                                        output_vert_dim, output_vert_vars, scale_level,
                                                                        rewight_outputs=rewight_outputs,
                                                                        weight_list=weight_list)  # Yani TO DO!!!
def plot_error_vs_min_samples_leaf(f_ppi, o_ppi, max_z=20000.0,
                                   training_expt = 'abs1.0_norf_spinup',
                                   n_trees=10, 
                                   n_trn_exs=None, rain_only=False, 
                                   no_cos = True, use_rh=False,
                                   load_results=True,
                                   save_results=False):

    if load_results:
     print('loading results')
     min_samples_leaf, cv_error, cv_error_std =pickle.load(open('figs_errors/error_vs_min_samples_leaf.pkl', 'rb'))
    else:

     datadir, trainfile, _, _ = ml_load.GetDataPath(training_expt)

     f, o, _, z, rho, _  = ml_load.LoadData(trainfile, max_z, n_trn_exs=n_trn_exs, rain_only=rain_only, no_cos=no_cos, use_rh=use_rh)

     # scale data
     f_pp = ml_load.init_pp(f_ppi, f)
     o_pp = ml_load.init_pp(o_ppi, o)
     f_scl = ml_load.transform_data(f_ppi, f_pp, f, z)
     o_scl = ml_load.transform_data(o_ppi, o_pp, o, z)

     rf = RandomForestRegressor(n_estimators = n_trees, random_state = 123, max_features = 1.0/3.0, warm_start = False)

     min_min_samples_leaf = 1
     max_min_samples_leaf = 16
     step_min_samples_leaf = 3

     min_samples_leaf = range(min_min_samples_leaf, max_min_samples_leaf + 1, step_min_samples_leaf)

     cv_error = np.zeros(len(min_samples_leaf))
     cv_error_std = np.zeros(len(min_samples_leaf))

     for i in range(len(min_samples_leaf)):
        print(min_samples_leaf[i])
        rf.set_params(min_samples_leaf=min_samples_leaf[i])
        scores = cross_val_score(rf, f_scl, o_scl, cv=n_cv, n_jobs=n_jobs)
        cv_error[i] = 1-scores.mean()
        cv_error_std[i] = scores.std()
        print(str(min_samples_leaf[i]) + ': ' + str(cv_error[i]))

     if save_results:
      print('saving results')
      pickle.dump([min_samples_leaf, cv_error, cv_error_std], open('figs_errors/error_vs_min_samples_leaf.pkl', 'wb'))

    print(list(min_samples_leaf))
    print(cv_error)
    print(np.max(cv_error_std/np.sqrt(n_cv)))
    fig = plt.figure(figsize=(3.0,2.25))
    plt.plot(min_samples_leaf, cv_error, '-o')
    plt.xlim(0, 16.5)
    plt.ylim(0,0.51) 
    plt.xlabel("Minimum sample size for a leaf")
    plt.ylabel("Error")
    #plt.legend(loc="upper right")
    #plt.legend(frameon=False)
    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    plt.tight_layout() # avoid overlap
    plt.show()
    fig.savefig('figs_errors/error_vs_min_samples_leaf.eps', bbox_inches='tight')
    plt.close()
def plot_error_vs_n_trees(f_ppi, o_ppi, max_z=20000.0,
                     training_expt = 'abs1.0_norf_ras',
                     min_samples_leaf = 10,  
                     n_trn_exs=None, rain_only=False, 
                     no_cos = True,
                     use_rh=False,
                     load_results=True,
                     save_results=False):

    if load_results:
     print('loading results')
     n_trees, cv_error, cv_error_std =pickle.load(open('figs_errors/error_vs_n_trees.pkl', 'rb'))
    else:
     datadir, trainfile, _, _ = ml_load.GetDataPath(training_expt)

     f, o, y, z, rho, p  = ml_load.LoadData(trainfile, max_z, n_trn_exs=n_trn_exs, rain_only=rain_only, no_cos=no_cos, use_rh=use_rh)


     # scale data
     f_pp = ml_load.init_pp(f_ppi, f)
     o_pp = ml_load.init_pp(o_ppi, o)
     f_scl = ml_load.transform_data(f_ppi, f_pp, f, z)
     o_scl = ml_load.transform_data(o_ppi, o_pp, o, z)

     rf = RandomForestRegressor(min_samples_leaf = min_samples_leaf, max_features = 1.0/3.0, random_state = 123, warm_start = False)

     min_n_trees = 1
     max_n_trees = 21

     n_trees = range(min_n_trees, max_n_trees + 1,2)
     cv_error = np.zeros(len(n_trees))
     cv_error_std = np.zeros(len(n_trees)) # standard deviation across folds

     for i in range(len(n_trees)):
        rf.set_params(n_estimators=n_trees[i])
        scores = cross_val_score(rf, f_scl, o_scl, cv=n_cv, n_jobs=n_jobs)
        cv_error[i] = 1-scores.mean()
        cv_error_std[i] = scores.std()
        print(str(n_trees[i]) + ': ' + str(cv_error[i]))

     if save_results:
      print('saving results')
      pickle.dump([n_trees, cv_error, cv_error_std], open('figs_errors/error_vs_n_trees.pkl', 'wb'))

    print(list(n_trees))
    print(cv_error)
    # some confusion in literature as to whether should include 1/sqrt(n_cv) in standard error
    print(np.max(cv_error_std/np.sqrt(n_cv))) # max of standard error
    fig = plt.figure(figsize=(3.0,2.25))
    plt.plot(n_trees, cv_error, 'o-')
    plt.xlim(0, 22.8)
    plt.ylim(0,0.51) 
    plt.xlabel("Number of trees")
    plt.ylabel("Error")
    #plt.legend(loc="upper right")
    #plt.legend(frameon=False)
    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    plt.tight_layout() # avoid overlap
    plt.show()
    fig.savefig('figs_errors/error_vs_ntrees.eps', bbox_inches='tight')
    plt.close()