def r2_score(X, y): y_rfbc1 = rff.rfbc_predict(rfbc1['rf1'], rfbc1['rf2'], X[val_blocks == 0]) y_rfbc2 = rff.rfbc_predict(rfbc2['rf1'], rfbc2['rf2'], X[val_blocks == 1]) y_rfbc3 = rff.rfbc_predict(rfbc3['rf1'], rfbc3['rf2'], X[val_blocks == 2]) y_rfbc4 = rff.rfbc_predict(rfbc4['rf1'], rfbc4['rf2'], X[val_blocks == 3]) y_rfbc5 = rff.rfbc_predict(rfbc5['rf1'], rfbc5['rf2'], X[val_blocks == 4]) y_obs = np.hstack( (y[val_blocks == 0], y[val_blocks == 1], y[val_blocks == 2], y[val_blocks == 3], y[val_blocks == 4])) y_mod = np.hstack((y_rfbc1, y_rfbc2, y_rfbc3, y_rfbc4, y_rfbc5)) temp1, temp2, r, temp3, temp4 = stats.linregress(y_obs, y_mod) return r**2
def f(params): global best_mse global fail_count # check the hyperparameter set is sensible # - check 1: min_samples_split > min_samples_leaf if params['min_samples_split']<params['min_samples_leaf']: fail_count+=1 #print("INVALID HYPERPARAMETER SELECTION",params) return {'loss': None, 'status': STATUS_FAIL} rf = RandomForestRegressor(**params) r2_scores = np.zeros(k) MSE_scores = np.zeros(k) for kk in range(k): train_mask = cal_blocks!=kk test_mask = val_blocks==kk rf1,rf2 = rff.rfbc_fit(rf,X[train_mask],y[train_mask]) y_rf = rff.rfbc_predict(rf1,rf2,X[test_mask]) temp1,temp2,r,temp3,temp4 = stats.linregress(y[test_mask],y_rf) r2_scores[kk] = r**2 MSE_scores[kk] = np.mean( (y[test_mask]-y_rf) **2 ) r2_score=r2_scores.mean() mse=MSE_scores.mean() # - if error reduced, then update best model accordingly if mse < best_mse: best_score = score print('new best r^2: ', r2_score, '; best RMSE: ', np.sqrt(mse), params) return {'loss': mse, 'status': STATUS_OK}
def r2_score(X, y): y_obs = np.zeros(y.size) y_mod = np.zeros(y.size) index = 0 for ii in range(0, k): n = val_blocks['iter%i' % (ii + 1)].sum() y_mod[index:index + n] = rff.rfbc_predict( rfbc_k['rfbc%i' % (ii + 1)]['rf1'], rfbc_k['rfbc%i' % (ii + 1)]['rf2'], X[val_blocks['iter%i' % (ii + 1)]]) y_obs[index:index + n] = y[val_blocks['iter%i' % (ii + 1)]] return r**2
def f(params): global best_mse global fail_count # check the hyperparameter set is sensible # - check 1: min_samples_split > min_samples_leaf if params['min_samples_split'] < params['min_samples_leaf']: fail_count += 1 return {'loss': None, 'status': STATUS_FAIL} params['random_state'] = int(np.random.random() * 10**6) rf = RandomForestRegressor(**params) r2_scores = np.zeros((k, 2)) MSE_scores = np.zeros((k, 2)) grad_scores = np.zeros((k, 2)) for kk in range(k): train_mask = cal_blocks != kk test_mask = val_blocks == kk rf1, rf2 = rff.rfbc_fit(rf, X[train_mask], y[train_mask]) y_rf = rff.rfbc_predict(rf1, rf2, X) m, temp1, r, temp2, temp3 = stats.linregress(y[test_mask], y_rf[test_mask]) r2_scores[kk, 0] = r**2 MSE_scores[kk, 0] = np.mean((y[test_mask] - y_rf[test_mask])**2) grad_scores[kk, 0] = m m, temp1, r, temp2, temp3 = stats.linregress(y[train_mask], y_rf[train_mask]) r2_scores[kk, 1] = r**2 MSE_scores[kk, 1] = np.mean((y[train_mask] - y_rf[train_mask])**2) grad_scores[kk, 1] = m r2_score = r2_scores[:, 0].mean() mse = MSE_scores[:, 0].mean() # - if error reduced, then update best model accordingly if mse < best_mse: best_mse = mse print('new best r^2: %.5f; best RMSE: %.5f' % (r2_score, np.sqrt(mse))) print(params) return { 'loss': mse, 'status': STATUS_OK, 'mse_test': MSE_scores[:, 0].mean(), 'mse_train': MSE_scores[:, 1].mean(), 'gradient_test': grad_scores[:, 0].mean(), 'gradient_train': grad_scores[:, 1].mean(), 'r2_test': r2_scores[:, 0].mean(), 'r2_train': r2_scores[:, 1].mean() }
# Take best hyperparameter set and apply cal-val on full training set print('Applying buffered k-fold cross validation') rfbc_k = {} y_obs = np.zeros(y.size) y_mod = np.zeros(y.size) index = 0 for ii in range(0, k): n = val_blocks['iter%i' % (ii + 1)].sum() print(n) rfbc_k['rfbc%i' % (ii + 1)] = {} rfbc_k['rfbc%i' % (ii + 1)]['rf1'], rfbc_k['rfbc%i' % (ii + 1)]['rf2'] = rff.rfbc_fit( rf, X[cal_blocks['iter%i' % (ii + 1)]], y[cal_blocks['iter%i' % (ii + 1)]]) y_mod[index:index + n] = rff.rfbc_predict( rfbc_k['rfbc%i' % (ii + 1)]['rf1'], rfbc_k['rfbc%i' % (ii + 1)]['rf2'], X[val_blocks['iter%i' % (ii + 1)]]) y_obs[index:index + n] = y[val_blocks['iter%i' % (ii + 1)]] index += n temp1, temp2, r, temp3, temp4 = stats.linregress(y_obs, y_mod) r2 = r**2 rmse = np.sqrt(np.mean((y_mod - y_obs)**2)) rel_rmse = rmse / np.mean(y_obs) print("Validation\n\tR^2 = %.02f" % r2) print("\tRMSE = %.02f" % rmse) print("\trelative RMSE = %.02f" % rel_rmse) annotation = 'R$^2$ = %.2f\nRMSE = %.1f Mg ha$^{-1}$\nrelative RMSE = %.1f%s' % ( r2, rmse, rel_rmse * 100, '%') fig1, axes1 = gplt.plot_validation(y_obs, y_mod, annotation=annotation) fig1.savefig('%s%s_%s_%sm_validation_blocked_kfold.png' % (path2fig, site_id, version, resolution.zfill(3)))
axes.set_title('Sentinel 2 only') elif sensor == 'alos': axes.set_title('ALOS only') axes.set_xlim(0, 0.5) fig5.tight_layout() fig5.savefig( '%s%s_%s_permutation_importances_by_texture_single_sensor_%s.png' % (path2fig, site_id, version, sensor)) """ #=============================================================================== PART C: VALIDATION - Observed vs. modelled biomass using the blocked buffered strategy to avoid bias due to spatial autocorrelation #------------------------------------------------------------------------------- """ y_rfbc1 = rff.rfbc_predict(rfbc1['rf1'], rfbc1['rf2'], X[val_blocks == 0]) y_rfbc2 = rff.rfbc_predict(rfbc2['rf1'], rfbc2['rf2'], X[val_blocks == 1]) y_rfbc3 = rff.rfbc_predict(rfbc3['rf1'], rfbc3['rf2'], X[val_blocks == 2]) y_rfbc4 = rff.rfbc_predict(rfbc4['rf1'], rfbc4['rf2'], X[val_blocks == 3]) y_rfbc5 = rff.rfbc_predict(rfbc5['rf1'], rfbc5['rf2'], X[val_blocks == 4]) y_obs = np.hstack( (y[val_blocks == 0], y[val_blocks == 1], y[val_blocks == 2], y[val_blocks == 3], y[val_blocks == 4])) y_mod = np.hstack((y_rfbc1, y_rfbc2, y_rfbc3, y_rfbc4, y_rfbc5)) temp1, temp2, r, temp3, temp4 = stats.linregress(y_obs, y_mod) r2 = r**2 rmse = np.sqrt(np.mean((y_mod - y_obs)**2)) rel_rmse = rmse / np.mean(y_obs) print("Validation\n\tR^2 = %.02f" % r2) print("\tRMSE = %.02f" % rmse) print("\trelative RMSE = %.02f" % rel_rmse)
#=============================================================================== PART C: MONTECARLO UPSCALING Fit RF model for 100 AGB maps Save RFs for future reference #------------------------------------------------------------------------------- """ # We'll load in an existing dataset to get the georeferencing information template = io.load_geotiff(agb_list[0],option=1) rows,cols=template.shape agb_stack = np.zeros((N_iter,rows,cols)) #pca_predictors = pca.transform(predictors) #predictors=None for ii, agb_file in enumerate(agb_list): print('Iteration %i of %i' % (ii+1,N_iter)) rf_dict = joblib.load('%s%s_%s_optimised_rfbc_sentinel_alos_lidar_%s.pkl' % (path2alg,site_id,version,str(ii+1).zfill(3))) agb_mod = rff.rfbc_predict(rf_dict['rf1'],rf_dict['rf2'],predictors) #let's copy to a new xarray for AGBpot agb = io.copy_xarray_template(template) agb.values[forest_mask] = agb_mod.copy() agb.values[agb.values==-9999]=np.nan agb.values[agb.values<0]=0 outfile_prefix = '%s%s_%s_rfbc_agb_upscaled_%s' % (path2output,site_id,version,str(ii+1).zfill(3)) io.write_xarray_to_GeoTiff(agb,outfile_prefix) agb_stack[ii] = agb.values # summary arrays agb_med = io.copy_xarray_template(template) agb_med.values = np.median(agb_stack,axis=0)