def select_block_size_rf(nruns, group_type, loc_dict, Cvar_dict, idw_example_grid, shapefile,\ file_path_elev, idx_list, cluster_num1, cluster_num2, cluster_num3, expand_area, boreal_shapefile): '''Evaluate the standard deviation of MAE values based on consective runs of the cross-valiation, in order to select the block/cluster size Parameters ---------- nruns : int number of repetitions group_type : string whether using 'clusters' or 'blocks' loc_dict : dictionary the latitude and longitudes of the daily/hourly stations Cvar_dict : dictionary dictionary of weather variable values for each station idw_example_grid : ndarray used for reference of study area grid size shapefile : string path to the study area shapefile file_path_elev : string path to the elevation lookup file idx_list : int position of the elevation column in the lookup file cluster_num1-3 : int three cluster numbers to test, for blocking this must be one of three:25, 16, 9 you can enter 'None' and it will automatically test 25, 16, 9 expand_area : bool expand area by 200km boreal_shapefile : string path to shapefile with the boreal zone Returns ---------- int - block/cluster number with lowest stdev float - average MAE of all the runs for that cluster/block number ''' # Get group dictionaries if group_type == 'blocks': folds25 = mbk.make_block(idw_example_grid, 25) dictionaryGroups25 = mbk.sorting_stations(folds25, shapefile, Cvar_dict) folds16 = mbk.make_block(idw_example_grid, 16) dictionaryGroups16 = mbk.sorting_stations(folds16, shapefile, Cvar_dict) folds9 = mbk.make_block(idw_example_grid, 9) dictionaryGroups9 = mbk.sorting_stations(folds9, shapefile, Cvar_dict) elif group_type == 'clusters': if expand_area: inBoreal = GD.is_station_in_boreal(loc_dict, Cvar_dict, boreal_shapefile) # Overwrite cvar_dict Cvar_dict = {k: v for k, v in Cvar_dict.items() if k in inBoreal} dictionaryGroups25 = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile, cluster_num1, file_path_elev, idx_list, False, False, False) dictionaryGroups16 = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile, cluster_num2, file_path_elev, idx_list, False, False, False) dictionaryGroups9 = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile, cluster_num3, file_path_elev, idx_list, False, False, False) else: dictionaryGroups25 = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile, cluster_num1, file_path_elev, idx_list, False, False, False) dictionaryGroups16 = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile, cluster_num2, file_path_elev, idx_list, False, False, False) dictionaryGroups9 = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile, cluster_num3, file_path_elev, idx_list, False, False, False) else: print('Thats not a valid group type') sys.exit() block25_error = [] block16_error = [] block9_error = [] if nruns <= 1: print('That is not enough runs to calculate the standard deviation!') sys.exit() for n in range(0, nruns): # We want same number of stations selected for each cluster number # We need to calculate, 5 folds x 25 clusters = 125 stations; 8 folds x 16 clusters = 128 stations, etc. # What is 30% of the stations target_stations = len(Cvar_dict.keys()) * 0.3 fold_num1 = int(round(target_stations / cluster_num1)) fold_num2 = int(round(target_stations / cluster_num2)) fold_num3 = int(round(target_stations / cluster_num3)) block25 = spatial_groups_rf(idw_example_grid, loc_dict, Cvar_dict, shapefile, cluster_num1, fold_num1, True, dictionaryGroups25, file_path_elev, idx_list, expand_area) block25_error.append(block25) block16 = spatial_groups_rf(idw_example_grid, loc_dict, Cvar_dict, shapefile, cluster_num2, fold_num2, True, dictionaryGroups16, file_path_elev, idx_list, expand_area) block16_error.append(block16) block9 = spatial_groups_rf(idw_example_grid, loc_dict, Cvar_dict, shapefile, cluster_num3, fold_num3, True, dictionaryGroups9, file_path_elev, idx_list, expand_area) block9_error.append(block9) stdev25 = statistics.stdev(block25_error) stdev16 = statistics.stdev(block16_error) stdev9 = statistics.stdev(block9_error) list_stdev = [stdev25, stdev16, stdev9] list_block_name = [cluster_num1, cluster_num2, cluster_num3] list_error = [block25_error, block16_error, block9_error] index_min = list_stdev.index(min(list_stdev)) lowest_stdev = statistics.stdev(list_error[index_min]) ave_MAE = sum(list_error[index_min]) / len(list_error[index_min]) cluster_select = list_block_name[index_min] print(list_error[index_min]) print(ave_MAE) print(lowest_stdev) print(cluster_select) return cluster_select, ave_MAE, lowest_stdev
def spatial_kfold_idw(idw_example_grid, loc_dict, Cvar_dict, shapefile, d, file_path_elev, idx_list, block_num, blocking_type, return_error): '''Spatially blocked k-fold cross-validation procedure for IDW Parameters ---------- idw_example_grid : ndarray used for reference of study area grid size loc_dict : dictionary the latitude and longitudes of the daily/hourly stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile d : int the weighting for IDW interpolation file_path_elev : string path to the elevation lookup file idx_list : int position of the elevation column in the lookup file block_num : int number of blocks/clusters blocking_type : string whether to use clusters or blocks return_error : bool whether or not to return the error dictionary Returns ---------- float - MAE estimate for entire surface int - Return the block number just so we can later write it into the file to keep track dictionary - if return_error = True, a dictionary of the absolute error at each fold when it was left out ''' groups_complete = [] error_dictionary = {} x_origin_list = [] y_origin_list = [] absolute_error_dictionary = {} projected_lat_lon = {} if blocking_type == 'cluster': cluster = c3d.spatial_cluster( loc_dict, Cvar_dict, shapefile, block_num, file_path_elev, idx_list, False, False, False) elif blocking_type == 'block': # Get the numpy array that delineates the blocks np_array_blocks = mbk.make_block(idw_example_grid, block_num) cluster = mbk.sorting_stations( np_array_blocks, shapefile, loc_dict, Cvar_dict) # Now get the dictionary else: print('That is not a valid blocking method') sys.exit() for group in cluster.values(): if group not in groups_complete: station_list = [k for k, v in cluster.items() if v == group] groups_complete.append(group) for station_name in Cvar_dict.keys(): if station_name in loc_dict.keys(): loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude) Plat = float(Plat) Plon = float(Plon) projected_lat_lon[station_name] = [Plat, Plon] lat = [] lon = [] Cvar = [] for station_name in sorted(Cvar_dict.keys()): if station_name in loc_dict.keys(): if station_name not in station_list: # This is the step where we hold back the fold loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) else: pass # Skip the station y = np.array(lat) x = np.array(lon) z = np.array(Cvar) na_map = gpd.read_file(shapefile) bounds = na_map.bounds xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] pixelHeight = 10000 pixelWidth = 10000 num_col = int((xmax - xmin) / pixelHeight) num_row = int((ymax - ymin) / pixelWidth) # We need to project to a projected system before making distance matrix # We dont know but assume source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent)] vals = np.vstack((xProj, yProj)).T interpol = np.vstack((Xi, Yi)).T # Length of the triangle side from the cell to the point with data dist_not = np.subtract.outer(vals[:, 0], interpol[:, 0]) # Length of the triangle side from the cell to the point with data dist_one = np.subtract.outer(vals[:, 1], interpol[:, 1]) # euclidean distance, getting the hypotenuse distance_matrix = np.hypot(dist_not, dist_one) # what if distance is 0 --> np.inf? have to account for the pixel underneath weights = 1 / (distance_matrix**d) # Making sure to assign the value of the weather station above the pixel directly to the pixel underneath weights[np.where(np.isinf(weights))] = 1 / (1.0E-50) weights /= weights.sum(axis=0) Zi = np.dot(weights.T, z) idw_grid = Zi.reshape(num_row, num_col) # Compare at a certain point for statLoc in station_list: coord_pair = projected_lat_lon[statLoc] x_orig = int( (coord_pair[0] - float(bounds['minx']))/pixelHeight) # lon y_orig = int( (coord_pair[1] - float(bounds['miny'])) / pixelWidth) # lat x_origin_list.append(x_orig) y_origin_list.append(y_orig) interpolated_val = idw_grid[y_orig][x_orig] original_val = Cvar_dict[statLoc] absolute_error = abs(interpolated_val - original_val) absolute_error_dictionary[statLoc] = absolute_error # average of all the withheld stations MAE = sum(absolute_error_dictionary.values()) / \ len(absolute_error_dictionary.values()) if return_error: return block_num, MAE, absolute_error_dictionary else: return block_num, MAE
def spatial_kfold_rf(idw_example_grid, loc_dict, Cvar_dict, shapefile, file_path_elev, elev_array, idx_list,\ block_num, blocking_type, return_error): '''Spatially blocked k-fold cross-validation procedure for RF Parameters ---------- idw_example_grid : ndarray used for reference of study area grid size loc_dict : dictionary the latitude and longitudes of the daily/hourly stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile file_path_elev : string path to the elevation lookup file elev_array : ndarray array for elevation, create using IDEW interpolation (this is a trick to speed up code) idx_list : int position of the elevation column in the lookup file block_num : int number of blocks/clusters blocking_type : string whether to use clusters or blocks return_error : bool whether or not to return the error dictionary Returns ---------- float - MAE estimate for entire surface int - Return the block number just so we can later write it into the file to keep track dictionary - if return_error = True, a dictionary of the absolute error at each fold when it was left out ''' groups_complete = [ ] # If not using replacement, keep a record of what we have done error_dictionary = {} x_origin_list = [] y_origin_list = [] absolute_error_dictionary = {} projected_lat_lon = {} # Selecting blocknum if blocking_type == 'cluster': cluster = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile, block_num, file_path_elev, idx_list, False, False, False) elif blocking_type == 'block': # Get the numpy array that delineates the blocks np_array_blocks = mbk.make_block(idw_example_grid, block_num) cluster = mbk.sorting_stations(np_array_blocks, shapefile, loc_dict, Cvar_dict) # Now get the dictionary else: print('That is not a valid blocking method') sys.exit() for group in cluster.values(): if group not in groups_complete: station_list = [k for k, v in cluster.items() if v == group] groups_complete.append(group) for station_name in Cvar_dict.keys(): if station_name in loc_dict.keys(): loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude) Plat = float(Plat) Plon = float(Plon) projected_lat_lon[station_name] = [Plat, Plon] lat = [] lon = [] Cvar = [] for station_name in sorted(Cvar_dict.keys()): if station_name in loc_dict.keys(): if station_name not in station_list: loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) else: pass y = np.array(lat) x = np.array(lon) z = np.array(Cvar) na_map = gpd.read_file(shapefile) bounds = na_map.bounds xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] pixelHeight = 10000 pixelWidth = 10000 num_col = int((xmax - xmin) / pixelHeight) num_row = int((ymax - ymin) / pixelWidth) # We need to project to a projected system before making distance matrix source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z}) yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [ np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent) ] # Elevation # Preparing the coordinates to send to the function that will get the elevation grid concat = np.array((Xi.flatten(), Yi.flatten())).T send_to_list = concat.tolist() # The elevation function takes a tuple send_to_tuple = [tuple(x) for x in send_to_list] Xi1_grd = [] Yi1_grd = [] elev_grd = [] # Get the elevations from the lookup file elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev, idx_list) for keys in elev_grd_dict.keys(): # The keys are each lat lon pair x = keys[0] y = keys[1] Xi1_grd.append(x) Yi1_grd.append(y) # Append the elevation data to the empty list elev_grd.append(elev_grd_dict[keys]) elev_array = np.array(elev_grd) # make an elevation array elev_dict = GD.finding_data_frm_lookup( zip(xProj, yProj), file_path_elev, idx_list) # Get the elevations for the stations xProj_input = [] yProj_input = [] e_input = [] for keys in zip( xProj, yProj): # Repeat process for just the stations not the whole grid x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) Xi1_grd = np.array(Xi1_grd) Yi1_grd = np.array(Yi1_grd) df_trainX = pd.DataFrame({ 'xProj': xProj, 'yProj': yProj, 'elevS': source_elev, 'var': z }) df_testX = pd.DataFrame({'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array}) reg = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1) y = np.array(df_trainX['var']).reshape(-1, 1) X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']]) X_test = np.array(df_testX[['Xi', 'Yi', 'elev']]) reg.fit(X_train, y) Zi = reg.predict(X_test) rf_grid = Zi.reshape(num_row, num_col) # Calc the RMSE, MAE at the pixel loc # Delete at a certain point for statLoc in station_list: coord_pair = projected_lat_lon[statLoc] x_orig = int( (coord_pair[0] - float(bounds['minx'])) / pixelHeight) # lon y_orig = int( (coord_pair[1] - float(bounds['miny'])) / pixelWidth) # lat x_origin_list.append(x_orig) y_origin_list.append(y_orig) interpolated_val = rf_grid[y_orig][x_orig] original_val = Cvar_dict[statLoc] absolute_error = abs(interpolated_val - original_val) absolute_error_dictionary[statLoc] = absolute_error # average of all the withheld stations MAE = sum(absolute_error_dictionary.values()) / \ len(absolute_error_dictionary.values()) if return_error: return block_num, MAE, absolute_error_dictionary else: return block_num, MAE
def spatial_kfold_tps(idw_example_grid, loc_dict, Cvar_dict, shapefile, phi, file_path_elev, idx_list, block_num, blocking_type, return_error, calc_phi): '''Spatially blocked k-folds cross-validation procedure for thin plate splines Parameters ---------- idw_example_grid : ndarray used for reference of study area grid size loc_dict : dictionary the latitude and longitudes of the daily/hourly stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile phi : float smoothing parameter for the thin plate spline, if 0 no smoothing file_path_elev : string path to the elevation lookup file idx_list : int position of the elevation column in the lookup file block_num : int number of blocks/clusters blocking_type : string whether to use clusters or blocks return_error : bool whether or not to return the error dictionary calc_phi : bool whether to calculate phi in the function, if True, phi can = None Returns ---------- float - MAE estimate for entire surface int - Return the block number just so we can later write it into the file to keep track dictionary - if return_error = True, a dictionary of the absolute error at each fold when it was left out ''' groups_complete = [ ] # If not using replacement, keep a record of what we have done error_dictionary = {} absolute_error_dictionary = {} # for plotting station_name_list = [] projected_lat_lon = {} if blocking_type == 'cluster': cluster = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile, block_num, file_path_elev, idx_list, False, False, False) elif blocking_type == 'block': # Get the numpy array that delineates the blocks np_array_blocks = mbk.make_block(idw_example_grid, block_num) cluster = mbk.sorting_stations(np_array_blocks, shapefile, loc_dict, Cvar_dict) # Now get the dictionary else: print('That is not a valid blocking method') sys.exit() for group in cluster.values(): if group not in groups_complete: station_list = [k for k, v in cluster.items() if v == group] groups_complete.append(group) for station_name in Cvar_dict.keys(): if station_name in loc_dict.keys(): station_name_list.append(station_name) loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude) Plat = float(Plat) Plon = float(Plon) projected_lat_lon[station_name] = [Plat, Plon] lat = [] lon = [] Cvar = [] # For preparing the empty grid w/ the values inserted for the rbf function x_origin_list = [] y_origin_list = [] z_origin_list = [] na_map = gpd.read_file(shapefile) bounds = na_map.bounds pixelHeight = 10000 pixelWidth = 10000 for station_name in sorted(Cvar_dict.keys()): if station_name in loc_dict.keys(): if station_name not in station_list: loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) coord_pair = projected_lat_lon[station_name] x_orig = int((coord_pair[0] - float(bounds['minx'])) / pixelHeight) # lon y_orig = int((coord_pair[1] - float(bounds['miny'])) / pixelWidth) # lat x_origin_list.append(x_orig) y_origin_list.append(y_orig) z_origin_list.append(Cvar_dict[station_name]) else: pass y = np.array(lat) x = np.array(lon) z = np.array(Cvar) na_map = gpd.read_file(shapefile) bounds = na_map.bounds xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] pixelHeight = 10000 pixelWidth = 10000 num_col = int((xmax - xmin) / pixelHeight) num_row = int((ymax - ymin) / pixelWidth) # We need to project to a projected system before making distance matrix # We dont know but assume source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col) Xi, Yi = np.meshgrid(Xi, Yi) empty_grid = np.empty(( num_row, num_col, )) * np.nan for x, y, z in zip(x_origin_list, y_origin_list, z_origin_list): empty_grid[y][x] = z vals = ~np.isnan(empty_grid) func = interpolate.Rbf(Xi[vals], Yi[vals], empty_grid[vals], function='thin_plate', smooth=phi) thin_plate = func(Xi, Yi) spline = thin_plate.reshape(num_row, num_col) # Calc the RMSE, MAE, at the pixel loc # Delete at a certain point for statLoc in station_list: coord_pair = projected_lat_lon[statLoc] x_orig = int( (coord_pair[0] - float(bounds['minx'])) / pixelHeight) # lon y_orig = int( (coord_pair[1] - float(bounds['miny'])) / pixelWidth) # lat interpolated_val = spline[y_orig][x_orig] original_val = Cvar_dict[statLoc] absolute_error = abs(interpolated_val - original_val) absolute_error_dictionary[statLoc] = absolute_error # average of all the withheld stations MAE = sum(absolute_error_dictionary.values()) / \ len(absolute_error_dictionary.values()) if return_error: return block_num, MAE, absolute_error_dictionary else: return block_num, MAE