Exemplo n.º 1
0
def shuffle_split_rf(latlon_dict,
                     Cvar_dict,
                     shapefile,
                     file_path_elev,
                     elev_array,
                     idx_list,
                     rep,
                     res=10000):
    '''Shuffle-split cross-validation with 50/50 training test split

    Parameters
    ----------

    loc_dict : dictionary
        the latitude and longitudes of the daily/hourly stations
    Cvar_dict : dictionary
        dictionary of weather variable values for each station
    shapefile : string
        path to the study area shapefile
    file_path_elev : string
        path to the elevation lookup file
    elev_array : ndarray
        array for elevation, create using IDEW interpolation (this is a trick to speed up code)
    idx_list : int
        position of the elevation column in the lookup file
    rep : int
        number of replications
             
    Returns
    ----------
    float
        - MAE estimate for entire surface (average of replications)
    '''
    count = 1
    error_dictionary = {}
    while count <= rep:
        x_origin_list = []
        y_origin_list = []

        absolute_error_dictionary = {}  # for plotting
        station_name_list = []
        projected_lat_lon = {}

        for station_name in Cvar_dict.keys():
            if station_name in latlon_dict.keys():
                station_name_list.append(station_name)

                loc = latlon_dict[station_name]
                latitude = loc[0]
                longitude = loc[1]
                Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
                Plat = float(Plat)
                Plon = float(Plon)
                projected_lat_lon[station_name] = [Plat, Plon]

        # Split the stations in two
        # we can't just use Cvar_dict.keys() because some stations do not have valid lat/lon
        stations_input = []
        for station_code in Cvar_dict.keys():
            if station_code in latlon_dict.keys():
                stations_input.append(station_code)
        # Split the stations in two
        stations = np.array(stations_input)
        # Won't be exactly 50/50 if uneven num stations
        splits = ShuffleSplit(n_splits=1, train_size=.5)

        for train_index, test_index in splits.split(stations):

            train_stations = stations[train_index]
            # print(train_stations)
            test_stations = stations[test_index]
            # print(test_stations)

        # They can't overlap

        for val in train_stations:
            if val in test_stations:
                print('Error, the train and test sets overlap!')
                sys.exit()

        lat = []
        lon = []
        Cvar = []
        for station_name in sorted(Cvar_dict.keys()):
            if station_name in latlon_dict.keys():
                if station_name not in test_stations:
                    loc = latlon_dict[station_name]
                    latitude = loc[0]
                    longitude = loc[1]
                    cvar_val = Cvar_dict[station_name]
                    lat.append(float(latitude))
                    lon.append(float(longitude))
                    Cvar.append(cvar_val)
                else:

                    pass

        y = np.array(lat)
        x = np.array(lon)
        z = np.array(Cvar)

        na_map = gpd.read_file(shapefile)
        bounds = na_map.bounds
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']
        pixelHeight = res
        pixelWidth = res

        num_col = int((xmax - xmin) / pixelHeight) + 1
        num_row = int((ymax - ymin) / pixelWidth) + 1

        # We need to project to a projected system before making distance matrix
        source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
        xProj, yProj = pyproj.Proj('esri:102001')(x, y)

        df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

        yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
        xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

        Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
        Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

        Xi, Yi = np.meshgrid(Xi, Yi)
        Xi, Yi = Xi.flatten(), Yi.flatten()

        maxmin = [
            np.min(yProj_extent),
            np.max(yProj_extent),
            np.max(xProj_extent),
            np.min(xProj_extent)
        ]

        # Elevation
        # Preparing the coordinates to send to the function that will get the elevation grid
        concat = np.array((Xi.flatten(), Yi.flatten())).T
        send_to_list = concat.tolist()
        # The elevation function takes a tuple
        send_to_tuple = [tuple(x) for x in send_to_list]

        Xi1_grd = []
        Yi1_grd = []
        elev_grd = []
        # Get the elevations from the lookup file
        elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple,
                                                   file_path_elev, idx_list)

        for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
            x = keys[0]
            y = keys[1]
            Xi1_grd.append(x)
            Yi1_grd.append(y)
            # Append the elevation data to the empty list
            elev_grd.append(elev_grd_dict[keys])

        elev_array = np.array(elev_grd)  # make an elevation array

        elev_dict = GD.finding_data_frm_lookup(
            zip(xProj, yProj), file_path_elev,
            idx_list)  # Get the elevations for the stations

        xProj_input = []
        yProj_input = []
        e_input = []

        for keys in zip(
                xProj, yProj
        ):  # Repeat process for just the stations not the whole grid
            x = keys[0]
            y = keys[1]
            xProj_input.append(x)
            yProj_input.append(y)
            e_input.append(elev_dict[keys])

        source_elev = np.array(e_input)

        Xi1_grd = np.array(Xi1_grd)
        Yi1_grd = np.array(Yi1_grd)

        df_trainX = pd.DataFrame({
            'xProj': xProj,
            'yProj': yProj,
            'elevS': source_elev,
            'var': z
        })

        df_testX = pd.DataFrame({
            'Xi': Xi1_grd,
            'Yi': Yi1_grd,
            'elev': elev_array
        })

        reg = RandomForestRegressor(n_estimators=100,
                                    max_features='sqrt',
                                    random_state=1)

        y = np.array(df_trainX['var']).reshape(-1, 1)
        X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
        X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

        reg.fit(X_train, y)

        Zi = reg.predict(X_test)

        rf_grid = Zi.reshape(num_row, num_col)

        # Calc the RMSE, MAE at the pixel loc
        # Delete at a certain point
        for statLoc in test_stations:
            coord_pair = projected_lat_lon[statLoc]

            x_orig = int(
                (coord_pair[0] - float(bounds['minx'])) / pixelHeight)  # lon
            y_orig = int(
                (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat
            x_origin_list.append(x_orig)
            y_origin_list.append(y_orig)

            try:

                interpolated_val = rf_grid[y_orig][x_orig]

                original_val = Cvar_dict[statLoc]
                absolute_error = abs(interpolated_val - original_val)
                absolute_error_dictionary[statLoc] = absolute_error
            except IndexError:
                pass

        error_dictionary[count] = sum(
            absolute_error_dictionary.values()) / len(
                absolute_error_dictionary.values(
                ))  # average of all the withheld stations
        count += 1

    overall_error = sum(error_dictionary.values()) / rep

    return overall_error
Exemplo n.º 2
0
def spatial_kfold_rf(idw_example_grid, loc_dict, Cvar_dict, shapefile, file_path_elev, elev_array, idx_list,\
                     block_num, blocking_type, return_error):
    '''Spatially blocked k-fold cross-validation procedure for RF

    Parameters
    ----------
         idw_example_grid  : ndarray
              used for reference of study area grid size
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile
         file_path_elev : string
              path to the elevation lookup file
         elev_array : ndarray
              array for elevation, create using IDEW interpolation (this is a trick to speed up code)         
         idx_list : int
              position of the elevation column in the lookup file
         block_num : int
              number of blocks/clusters
         blocking_type : string
              whether to use clusters or blocks
         return_error : bool
              whether or not to return the error dictionary
              
    Returns
    ----------
         float
              - MAE estimate for entire surface
         int
              - Return the block number just so we can later write it into the file to keep track
         dictionary
              - if return_error = True, a dictionary of the absolute error at each fold when it was left out
    '''
    groups_complete = [
    ]  # If not using replacement, keep a record of what we have done
    error_dictionary = {}

    x_origin_list = []
    y_origin_list = []

    absolute_error_dictionary = {}
    projected_lat_lon = {}

    # Selecting blocknum
    if blocking_type == 'cluster':
        cluster = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile,
                                      block_num, file_path_elev, idx_list,
                                      False, False, False)
    elif blocking_type == 'block':
        # Get the numpy array that delineates the blocks
        np_array_blocks = mbk.make_block(idw_example_grid, block_num)
        cluster = mbk.sorting_stations(np_array_blocks, shapefile, loc_dict,
                                       Cvar_dict)  # Now get the dictionary
    else:
        print('That is not a valid blocking method')
        sys.exit()

    for group in cluster.values():
        if group not in groups_complete:
            station_list = [k for k, v in cluster.items() if v == group]
            groups_complete.append(group)

    for station_name in Cvar_dict.keys():
        if station_name in loc_dict.keys():

            loc = loc_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
            Plat = float(Plat)
            Plon = float(Plon)
            projected_lat_lon[station_name] = [Plat, Plon]

    lat = []
    lon = []
    Cvar = []
    for station_name in sorted(Cvar_dict.keys()):
        if station_name in loc_dict.keys():
            if station_name not in station_list:
                loc = loc_dict[station_name]
                latitude = loc[0]
                longitude = loc[1]
                cvar_val = Cvar_dict[station_name]
                lat.append(float(latitude))
                lon.append(float(longitude))
                Cvar.append(cvar_val)
            else:

                pass

    y = np.array(lat)
    x = np.array(lon)
    z = np.array(Cvar)

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds
    xmax = bounds['maxx']
    xmin = bounds['minx']
    ymax = bounds['maxy']
    ymin = bounds['miny']
    pixelHeight = 10000
    pixelWidth = 10000

    num_col = int((xmax - xmin) / pixelHeight)
    num_row = int((ymax - ymin) / pixelWidth)

    # We need to project to a projected system before making distance matrix
    source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
    xProj, yProj = pyproj.Proj('esri:102001')(x, y)

    df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

    yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
    xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

    Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
    Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

    Xi, Yi = np.meshgrid(Xi, Yi)
    Xi, Yi = Xi.flatten(), Yi.flatten()

    maxmin = [
        np.min(yProj_extent),
        np.max(yProj_extent),
        np.max(xProj_extent),
        np.min(xProj_extent)
    ]

    # Elevation
    # Preparing the coordinates to send to the function that will get the elevation grid
    concat = np.array((Xi.flatten(), Yi.flatten())).T
    send_to_list = concat.tolist()
    # The elevation function takes a tuple
    send_to_tuple = [tuple(x) for x in send_to_list]

    Xi1_grd = []
    Yi1_grd = []
    elev_grd = []
    # Get the elevations from the lookup file
    elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev,
                                               idx_list)

    for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
        x = keys[0]
        y = keys[1]
        Xi1_grd.append(x)
        Yi1_grd.append(y)
        # Append the elevation data to the empty list
        elev_grd.append(elev_grd_dict[keys])

    elev_array = np.array(elev_grd)  # make an elevation array

    elev_dict = GD.finding_data_frm_lookup(
        zip(xProj, yProj), file_path_elev,
        idx_list)  # Get the elevations for the stations

    xProj_input = []
    yProj_input = []
    e_input = []

    for keys in zip(
            xProj,
            yProj):  # Repeat process for just the stations not the whole grid
        x = keys[0]
        y = keys[1]
        xProj_input.append(x)
        yProj_input.append(y)
        e_input.append(elev_dict[keys])

    source_elev = np.array(e_input)

    Xi1_grd = np.array(Xi1_grd)
    Yi1_grd = np.array(Yi1_grd)

    df_trainX = pd.DataFrame({
        'xProj': xProj,
        'yProj': yProj,
        'elevS': source_elev,
        'var': z
    })

    df_testX = pd.DataFrame({'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array})

    reg = RandomForestRegressor(n_estimators=100,
                                max_features='sqrt',
                                random_state=1)

    y = np.array(df_trainX['var']).reshape(-1, 1)
    X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
    X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

    reg.fit(X_train, y)

    Zi = reg.predict(X_test)

    rf_grid = Zi.reshape(num_row, num_col)

    # Calc the RMSE, MAE at the pixel loc
    # Delete at a certain point
    for statLoc in station_list:
        coord_pair = projected_lat_lon[statLoc]

        x_orig = int(
            (coord_pair[0] - float(bounds['minx'])) / pixelHeight)  # lon
        y_orig = int(
            (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat
        x_origin_list.append(x_orig)
        y_origin_list.append(y_orig)

        interpolated_val = rf_grid[y_orig][x_orig]

        original_val = Cvar_dict[statLoc]
        absolute_error = abs(interpolated_val - original_val)
        absolute_error_dictionary[statLoc] = absolute_error

    # average of all the withheld stations
    MAE = sum(absolute_error_dictionary.values()) / \
        len(absolute_error_dictionary.values())
    if return_error:
        return block_num, MAE, absolute_error_dictionary
    else:
        return block_num, MAE
Exemplo n.º 3
0
def cross_validate_rf(latlon_dict, Cvar_dict, shapefile, file_path_elev,
                      elev_array, idx_list, pass_to_plot):
    '''Leave-one-out cross-validation procedure for RF

    Parameters
    ----------
    
         latlon_dict : dictionary
              the latitude and longitudes of the stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile, including its name
         file_path_elev : string
              path to the elevation lookup file
         elev_array : ndarray
              array for elevation, create using IDEW interpolation (this is a trick to speed up code)
         idx_list : int
              position of the elevation column in the lookup file
         pass_to_plot : bool
              whether you will be plotting the error and need a version without absolute value error (i.e. fire season days)
              
    Returns
    ----------
         dictionary
              - a dictionary of the absolute error at each station when it was left out
         dictionary
              - if pass_to_plot = True, returns a dictionary without the absolute value of the error, for example for plotting fire season error
     '''
    x_origin_list = []
    y_origin_list = []

    absolute_error_dictionary = {}  # for plotting
    no_absolute_value_dict = {}  # to see whether under or over estimation
    station_name_list = []
    projected_lat_lon = {}

    for station_name in Cvar_dict.keys():
        if station_name in latlon_dict.keys():
            station_name_list.append(station_name)

            loc = latlon_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
            Plat = float(Plat)
            Plon = float(Plon)
            projected_lat_lon[station_name] = [Plat, Plon]

    for station_name_hold_back in station_name_list:

        lat = []
        lon = []
        Cvar = []
        for station_name in sorted(Cvar_dict.keys()):
            if station_name in latlon_dict.keys():
                if station_name != station_name_hold_back:
                    loc = latlon_dict[station_name]
                    latitude = loc[0]
                    longitude = loc[1]
                    cvar_val = Cvar_dict[station_name]
                    lat.append(float(latitude))
                    lon.append(float(longitude))
                    Cvar.append(cvar_val)
                else:

                    pass

        y = np.array(lat)
        x = np.array(lon)
        z = np.array(Cvar)

        na_map = gpd.read_file(shapefile)
        bounds = na_map.bounds
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']
        pixelHeight = 10000
        pixelWidth = 10000

        num_col = int((xmax - xmin) / pixelHeight)
        num_row = int((ymax - ymin) / pixelWidth)

        # We need to project to a projected system before making distance matrix
        source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
        xProj, yProj = pyproj.Proj('esri:102001')(x, y)

        df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

        yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
        xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

        Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
        Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

        Xi, Yi = np.meshgrid(Xi, Yi)
        Xi, Yi = Xi.flatten(), Yi.flatten()

        maxmin = [
            np.min(yProj_extent),
            np.max(yProj_extent),
            np.max(xProj_extent),
            np.min(xProj_extent)
        ]

        # Elevation
        # Preparing the coordinates to send to the function that will get the elevation grid
        concat = np.array((Xi.flatten(), Yi.flatten())).T
        send_to_list = concat.tolist()
        # The elevation function takes a tuple
        send_to_tuple = [tuple(x) for x in send_to_list]

        Xi1_grd = []
        Yi1_grd = []
        elev_grd = []
        # Get the elevations from the lookup file
        elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple,
                                                   file_path_elev, idx_list)

        for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
            x = keys[0]
            y = keys[1]
            Xi1_grd.append(x)
            Yi1_grd.append(y)
            # Append the elevation data to the empty list
            elev_grd.append(elev_grd_dict[keys])

        elev_array = np.array(elev_grd)  # make an elevation array

        elev_dict = GD.finding_data_frm_lookup(
            zip(xProj, yProj), file_path_elev,
            idx_list)  # Get the elevations for the stations

        xProj_input = []
        yProj_input = []
        e_input = []

        for keys in zip(
                xProj, yProj
        ):  # Repeat process for just the stations not the whole grid
            x = keys[0]
            y = keys[1]
            xProj_input.append(x)
            yProj_input.append(y)
            e_input.append(elev_dict[keys])

        source_elev = np.array(e_input)

        Xi1_grd = np.array(Xi1_grd)
        Yi1_grd = np.array(Yi1_grd)

        df_trainX = pd.DataFrame({
            'xProj': xProj,
            'yProj': yProj,
            'elevS': source_elev,
            'var': z
        })

        df_testX = pd.DataFrame({
            'Xi': Xi1_grd,
            'Yi': Yi1_grd,
            'elev': elev_array
        })

        reg = RandomForestRegressor(n_estimators=100,
                                    max_features='sqrt',
                                    random_state=1)

        y = np.array(df_trainX['var']).reshape(-1, 1)
        X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
        X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

        reg.fit(X_train, y)

        Zi = reg.predict(X_test)

        rf_grid = Zi.reshape(num_row, num_col)

        # Calc the RMSE, MAE at the pixel loc
        # Delete at a certain point
        coord_pair = projected_lat_lon[station_name_hold_back]

        x_orig = int(
            (coord_pair[0] - float(bounds['minx'])) / pixelHeight)  # lon
        y_orig = int(
            (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat
        x_origin_list.append(x_orig)
        y_origin_list.append(y_orig)

        interpolated_val = rf_grid[y_orig][x_orig]

        original_val = Cvar_dict[station_name_hold_back]
        absolute_error = abs(interpolated_val - original_val)
        absolute_error_dictionary[station_name_hold_back] = absolute_error
        no_absolute_value_dict[
            station_name_hold_back] = interpolated_val - original_val
    if pass_to_plot:
        return absolute_error_dictionary, no_absolute_value_dict
    else:
        return absolute_error_dictionary
Exemplo n.º 4
0
def random_forest_interpolator(latlon_dict, Cvar_dict, input_date, var_name, shapefile, show, \
                               file_path_elev, idx_list, expand_area, res = 10000):
    '''Random forest interpolation

    Parameters
    ----------
         latlon_dict : dictionary
              the latitude and longitudes of the stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         input_date : string
              the date you want to interpolate for
         var_name : string
              the name of the variable you are interpolating
         shapefile : string
              path to the study area shapefile, including its name
         show : bool
              whether you want to plot a map
         file_path_elev : string
              path to the elevation lookup file
         idx_list : int
              position of the elevation column in the lookup file
         expand_area : bool
              function will expand the study area so that more stations are taken into account (200 km)
              
    Returns
    ----------
         ndarray
              - the array of values for the interpolated surface
         list
              - the bounds of the array surface, for use in other functions
     '''
    lat = []
    lon = []
    Cvar = []

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds
    if expand_area:
        xmax = bounds['maxx'] + 200000
        xmin = bounds['minx'] - 200000
        ymax = bounds['maxy'] + 200000
        ymin = bounds['miny'] - 200000
    else:
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']

    for station_name in Cvar_dict.keys():
        if station_name in latlon_dict.keys():

            loc = latlon_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            # Filter out stations outside of grid
            proj_coord = pyproj.Proj('esri:102001')(longitude, latitude)
            if (proj_coord[1] <= float(ymax[0])
                    and proj_coord[1] >= float(ymin[0])
                    and proj_coord[0] <= float(xmax[0])
                    and proj_coord[0] >= float(xmin[0])):
                cvar_val = Cvar_dict[station_name]
                lat.append(float(latitude))
                lon.append(float(longitude))
                Cvar.append(cvar_val)

    y = np.array(lat)
    x = np.array(lon)
    z = np.array(Cvar)

    pixelHeight = res
    pixelWidth = res

    num_col = int((xmax - xmin) / pixelHeight)
    num_row = int((ymax - ymin) / pixelWidth)

    # We need to project to a projected system before making distance matrix
    source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
    xProj, yProj = pyproj.Proj('esri:102001')(x, y)

    df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

    if expand_area:

        yProj_extent = np.append(
            yProj, [bounds['maxy'] + 200000, bounds['miny'] - 200000])
        xProj_extent = np.append(
            xProj, [bounds['maxx'] + 200000, bounds['minx'] - 200000])

    else:
        yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
        xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

    Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row + 1)
    Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col + 1)

    Xi, Yi = np.meshgrid(Xi, Yi)
    Xi, Yi = Xi.flatten(), Yi.flatten()

    maxmin = [
        np.min(yProj_extent),
        np.max(yProj_extent),
        np.max(xProj_extent),
        np.min(xProj_extent)
    ]

    # Elevation
    # Preparing the coordinates to send to the function that will get the elevation grid
    concat = np.array((Xi.flatten(), Yi.flatten())).T
    send_to_list = concat.tolist()
    # The elevation function takes a tuple
    send_to_tuple = [tuple(x) for x in send_to_list]

    Xi1_grd = []
    Yi1_grd = []
    elev_grd = []
    # Get the elevations from the lookup file
    elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev,
                                               idx_list)

    for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
        x = keys[0]
        y = keys[1]
        Xi1_grd.append(x)
        Yi1_grd.append(y)
        # Append the elevation data to the empty list
        elev_grd.append(elev_grd_dict[keys])

    elev_array = np.array(elev_grd)  # make an elevation array

    elev_dict = GD.finding_data_frm_lookup(
        zip(xProj, yProj), file_path_elev,
        idx_list)  # Get the elevations for the stations

    xProj_input = []
    yProj_input = []
    e_input = []

    for keys in zip(
            xProj,
            yProj):  # Repeat process for just the stations not the whole grid
        x = keys[0]
        y = keys[1]
        xProj_input.append(x)
        yProj_input.append(y)
        e_input.append(elev_dict[keys])

    source_elev = np.array(e_input)

    Xi1_grd = np.array(Xi1_grd)
    Yi1_grd = np.array(Yi1_grd)

    df_trainX = pd.DataFrame({
        'xProj': xProj,
        'yProj': yProj,
        'elevS': source_elev,
        'var': z
    })

    df_testX = pd.DataFrame({'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array})

    reg = RandomForestRegressor(n_estimators=100,
                                max_features='sqrt',
                                random_state=1)

    y = np.array(df_trainX['var']).reshape(-1, 1)
    X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
    X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

    reg.fit(X_train, y)

    Zi = reg.predict(X_test)

    rf_grid = Zi.reshape(num_row + 1, num_col + 1)

    if show:
        fig, ax = plt.subplots(figsize=(15, 15))
        crs = {'init': 'esri:102001'}

        na_map = gpd.read_file(shapefile)

        plt.imshow(rf_grid,
                   extent=(xProj_extent.min() - 1, xProj_extent.max() + 1,
                           yProj_extent.max() - 1, yProj_extent.min() + 1))
        na_map.plot(ax=ax,
                    color='white',
                    edgecolor='k',
                    linewidth=2,
                    zorder=10,
                    alpha=0.1)

        plt.scatter(xProj, yProj, c=z, edgecolors='k')

        plt.gca().invert_yaxis()
        cbar = plt.colorbar()
        cbar.set_label(var_name)

        title = 'RF Interpolation for %s on %s' % (var_name, input_date)
        fig.suptitle(title, fontsize=14)
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')

        plt.show()

    return rf_grid, maxmin
Exemplo n.º 5
0
def spatial_cluster(loc_dict, Cvar_dict, shapefile, cluster_num, file_path_elev, idx_list,
                    plot_2D, plot_3D, return_all):
    '''Spatial clustering based on scikit learn's agglomerative clustering

    Parameters
    ----------
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile
         clusternum : int
              number of clusters
         file_path_elev : string
              path to the elevation lookup file
         idx_list : int
              position of the elevation column in the lookup file
         plot_2D : bool
              whether to plot maps of the clusters in 2d
         plot_3D : bool
              whether to plot maps of the clusters in 3d             
         return_all : bool
            whether or not to return all the outputs (needed for selecting cluster size)
            
    Returns
    ----------
         dictionary
             - a dictionary of cluster that each station is in 
    '''

    x = []
    y = []

    proj_stations = {}
    for station in Cvar_dict.keys():
        if station in loc_dict.keys():
            coord = loc_dict[station]
            Plon1, Plat1 = pyproj.Proj('esri:102001')(
                coord[1], coord[0])  # longitude,lat
            Plat = float(Plat1)
            Plon = float(Plon1)
            x.append([Plon])
            y.append([Plat])
            proj_stations[station] = [Plat, Plon]
    X = [val+y[i] for i, val in enumerate(x)]
    X = np.array(X)
    # print(X)
    # Make the longitudinal transect of distance (lon, elev)

    Xi1_grd = []
    Yi1_grd = []
    elev_grd = []
    # Preparing the coordinates to send to the function that will get the elevation grid
    concat = np.array((x, y)).T
    send_to_list = concat[0].tolist()
    send_to_tuple = [tuple(x) for x in send_to_list]
    # Get the elevations from the lookup file
    elev_grd_dict = GD.finding_data_frm_lookup(
        send_to_tuple, file_path_elev, idx_list)

    for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
        x = keys[0]
        y = keys[1]
        Xi1_grd.append(x)
        Yi1_grd.append(y)
        # Append the elevation data to the empty list
        elev_grd.append(elev_grd_dict[keys])

    lon = [i for i in Xi1_grd]  # list of 0
    lon_list = [[i] for i in lon]
    lat_list = [[i] for i in Yi1_grd]
    elev = [[i] for i in elev_grd]  # put into sublist so you can make pairs
    Xelev = [val+lat_list[i]+elev[i] for i, val in enumerate(lon_list)]
    Xelev = np.array(Xelev)

    # This is where we make the connectivity graph based on elevation

    knn_graph = kneighbors_graph(Xelev, 10, include_self=False)
    connectivity = knn_graph
    n_clusters = cluster_num

    linkage = 'ward'

    model = AgglomerativeClustering(
        linkage=linkage, connectivity=connectivity, n_clusters=n_clusters)

    model.fit(Xelev)  # fit with lat lon elev
    label = model.labels_

    if plot_3D:
        fig = plt.figure()
        ax = p3.Axes3D(fig)
        ax.view_init(7, -80)
        for l in np.unique(label):
            ax.scatter(Xelev[label == l, 0], Xelev[label == l, 1], Xelev[label == l, 2],
                       color=plt.cm.jet(float(l) / np.max(label + 1)),
                       s=20, edgecolor='k')
        plt.title('With connectivity constraints, Elevation inc.')
        ax.set_xlabel('Longitude')
        ax.set_ylabel('Latitude')
        ax.set_zlabel('Elevation (m)')

        plt.show()

    # This is where we make the connectivity graph where we can see on the map
    if plot_2D:

        fig, ax = plt.subplots(figsize=(15, 15))
        crs = {'init': 'esri:102001'}
        na_map = gpd.read_file(shapefile)

        na_map.plot(ax=ax, color='white', edgecolor='k', linewidth=1, alpha=1)

        plt.scatter(Xelev[:, 0], Xelev[:, 1], c=model.labels_,
                    cmap=plt.cm.tab20b, s=20, edgecolor='k')

        ax.tick_params(axis='both', which='both', bottom=False, top=False,
                       labelbottom=False, right=False, left=False, labelleft=False)
        ax.ticklabel_format(useOffset=False, style='plain')

        # plt.subplots_adjust(bottom=0, top=.83, wspace=0,
        # left=0, right=1)
        # plt.suptitle('n_cluster=%i, connectivity=%r' %
        # (n_clusters, connectivity is not None), size=17)

        plt.show()

    # Make a dictionary with each class
    station_class = {}

    count = 0
    for val in Xelev:
        key = [key for key, value in proj_stations.items() if value == [
            val[1], val[0]]]
        if len(key) == 1:
            # We add 1, because for the random selection the groups start at 1
            station_class[key[0]] = label[count] + 1
        elif len(key) == 2:
            station_class[key[0]] = label[count] + 1
            station_class[key[1]] = label[count] + 1
        elif len(key) == 3:
            station_class[key[0]] = label[count] + 1
            station_class[key[1]] = label[count] + 1
            station_class[key[2]] = label[count] + 1
        else:
            print('Too many stations have the same lat lon.')
        count += 1

    if count != label.shape[0]:
        print('The groups and label matrix do not match')

    if return_all:
        return label, Xelev, station_class
    else:

        return station_class
Exemplo n.º 6
0
def spatial_groups_rf(idw_example_grid, loc_dict, Cvar_dict, shapefile, blocknum, nfolds,\
                      replacement, dictionary_Groups, file_path_elev, idx_list, expand_area):
    '''Stratified shuffle-split cross-validation procedure

    Parameters
    ----------
         idw_example_grid  : ndarray
              used for reference of study area grid size
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile
         blocknum : int
              number of blocks/clusters
         nfolds : int
              number of folds to create (essentially repetitions)
         replacement : bool
              whether or not to use replacement between folds, should usually be true
         dictionary_Groups : dictionary
              dictionary of what groups (clusters) the stations belong to
         expand_area : bool
              function will expand the study area so that more stations are taken into account (200 km)
              
    Returns
    ----------
         dictionary
              - a dictionary of the absolute error at each fold when it was left out
    '''
    station_list_used = [
    ]  # If not using replacement, keep a record of what we have done
    count = 1
    error_dictionary = {}

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds
    if expand_area:
        xmax = bounds['maxx'] + 200000
        xmin = bounds['minx'] - 200000
        ymax = bounds['maxy'] + 200000
        ymin = bounds['miny'] - 200000
    else:
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']

    while count <= nfolds:
        x_origin_list = []
        y_origin_list = []

        absolute_error_dictionary = {}
        projected_lat_lon = {}

        station_list = Eval.select_random_station(dictionary_Groups, blocknum,
                                                  replacement,
                                                  station_list_used).values()

        if replacement == False:
            station_list_used.append(list(station_list))
        # print(station_list_used)

        for station_name in Cvar_dict.keys():

            if station_name in loc_dict.keys():

                loc = loc_dict[station_name]
                latitude = loc[0]
                longitude = loc[1]
                Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
                Plat = float(Plat)
                Plon = float(Plon)
                # Filter out stations outside of grid
                proj_coord = pyproj.Proj('esri:102001')(longitude, latitude)
                if (proj_coord[1] <= float(ymax[0])
                        and proj_coord[1] >= float(ymin[0])
                        and proj_coord[0] <= float(xmax[0])
                        and proj_coord[0] >= float(xmin[0])):
                    projected_lat_lon[station_name] = [Plat, Plon]

        lat = []
        lon = []
        Cvar = []
        for station_name in sorted(Cvar_dict.keys()):
            if station_name in loc_dict.keys():
                if station_name not in station_list:  # This is the step where we hold back the fold
                    loc = loc_dict[station_name]
                    latitude = loc[0]
                    longitude = loc[1]
                    cvar_val = Cvar_dict[station_name]

                    # Filter out stations outside of grid
                    proj_coord = pyproj.Proj('esri:102001')(longitude,
                                                            latitude)
                    if (proj_coord[1] <= float(ymax[0])
                            and proj_coord[1] >= float(ymin[0])
                            and proj_coord[0] <= float(xmax[0])
                            and proj_coord[0] >= float(xmin[0])):
                        lat.append(float(latitude))
                        lon.append(float(longitude))
                        Cvar.append(cvar_val)
                else:
                    pass  # Skip the station

        y = np.array(lat)
        x = np.array(lon)
        z = np.array(Cvar)

        pixelHeight = 10000
        pixelWidth = 10000
        num_col = int((xmax - xmin) / pixelHeight) + 1
        num_row = int((ymax - ymin) / pixelWidth) + 1

        # We need to project to a projected system before making distance matrix
        source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
        xProj, yProj = pyproj.Proj('esri:102001')(x, y)

        df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

        if expand_area:

            yProj_extent = np.append(
                yProj, [bounds['maxy'] + 200000, bounds['miny'] - 200000])
            xProj_extent = np.append(
                xProj, [bounds['maxx'] + 200000, bounds['minx'] - 200000])
        else:
            yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
            xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

        Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent),
                         num_row + 1)
        Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent),
                         num_col + 1)

        Xi, Yi = np.meshgrid(Xi, Yi)
        Xi, Yi = Xi.flatten(), Yi.flatten()

        maxmin = [
            np.min(yProj_extent),
            np.max(yProj_extent),
            np.max(xProj_extent),
            np.min(xProj_extent)
        ]

        # Elevation
        # Preparing the coordinates to send to the function that will get the elevation grid
        concat = np.array((Xi.flatten(), Yi.flatten())).T
        send_to_list = concat.tolist()
        # The elevation function takes a tuple
        send_to_tuple = [tuple(x) for x in send_to_list]

        Xi1_grd = []
        Yi1_grd = []
        elev_grd = []
        # Get the elevations from the lookup file
        elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple,
                                                   file_path_elev, idx_list)

        for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
            x = keys[0]
            y = keys[1]
            Xi1_grd.append(x)
            Yi1_grd.append(y)
            # Append the elevation data to the empty list
            elev_grd.append(elev_grd_dict[keys])

        elev_array = np.array(elev_grd)  # make an elevation array

        elev_dict = GD.finding_data_frm_lookup(
            zip(xProj, yProj), file_path_elev,
            idx_list)  # Get the elevations for the stations

        xProj_input = []
        yProj_input = []
        e_input = []

        for keys in zip(
                xProj, yProj
        ):  # Repeat process for just the stations not the whole grid
            x = keys[0]
            y = keys[1]
            xProj_input.append(x)
            yProj_input.append(y)
            e_input.append(elev_dict[keys])

        source_elev = np.array(e_input)

        Xi1_grd = np.array(Xi1_grd)
        Yi1_grd = np.array(Yi1_grd)

        df_trainX = pd.DataFrame({
            'xProj': xProj,
            'yProj': yProj,
            'elevS': source_elev,
            'var': z
        })

        df_testX = pd.DataFrame({
            'Xi': Xi1_grd,
            'Yi': Yi1_grd,
            'elev': elev_array
        })

        reg = RandomForestRegressor(n_estimators=100,
                                    max_features='sqrt',
                                    random_state=1)

        y = np.array(df_trainX['var']).reshape(-1, 1)
        X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
        X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

        reg.fit(X_train, y)

        Zi = reg.predict(X_test)

        rf_grid = Zi.reshape(num_row + 1, num_col + 1)

        # Compare at a certain point
        for statLoc in station_list:

            coord_pair = projected_lat_lon[statLoc]

            x_orig = int((coord_pair[0] - float(xmin)) / pixelHeight)  # lon
            y_orig = int((coord_pair[1] - float(ymin)) / pixelWidth)  # lat
            x_origin_list.append(x_orig)
            y_origin_list.append(y_orig)

            interpolated_val = rf_grid[y_orig][x_orig]

            original_val = Cvar_dict[statLoc]
            absolute_error = abs(interpolated_val - original_val)
            absolute_error_dictionary[statLoc] = absolute_error

        error_dictionary[count] = sum(
            absolute_error_dictionary.values()) / len(
                absolute_error_dictionary.values(
                ))  # average of all the withheld stations
        # print(absolute_error_dictionary)
        count += 1
    overall_error = sum(error_dictionary.values()) / \
        nfolds  # average of all the runs
    # print(overall_error)
    return overall_error
Exemplo n.º 7
0
def shuffle_split_IDEW(latlon_dict,
                       Cvar_dict,
                       shapefile,
                       file_path_elev,
                       elev_array,
                       idx_list,
                       d,
                       rep,
                       res=10000):
    '''Shuffle-split cross-validation with 50/50 training test split

   Parameters
   ----------
        loc_dict : dictionary
             the latitude and longitudes of the daily/hourly stations
        Cvar_dict : dictionary
             dictionary of weather variable values for each station
        shapefile : string
             path to the study area shapefile
        file_path_elev : string
            path to the elevation lookup file
        elev_array : ndarray
            array for elevation, create using IDEW interpolation (this is a trick to speed up code)
        idx_list : int
            position of the elevation column in the lookup file
        d : int
             the weighting for IDW interpolation
        rep : int
             number of replications
             
   Returns
   ----------
        float
             - MAE estimate for entire surface (average of replications)
   '''
    count = 1
    error_dictionary = {}
    while count <= rep:
        x_origin_list = []
        y_origin_list = []

        absolute_error_dictionary = {}
        station_name_list = []
        projected_lat_lon = {}

        # we can't just use Cvar_dict.keys() because some stations do not have valid lat/lon
        stations_input = []
        for station_code in Cvar_dict.keys():
            if station_code in latlon_dict.keys():
                stations_input.append(station_code)
        # Split the stations in two
        stations = np.array(stations_input)
        # Won't be exactly 50/50 if uneven num stations
        splits = ShuffleSplit(n_splits=1, train_size=.5)

        for train_index, test_index in splits.split(stations):

            train_stations = stations[train_index]
            # print(train_stations)
            test_stations = stations[test_index]
            # print(test_stations)

    # They can't overlap

        for val in train_stations:
            if val in test_stations:
                print('Error, the train and test sets overlap!')
                sys.exit()

        for station_name in Cvar_dict.keys():
            if station_name in latlon_dict.keys():
                station_name_list.append(station_name)

                loc = latlon_dict[station_name]
                latitude = loc[0]
                longitude = loc[1]
                Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
                Plat = float(Plat)
                Plon = float(Plon)
                projected_lat_lon[station_name] = [Plat, Plon]

        lat = []
        lon = []
        Cvar = []
        for station_name in sorted(Cvar_dict.keys()):
            if station_name in latlon_dict.keys():
                if station_name not in test_stations:
                    loc = latlon_dict[station_name]
                    latitude = loc[0]
                    longitude = loc[1]
                    cvar_val = Cvar_dict[station_name]
                    lat.append(float(latitude))
                    lon.append(float(longitude))
                    Cvar.append(cvar_val)
                else:

                    pass

        y = np.array(lat)
        x = np.array(lon)
        # what if we add the bounding locations to the array??? ==> that would be extrapolation not interpolation?
        z = np.array(Cvar)

        na_map = gpd.read_file(shapefile)
        bounds = na_map.bounds
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']
        pixelHeight = res
        pixelWidth = res

        num_col = int((xmax - xmin) / pixelHeight)
        num_row = int((ymax - ymin) / pixelWidth)

        # We need to project to a projected system before making distance matrix
        # We dont know but assume
        source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
        xProj, yProj = pyproj.Proj('esri:102001')(x, y)

        yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
        xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

        Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
        Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

        Xi, Yi = np.meshgrid(Xi, Yi)
        Xi, Yi = Xi.flatten(), Yi.flatten()
        maxmin = [
            np.min(yProj_extent),
            np.max(yProj_extent),
            np.max(xProj_extent),
            np.min(xProj_extent)
        ]

        vals = np.vstack((xProj, yProj)).T

        interpol = np.vstack((Xi, Yi)).T
        # Length of the triangle side from the cell to the point with data
        dist_not = np.subtract.outer(vals[:, 0], interpol[:, 0])
        # Length of the triangle side from the cell to the point with data
        dist_one = np.subtract.outer(vals[:, 1], interpol[:, 1])
        # euclidean distance, getting the hypotenuse
        distance_matrix = np.hypot(dist_not, dist_one)

        # what if distance is 0 --> np.inf? have to account for the pixel underneath
        weights = 1 / (distance_matrix**d)
        # Making sure to assign the value of the weather station above the pixel directly to the pixel underneath
        weights[np.where(np.isinf(weights))] = 1 / (1.0E-50)
        weights /= weights.sum(axis=0)

        Zi = np.dot(weights.T, z)
        idw_grid = Zi.reshape(num_row, num_col)

        elev_dict = GD.finding_data_frm_lookup(zip(xProj, yProj),
                                               file_path_elev, idx_list)

        xProj_input = []
        yProj_input = []
        e_input = []

        for keys in zip(
                xProj,
                yProj):  # in case there are two stations at the same lat\lon
            x = keys[0]
            y = keys[1]
            xProj_input.append(x)
            yProj_input.append(y)
            e_input.append(elev_dict[keys])

        source_elev = np.array(e_input)

        vals2 = np.vstack(source_elev).T

        interpol2 = np.vstack(elev_array).T

        dist_not2 = np.subtract.outer(vals2[0], interpol2[0])
        dist_not2 = np.absolute(dist_not2)
        weights2 = 1 / (dist_not2**d)

        weights2[np.where(np.isinf(weights2))] = 1
        weights2 /= weights2.sum(axis=0)

        fin = 0.8 * np.dot(weights.T, z) + 0.2 * np.dot(weights2.T, z)

        fin = fin.reshape(num_row, num_col)

        # Calc the RMSE, MAE, NSE, and MRAE at the pixel loc
        # Delete at a certain point
        for statLoc in test_stations:
            coord_pair = projected_lat_lon[statLoc]

            x_orig = int(
                (coord_pair[0] - float(bounds['minx'])) / pixelHeight)  # lon
            y_orig = int(
                (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat
            x_origin_list.append(x_orig)
            y_origin_list.append(y_orig)

            interpolated_val = fin[y_orig][x_orig]

            original_val = Cvar_dict[statLoc]
            absolute_error = abs(interpolated_val - original_val)
            absolute_error_dictionary[statLoc] = absolute_error

        error_dictionary[count] = sum(
            absolute_error_dictionary.values()) / len(
                absolute_error_dictionary.values(
                ))  # average of all the withheld stations
        count += 1

    overall_error = sum(error_dictionary.values()) / rep

    return overall_error
Exemplo n.º 8
0
def spatial_kfold_IDEW(loc_dict, Cvar_dict, shapefile, file_path_elev,
                       elev_array, idx_list, d, block_num, blocking_type):
    '''Spatially blocked k-folds cross-validation procedure for IDEW

    Parameters
    ----------
         idw_example_grid  : ndarray
              used for reference of study area grid size
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile
         d : int
              the weighting for IDW interpolation
         file_path_elev : string
              path to the elevation lookup file
         elev_array : ndarray
             array for elevation, create using IDEW interpolation (this is a trick to speed up code)
         idx_list : int
              position of the elevation column in the lookup file
         block_num : int
              number of blocks/clusters
         blocking_type : string
              whether to use clusters or blocks
              
    Returns
    ----------
         float
              - MAE estimate for entire surface
         int
              - Return the block number just so we can later write it into the file to keep track
    '''
    groups_complete = [
    ]  # If not using replacement, keep a record of what we have done
    error_dictionary = {}

    x_origin_list = []
    y_origin_list = []

    absolute_error_dictionary = {}
    projected_lat_lon = {}

    if blocking_type == 'cluster':
        cluster = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile,
                                      block_num, file_path_elev, idx_list,
                                      False, False, False)
    elif blocking_type == 'block':
        # Get the numpy array that delineates the blocks
        np_array_blocks = mbk.make_block(idw_example_grid, block_num)
        cluster = mbk.sorting_stations(np_array_blocks, shapefile, loc_dict,
                                       Cvar_dict)  # Now get the dictionary
    else:
        print('That is not a valid blocking method')
        sys.exit()

    for group in cluster.values():
        if group not in groups_complete:
            station_list = [k for k, v in cluster.items() if v == group]
            groups_complete.append(group)

    for station_name in Cvar_dict.keys():
        if station_name in loc_dict.keys():

            loc = loc_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
            Plat = float(Plat)
            Plon = float(Plon)
            projected_lat_lon[station_name] = [Plat, Plon]

    lat = []
    lon = []
    Cvar = []
    for station_name in sorted(Cvar_dict.keys()):
        if station_name in loc_dict.keys():
            if station_name not in station_list:
                loc = loc_dict[station_name]
                latitude = loc[0]
                longitude = loc[1]
                cvar_val = Cvar_dict[station_name]
                lat.append(float(latitude))
                lon.append(float(longitude))
                Cvar.append(cvar_val)
            else:

                pass

    y = np.array(lat)
    x = np.array(lon)
    # what if we add the bounding locations to the array??? ==> that would be extrapolation not interpolation?
    z = np.array(Cvar)

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds
    xmax = bounds['maxx']
    xmin = bounds['minx']
    ymax = bounds['maxy']
    ymin = bounds['miny']
    pixelHeight = 10000
    pixelWidth = 10000

    num_col = int((xmax - xmin) / pixelHeight)
    num_row = int((ymax - ymin) / pixelWidth)

    # We need to project to a projected system before making distance matrix
    # We dont know but assume
    source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
    xProj, yProj = pyproj.Proj('esri:102001')(x, y)

    yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
    xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

    Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
    Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

    Xi, Yi = np.meshgrid(Xi, Yi)
    Xi, Yi = Xi.flatten(), Yi.flatten()
    maxmin = [
        np.min(yProj_extent),
        np.max(yProj_extent),
        np.max(xProj_extent),
        np.min(xProj_extent)
    ]

    vals = np.vstack((xProj, yProj)).T

    interpol = np.vstack((Xi, Yi)).T
    # Length of the triangle side from the cell to the point with data
    dist_not = np.subtract.outer(vals[:, 0], interpol[:, 0])
    # Length of the triangle side from the cell to the point with data
    dist_one = np.subtract.outer(vals[:, 1], interpol[:, 1])
    # euclidean distance, getting the hypotenuse
    distance_matrix = np.hypot(dist_not, dist_one)

    # what if distance is 0 --> np.inf? have to account for the pixel underneath
    weights = 1 / (distance_matrix**d)
    # Making sure to assign the value of the weather station above the pixel directly to the pixel underneath
    weights[np.where(np.isinf(weights))] = 1 / (1.0E-50)
    weights /= weights.sum(axis=0)

    Zi = np.dot(weights.T, z)
    idw_grid = Zi.reshape(num_row, num_col)

    elev_dict = GD.finding_data_frm_lookup(zip(xProj, yProj), file_path_elev,
                                           idx_list)

    xProj_input = []
    yProj_input = []
    e_input = []

    for keys in zip(
            xProj,
            yProj):  # in case there are two stations at the same lat\lon
        x = keys[0]
        y = keys[1]
        xProj_input.append(x)
        yProj_input.append(y)
        e_input.append(elev_dict[keys])

    source_elev = np.array(e_input)

    vals2 = np.vstack(source_elev).T

    interpol2 = np.vstack(elev_array).T

    dist_not2 = np.subtract.outer(vals2[0], interpol2[0])
    dist_not2 = np.absolute(dist_not2)
    weights2 = 1 / (dist_not2**d)

    weights2[np.where(np.isinf(weights2))] = 1
    weights2 /= weights2.sum(axis=0)

    fin = 0.8 * np.dot(weights.T, z) + 0.2 * np.dot(weights2.T, z)

    fin = fin.reshape(num_row, num_col)

    # Calc the RMSE, MAE, NSE, and MRAE at the pixel loc
    # Delete at a certain point
    for statLoc in station_list:
        coord_pair = projected_lat_lon[statLoc]

        x_orig = int(
            (coord_pair[0] - float(bounds['minx'])) / pixelHeight)  # lon
        y_orig = int(
            (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat
        x_origin_list.append(x_orig)
        y_origin_list.append(y_orig)

        interpolated_val = fin[y_orig][x_orig]

        original_val = Cvar_dict[statLoc]
        absolute_error = abs(interpolated_val - original_val)
        absolute_error_dictionary[statLoc] = absolute_error

    # average of all the withheld stations
    MAE = sum(absolute_error_dictionary.values()) / \
        len(absolute_error_dictionary.values())

    return block_num, MAE
Exemplo n.º 9
0
def IDEW(latlon_dict,
         Cvar_dict,
         input_date,
         var_name,
         shapefile,
         show,
         file_path_elev,
         idx_list,
         d,
         expand_area,
         res=10000):
    '''Inverse distance elevation weighting

    Parameters
    ----------
         latlon_dict : dictionary
              the latitude and longitudes of the stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         input_date : string
              the date you want to interpolate for
         var_name : string
              the name of the variable you are interpolating
         shapefile : string
              path to the study area shapefile, including its name
         show : bool
              whether you want to plot a map
         file_path_elev : string
              path to the elevation lookup file
         idx_list : int
              position of the elevation column in the lookup file
         d : int
              the weighting for IDW interpolation

    Returns
    ----------
         ndarray
              - the array of values for the interpolated surface
         list
              - the bounds of the array surface, for use in other functions
         ndarray
              - elevation array (for use in the random forest module 
     '''

    # Input: lat lon of station, variable (start day, rainfall, etc), date of interest,variable name (for plotting), show (bool true/false), file path to elevation lookup file
    # idx_list (for the column containing the elevation data), d is the power applied to get the weight
    lat = []  # Initialize empty lists to store data
    lon = []
    Cvar = []
    for station_name in Cvar_dict.keys():  # Loop through the list of stations
        if station_name in latlon_dict.keys(
        ):  # Make sure the station is present in the latlon dict
            loc = latlon_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            cvar_val = Cvar_dict[station_name]
            lat.append(float(latitude))
            lon.append(float(longitude))
            Cvar.append(cvar_val)
    y = np.array(lat)  # Convert to a numpy array for faster processing speed
    x = np.array(lon)
    z = np.array(Cvar)

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds  # Get the bounding box of the shapefile
    if expand_area:
        xmax = bounds['maxx'] + 200000
        xmin = bounds['minx'] - 200000
        ymax = bounds['maxy'] + 200000
        ymin = bounds['miny'] - 200000
    else:
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']

    for station_name in Cvar_dict.keys():

        if station_name in latlon_dict.keys():

            loc = latlon_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            proj_coord = pyproj.Proj('esri:102001')(
                longitude, latitude)  # Filter out stations outside of grid
            if (proj_coord[1] <= float(ymax[0])
                    and proj_coord[1] >= float(ymin[0])
                    and proj_coord[0] <= float(xmax[0])
                    and proj_coord[0] >= float(xmin[0])):
                cvar_val = Cvar_dict[station_name]
                lat.append(float(latitude))
                lon.append(float(longitude))
                Cvar.append(cvar_val)
    y = np.array(lat)
    x = np.array(lon)
    z = np.array(Cvar)

    pixelHeight = res
    pixelWidth = res

    num_col = int((xmax - xmin) / pixelHeight) + 1
    num_row = int((ymax - ymin) / pixelWidth) + 1

    # We need to project to a projected system before making distance matrix
    # We dont know but assume NAD83
    source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
    xProj, yProj = pyproj.Proj('esri:102001')(
        x, y)  # Convert to Canada Albers Equal Area

    # Add the bounding box coords to the dataset so we can extrapolate the interpolation to cover whole area
    if expand_area:

        yProj_extent = np.append(
            yProj, [bounds['maxy'] + 200000, bounds['miny'] - 200000])
        xProj_extent = np.append(
            xProj, [bounds['maxx'] + 200000, bounds['minx'] - 200000])
    else:
        yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
        xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

    # Get the value for lat lon in each cell we just made
    Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
    Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

    # Make a rectangular grid (because eventually we will map the values)
    Xi, Yi = np.meshgrid(Xi, Yi)
    # Then we flatten the arrays for easier processing
    Xi, Yi = Xi.flatten(), Yi.flatten()
    maxmin = [
        np.min(yProj_extent),
        np.max(yProj_extent),
        np.max(xProj_extent),
        np.min(xProj_extent)
    ]  # We will later return this for use in other functions

    # vertically stack station x and y vals and then transpose them so they are in pairs
    vals = np.vstack((xProj, yProj)).T

    # Do the same thing for the grid x and y vals
    interpol = np.vstack((Xi, Yi)).T
    # Length of the triangle side from the cell to the point with data
    dist_not = np.subtract.outer(vals[:, 0], interpol[:, 0])
    # Length of the triangle side from the cell to the point with data
    dist_one = np.subtract.outer(vals[:, 1], interpol[:, 1])
    # Euclidean distance, getting the hypotenuse
    distance_matrix = np.hypot(dist_not, dist_one)

    # what if distance is 0 --> np.inf? have to account for the pixel underneath
    weights = 1 / (distance_matrix**d)
    # Making sure to assign the value of the weather station above the pixel directly to the pixel underneath
    weights[np.where(np.isinf(weights))] = 1 / (1.0E-50)
    weights /= weights.sum(axis=0)  # The weights must add up to 0

    # Take the dot product of the weights and the values, in this case the dot product is the sum product over the last axis of Weights.T and z
    Zi = np.dot(weights.T, z)

    # reshape the array into the proper format for the map
    idw_grid = Zi.reshape(num_row, num_col)

    # Elevation weights
    # Lon (X) goes in first for a REASON. It has to do with order in the lookup file.
    # Preparing the coordinates to send to the function that will get the elevation grid
    concat = np.array((Xi.flatten(), Yi.flatten())).T
    send_to_list = concat.tolist()
    # The elevation function takes a tuple
    send_to_tuple = [tuple(x) for x in send_to_list]

    Xi1_grd = []
    Yi1_grd = []
    elev_grd = []
    # Get the elevations from the lookup file
    elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev,
                                               idx_list)

    for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
        x = keys[0]
        y = keys[1]
        Xi1_grd.append(x)
        Yi1_grd.append(y)
        # Append the elevation data to the empty list
        elev_grd.append(elev_grd_dict[keys])

    elev_array = np.array(elev_grd)  # make an elevation array

    elev_dict = GD.finding_data_frm_lookup(
        zip(xProj, yProj), file_path_elev,
        idx_list)  # Get the elevations for the stations

    xProj_input = []
    yProj_input = []
    e_input = []

    for keys in zip(
            xProj,
            yProj):  # Repeat process for just the stations not the whole grid
        x = keys[0]
        y = keys[1]
        xProj_input.append(x)
        yProj_input.append(y)
        e_input.append(elev_dict[keys])

    source_elev = np.array(e_input)

    vals2 = np.vstack(source_elev).T

    interpol2 = np.vstack(elev_array).T

    # Get distance in terms of the elevation (vertical distance) from the station to the point to be interpolated
    dist_not2 = np.subtract.outer(vals2[0], interpol2[0])
    # Take the absolute value, we just care about what is the difference
    dist_not2 = np.absolute(dist_not2)
    weights2 = 1 / (dist_not2**d)  # Get the inverse distance weight
    # In the case of no elevation change
    weights2[np.where(np.isinf(weights2))] = 1
    weights2 /= weights2.sum(axis=0)  # Make weights add up to 1

    # Weight distance as 0.8 and elevation as 0.2
    fin = 0.8 * np.dot(weights.T, z) + 0.2 * np.dot(weights2.T, z)

    idew_grid = fin.reshape(num_row, num_col)  # Reshape the final array

    if show:  # Plot if show == True
        fig, ax = plt.subplots(figsize=(15, 15))
        crs = {'init': 'esri:102001'}

        na_map = gpd.read_file(shapefile)

        plt.imshow(elev_array.reshape(num_row, num_col),
                   extent=(xProj_extent.min() - 1, xProj_extent.max() + 1,
                           yProj_extent.max() - 1, yProj_extent.min() + 1))
        na_map.plot(ax=ax,
                    color='white',
                    edgecolor='k',
                    linewidth=2,
                    zorder=10,
                    alpha=0.1)

        plt.scatter(xProj, yProj, c=z, edgecolors='k')

        plt.gca().invert_yaxis()
        cbar = plt.colorbar()
        cbar.set_label(var_name)

        title = 'IDEW Interpolation for %s on %s' % (var_name, input_date)
        fig.suptitle(title, fontsize=14)
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')

        plt.show()

    return idew_grid, maxmin, elev_array
Exemplo n.º 10
0
def cross_validate_IDEW(latlon_dict, Cvar_dict, shapefile, file_path_elev,
                        elev_array, idx_list, d):
    '''Leave-one-out cross-validation procedure for IDEW

    Parameters
    ----------
         latlon_dict : dictionary
              the latitude and longitudes of the stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile, including its name
         file_path_elev : string
              path to the elevation lookup file
         elev_array : ndarray
              array for elevation, create using IDEW interpolation (this is a trick to speed up code)
         idx_list : int
              position of the elevation column in the lookup file
         d : int
              the weighting for IDW interpolation
              
    Returns
    ----------
         dictionary
              - a dictionary of the absolute error at each station when it was left out
    '''
    x_origin_list = []
    y_origin_list = []

    absolute_error_dictionary = {}  # for plotting
    station_name_list = []
    projected_lat_lon = {}

    for station_name in Cvar_dict.keys():
        if station_name in latlon_dict.keys():
            station_name_list.append(station_name)

            loc = latlon_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
            Plat = float(Plat)
            Plon = float(Plon)
            projected_lat_lon[station_name] = [Plat, Plon]

    # Pre-make the elev_dict to speed up code

    latO = []
    lonO = []
    for station_name in sorted(Cvar_dict.keys()):
        if station_name in latlon_dict.keys():
            loc = latlon_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            cvar_val = Cvar_dict[station_name]
            latO.append(float(latitude))
            lonO.append(float(longitude))
        else:
            pass

    yO = np.array(latO)
    xO = np.array(lonO)

    # We need to project to a projected system before making distance matrix
    # We dont know but assume
    source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
    xProjO, yProjO = pyproj.Proj('esri:102001')(xO, yO)
    elev_dict = GD.finding_data_frm_lookup(zip(xProjO, yProjO), file_path_elev,
                                           idx_list)

    for station_name_hold_back in station_name_list:

        lat = []
        lon = []
        Cvar = []
        for station_name in sorted(Cvar_dict.keys()):
            if station_name in latlon_dict.keys():
                if station_name != station_name_hold_back:
                    loc = latlon_dict[station_name]
                    latitude = loc[0]
                    longitude = loc[1]
                    cvar_val = Cvar_dict[station_name]
                    lat.append(float(latitude))
                    lon.append(float(longitude))
                    Cvar.append(cvar_val)
                else:

                    pass

        y = np.array(lat)
        x = np.array(lon)
        # what if we add the bounding locations to the array??? ==> that would be extrapolation not interpolation?
        z = np.array(Cvar)

        na_map = gpd.read_file(shapefile)
        bounds = na_map.bounds
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']
        pixelHeight = 10000
        pixelWidth = 10000

        num_col = int((xmax - xmin) / pixelHeight)
        num_row = int((ymax - ymin) / pixelWidth)

        # We need to project to a projected system before making distance matrix
        # We dont know but assume
        source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
        xProj, yProj = pyproj.Proj('esri:102001')(x, y)

        yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
        xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

        Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
        Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

        Xi, Yi = np.meshgrid(Xi, Yi)
        Xi, Yi = Xi.flatten(), Yi.flatten()
        maxmin = [
            np.min(yProj_extent),
            np.max(yProj_extent),
            np.max(xProj_extent),
            np.min(xProj_extent)
        ]

        vals = np.vstack((xProj, yProj)).T

        interpol = np.vstack((Xi, Yi)).T
        # Length of the triangle side from the cell to the point with data
        dist_not = np.subtract.outer(vals[:, 0], interpol[:, 0])
        # Length of the triangle side from the cell to the point with data
        dist_one = np.subtract.outer(vals[:, 1], interpol[:, 1])
        # euclidean distance, getting the hypotenuse
        distance_matrix = np.hypot(dist_not, dist_one)

        # what if distance is 0 --> np.inf? have to account for the pixel underneath
        weights = 1 / (distance_matrix**d)
        # Making sure to assign the value of the weather station above the pixel directly to the pixel underneath
        weights[np.where(np.isinf(weights))] = 1 / (1.0E-50)
        weights /= weights.sum(axis=0)

        Zi = np.dot(weights.T, z)
        idw_grid = Zi.reshape(num_row, num_col)

        #elev_dict= GD.finding_data_frm_lookup(zip(xProj, yProj),file_path_elev,idx_list)

        xProj_input = []
        yProj_input = []
        e_input = []

        for keys in zip(
                xProj,
                yProj):  # in case there are two stations at the same lat\lon
            x = keys[0]
            y = keys[1]
            xProj_input.append(x)
            yProj_input.append(y)
            e_input.append(elev_dict[keys])

        source_elev = np.array(e_input)

        vals2 = np.vstack(source_elev).T

        interpol2 = np.vstack(elev_array).T

        dist_not2 = np.subtract.outer(vals2[0], interpol2[0])
        dist_not2 = np.absolute(dist_not2)
        weights2 = 1 / (dist_not2**d)

        weights2[np.where(np.isinf(weights2))] = 1
        weights2 /= weights2.sum(axis=0)

        fin = 0.8 * np.dot(weights.T, z) + 0.2 * np.dot(weights2.T, z)

        fin = fin.reshape(num_row, num_col)

        # Calc the RMSE, MAE, NSE, and MRAE at the pixel loc
        # Delete at a certain point
        coord_pair = projected_lat_lon[station_name_hold_back]

        x_orig = int(
            (coord_pair[0] - float(bounds['minx'])) / pixelHeight)  # lon
        y_orig = int(
            (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat
        x_origin_list.append(x_orig)
        y_origin_list.append(y_orig)

        interpolated_val = fin[y_orig][x_orig]

        # Get the original value
        original_val = Cvar_dict[station_name_hold_back]
        # Calc the difference
        absolute_error = abs(interpolated_val - original_val)
        absolute_error_dictionary[station_name_hold_back] = absolute_error

    return absolute_error_dictionary
Exemplo n.º 11
0
def spatial_groups_IDEW(idw_example_grid, loc_dict, Cvar_dict, shapefile, d,
                        blocknum, nfolds, replacement, dictionary_Groups,
                        file_path_elev, idx_list, elev_array):
    '''Stratified shuffle-split cross-validation procedure

    Parameters
    ----------
         idw_example_grid  : ndarray
              used for reference of study area grid size
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile
         d : int
              the weighting for IDW interpolation
         blocknum : int
              number of blocks/clusters
         nfolds : int
              number of folds to create (essentially repetitions)
         replacement : bool
              whether or not to use replacement between folds, should usually be true
         dictionary_Groups : dictionary
              dictionary of what groups (clusters) the stations belong to
         elev_array : ndarray
             array for elevation, create using IDEW interpolation (this is a trick to speed up code)
             
    Returns
    ----------
         dictionary
              - a dictionary of the absolute error at each fold when it was left out
    '''
    station_list_used = [
    ]  # If not using replacement, keep a record of what we have done
    count = 1
    error_dictionary = {}

    # Premake elevation dictionary to speed up code

    latO = []
    lonO = []
    for station_name in sorted(Cvar_dict.keys()):
        if station_name in loc_dict.keys():
            loc = loc_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            cvar_val = Cvar_dict[station_name]
            latO.append(float(latitude))
            lonO.append(float(longitude))
        else:
            pass

    yO = np.array(latO)
    xO = np.array(lonO)

    # We need to project to a projected system before making distance matrix
    # We dont know but assume
    source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
    xProjO, yProjO = pyproj.Proj('esri:102001')(xO, yO)
    elev_dict = GD.finding_data_frm_lookup(zip(xProjO, yProjO), file_path_elev,
                                           idx_list)
    while count <= nfolds:
        x_origin_list = []
        y_origin_list = []

        absolute_error_dictionary = {}
        projected_lat_lon = {}

        station_list = Eval.select_random_station(dictionary_Groups, blocknum,
                                                  replacement,
                                                  station_list_used).values()
        if replacement == False:
            station_list_used.append(list(station_list))
        # print(station_list_used)

        for station_name in Cvar_dict.keys():

            if station_name in loc_dict.keys():

                loc = loc_dict[station_name]
                latitude = loc[0]
                longitude = loc[1]
                Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
                Plat = float(Plat)
                Plon = float(Plon)
                projected_lat_lon[station_name] = [Plat, Plon]

        lat = []
        lon = []
        Cvar = []
        for station_name in sorted(Cvar_dict.keys()):
            if station_name in loc_dict.keys():
                if station_name not in station_list:
                    loc = loc_dict[station_name]
                    latitude = loc[0]
                    longitude = loc[1]
                    cvar_val = Cvar_dict[station_name]
                    lat.append(float(latitude))
                    lon.append(float(longitude))
                    Cvar.append(cvar_val)
                else:

                    pass

        y = np.array(lat)
        x = np.array(lon)
        # what if we add the bounding locations to the array??? ==> that would be extrapolation not interpolation?
        z = np.array(Cvar)

        na_map = gpd.read_file(shapefile)
        bounds = na_map.bounds
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']
        pixelHeight = 10000
        pixelWidth = 10000

        num_col = int((xmax - xmin) / pixelHeight)
        num_row = int((ymax - ymin) / pixelWidth)

        # We need to project to a projected system before making distance matrix
        # We dont know but assume
        source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
        xProj, yProj = pyproj.Proj('esri:102001')(x, y)

        yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
        xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

        Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
        Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

        Xi, Yi = np.meshgrid(Xi, Yi)
        Xi, Yi = Xi.flatten(), Yi.flatten()
        maxmin = [
            np.min(yProj_extent),
            np.max(yProj_extent),
            np.max(xProj_extent),
            np.min(xProj_extent)
        ]

        vals = np.vstack((xProj, yProj)).T

        interpol = np.vstack((Xi, Yi)).T
        # Length of the triangle side from the cell to the point with data
        dist_not = np.subtract.outer(vals[:, 0], interpol[:, 0])
        # Length of the triangle side from the cell to the point with data
        dist_one = np.subtract.outer(vals[:, 1], interpol[:, 1])
        # euclidean distance, getting the hypotenuse
        distance_matrix = np.hypot(dist_not, dist_one)

        # what if distance is 0 --> np.inf? have to account for the pixel underneath
        weights = 1 / (distance_matrix**d)
        # Making sure to assign the value of the weather station above the pixel directly to the pixel underneath
        weights[np.where(np.isinf(weights))] = 1 / (1.0E-50)
        weights /= weights.sum(axis=0)

        Zi = np.dot(weights.T, z)
        idw_grid = Zi.reshape(num_row, num_col)

        elev_dict = GD.finding_data_frm_lookup(zip(xProj, yProj),
                                               file_path_elev, idx_list)

        xProj_input = []
        yProj_input = []
        e_input = []

        for keys in zip(
                xProj,
                yProj):  # in case there are two stations at the same lat\lon
            x = keys[0]
            y = keys[1]
            xProj_input.append(x)
            yProj_input.append(y)
            e_input.append(elev_dict[keys])

        source_elev = np.array(e_input)

        vals2 = np.vstack(source_elev).T

        interpol2 = np.vstack(elev_array).T

        dist_not2 = np.subtract.outer(vals2[0], interpol2[0])
        dist_not2 = np.absolute(dist_not2)
        weights2 = 1 / (dist_not2**d)

        weights2[np.where(np.isinf(weights2))] = 1
        weights2 /= weights2.sum(axis=0)

        fin = 0.8 * np.dot(weights.T, z) + 0.2 * np.dot(weights2.T, z)

        fin = fin.reshape(num_row, num_col)

        # Compare at a certain point
        for statLoc in station_list:

            coord_pair = projected_lat_lon[statLoc]

            x_orig = int(
                (coord_pair[0] - float(bounds['minx'])) / pixelHeight)  # lon
            y_orig = int(
                (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat
            x_origin_list.append(x_orig)
            y_origin_list.append(y_orig)

            interpolated_val = fin[y_orig][x_orig]

            original_val = Cvar_dict[statLoc]
            absolute_error = abs(interpolated_val - original_val)
            absolute_error_dictionary[statLoc] = absolute_error

        error_dictionary[count] = sum(
            absolute_error_dictionary.values()) / len(
                absolute_error_dictionary.values(
                ))  # average of all the withheld stations
        # print(absolute_error_dictionary)
        count += 1
    overall_error = sum(error_dictionary.values()) / \
        nfolds  # average of all the runs
    # print(overall_error)
    return overall_error
Exemplo n.º 12
0
def GPR_interpolator(latlon_dict, Cvar_dict, input_date, var_name, shapefile, show,
                     file_path_elev, idx_list, expand_area, kernel_object, restarts, \
                     report_params, optimizer, param_initiate=None, cov_type='RBF',res=10000):
    '''Base interpolator function for gaussian process regression

    Parameters
    ----------
    latlon_dict : dictionary
        the latitude and longitudes of the stations
    Cvar_dict : dictionary
        dictionary of weather variable values for each station
    input_date : string
        the date you want to interpolate for
    shapefile : string
        path to the study area shapefile, including its name
    show : bool
        whether you want to plot a map    
    file_path_elev : string
        file path to the elevation lookup file 
    idx_list : list
        the index of the elevation data column in the lookup file 
    expand_area : bool
        function will expand the study area so that more stations are taken into account (200 km)   
    kernel_object : list
        kernel object describing input kernel you want to use, if optimizing a set of parameters, can input empty list
    restarts : int
        number of times to restart to avoid local optima
    report_params : bool
        if True, outputs optimized values for kernel hyperparameters
    optimizer : bool
        if False, fix parameters of covariance function
    param_initiate : list
        input parameters needed to start optimization, controls extent of the spatial autocorrelation modelled by the process
        whether the spatial autocorrelation is the same in all directions will depend on the inputs for parameters,
        you need to input the parameters of the function (distribution) as a vector not a scalar
        since we are working in 3d (latitude, longitude, elevation) the vector must be len=3 because this corresponds to the [x,y,z]
        if we are using an anisotropic distribution
        ...for isotropic 1d, [1] (or if 2 parameters, [[1],[1]]), for anisotropic, will be [1,1,1] or [[1,1],[1,1],[1,1]]
    cov_type : str
        type of covariance function to use if have not specified a kernel object

    Returns
    ----------
    ndarray
        - an array of the interpolated values
    '''
    lat = []
    lon = []
    Cvar = []

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds
    if expand_area:
        xmax = bounds['maxx']+200000
        xmin = bounds['minx']-200000
        ymax = bounds['maxy']+200000
        ymin = bounds['miny']-200000
    else:
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']

    for station_name in Cvar_dict.keys():
        if station_name in latlon_dict.keys():

            loc = latlon_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            # Filter out stations outside of grid
            proj_coord = pyproj.Proj('esri:102001')(longitude, latitude)
            if (proj_coord[1] <= float(ymax[0]) and proj_coord[1] >=
                float(ymin[0]) and proj_coord[0] <= float(xmax[0]) and
                    proj_coord[0] >= float(xmin[0])):
                cvar_val = Cvar_dict[station_name]
                lat.append(float(latitude))
                lon.append(float(longitude))
                Cvar.append(cvar_val)

    y = np.array(lat)
    x = np.array(lon)
    z = np.array(Cvar)

    pixelHeight = res
    pixelWidth = res

    num_col = int((xmax - xmin) / pixelHeight)
    num_row = int((ymax - ymin) / pixelWidth)

    # We need to project to a projected system before making distance matrix
    source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
    xProj, yProj = pyproj.Proj('esri:102001')(x, y)

    df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

    if expand_area:

        yProj_extent = np.append(
            yProj, [bounds['maxy']+200000, bounds['miny']-200000])
        xProj_extent = np.append(
            xProj, [bounds['maxx']+200000, bounds['minx']-200000])

    else:
        yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
        xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

    Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row+1)
    Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col+1)

    Xi, Yi = np.meshgrid(Xi, Yi)
    Xi, Yi = Xi.flatten(), Yi.flatten()

    maxmin = [np.min(yProj_extent), np.max(yProj_extent),
              np.max(xProj_extent), np.min(xProj_extent)]

    # Elevation
    # Preparing the coordinates to send to the function that will get the elevation grid
    concat = np.array((Xi.flatten(), Yi.flatten())).T
    send_to_list = concat.tolist()
    # The elevation function takes a tuple
    send_to_tuple = [tuple(x) for x in send_to_list]

    Xi1_grd = []
    Yi1_grd = []
    elev_grd = []
    # Get the elevations from the lookup file
    elev_grd_dict = GD.finding_data_frm_lookup(
        send_to_tuple, file_path_elev, idx_list)

    for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
        x = keys[0]
        y = keys[1]
        Xi1_grd.append(x)
        Yi1_grd.append(y)
        # Append the elevation data to the empty list
        elev_grd.append(elev_grd_dict[keys])

    elev_array = np.array(elev_grd)  # make an elevation array

    elev_dict = GD.finding_data_frm_lookup(zip(
        xProj, yProj), file_path_elev, idx_list)  # Get the elevations for the stations

    xProj_input = []
    yProj_input = []
    e_input = []

    for keys in zip(xProj, yProj):  # Repeat process for just the stations not the whole grid
        x = keys[0]
        y = keys[1]
        xProj_input.append(x)
        yProj_input.append(y)
        e_input.append(elev_dict[keys])

    source_elev = np.array(e_input)

    Xi1_grd = np.array(Xi1_grd)
    Yi1_grd = np.array(Yi1_grd)

    df_trainX = pd.DataFrame(
        {'xProj': xProj, 'yProj': yProj, 'elevS': source_elev, 'var': z})

    df_testX = pd.DataFrame({'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array})

    if param_initiate is not None:

        if len(param_initiate) > 1:

            kernels = [1.0 * RBF(length_scale=param_initiate[0]), 1.0 * RationalQuadratic(length_scale=param_initiate[0][0], alpha=param_initiate[0][1]),
                       1.0 * Matern(length_scale=param_initiate[0], nu=param_initiate[1], length_scale_bounds=(1000, 500000))]  # Temp =(100,500000) #RH = (1000,500000)
        # Optimizer =  ‘L-BGFS-B’ algorithm
        else:

            kernels = [1.0 * RBF(length_scale=param_initiate[0])]

        if cov == 'RationalQuadratic':
            if optimizer:
                # Updated Nov 23 for fire season manuscript to make 3 restarts, Dec 9 = 5
                reg = GaussianProcessRegressor(
                    kernel=kernels[1], normalize_y=True, n_restarts_optimizer=restarts)
            else:
                reg = GaussianProcessRegressor(
                    kernel=kernels[1], normalize_y=True, n_restarts_optimizer=restarts, optimizer=None)
        elif cov == 'RBF':
            if optimizer:
                # Updated Nov 23 for fire season manuscript to make 3 restarts, Dec 9 = 5
                reg = GaussianProcessRegressor(
                    kernel=kernels[0], normalize_y=True, n_restarts_optimizer=restarts)
            else:
                reg = GaussianProcessRegressor(
                    kernel=kernels[0], normalize_y=True, n_restarts_optimizer=restarts, optimizer=None)
        elif cov == 'Matern':

            if optimizer:
                # Updated Nov 23 for fire season manuscript to make 3 restarts, Dec 9 = 5
                reg = GaussianProcessRegressor(
                    kernel=kernels[2], normalize_y=True, n_restarts_optimizer=restarts)
            else:
                #kernels = [307**2 * Matern(length_scale=[5e+05, 6.62e+04, 1.07e+04], nu=0.5)]
                #kernels = [316**2 * Matern(length_scale=[5e+05, 5e+05, 6.01e+03], nu=0.5)]
                #kernels = [316**2 * Matern(length_scale=[5e+05, 5e+05, 4.67e+05], nu=0.5)]
                reg = GaussianProcessRegressor(
                    kernel=kernels[0], normalize_y=True, n_restarts_optimizer=restarts, optimizer=None)
    else:
        kernels = [eval(kernel_object[0])]
        reg = GaussianProcessRegressor(
            kernel=kernels[0], normalize_y=True, n_restarts_optimizer=0, optimizer=None)

    y = np.array(df_trainX['var']).reshape(-1, 1)
    X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
    X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

    reg.fit(X_train, y)
    fitted_params = reg.kernel_
    score = reg.score(X_train, y)
    print(fitted_params)
    print(score)

    Zi = reg.predict(X_test)

    gpr_grid = Zi.reshape(num_row+1, num_col+1)

    if show:
        fig, ax = plt.subplots(figsize=(15, 15))
        crs = {'init': 'esri:102001'}

        na_map = gpd.read_file(shapefile)

        plt.imshow(gpr_grid, extent=(xProj_extent.min(
        )-1, xProj_extent.max()+1, yProj_extent.max()-1, yProj_extent.min()+1))
        na_map.plot(ax=ax, color='white', edgecolor='k',
                    linewidth=2, zorder=10, alpha=0.1)

        plt.scatter(xProj, yProj, c=z, edgecolors='k')

        plt.gca().invert_yaxis()
        cbar = plt.colorbar()
        cbar.set_label(var_name)

        title = 'GPR Interpolation for %s on %s' % (var_name, input_date)
        fig.suptitle(title, fontsize=14)
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')

        plt.show()

    if report_params:
        return fitted_params

    else:
        return gpr_grid, maxmin
Exemplo n.º 13
0
def cross_validate_gpr(latlon_dict, Cvar_dict, shapefile, file_path_elev, elev_array, idx_list, cov_function):
    '''Leave-one-out cross-validation procedure for GPR

    Parameters
    ----------

    latlon_dict : dictionary
        the latitude and longitudes of the stations
    Cvar_dict : dictionary
        dictionary of weather variable values for each station
    shapefile : string
        path to the study area shapefile, including its name
    file_path_elev : string
        path to the elevation lookup file
    elev_array : ndarray
        array for elevation, create using IDEW interpolation (this is a trick to speed up code)
    idx_list : int
        position of the elevation column in the lookup file
    cov_function : list
        list containing a string that describes the input covariance function, similar to: ['316**2 * Matern(length_scale=[5e+05, 5e+05, 6.01e+03], nu=0.5)']

    Returns
    ----------
    dictionary
        - a dictionary of the absolute error at each station when it was left out
    '''
    x_origin_list = []
    y_origin_list = []

    absolute_error_dictionary = {}  # for plotting
    station_name_list = []
    projected_lat_lon = {}

    for station_name in Cvar_dict.keys():
        if station_name in latlon_dict.keys():
            station_name_list.append(station_name)

            loc = latlon_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
            Plat = float(Plat)
            Plon = float(Plon)
            projected_lat_lon[station_name] = [Plat, Plon]

    for station_name_hold_back in station_name_list:

        lat = []
        lon = []
        Cvar = []
        for station_name in sorted(Cvar_dict.keys()):
            if station_name in latlon_dict.keys():
                if station_name != station_name_hold_back:
                    loc = latlon_dict[station_name]
                    latitude = loc[0]
                    longitude = loc[1]
                    cvar_val = Cvar_dict[station_name]
                    lat.append(float(latitude))
                    lon.append(float(longitude))
                    Cvar.append(cvar_val)
                else:

                    pass

        y = np.array(lat)
        x = np.array(lon)
        z = np.array(Cvar)

        na_map = gpd.read_file(shapefile)
        bounds = na_map.bounds
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']
        pixelHeight = 10000
        pixelWidth = 10000

        num_col = int((xmax - xmin) / pixelHeight)+1
        num_row = int((ymax - ymin) / pixelWidth)+1

        # We need to project to a projected system before making distance matrix
        source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
        xProj, yProj = pyproj.Proj('esri:102001')(x, y)

        df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

        yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
        xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

        Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
        Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

        Xi, Yi = np.meshgrid(Xi, Yi)
        Xi, Yi = Xi.flatten(), Yi.flatten()

        maxmin = [np.min(yProj_extent), np.max(yProj_extent),
                  np.max(xProj_extent), np.min(xProj_extent)]

        # Elevation
        # Preparing the coordinates to send to the function that will get the elevation grid
        concat = np.array((Xi.flatten(), Yi.flatten())).T
        send_to_list = concat.tolist()
        # The elevation function takes a tuple
        send_to_tuple = [tuple(x) for x in send_to_list]

        Xi1_grd = []
        Yi1_grd = []
        elev_grd = []
        # Get the elevations from the lookup file
        elev_grd_dict = GD.finding_data_frm_lookup(
            send_to_tuple, file_path_elev, idx_list)

        for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
            x = keys[0]
            y = keys[1]
            Xi1_grd.append(x)
            Yi1_grd.append(y)
            # Append the elevation data to the empty list
            elev_grd.append(elev_grd_dict[keys])

        elev_array = np.array(elev_grd)  # make an elevation array

        elev_dict = GD.finding_data_frm_lookup(zip(
            xProj, yProj), file_path_elev, idx_list)  # Get the elevations for the stations

        xProj_input = []
        yProj_input = []
        e_input = []

        for keys in zip(xProj, yProj):  # Repeat process for just the stations not the whole grid
            x = keys[0]
            y = keys[1]
            xProj_input.append(x)
            yProj_input.append(y)
            e_input.append(elev_dict[keys])

        source_elev = np.array(e_input)

        Xi1_grd = np.array(Xi1_grd)
        Yi1_grd = np.array(Yi1_grd)

        df_trainX = pd.DataFrame(
            {'xProj': xProj, 'yProj': yProj, 'elevS': source_elev, 'var': z})

        df_testX = pd.DataFrame(
            {'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array})

        #kernels = [1.0 * RationalQuadratic(length_scale=1.0, alpha=alpha_input)]
        #kernels = [multiplier**exponent * Matern(length_scale=length_scale_list,nu=param_initiate[1],length_scale_bounds='fixed')]
        #kernels = [params]

        # Temperature
        #kernels = [316**2 * Matern(length_scale=[5e+05, 5e+05, 6.01e+03], nu=0.5)]

        # RH
        #kernels = [307**2 * Matern(length_scale=[9.51e+04, 9.58e+04, 3.8e+05], nu=0.5)]

        # Wind =

        #kernels = [316**2 * Matern(length_scale=[5e+05, 6.62e+04, 1.07e+04], nu=0.5)]
        kernels = [eval(cov_function[0])]
        reg = GaussianProcessRegressor(
            kernel=kernels[0], normalize_y=True, n_restarts_optimizer=0, optimizer=None)

        y = np.array(df_trainX['var']).reshape(-1, 1)
        X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
        X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

        reg.fit(X_train, y)

        Zi = reg.predict(X_test)

        gpr_grid = Zi.reshape(num_row, num_col)

        # Calc the RMSE, MAE at the pixel loc
        # Delete at a certain point
        coord_pair = projected_lat_lon[station_name_hold_back]

        x_orig = int(
            (coord_pair[0] - float(bounds['minx']))/pixelHeight)  # lon
        y_orig = int((coord_pair[1] - float(bounds['miny']))/pixelWidth)  # lat
        x_origin_list.append(x_orig)
        y_origin_list.append(y_orig)

        interpolated_val = gpr_grid[y_orig][x_orig]

        original_val = Cvar_dict[station_name_hold_back]
        absolute_error = abs(interpolated_val-original_val)
        absolute_error_dictionary[station_name_hold_back] = absolute_error

    return absolute_error_dictionary