Exemplos de finding_data_frm_lookup em Python, exemplos de get_data.finding_data_frm_lookup em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: rf.py Projeto: clara-risk/fire_weather_interpolate

def shuffle_split_rf(latlon_dict,
                     Cvar_dict,
                     shapefile,
                     file_path_elev,
                     elev_array,
                     idx_list,
                     rep,
                     res=10000):
    '''Shuffle-split cross-validation with 50/50 training test split

    Parameters
    ----------

    loc_dict : dictionary
        the latitude and longitudes of the daily/hourly stations
    Cvar_dict : dictionary
        dictionary of weather variable values for each station
    shapefile : string
        path to the study area shapefile
    file_path_elev : string
        path to the elevation lookup file
    elev_array : ndarray
        array for elevation, create using IDEW interpolation (this is a trick to speed up code)
    idx_list : int
        position of the elevation column in the lookup file
    rep : int
        number of replications
             
    Returns
    ----------
    float
        - MAE estimate for entire surface (average of replications)
    '''
    count = 1
    error_dictionary = {}
    while count <= rep:
        x_origin_list = []
        y_origin_list = []

        absolute_error_dictionary = {}  # for plotting
        station_name_list = []
        projected_lat_lon = {}

        for station_name in Cvar_dict.keys():
            if station_name in latlon_dict.keys():
                station_name_list.append(station_name)

                loc = latlon_dict[station_name]
                latitude = loc[0]
                longitude = loc[1]
                Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
                Plat = float(Plat)
                Plon = float(Plon)
                projected_lat_lon[station_name] = [Plat, Plon]

        # Split the stations in two
        # we can't just use Cvar_dict.keys() because some stations do not have valid lat/lon
        stations_input = []
        for station_code in Cvar_dict.keys():
            if station_code in latlon_dict.keys():
                stations_input.append(station_code)
        # Split the stations in two
        stations = np.array(stations_input)
        # Won't be exactly 50/50 if uneven num stations
        splits = ShuffleSplit(n_splits=1, train_size=.5)

        for train_index, test_index in splits.split(stations):

            train_stations = stations[train_index]
            # print(train_stations)
            test_stations = stations[test_index]
            # print(test_stations)

        # They can't overlap

        for val in train_stations:
            if val in test_stations:
                print('Error, the train and test sets overlap!')
                sys.exit()

        lat = []
        lon = []
        Cvar = []
        for station_name in sorted(Cvar_dict.keys()):
            if station_name in latlon_dict.keys():
                if station_name not in test_stations:
                    loc = latlon_dict[station_name]
                    latitude = loc[0]
                    longitude = loc[1]
                    cvar_val = Cvar_dict[station_name]
                    lat.append(float(latitude))
                    lon.append(float(longitude))
                    Cvar.append(cvar_val)
                else:

                    pass

        y = np.array(lat)
        x = np.array(lon)
        z = np.array(Cvar)

        na_map = gpd.read_file(shapefile)
        bounds = na_map.bounds
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']
        pixelHeight = res
        pixelWidth = res

        num_col = int((xmax - xmin) / pixelHeight) + 1
        num_row = int((ymax - ymin) / pixelWidth) + 1

        # We need to project to a projected system before making distance matrix
        source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
        xProj, yProj = pyproj.Proj('esri:102001')(x, y)

        df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

        yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
        xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

        Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
        Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

        Xi, Yi = np.meshgrid(Xi, Yi)
        Xi, Yi = Xi.flatten(), Yi.flatten()

        maxmin = [
            np.min(yProj_extent),
            np.max(yProj_extent),
            np.max(xProj_extent),
            np.min(xProj_extent)
        ]

        # Elevation
        # Preparing the coordinates to send to the function that will get the elevation grid
        concat = np.array((Xi.flatten(), Yi.flatten())).T
        send_to_list = concat.tolist()
        # The elevation function takes a tuple
        send_to_tuple = [tuple(x) for x in send_to_list]

        Xi1_grd = []
        Yi1_grd = []
        elev_grd = []
        # Get the elevations from the lookup file
        elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple,
                                                   file_path_elev, idx_list)

        for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
            x = keys[0]
            y = keys[1]
            Xi1_grd.append(x)
            Yi1_grd.append(y)
            # Append the elevation data to the empty list
            elev_grd.append(elev_grd_dict[keys])

        elev_array = np.array(elev_grd)  # make an elevation array

        elev_dict = GD.finding_data_frm_lookup(
            zip(xProj, yProj), file_path_elev,
            idx_list)  # Get the elevations for the stations

        xProj_input = []
        yProj_input = []
        e_input = []

        for keys in zip(
                xProj, yProj
        ):  # Repeat process for just the stations not the whole grid
            x = keys[0]
            y = keys[1]
            xProj_input.append(x)
            yProj_input.append(y)
            e_input.append(elev_dict[keys])

        source_elev = np.array(e_input)

        Xi1_grd = np.array(Xi1_grd)
        Yi1_grd = np.array(Yi1_grd)

        df_trainX = pd.DataFrame({
            'xProj': xProj,
            'yProj': yProj,
            'elevS': source_elev,
            'var': z
        })

        df_testX = pd.DataFrame({
            'Xi': Xi1_grd,
            'Yi': Yi1_grd,
            'elev': elev_array
        })

        reg = RandomForestRegressor(n_estimators=100,
                                    max_features='sqrt',
                                    random_state=1)

        y = np.array(df_trainX['var']).reshape(-1, 1)
        X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
        X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

        reg.fit(X_train, y)

        Zi = reg.predict(X_test)

        rf_grid = Zi.reshape(num_row, num_col)

        # Calc the RMSE, MAE at the pixel loc
        # Delete at a certain point
        for statLoc in test_stations:
            coord_pair = projected_lat_lon[statLoc]

            x_orig = int(
                (coord_pair[0] - float(bounds['minx'])) / pixelHeight)  # lon
            y_orig = int(
                (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat
            x_origin_list.append(x_orig)
            y_origin_list.append(y_orig)

            try:

                interpolated_val = rf_grid[y_orig][x_orig]

                original_val = Cvar_dict[statLoc]
                absolute_error = abs(interpolated_val - original_val)
                absolute_error_dictionary[statLoc] = absolute_error
            except IndexError:
                pass

        error_dictionary[count] = sum(
            absolute_error_dictionary.values()) / len(
                absolute_error_dictionary.values(
                ))  # average of all the withheld stations
        count += 1

    overall_error = sum(error_dictionary.values()) / rep

    return overall_error

Exemplo n.º 2

0

Exibir arquivo

Arquivo: rf.py Projeto: clara-risk/fire_weather_interpolate

def spatial_kfold_rf(idw_example_grid, loc_dict, Cvar_dict, shapefile, file_path_elev, elev_array, idx_list,\
                     block_num, blocking_type, return_error):
    '''Spatially blocked k-fold cross-validation procedure for RF

    Parameters
    ----------
         idw_example_grid  : ndarray
              used for reference of study area grid size
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile
         file_path_elev : string
              path to the elevation lookup file
         elev_array : ndarray
              array for elevation, create using IDEW interpolation (this is a trick to speed up code)         
         idx_list : int
              position of the elevation column in the lookup file
         block_num : int
              number of blocks/clusters
         blocking_type : string
              whether to use clusters or blocks
         return_error : bool
              whether or not to return the error dictionary
              
    Returns
    ----------
         float
              - MAE estimate for entire surface
         int
              - Return the block number just so we can later write it into the file to keep track
         dictionary
              - if return_error = True, a dictionary of the absolute error at each fold when it was left out
    '''
    groups_complete = [
    ]  # If not using replacement, keep a record of what we have done
    error_dictionary = {}

    x_origin_list = []
    y_origin_list = []

    absolute_error_dictionary = {}
    projected_lat_lon = {}

    # Selecting blocknum
    if blocking_type == 'cluster':
        cluster = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile,
                                      block_num, file_path_elev, idx_list,
                                      False, False, False)
    elif blocking_type == 'block':
        # Get the numpy array that delineates the blocks
        np_array_blocks = mbk.make_block(idw_example_grid, block_num)
        cluster = mbk.sorting_stations(np_array_blocks, shapefile, loc_dict,
                                       Cvar_dict)  # Now get the dictionary
    else:
        print('That is not a valid blocking method')
        sys.exit()

    for group in cluster.values():
        if group not in groups_complete:
            station_list = [k for k, v in cluster.items() if v == group]
            groups_complete.append(group)

    for station_name in Cvar_dict.keys():
        if station_name in loc_dict.keys():

            loc = loc_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
            Plat = float(Plat)
            Plon = float(Plon)
            projected_lat_lon[station_name] = [Plat, Plon]

    lat = []
    lon = []
    Cvar = []
    for station_name in sorted(Cvar_dict.keys()):
        if station_name in loc_dict.keys():
            if station_name not in station_list:
                loc = loc_dict[station_name]
                latitude = loc[0]
                longitude = loc[1]
                cvar_val = Cvar_dict[station_name]
                lat.append(float(latitude))
                lon.append(float(longitude))
                Cvar.append(cvar_val)
            else:

                pass

    y = np.array(lat)
    x = np.array(lon)
    z = np.array(Cvar)

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds
    xmax = bounds['maxx']
    xmin = bounds['minx']
    ymax = bounds['maxy']
    ymin = bounds['miny']
    pixelHeight = 10000
    pixelWidth = 10000

    num_col = int((xmax - xmin) / pixelHeight)
    num_row = int((ymax - ymin) / pixelWidth)

    # We need to project to a projected system before making distance matrix
    source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
    xProj, yProj = pyproj.Proj('esri:102001')(x, y)

    df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

    yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
    xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

    Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
    Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

    Xi, Yi = np.meshgrid(Xi, Yi)
    Xi, Yi = Xi.flatten(), Yi.flatten()

    maxmin = [
        np.min(yProj_extent),
        np.max(yProj_extent),
        np.max(xProj_extent),
        np.min(xProj_extent)
    ]

    # Elevation
    # Preparing the coordinates to send to the function that will get the elevation grid
    concat = np.array((Xi.flatten(), Yi.flatten())).T
    send_to_list = concat.tolist()
    # The elevation function takes a tuple
    send_to_tuple = [tuple(x) for x in send_to_list]

    Xi1_grd = []
    Yi1_grd = []
    elev_grd = []
    # Get the elevations from the lookup file
    elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev,
                                               idx_list)

    for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
        x = keys[0]
        y = keys[1]
        Xi1_grd.append(x)
        Yi1_grd.append(y)
        # Append the elevation data to the empty list
        elev_grd.append(elev_grd_dict[keys])

    elev_array = np.array(elev_grd)  # make an elevation array

    elev_dict = GD.finding_data_frm_lookup(
        zip(xProj, yProj), file_path_elev,
        idx_list)  # Get the elevations for the stations

    xProj_input = []
    yProj_input = []
    e_input = []

    for keys in zip(
            xProj,
            yProj):  # Repeat process for just the stations not the whole grid
        x = keys[0]
        y = keys[1]
        xProj_input.append(x)
        yProj_input.append(y)
        e_input.append(elev_dict[keys])

    source_elev = np.array(e_input)

    Xi1_grd = np.array(Xi1_grd)
    Yi1_grd = np.array(Yi1_grd)

    df_trainX = pd.DataFrame({
        'xProj': xProj,
        'yProj': yProj,
        'elevS': source_elev,
        'var': z
    })

    df_testX = pd.DataFrame({'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array})

    reg = RandomForestRegressor(n_estimators=100,
                                max_features='sqrt',
                                random_state=1)

    y = np.array(df_trainX['var']).reshape(-1, 1)
    X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
    X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

    reg.fit(X_train, y)

    Zi = reg.predict(X_test)

    rf_grid = Zi.reshape(num_row, num_col)

    # Calc the RMSE, MAE at the pixel loc
    # Delete at a certain point
    for statLoc in station_list:
        coord_pair = projected_lat_lon[statLoc]

        x_orig = int(
            (coord_pair[0] - float(bounds['minx'])) / pixelHeight)  # lon
        y_orig = int(
            (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat
        x_origin_list.append(x_orig)
        y_origin_list.append(y_orig)

        interpolated_val = rf_grid[y_orig][x_orig]

        original_val = Cvar_dict[statLoc]
        absolute_error = abs(interpolated_val - original_val)
        absolute_error_dictionary[statLoc] = absolute_error

    # average of all the withheld stations
    MAE = sum(absolute_error_dictionary.values()) / \
        len(absolute_error_dictionary.values())
    if return_error:
        return block_num, MAE, absolute_error_dictionary
    else:
        return block_num, MAE

Exemplo n.º 3

0

Exibir arquivo

Arquivo: rf.py Projeto: clara-risk/fire_weather_interpolate

def cross_validate_rf(latlon_dict, Cvar_dict, shapefile, file_path_elev,
                      elev_array, idx_list, pass_to_plot):
    '''Leave-one-out cross-validation procedure for RF

    Parameters
    ----------
    
         latlon_dict : dictionary
              the latitude and longitudes of the stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile, including its name
         file_path_elev : string
              path to the elevation lookup file
         elev_array : ndarray
              array for elevation, create using IDEW interpolation (this is a trick to speed up code)
         idx_list : int
              position of the elevation column in the lookup file
         pass_to_plot : bool
              whether you will be plotting the error and need a version without absolute value error (i.e. fire season days)
              
    Returns
    ----------
         dictionary
              - a dictionary of the absolute error at each station when it was left out
         dictionary
              - if pass_to_plot = True, returns a dictionary without the absolute value of the error, for example for plotting fire season error
     '''
    x_origin_list = []
    y_origin_list = []

    absolute_error_dictionary = {}  # for plotting
    no_absolute_value_dict = {}  # to see whether under or over estimation
    station_name_list = []
    projected_lat_lon = {}

    for station_name in Cvar_dict.keys():
        if station_name in latlon_dict.keys():
            station_name_list.append(station_name)

            loc = latlon_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
            Plat = float(Plat)
            Plon = float(Plon)
            projected_lat_lon[station_name] = [Plat, Plon]

    for station_name_hold_back in station_name_list:

        lat = []
        lon = []
        Cvar = []
        for station_name in sorted(Cvar_dict.keys()):
            if station_name in latlon_dict.keys():
                if station_name != station_name_hold_back:
                    loc = latlon_dict[station_name]
                    latitude = loc[0]
                    longitude = loc[1]
                    cvar_val = Cvar_dict[station_name]
                    lat.append(float(latitude))
                    lon.append(float(longitude))
                    Cvar.append(cvar_val)
                else:

                    pass

        y = np.array(lat)
        x = np.array(lon)
        z = np.array(Cvar)

        na_map = gpd.read_file(shapefile)
        bounds = na_map.bounds
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']
        pixelHeight = 10000
        pixelWidth = 10000

        num_col = int((xmax - xmin) / pixelHeight)
        num_row = int((ymax - ymin) / pixelWidth)

        # We need to project to a projected system before making distance matrix
        source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
        xProj, yProj = pyproj.Proj('esri:102001')(x, y)

        df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

        yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
        xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

        Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
        Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

        Xi, Yi = np.meshgrid(Xi, Yi)
        Xi, Yi = Xi.flatten(), Yi.flatten()

        maxmin = [
            np.min(yProj_extent),
            np.max(yProj_extent),
            np.max(xProj_extent),
            np.min(xProj_extent)
        ]

        # Elevation
        # Preparing the coordinates to send to the function that will get the elevation grid
        concat = np.array((Xi.flatten(), Yi.flatten())).T
        send_to_list = concat.tolist()
        # The elevation function takes a tuple
        send_to_tuple = [tuple(x) for x in send_to_list]

        Xi1_grd = []
        Yi1_grd = []
        elev_grd = []
        # Get the elevations from the lookup file
        elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple,
                                                   file_path_elev, idx_list)

        for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
            x = keys[0]
            y = keys[1]
            Xi1_grd.append(x)
            Yi1_grd.append(y)
            # Append the elevation data to the empty list
            elev_grd.append(elev_grd_dict[keys])

        elev_array = np.array(elev_grd)  # make an elevation array

        elev_dict = GD.finding_data_frm_lookup(
            zip(xProj, yProj), file_path_elev,
            idx_list)  # Get the elevations for the stations

        xProj_input = []
        yProj_input = []
        e_input = []

        for keys in zip(
                xProj, yProj
        ):  # Repeat process for just the stations not the whole grid
            x = keys[0]
            y = keys[1]
            xProj_input.append(x)
            yProj_input.append(y)
            e_input.append(elev_dict[keys])

        source_elev = np.array(e_input)

        Xi1_grd = np.array(Xi1_grd)
        Yi1_grd = np.array(Yi1_grd)

        df_trainX = pd.DataFrame({
            'xProj': xProj,
            'yProj': yProj,
            'elevS': source_elev,
            'var': z
        })

        df_testX = pd.DataFrame({
            'Xi': Xi1_grd,
            'Yi': Yi1_grd,
            'elev': elev_array
        })

        reg = RandomForestRegressor(n_estimators=100,
                                    max_features='sqrt',
                                    random_state=1)

        y = np.array(df_trainX['var']).reshape(-1, 1)
        X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
        X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

        reg.fit(X_train, y)

        Zi = reg.predict(X_test)

        rf_grid = Zi.reshape(num_row, num_col)

        # Calc the RMSE, MAE at the pixel loc
        # Delete at a certain point
        coord_pair = projected_lat_lon[station_name_hold_back]

        x_orig = int(
            (coord_pair[0] - float(bounds['minx'])) / pixelHeight)  # lon
        y_orig = int(
            (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat
        x_origin_list.append(x_orig)
        y_origin_list.append(y_orig)

        interpolated_val = rf_grid[y_orig][x_orig]

        original_val = Cvar_dict[station_name_hold_back]
        absolute_error = abs(interpolated_val - original_val)
        absolute_error_dictionary[station_name_hold_back] = absolute_error
        no_absolute_value_dict[
            station_name_hold_back] = interpolated_val - original_val
    if pass_to_plot:
        return absolute_error_dictionary, no_absolute_value_dict
    else:
        return absolute_error_dictionary

Exemplo n.º 4

0

Exibir arquivo

Arquivo: rf.py Projeto: clara-risk/fire_weather_interpolate

def random_forest_interpolator(latlon_dict, Cvar_dict, input_date, var_name, shapefile, show, \
                               file_path_elev, idx_list, expand_area, res = 10000):
    '''Random forest interpolation

    Parameters
    ----------
         latlon_dict : dictionary
              the latitude and longitudes of the stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         input_date : string
              the date you want to interpolate for
         var_name : string
              the name of the variable you are interpolating
         shapefile : string
              path to the study area shapefile, including its name
         show : bool
              whether you want to plot a map
         file_path_elev : string
              path to the elevation lookup file
         idx_list : int
              position of the elevation column in the lookup file
         expand_area : bool
              function will expand the study area so that more stations are taken into account (200 km)
              
    Returns
    ----------
         ndarray
              - the array of values for the interpolated surface
         list
              - the bounds of the array surface, for use in other functions
     '''
    lat = []
    lon = []
    Cvar = []

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds
    if expand_area:
        xmax = bounds['maxx'] + 200000
        xmin = bounds['minx'] - 200000
        ymax = bounds['maxy'] + 200000
        ymin = bounds['miny'] - 200000
    else:
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']

    for station_name in Cvar_dict.keys():
        if station_name in latlon_dict.keys():

            loc = latlon_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            # Filter out stations outside of grid
            proj_coord = pyproj.Proj('esri:102001')(longitude, latitude)
            if (proj_coord[1] <= float(ymax[0])
                    and proj_coord[1] >= float(ymin[0])
                    and proj_coord[0] <= float(xmax[0])
                    and proj_coord[0] >= float(xmin[0])):
                cvar_val = Cvar_dict[station_name]
                lat.append(float(latitude))
                lon.append(float(longitude))
                Cvar.append(cvar_val)

    y = np.array(lat)
    x = np.array(lon)
    z = np.array(Cvar)

    pixelHeight = res
    pixelWidth = res

    num_col = int((xmax - xmin) / pixelHeight)
    num_row = int((ymax - ymin) / pixelWidth)

    # We need to project to a projected system before making distance matrix
    source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
    xProj, yProj = pyproj.Proj('esri:102001')(x, y)

    df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

    if expand_area:

        yProj_extent = np.append(
            yProj, [bounds['maxy'] + 200000, bounds['miny'] - 200000])
        xProj_extent = np.append(
            xProj, [bounds['maxx'] + 200000, bounds['minx'] - 200000])

    else:
        yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
        xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

    Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row + 1)
    Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col + 1)

    Xi, Yi = np.meshgrid(Xi, Yi)
    Xi, Yi = Xi.flatten(), Yi.flatten()

    maxmin = [
        np.min(yProj_extent),
        np.max(yProj_extent),
        np.max(xProj_extent),
        np.min(xProj_extent)
    ]

    # Elevation
    # Preparing the coordinates to send to the function that will get the elevation grid
    concat = np.array((Xi.flatten(), Yi.flatten())).T
    send_to_list = concat.tolist()
    # The elevation function takes a tuple
    send_to_tuple = [tuple(x) for x in send_to_list]

    Xi1_grd = []
    Yi1_grd = []
    elev_grd = []
    # Get the elevations from the lookup file
    elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev,
                                               idx_list)

    for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
        x = keys[0]
        y = keys[1]
        Xi1_grd.append(x)
        Yi1_grd.append(y)
        # Append the elevation data to the empty list
        elev_grd.append(elev_grd_dict[keys])

    elev_array = np.array(elev_grd)  # make an elevation array

    elev_dict = GD.finding_data_frm_lookup(
        zip(xProj, yProj), file_path_elev,
        idx_list)  # Get the elevations for the stations

    xProj_input = []
    yProj_input = []
    e_input = []

    for keys in zip(
            xProj,
            yProj):  # Repeat process for just the stations not the whole grid
        x = keys[0]
        y = keys[1]
        xProj_input.append(x)
        yProj_input.append(y)
        e_input.append(elev_dict[keys])

    source_elev = np.array(e_input)

    Xi1_grd = np.array(Xi1_grd)
    Yi1_grd = np.array(Yi1_grd)

    df_trainX = pd.DataFrame({
        'xProj': xProj,
        'yProj': yProj,
        'elevS': source_elev,
        'var': z
    })

    df_testX = pd.DataFrame({'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array})

    reg = RandomForestRegressor(n_estimators=100,
                                max_features='sqrt',
                                random_state=1)

    y = np.array(df_trainX['var']).reshape(-1, 1)
    X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
    X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

    reg.fit(X_train, y)

    Zi = reg.predict(X_test)

    rf_grid = Zi.reshape(num_row + 1, num_col + 1)

    if show:
        fig, ax = plt.subplots(figsize=(15, 15))
        crs = {'init': 'esri:102001'}

        na_map = gpd.read_file(shapefile)

        plt.imshow(rf_grid,
                   extent=(xProj_extent.min() - 1, xProj_extent.max() + 1,
                           yProj_extent.max() - 1, yProj_extent.min() + 1))
        na_map.plot(ax=ax,
                    color='white',
                    edgecolor='k',
                    linewidth=2,
                    zorder=10,
                    alpha=0.1)

        plt.scatter(xProj, yProj, c=z, edgecolors='k')

        plt.gca().invert_yaxis()
        cbar = plt.colorbar()
        cbar.set_label(var_name)

        title = 'RF Interpolation for %s on %s' % (var_name, input_date)
        fig.suptitle(title, fontsize=14)
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')

        plt.show()

    return rf_grid, maxmin

Exemplo n.º 5

0

Exibir arquivo

def spatial_cluster(loc_dict, Cvar_dict, shapefile, cluster_num, file_path_elev, idx_list,
                    plot_2D, plot_3D, return_all):
    '''Spatial clustering based on scikit learn's agglomerative clustering

    Parameters
    ----------
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile
         clusternum : int
              number of clusters
         file_path_elev : string
              path to the elevation lookup file
         idx_list : int
              position of the elevation column in the lookup file
         plot_2D : bool
              whether to plot maps of the clusters in 2d
         plot_3D : bool
              whether to plot maps of the clusters in 3d             
         return_all : bool
            whether or not to return all the outputs (needed for selecting cluster size)
            
    Returns
    ----------
         dictionary
             - a dictionary of cluster that each station is in 
    '''

    x = []
    y = []

    proj_stations = {}
    for station in Cvar_dict.keys():
        if station in loc_dict.keys():
            coord = loc_dict[station]
            Plon1, Plat1 = pyproj.Proj('esri:102001')(
                coord[1], coord[0])  # longitude,lat
            Plat = float(Plat1)
            Plon = float(Plon1)
            x.append([Plon])
            y.append([Plat])
            proj_stations[station] = [Plat, Plon]
    X = [val+y[i] for i, val in enumerate(x)]
    X = np.array(X)
    # print(X)
    # Make the longitudinal transect of distance (lon, elev)

    Xi1_grd = []
    Yi1_grd = []
    elev_grd = []
    # Preparing the coordinates to send to the function that will get the elevation grid
    concat = np.array((x, y)).T
    send_to_list = concat[0].tolist()
    send_to_tuple = [tuple(x) for x in send_to_list]
    # Get the elevations from the lookup file
    elev_grd_dict = GD.finding_data_frm_lookup(
        send_to_tuple, file_path_elev, idx_list)

    for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
        x = keys[0]
        y = keys[1]
        Xi1_grd.append(x)
        Yi1_grd.append(y)
        # Append the elevation data to the empty list
        elev_grd.append(elev_grd_dict[keys])

    lon = [i for i in Xi1_grd]  # list of 0
    lon_list = [[i] for i in lon]
    lat_list = [[i] for i in Yi1_grd]
    elev = [[i] for i in elev_grd]  # put into sublist so you can make pairs
    Xelev = [val+lat_list[i]+elev[i] for i, val in enumerate(lon_list)]
    Xelev = np.array(Xelev)

    # This is where we make the connectivity graph based on elevation

    knn_graph = kneighbors_graph(Xelev, 10, include_self=False)
    connectivity = knn_graph
    n_clusters = cluster_num

    linkage = 'ward'

    model = AgglomerativeClustering(
        linkage=linkage, connectivity=connectivity, n_clusters=n_clusters)

    model.fit(Xelev)  # fit with lat lon elev
    label = model.labels_

    if plot_3D:
        fig = plt.figure()
        ax = p3.Axes3D(fig)
        ax.view_init(7, -80)
        for l in np.unique(label):
            ax.scatter(Xelev[label == l, 0], Xelev[label == l, 1], Xelev[label == l, 2],
                       color=plt.cm.jet(float(l) / np.max(label + 1)),
                       s=20, edgecolor='k')
        plt.title('With connectivity constraints, Elevation inc.')
        ax.set_xlabel('Longitude')
        ax.set_ylabel('Latitude')
        ax.set_zlabel('Elevation (m)')

        plt.show()

    # This is where we make the connectivity graph where we can see on the map
    if plot_2D:

        fig, ax = plt.subplots(figsize=(15, 15))
        crs = {'init': 'esri:102001'}
        na_map = gpd.read_file(shapefile)

        na_map.plot(ax=ax, color='white', edgecolor='k', linewidth=1, alpha=1)

        plt.scatter(Xelev[:, 0], Xelev[:, 1], c=model.labels_,
                    cmap=plt.cm.tab20b, s=20, edgecolor='k')

        ax.tick_params(axis='both', which='both', bottom=False, top=False,
                       labelbottom=False, right=False, left=False, labelleft=False)
        ax.ticklabel_format(useOffset=False, style='plain')

        # plt.subplots_adjust(bottom=0, top=.83, wspace=0,
        # left=0, right=1)
        # plt.suptitle('n_cluster=%i, connectivity=%r' %
        # (n_clusters, connectivity is not None), size=17)

        plt.show()

    # Make a dictionary with each class
    station_class = {}

    count = 0
    for val in Xelev:
        key = [key for key, value in proj_stations.items() if value == [
            val[1], val[0]]]
        if len(key) == 1:
            # We add 1, because for the random selection the groups start at 1
            station_class[key[0]] = label[count] + 1
        elif len(key) == 2:
            station_class[key[0]] = label[count] + 1
            station_class[key[1]] = label[count] + 1
        elif len(key) == 3:
            station_class[key[0]] = label[count] + 1
            station_class[key[1]] = label[count] + 1
            station_class[key[2]] = label[count] + 1
        else:
            print('Too many stations have the same lat lon.')
        count += 1

    if count != label.shape[0]:
        print('The groups and label matrix do not match')

    if return_all:
        return label, Xelev, station_class
    else:

        return station_class

Exemplo n.º 6

0

Exibir arquivo

Arquivo: rf.py Projeto: clara-risk/fire_weather_interpolate

def spatial_groups_rf(idw_example_grid, loc_dict, Cvar_dict, shapefile, blocknum, nfolds,\
                      replacement, dictionary_Groups, file_path_elev, idx_list, expand_area):
    '''Stratified shuffle-split cross-validation procedure

    Parameters
    ----------
         idw_example_grid  : ndarray
              used for reference of study area grid size
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile
         blocknum : int
              number of blocks/clusters
         nfolds : int
              number of folds to create (essentially repetitions)
         replacement : bool
              whether or not to use replacement between folds, should usually be true
         dictionary_Groups : dictionary
              dictionary of what groups (clusters) the stations belong to
         expand_area : bool
              function will expand the study area so that more stations are taken into account (200 km)
              
    Returns
    ----------
         dictionary
              - a dictionary of the absolute error at each fold when it was left out
    '''
    station_list_used = [
    ]  # If not using replacement, keep a record of what we have done
    count = 1
    error_dictionary = {}

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds
    if expand_area:
        xmax = bounds['maxx'] + 200000
        xmin = bounds['minx'] - 200000
        ymax = bounds['maxy'] + 200000
        ymin = bounds['miny'] - 200000
    else:
        xmax = bounds['maxx']
        xmin = bounds['minx']
        ymax = bounds['maxy']
        ymin = bounds['miny']

    while count <= nfolds:
        x_origin_list = []
        y_origin_list = []

        absolute_error_dictionary = {}
        projected_lat_lon = {}

        station_list = Eval.select_random_station(dictionary_Groups, blocknum,
                                                  replacement,
                                                  station_list_used).values()

        if replacement == False:
            station_list_used.append(list(station_list))
        # print(station_list_used)

        for station_name in Cvar_dict.keys():

            if station_name in loc_dict.keys():

                loc = loc_dict[station_name]
                latitude = loc[0]
                longitude = loc[1]
                Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
                Plat = float(Plat)
                Plon = float(Plon)
                # Filter out stations outside of grid
                proj_coord = pyproj.Proj('esri:102001')(longitude, latitude)
                if (proj_coord[1] <= float(ymax[0])
                        and proj_coord[1] >= float(ymin[0])
                        and proj_coord[0] <= float(xmax[0])
                        and proj_coord[0] >= float(xmin[0])):
                    projected_lat_lon[station_name] = [Plat, Plon]

        lat = []
        lon = []
        Cvar = []
        for station_name in sorted(Cvar_dict.keys()):
            if station_name in loc_dict.keys():
                if station_name not in station_list:  # This is the step where we hold back the fold
                    loc = loc_dict[station_name]
                    latitude = loc[0]
                    longitude = loc[1]
                    cvar_val = Cvar_dict[station_name]

                    # Filter out stations outside of grid
                    proj_coord = pyproj.Proj('esri:102001')(longitude,
                                                            latitude)
                    if (proj_coord[1] <= float(ymax[0])
                            and proj_coord[1] >= float(ymin[0])
                            and proj_coord[0] <= float(xmax[0])
                            and proj_coord[0] >= float(xmin[0])):
                        lat.append(float(latitude))
                        lon.append(float(longitude))
                        Cvar.append(cvar_val)
                else:
                    pass  # Skip the station

        y = np.array(lat)
        x = np.array(lon)
        z = np.array(Cvar)

        pixelHeight = 10000
        pixelWidth = 10000
        num_col = int((xmax - xmin) / pixelHeight) + 1
        num_row = int((ymax - ymin) / pixelWidth) + 1

        # We need to project to a projected system before making distance matrix
        source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
        xProj, yProj = pyproj.Proj('esri:102001')(x, y)

        df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

        if expand_area:

            yProj_extent = np.append(
                yProj, [bounds['maxy'] + 200000, bounds['miny'] - 200000])
            xProj_extent = np.append(
                xProj, [bounds['maxx'] + 200000, bounds['minx'] - 200000])
        else:
            yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
            xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

        Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent),
                         num_row + 1)
        Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent),
                         num_col + 1)

        Xi, Yi = np.meshgrid(Xi, Yi)
        Xi, Yi = Xi.flatten(), Yi.flatten()

        maxmin = [
            np.min(yProj_extent),
            np.max(yProj_extent),
            np.max(xProj_extent),
            np.min(xProj_extent)
        ]

        # Elevation
        # Preparing the coordinates to send to the function that will get the elevation grid
        concat = np.array((Xi.flatten(), Yi.flatten())).T
        send_to_list = concat.tolist()
        # The elevation function takes a tuple
        send_to_tuple = [tuple(x) for x in send_to_list]

        Xi1_grd = []
        Yi1_grd = []
        elev_grd = []
        # Get the elevations from the lookup file
        elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple,
                                                   file_path_elev, idx_list)

        for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
            x = keys[0]
            y = keys[1]
            Xi1_grd.append(x)
            Yi1_grd.append(y)
            # Append the elevation data to the empty list
            elev_grd.append(elev_grd_dict[keys])

        elev_array = np.array(elev_grd)  # make an elevation array

        elev_dict = GD.finding_data_frm_lookup(
            zip(xProj, yProj), file_path_elev,
            idx_list)  # Get the elevations for the stations

        xProj_input = []
        yProj_input = []
        e_input = []

        for keys in zip(
                xProj, yProj
        ):  # Repeat process for just the stations not the whole grid
            x = keys[0]
            y = keys[1]
            xProj_input.append(x)
            yProj_input.append(y)
            e_input.append(elev_dict[keys])

        source_elev = np.array(e_input)

        Xi1_grd = np.array(Xi1_grd)
        Yi1_grd = np.array(Yi1_grd)

        df_trainX = pd.DataFrame({
            'xProj': xProj,
            'yProj': yProj,
            'elevS': source_elev,
            'var': z
        })

        df_testX = pd.DataFrame({
            'Xi': Xi1_grd,
            'Yi': Yi1_grd,
            'elev': elev_array
        })

        reg = RandomForestRegressor(n_estimators=100,
                                    max_features='sqrt',
                                    random_state=1)

        y = np.array(df_trainX['var']).reshape(-1, 1)
        X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
        X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

        reg.fit(X_train, y)

        Zi = reg.predict(X_test)

        rf_grid = Zi.reshape(num_row + 1, num_col + 1)

        # Compare at a certain point
        for statLoc in station_list:

            coord_pair = projected_lat_lon[statLoc]

            x_orig = int((coord_pair[0] - float(xmin)) / pixelHeight)  # lon
            y_orig = int((coord_pair[1] - float(ymin)) / pixelWidth)  # lat
            x_origin_list.append(x_orig)
            y_origin_list.append(y_orig)

            interpolated_val = rf_grid[y_orig][x_orig]

            original_val = Cvar_dict[statLoc]
            absolute_error = abs(interpolated_val - original_val)
            absolute_error_dictionary[statLoc] = absolute_error

        error_dictionary[count] = sum(
            absolute_error_dictionary.values()) / len(
                absolute_error_dictionary.values(
                ))  # average of all the withheld stations
        # print(absolute_error_dictionary)
        count += 1
    overall_error = sum(error_dictionary.values()) / \
        nfolds  # average of all the runs
    # print(overall_error)
    return overall_error

Exemplo n.º 7

0

Exibir arquivo