Exemplo n.º 1
0
def plotPunctualDataFromObs(ds, title):
    """This function will show the profiles of punctual data as well as the locations"""
    all_vars = ds.variables.keys()
    # print(F"All variables: {all_vars}")
    depths = ds.obs_level
    # groups = ["sla", "sst", "tem", "sal", "den", "thk", "uvl", "vvl"]
    # groups_long_name = ["Sea Level Anomaly", "Sea Surface Temperature", "Temperature", "Salinity", "Density", "Thickness", "U", "V"]
    groups_long_name = [
        "Temperature", "Salinity", "Density", "Interface Depth"
    ]
    groups_long_name = [F"Field_{x}" for x in range(8)]

    obs_types = ds.obs_typ  # Temp, Saln, Saln, u ?
    obs_groups_present = ds.ob_grp_present
    profiles = ds.val
    tot_profiles = profiles.shape[0]
    err = ds.err
    m_lon_idx = ds.grdj
    m_lat_idx = ds.grdi
    print(obs_groups_present.values)

    lons = ds.lon.values[:, 0, 0]
    lats = ds.lat.values[:, 0, 0]
    extent = [-98, -70.40002, 18.09165, 31.9267]

    img_viz = EOAImageVisualizer(
        output_folder="/home/olmozavala/Desktop/DELETE", disp_images=True)
    img_viz.scatter_coords_map(lons, lats, extent, title=title)
Exemplo n.º 2
0
def main():
    config = get_makeprediction_config()
    # *********** Reads the parameters ***********

    input_file = config[ClassificationParams.input_file]
    splits_file = config[ClassificationParams.split_file]
    output_folder = config[ClassificationParams.output_folder]
    output_imgs_folder = config[ClassificationParams.output_imgs_folder]
    output_file_name = config[ClassificationParams.output_file_name]
    run_name = config[TrainingParams.config_name]
    model_weights_file = config[ClassificationParams.model_weights_file]
    forecasted_hours = config[LocalTrainingParams.forecasted_hours]
    disp_images = config[ClassificationParams.show_imgs]
    generate_images = config[ClassificationParams.generate_images]
    metrics_user = config[ClassificationParams.metrics]
    filter_stations = config[LocalTrainingParams.stations]

    # Iterate over the stations
    # Selects the proper model file for the current station
    assert len(model_weights_file) > 0
    assert len(input_file) > 0

    print(F"Working with: {model_weights_file} \n and \n {input_file}")

    data = pd.read_csv(input_file, index_col=0, parse_dates=True)

    all_data_cols = data.columns
    date_columns = [
        x for x in all_data_cols if (x.find('week') != -1) or (
            x.find('hour') != -1) or (x.find('year') != -1)
    ]
    stations_columns = [
        x for x in all_data_cols
        if (x.find('h') == -1) and (x not in date_columns)
    ]
    meteo_columns = [
        x for x in all_data_cols if (x.find('h') != -1) and (
            x not in date_columns) and (x not in stations_columns)
    ]
    desired_columns = meteo_columns + filter_stations + date_columns

    print("Appending date hot vector...")
    date_hv = generate_date_hot_vector(data.index)
    data = pd.concat([data[desired_columns], date_hv], axis=1)
    print("Done!")

    # print("Filtering data to hours 9 to 20...")
    filtered_data = data.between_time("9:00", "20:00")
    # filtered_data = data
    datetimes_str = filtered_data.index.values
    # print("Done!")

    print(F'Normalizing and filtering data....')
    parameters_folder = join(dirname(output_folder), 'Training', 'Parameters')
    data_norm_df_final, accepted_times_idx, y_times_idx, stations_columns, meteo_columns = \
        normalizeAndFilterData(filtered_data, datetimes_str, forecasted_hours, output_folder=parameters_folder,
                               run_name=run_name, read_from_file=True)

    # ********* Filling nan values in the stations with the mean values of all the 'available' stations ********
    X_df = data_norm_df_final.loc[datetimes_str[accepted_times_idx]]
    Y_df = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns]

    # ********* Filling nan values in the stations with the mean values of all the 'available' stations ********
    # for cur_station in stations_columns:
    #     X_df[cur_station] = X_df[cur_station].fillna(X_df['MEAN'])
    #     Y_df[cur_station] = Y_df[cur_station].fillna(data_norm_df_final.loc[datetimes_str[y_times_idx]]['MEAN'])

    # X = data_norm_df_final.loc[datetimes_str[accepted_times_idx]].values
    # X_df = X_df.drop(columns=['MEAN'])
    X_df = X_df.drop(columns=stations_columns)
    X = X_df.values
    # Y = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns].values
    Y = Y_df.values

    config[ModelParams.INPUT_SIZE] = len(X_df.columns)
    print(F'X shape: {X.shape} Y shape: {Y.shape}')

    # *********** Chooses the proper model ***********
    print('Reading model ....')
    config[ModelParams.NUMBER_OF_OUTPUT_CLASSES] = Y.shape[1]
    model = select_1d_model(config)

    # *********** Chooses the proper model ***********
    print('Reading splits info....')
    if splits_file != '':  # In this case we do read the information
        split_info = pd.read_csv(splits_file, dtype=np.int16)
    else:
        split_info = pd.DataFrame({
            'train_ids': [],
            'validation_ids': [],
            'test_id': []
        })
        split_info['train_ids'] = range(Y.shape[0])

    # *********** Reads the weights***********
    print('Reading weights ....')
    model.load_weights(model_weights_file)

    # ************ Makes NN Prediction ********
    print('Making prediction ....')
    output_nn_all = model.predict(X, verbose=1)

    # ************ Saves raw results ********
    number_of_examples = 10
    if generate_images:
        img_viz = EOAImageVisualizer(output_folder=output_imgs_folder,
                                     disp_images=disp_images)

        Y[Y == -1] = np.nan  # So that we do not show the -1
        for c_example in range(number_of_examples):
            hours_to_plot = 24 * 3  # How many points to plot
            start_idx = np.random.randint(
                0, X.shape[0] - hours_to_plot - forecasted_hours)
            end_idx = start_idx + hours_to_plot
            create_folder(output_folder)
            create_folder(output_imgs_folder)
            for idx_station, cur_station in enumerate(filter_stations):
                img_viz.plot_1d_data_np(
                    datetimes_str[y_times_idx][start_idx:end_idx], [
                        Y[start_idx:end_idx, idx_station],
                        output_nn_all[start_idx:end_idx, idx_station]
                    ],
                    title=F'{cur_station}',
                    labels=['GT', 'NN'],
                    file_name_prefix=F'{cur_station}_{c_example}')

    # ************ Recovering original units********
    print('Recovering original units....')
    nn_df = pd.DataFrame(output_nn_all,
                         columns=stations_columns,
                         index=filtered_data.index[y_times_idx])
    nn_original_units = deNormalize(nn_df)
    Y_original = deNormalize(Y_df)

    # ************ Computing metrics********
    print('Computing metrics and saving predictions....')
    compute_metrics(Y_original, nn_original_units, metrics_user, split_info,
                    output_file_name, stations_columns)
import numpy as np
import pandas as pd
import seaborn as sns
from img_viz.eoa_viz import EOAImageVisualizer
import matplotlib.pyplot as plt
from os.path import join

import xarray as xr

viz_obj = EOAImageVisualizer()


def data_summary(ds):
    print("------------- Data summary ---------------------")
    print(ds.head())
    df = ds.to_dataframe()
    print(df.describe())


def access_data(ds):
    """ Examples in how to access data. """
    # In this example we have two variables (tmin, tmax) with two dimensions each (time:731, location:3)

    X = range(len(ds["time"]))

    # http://xarray.pydata.org/en/stable/indexing.html
    # --- access by index (single var, all times)----
    Y = ds["tmin"][:, 0]
    viz_obj.plot_1d_data_np(X, [Y], title="Single var and dim")
    # --- access by name (single var, all times)----
    Y = ds["tmin"].loc[:, "IA"]
Exemplo n.º 4
0
def test_model(config):
    input_folder = config[PredictionParams.input_folder]
    output_folder = config[PredictionParams.output_folder]
    output_fields = config[ProjTrainingParams.output_fields]
    model_weights_file = config[PredictionParams.model_weights_file]
    output_imgs_folder = config[PredictionParams.output_imgs_folder]
    field_names_model = config[ProjTrainingParams.fields_names]
    field_names_obs = config[ProjTrainingParams.fields_names_obs]
    rows = config[ProjTrainingParams.rows]
    cols = config[ProjTrainingParams.cols]
    run_name = config[TrainingParams.config_name]
    norm_type = config[ProjTrainingParams.norm_type]

    output_imgs_folder = join(output_imgs_folder, run_name)
    create_folder(output_imgs_folder)

    # *********** Chooses the proper model ***********
    print('Reading model ....')
    net_type = config[ProjTrainingParams.network_type]
    if net_type == NetworkTypes.UNET or net_type == NetworkTypes.UNET_MultiStream:
        model = select_2d_model(config, last_activation=None)
    if net_type == NetworkTypes.SimpleCNN_2:
        model = simpleCNN(config, nn_type="2d", hid_lay=2, out_lay=2)
    if net_type == NetworkTypes.SimpleCNN_4:
        model = simpleCNN(config, nn_type="2d", hid_lay=4, out_lay=2)
    if net_type == NetworkTypes.SimpleCNN_8:
        model = simpleCNN(config, nn_type="2d", hid_lay=8, out_lay=2)
    if net_type == NetworkTypes.SimpleCNN_16:
        model = simpleCNN(config, nn_type="2d", hid_lay=16, out_lay=2)

    plot_model(model,
               to_file=join(output_folder, F'running.png'),
               show_shapes=True)

    # *********** Reads the weights***********
    print('Reading weights ....')
    model.load_weights(model_weights_file)

    # *********** Read files to predict***********
    all_files = os.listdir(input_folder)
    all_files.sort()
    model_files = np.array([x for x in all_files if x.startswith('model')])

    z_layers = [0]
    var_file = join(input_folder, "cov_mat", "tops_ias_std.nc")
    field_names_std = config[ProjTrainingParams.fields_names_var]
    if len(field_names_std) > 0:
        input_fields_std = read_netcdf(var_file, field_names_std, z_layers)
    else:
        input_fields_std = []

    cmap_out = chooseCMAP(output_fields)
    cmap_model = chooseCMAP(field_names_model)
    cmap_obs = chooseCMAP(field_names_obs)
    cmap_std = chooseCMAP(field_names_std)

    tot_rows = 891
    tot_cols = 1401

    all_whole_mean_times = []
    all_whole_sum_times = []
    all_whole_rmse = []

    # np.random.shuffle(model_files)  # TODO this is only for testing
    for id_file, c_file in enumerate(model_files):
        # Find current and next date
        year = int(c_file.split('_')[1])
        day_of_year = int(c_file.split('_')[2].split('.')[0])

        if day_of_year != 5:
            continue

        model_file = join(input_folder, F'model_{year}_{day_of_year:03d}.nc')
        inc_file = join(input_folder, F'increment_{year}_{day_of_year:03d}.nc')
        obs_file = join(input_folder, F'obs_{year}_{day_of_year:03d}.nc')

        # *********************** Reading files **************************
        input_fields_model = read_netcdf(model_file, field_names_model,
                                         z_layers)
        input_fields_obs = read_netcdf(obs_file, field_names_obs, z_layers)
        output_field_increment = read_netcdf(inc_file, output_fields, z_layers)

        # ******************* Normalizing and Cropping Data *******************
        whole_cnn = np.zeros((891, 1401))
        whole_y = np.zeros((891, 1401))

        this_file_times = []

        start_row = 0
        donerow = False
        while not (donerow):
            donecol = False
            start_col = 0
            while not (donecol):
                # print(F"{start_row}-{start_row+rows} {start_col}-{start_col+cols}")
                # Generate the proper inputs for the NN
                try:
                    perc_ocean = .05
                    input_data, y_data = generateXandY(input_fields_model,
                                                       input_fields_obs,
                                                       input_fields_std,
                                                       output_field_increment,
                                                       field_names_model,
                                                       field_names_obs,
                                                       field_names_std,
                                                       output_fields,
                                                       start_row,
                                                       start_col,
                                                       rows,
                                                       cols,
                                                       norm_type=norm_type,
                                                       perc_ocean=perc_ocean)
                except Exception as e:
                    print(F"Land for {c_file} row:{start_row} col:{start_col}")
                    start_col, donecol = verifyBoundaries(
                        start_col, cols, tot_cols)
                    continue

                # ******************* Replacing nan values *********
                # We set a value of 0.5 on the land. Trying a new loss function that do not takes into account land
                input_data_nans = np.isnan(input_data)
                input_data = np.nan_to_num(input_data, nan=0)
                y_data = np.nan_to_num(y_data, nan=-0.5)

                X = np.expand_dims(input_data, axis=0)
                Y = np.expand_dims(y_data, axis=0)

                # Make the prediction of the network
                start = time.time()
                output_nn_original = model.predict(X, verbose=1)
                toc = time.time() - start
                this_file_times.append(toc)
                # print(F"Time to get prediction {toc:0.3f} seconds")
                # PLOT RAW DATA
                # import matplotlib.pyplot as plt
                # plt.imshow(np.flip(output_nn_original[0,:,:,0], axis=0))
                # plt.imshow(np.flip(Y[0,:,:,0], axis=0))
                # plt.show()
                # Original MSE
                # print(F"MSE: {mean_squared_error(Y[0,:,:,0], output_nn_original[0,:,:,0])}")

                # Make nan all values inside the land
                land_indexes = Y == -0.5
                output_nn_original[land_indexes] = np.nan

                # ====================== PLOTS RAW DATA  NOT NECESSARY =============================
                # viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder, disp_images=False)
                # viz_obj.plot_2d_data_np_raw(np.concatenate((input_data.swapaxes(0,2), Y[0,:,:,:].swapaxes(0,2), output_nn_original[0,:,:,:].swapaxes(0,2))),
                #                             var_names=[F"in_model_{x}" for x in field_names_model] +
                #                                       [F"in_obs_{x}" for x in field_names_obs] +
                #                                       [F"in_var_{x}" for x in field_names_std] +
                #                                       [F"out_inc_{x}" for x in output_fields] +
                #                                       [F"cnn_{x}" for x in output_fields],
                #                             file_name=F"RAW_Input_and_CNN_{c_file}_{start_row:03d}_{start_col:03d}",
                #                             rot_90=True,
                #                             cols_per_row=len(field_names_model),
                #                             title=F"Input data: {field_names_model} and obs {field_names_obs}, increment {output_fields}, cnn {output_fields}")

                # Denormalize the data to the proper units in each field
                denorm_cnn_output = np.zeros(output_nn_original.shape)
                denorm_y = np.zeros(Y.shape)

                # ==== Denormalizingallinput and outputs
                denorm_cnn_output = denormalizeData(output_nn_original,
                                                    output_fields,
                                                    PreprocParams.type_inc,
                                                    norm_type)
                denorm_y = denormalizeData(Y, output_fields,
                                           PreprocParams.type_inc, norm_type)
                input_types = [
                    PreprocParams.type_model for i in input_fields_model
                ] + [PreprocParams.type_obs for i in input_fields_obs
                     ] + [PreprocParams.type_std for i in input_fields_std]
                denorm_input = denormalizeData(
                    input_data,
                    field_names_model + field_names_obs + field_names_std,
                    input_types, norm_type)

                # Recover the original land areas, they are lost after denormalization
                denorm_input[input_data_nans] = np.nan
                denorm_y[land_indexes] = np.nan

                # Remove the 'extra dimension'
                denorm_cnn_output = np.squeeze(denorm_cnn_output)
                denorm_y = np.squeeze(denorm_y)
                whole_cnn[
                    start_row:start_row + rows, start_col:start_col +
                    cols] = denorm_cnn_output  # Add the the 'whole prediction'
                whole_y[start_row:start_row + rows, start_col:start_col +
                        cols] = denorm_y  # Add the the 'whole prediction'

                # if np.random.random() > .99: # Plot 1% of the times
                if True:  # Plot 1% of the times
                    if len(
                            denorm_cnn_output.shape
                    ) == 2:  # In this case we only had one output and we need to make it 'array' to plot
                        denorm_cnn_output = np.expand_dims(denorm_cnn_output,
                                                           axis=2)
                        denorm_y = np.expand_dims(denorm_y, axis=2)

                    # Compute RMSE
                    rmse_cnn = np.zeros(len(output_fields))
                    for i in range(len(output_fields)):
                        ocean_indexes = np.logical_not(
                            np.isnan(denorm_y[:, :, i]))
                        rmse_cnn[i] = np.sqrt(
                            mean_squared_error(
                                denorm_cnn_output[:, :, i][ocean_indexes],
                                denorm_y[:, :, i][ocean_indexes]))

                    # viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder, disp_images=False, mincbar=mincbar, maxcbar=maxcbar)
                    viz_obj = EOAImageVisualizer(
                        output_folder=output_imgs_folder, disp_images=False)

                    # ================== DISPLAYS ALL INPUTS AND OUTPUTS DENORMALIZED ===================
                    # viz_obj.plot_2d_data_np_raw(np.concatenate((input_data.swapaxes(0,2), Y[0,:,:,:].swapaxes(0,2), output_nn_original[0,:,:,:].swapaxes(0,2))),
                    viz_obj.plot_2d_data_np_raw(
                        np.concatenate(
                            (denorm_input.swapaxes(0,
                                                   2), denorm_y.swapaxes(0, 2),
                             denorm_cnn_output.swapaxes(0, 2))),
                        var_names=[F"in_model_{x}"
                                   for x in field_names_model] +
                        [F"in_obs_{x}" for x in field_names_obs] +
                        [F"in_var_{x}" for x in field_names_std] +
                        [F"out_inc_{x}" for x in output_fields] +
                        [F"cnn_{x}" for x in output_fields],
                        file_name=
                        F"Input_and_CNN_{c_file}_{start_row:03d}_{start_col:03d}",
                        cmap=cmap_model + cmap_obs + cmap_std + cmap_out +
                        cmap_out,
                        rot_90=True,
                        cols_per_row=len(field_names_model),
                        title=
                        F"Input data: {field_names_model} and obs {field_names_obs}, increment {output_fields}, cnn {output_fields}"
                    )

                    # =========== Making the same color bar for desired output and the NN =====================
                    mincbar = [
                        np.nanmin(denorm_y[:, :, x])
                        for x in range(denorm_cnn_output.shape[-1])
                    ]
                    maxcbar = [
                        np.nanmax(denorm_y[:, :, x])
                        for x in range(denorm_cnn_output.shape[-1])
                    ]
                    error = (denorm_y - denorm_cnn_output).swapaxes(0, 2)
                    mincbarerror = [
                        np.nanmin(error[i, :, :])
                        for i in range(len(output_fields))
                    ]
                    maxcbarerror = [
                        np.nanmax(error[i, :, :])
                        for i in range(len(output_fields))
                    ]
                    viz_obj = EOAImageVisualizer(
                        output_folder=output_imgs_folder,
                        disp_images=False,
                        mincbar=mincbar + mincbar + mincbarerror,
                        maxcbar=maxcbar + maxcbar + maxcbarerror)

                    # ================== Displays CNN and TSIS with RMSE ================
                    viz_obj.output_folder = join(output_imgs_folder,
                                                 'JoinedErrrorCNN')
                    cmap = chooseCMAP(output_fields)
                    error_cmap = cmocean.cm.diff
                    viz_obj.plot_2d_data_np_raw(
                        np.concatenate((denorm_cnn_output.swapaxes(
                            0, 2), denorm_y.swapaxes(0, 2), error),
                                       axis=0),
                        var_names=[F"CNN INC {x}" for x in output_fields] +
                        [F"TSIS INC {x}" for x in output_fields] +
                        [F'RMSE {c_rmse_cnn:0.4f}' for c_rmse_cnn in rmse_cnn],
                        file_name=
                        F"AllError_{c_file}_{start_row:03d}_{start_col:03d}",
                        rot_90=True,
                        cmap=cmap + cmap + [error_cmap],
                        cols_per_row=len(output_fields),
                        title=F"{output_fields} RMSE: {np.mean(rmse_cnn):0.5f}"
                    )

                start_col, donecol = verifyBoundaries(start_col, cols,
                                                      tot_cols)
                # Column for
            start_row, donerow = verifyBoundaries(start_row, rows, tot_rows)
            # Row for

        # ======= Plots whole output with RMSE
        mincbar = np.nanmin(whole_y) / 2
        maxcbar = np.nanmax(whole_y) / 2
        error = whole_y - whole_cnn
        mincbarerror = np.nanmin(error) / 2
        maxcbarerror = np.nanmax(error) / 2
        no_zero_ids = np.count_nonzero(whole_cnn)

        rmse_cnn = np.sqrt(np.nansum((whole_y - whole_cnn)**2) / no_zero_ids)
        all_whole_rmse.append(rmse_cnn)
        all_whole_mean_times.append(np.mean(np.array(this_file_times)))
        all_whole_sum_times.append(np.sum(np.array(this_file_times)))

        if np.random.random(
        ) > .9 or day_of_year == 353:  # Plot 10% of the times
            viz_obj = EOAImageVisualizer(
                output_folder=output_imgs_folder,
                disp_images=False,
                mincbar=mincbar + mincbar + mincbarerror,
                maxcbar=maxcbar + maxcbar + maxcbarerror)
            # mincbar=[-5, -5, -1],
            # maxcbar=[10, 10, 1])

            # ================== Displays CNN and TSIS with RMSE ================
            viz_obj.output_folder = join(output_imgs_folder,
                                         'WholeOutput_CNN_TSIS')
            viz_obj.plot_2d_data_np_raw(
                [
                    np.flip(whole_cnn, axis=0),
                    np.flip(whole_y, axis=0),
                    np.flip(error, axis=0)
                ],
                var_names=[F"CNN INC {x}" for x in output_fields] +
                [F"TSIS INC {x}"
                 for x in output_fields] + [F'RMSE {rmse_cnn:0.4f}'],
                file_name=F"WholeOutput_CNN_TSIS_{c_file}",
                rot_90=False,
                cols_per_row=3,
                cmap=cmocean.cm.algae,
                title=F"{output_fields} RMSE: {np.mean(rmse_cnn):0.5f}")
def trainModel(config, cur_pollutant, cur_station):
    """Trying to separate things so that tf 'cleans' the memory """

    input_folder = config[TrainingParams.input_folder]
    output_folder = config[TrainingParams.output_folder]

    val_perc = config[TrainingParams.validation_percentage]
    test_perc = config[TrainingParams.test_percentage]
    eval_metrics = config[TrainingParams.evaluation_metrics]
    loss_func = config[TrainingParams.loss_function]
    batch_size = config[TrainingParams.batch_size]
    epochs = config[TrainingParams.epochs]
    model_name_user = config[TrainingParams.config_name]
    optimizer = config[TrainingParams.optimizer]
    forecasted_hours = config[LocalTrainingParams.forecasted_hours]

    split_info_folder = join(output_folder, 'Splits')
    parameters_folder = join(output_folder, 'Parameters')
    weights_folder = join(output_folder, 'models')
    logs_folder = join(output_folder, 'logs')
    imgs_folder = join(output_folder, 'imgs')
    create_folder(split_info_folder)
    create_folder(parameters_folder)
    create_folder(weights_folder)
    create_folder(logs_folder)

    viz_obj = EOAImageVisualizer(output_folder=imgs_folder, disp_images=False)

    print(
        F"============ Reading data for: {cur_pollutant} -- {cur_station} =========================="
    )
    db_file_name = join(input_folder, constants.merge_output_folder.value,
                        F"{cur_pollutant}_{cur_station}.csv")
    data = pd.read_csv(db_file_name, index_col=0)

    config[ModelParams.INPUT_SIZE] = len(data.columns)
    print(F'Data shape: {data.shape} Data axes {data.axes}')
    print("Done!")

    # Predicting for the next value after 24hrs (only one)
    print("Normalizing data....")
    datetimes_str = data.index.values
    datetimes = np.array([
        datetime.strptime(x, constants.datetime_format.value)
        for x in datetimes_str
    ])

    scaler = preprocessing.MinMaxScaler()
    scaler = scaler.fit(data)
    data_norm_np = scaler.transform(data)
    data_norm_df = DataFrame(data_norm_np,
                             columns=data.columns,
                             index=data.index)
    print(F'Done!')

    # Filtering only dates where there is data "forecasted hours after" (24 hrs after)
    print(F"\tBuilding X and Y ....")
    accepted_times_idx = []
    y_times_idx = []
    for i, c_datetime in enumerate(datetimes):
        forecasted_datetime = (c_datetime + timedelta(hours=forecasted_hours))
        if forecasted_datetime in datetimes:
            accepted_times_idx.append(i)
            y_times_idx.append(
                np.argwhere(forecasted_datetime == datetimes)[0][0])

    X_df = data_norm_df.loc[datetimes_str[accepted_times_idx]]
    Y_df = data_norm_df.loc[datetimes_str[y_times_idx]][cur_pollutant]
    X = X_df.values
    Y = Y_df.values

    print(F'X shape: {X.shape} Y shape: {Y.shape}')

    tot_examples = X.shape[0]
    rows_to_read = np.arange(tot_examples)

    # ================ Split definition =================
    [train_ids, val_ids, test_ids
     ] = utilsNN.split_train_validation_and_test(tot_examples,
                                                 val_percentage=val_perc,
                                                 test_percentage=test_perc)

    print("Train examples (total:{}) :{}".format(len(train_ids),
                                                 rows_to_read[train_ids]))
    print("Validation examples (total:{}) :{}:".format(len(val_ids),
                                                       rows_to_read[val_ids]))
    print("Test examples (total:{}) :{}".format(len(test_ids),
                                                rows_to_read[test_ids]))

    print("Selecting and generating the model....")
    now = datetime.utcnow().strftime("%Y_%m_%d_%H_%M")
    model_name = F'{model_name_user}_{now}_{cur_pollutant}_{cur_station}'

    # ******************* Selecting the model **********************
    model = select_1d_model(config)
    plot_model(model,
               to_file=join(output_folder, F'{model_name}.png'),
               show_shapes=True)

    print("Saving split information...")
    file_name_splits = join(split_info_folder, F'{model_name}.csv')
    info_splits = DataFrame({F'Train({len(train_ids)})': train_ids})
    info_splits[F'Validation({len(val_ids)})'] = 0
    info_splits[F'Validation({len(val_ids)})'][0:len(val_ids)] = val_ids
    info_splits[F'Test({len(test_ids)})'] = 0
    info_splits[F'Test({len(test_ids)})'][0:len(test_ids)] = test_ids
    info_splits.to_csv(file_name_splits, index=None)

    print(F"Norm params: {scaler.get_params()}")
    file_name_normparams = join(parameters_folder, F'{model_name}.txt')
    utilsNN.save_norm_params(file_name_normparams, NormParams.min_max, scaler)
    info_splits.to_csv(file_name_splits, index=None)

    print("Getting callbacks ...")

    [logger, save_callback, stop_callback] = utilsNN.get_all_callbacks(
        model_name=model_name,
        early_stopping_func=F'val_{eval_metrics[0].__name__}',
        weights_folder=weights_folder,
        logs_folder=logs_folder)

    print("Compiling model ...")
    model.compile(loss=loss_func, optimizer=optimizer, metrics=eval_metrics)

    print("Training ...")
    # This part should be somehow separated, it will change for every project
    x_train = X[train_ids, :]
    y_train = Y[train_ids]
    x_val = X[val_ids, :]
    y_val = Y[val_ids]
    x_test = X[test_ids, :]
    y_test = Y[test_ids]

    # Plotting some intermediate results
    import matplotlib.pyplot as plt
    size = 24 * 60  # Two months of data
    start = np.random.randint(0, len(data) - size)
    end = start + size
    plt.figure(figsize=[64, 8])
    x_plot = range(len(X_df.iloc[start:end].index.values))
    y_plot = X_df.iloc[start:end][cur_pollutant].values
    yy_plot = Y_df.iloc[start:end].values
    viz_obj.plot_1d_data_np(x_plot, [y_plot, yy_plot],
                            title=F"{cur_pollutant}_{cur_station}",
                            labels=['Current', 'Desired'],
                            wide_ratio=4,
                            file_name_prefix=F"{cur_pollutant}_{cur_station}")

    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_val, y_val),
              shuffle=True,
              callbacks=[logger, save_callback, stop_callback])
Exemplo n.º 6
0
def img_generation_hycom(proc_id):
    """
    Makes images of the available data (Free run, DA and Observations)
    :param proc_id:
    :return:
    """
    config = get_preproc_config()
    input_folder_tsis = config[PreprocParams.input_folder_tsis]
    input_folder_forecast = config[PreprocParams.input_folder_hycom]
    input_folder_obs = config[PreprocParams.input_folder_obs]
    output_folder = config[PreprocParams.imgs_output_folder]
    YEARS = config[PreprocParams.YEARS]
    MONTHS = config[PreprocParams.MONTHS]
    fields = config[PreprocParams.fields_names]
    fields_obs = config[PreprocParams.fields_names_obs]
    plot_modes = config[PreprocParams.plot_modes_per_field]
    layers = config[PreprocParams.layers_to_plot]

    img_viz = EOAImageVisualizer(output_folder=output_folder,
                                 disp_images=False)

    # Iterate current year
    for c_year in YEARS:
        # Iterate current month
        for c_month in MONTHS:
            try:
                days_of_month, days_of_year = get_days_from_month(c_month)
                # Reads the data (DA, Free run, and observations)
                hycom_files, hycom_paths = get_hycom_file_name(
                    input_folder_forecast, c_year, c_month)
            except Exception as e:
                print(F"Failed to find any file for date {c_year}-{c_month}")
                continue

            # This for is fixed to be able to run in parallel
            for c_day_of_month, c_day_of_year in enumerate(days_of_year):
                if (c_day_of_month % NUM_PROC) == proc_id:
                    # Makes regular expression of the current desired file
                    re_hycom = F'archv.{c_year}_{c_day_of_year:03d}\S*.a'
                    try:
                        # Gets the proper index of the file for the three cases
                        hycom_file_idx = [
                            i for i, file in enumerate(hycom_files)
                            if re.search(re_hycom, file) != None
                        ][0]
                    except Exception as e:
                        print(
                            F"ERROR: The file for date {c_year} - {c_month} - {c_day_of_month} doesn't exist: {e}"
                        )
                        continue

                    print(
                        F" =============== Working with: {hycom_files[hycom_file_idx]} ============= "
                    )
                    print(
                        F"Available fields: {read_field_names(hycom_paths[hycom_file_idx])}"
                    )
                    model_state_np_fields = read_hycom_fields(
                        hycom_paths[hycom_file_idx], fields, layers=layers)
                    for idx_field, c_field_name in enumerate(fields):
                        model_state_np_c_field = model_state_np_fields[
                            c_field_name]
                        title = F"{c_field_name} {c_year}_{c_month:02d}_{(c_day_of_month+1):02d}"
                        # ======================= Only Fredatae HYCOM, TSIS, Observations ==================
                        img_viz.plot_3d_data_np(
                            [model_state_np_c_field],
                            var_names=[F'HYCOM'],
                            title=title,
                            file_name_prefix=
                            F'HYCOM_{c_field_name}_{c_year}_{c_month:02d}_{c_day_of_month:02d}',
                            z_lavels_names=layers,
                            flip_data=True,
                            plot_mode=plot_modes[idx_field])
Exemplo n.º 7
0
def compute_consecutive_days_difference():
    """
    Computes the difference between consecutive days on the hycom files.
    :param proc_id:
    :return:
    """
    config = get_preproc_config()
    input_folder_forecast = config[PreprocParams.input_folder_hycom]
    output_folder = config[PreprocParams.imgs_output_folder]
    YEARS = config[PreprocParams.YEARS]
    MONTHS = config[PreprocParams.MONTHS]
    fields = config[PreprocParams.fields_names]
    layers = config[PreprocParams.layers_to_plot]

    img_viz = EOAImageVisualizer(output_folder=output_folder,
                                 disp_images=False)

    # Iterate current year
    for c_year in YEARS:
        # Iterate current month
        diff_per_field = {field: [] for field in fields}
        days_with_data = []
        for c_month in MONTHS:
            # Reading the data
            try:
                days_of_month, days_of_year = get_days_from_month(c_month)
                # Reading hycom files
                hycom_files, hycom_paths = get_hycom_file_name(
                    input_folder_forecast, c_year, c_month)
            except Exception as e:
                print(F"Failed to find any file for date {c_year}-{c_month}")
                continue

            # This for is fixed to be able to run in parallel
            for c_day_of_month, c_day_of_year in enumerate(days_of_year):
                print(
                    F"---------- Year {c_year} day: {c_day_of_year} --------------"
                )
                # Makes regular expression of the current desired file
                re_hycom = F'archv.{c_year}_{c_day_of_year:03d}\S*.a'
                re_hycom_prev = F'archv.{c_year}_{(c_day_of_year-1):03d}\S*.a'
                try:
                    # Gets the proper index of the file for the three cases
                    hycom_file_idx = [
                        i for i, file in enumerate(hycom_files)
                        if re.search(re_hycom, file) != None
                    ][0]
                    hycom_file_idx_prev = [
                        i for i, file in enumerate(hycom_files)
                        if re.search(re_hycom_prev, file) != None
                    ][0]
                except Exception as e:
                    print(
                        F"ERROR: The file for date {c_year} - {c_month} - {c_day_of_month} (and prev day) don't exist: {e}"
                    )
                    continue

                days_with_data.append(c_day_of_year)
                model_state_np_fields = read_hycom_fields(
                    hycom_paths[hycom_file_idx], fields, layers=layers)
                model_state_np_fields_prev = read_hycom_fields(
                    hycom_paths[hycom_file_idx_prev], fields, layers=layers)
                # Computes the difference between consecutive days from the desired fields
                for idx_field, c_field_name in enumerate(fields):
                    model_state_np_c_field = model_state_np_fields[
                        c_field_name]
                    model_state_np_c_field_prev = model_state_np_fields_prev[
                        c_field_name]
                    c_diff = np.abs(
                        np.nanmean(model_state_np_c_field_prev -
                                   model_state_np_c_field))
                    diff_per_field[c_field_name].append(c_diff)

        # Plots the differences between consecutive days. For all the fields together.
        img_viz.plot_1d_data_np(
            days_with_data, [diff_per_field[a] for a in diff_per_field.keys()],
            title='Difference between days',
            labels=fields,
            file_name_prefix='HYCOM_Diff_Between_Days',
            wide_ratio=4)
        # Plots the differences between consecutive days. Separated by fields
        for field in diff_per_field.keys():
            img_viz.plot_1d_data_np(
                days_with_data, [diff_per_field[field]],
                title=F'Difference between days {field}',
                labels=[field],
                file_name_prefix=F'HYCOM_Diff_Between_Days_{field}',
                wide_ratio=4)
Exemplo n.º 8
0
def plot_raw_data_new(proc_id):
    """
    This code makes two plots: 1) model and increment 2) model, increment and observations
    Depending on which plot you want to make, it reads field_names and fields_names_obs from the PreprocConfig file
    :param proc_id:
    :return:
    """
    config = get_preproc_config()
    input_folder_tsis = config[PreprocParams.input_folder_tsis]
    input_folder_forecast = config[PreprocParams.input_folder_hycom]
    input_folder_obs = config[PreprocParams.input_folder_obs]
    output_folder = config[PreprocParams.imgs_output_folder]
    YEARS = config[PreprocParams.YEARS]
    MONTHS = config[PreprocParams.MONTHS]
    fields = config[PreprocParams.fields_names]
    fields_obs = config[PreprocParams.fields_names_obs]
    plot_modes = config[PreprocParams.plot_modes_per_field]
    layers = config[PreprocParams.layers_to_plot]

    img_viz = EOAImageVisualizer(output_folder=output_folder,
                                 disp_images=False)

    # Iterate current year
    for c_year in YEARS:
        # Iterate current month
        for c_month in MONTHS:
            try:
                days_of_month, days_of_year = get_days_from_month(c_month)
                # Reads the data (DA, Free run, and observations)
                increment_files, increment_paths = get_hycom_file_name(
                    input_folder_tsis, c_year, c_month)
                hycom_files, hycom_paths = get_hycom_file_name(
                    input_folder_forecast, c_year, c_month, day_idx=2)
                obs_files, obs_paths = get_obs_file_names(
                    input_folder_obs, c_year, c_month)
            except Exception as e:
                print(F"Failed to find any file for date {c_year}-{c_month}")
                continue

            # This for is fixed to be able to run in parallel
            for c_day_of_month, c_day_of_year in enumerate(days_of_year):
                if (c_day_of_month % NUM_PROC) == proc_id:
                    # Makes regular expression of the current desired file
                    re_tsis = F'incupd.{c_year}_{c_day_of_year:03d}\S*.a'
                    re_hycom = F'020_archv.{c_year}_{c_day_of_year:03d}\S*.a'
                    # re_hycom = F'archv.{c_year}_{c_day_of_year:03d}\S*.a'
                    # re_obs = F'tsis_obs_ias_{c_year}{c_month:02d}{c_day_of_month+1:02d}\S*.nc'
                    re_obs = F'tsis_obs_gomb4_{c_year}{c_month:02d}{c_day_of_month+1:02d}\S*.nc'

                    try:
                        # Gets the proper index of the file for the three cases
                        increment_file_idx = [
                            i for i, file in enumerate(increment_files)
                            if re.search(re_tsis, file) != None
                        ][0]
                        hycom_file_idx = [
                            i for i, file in enumerate(hycom_files)
                            if re.search(re_hycom, file) != None
                        ][0]
                        obs_file_idx = [
                            i for i, file in enumerate(obs_files)
                            if re.search(re_obs, file) != None
                        ][0]
                    except Exception as e:
                        print(
                            F"ERROR: The file for date {c_year} - {c_month} - {(c_day_of_month+1)} doesn't exist: {e}"
                        )
                        continue

                    print(
                        F" =============== Working with: {increment_files[increment_file_idx]} ============= "
                    )
                    print(
                        F"Available fields on increment: {read_field_names(increment_paths[increment_file_idx])}"
                    )
                    print(
                        F"Available fields on model: {read_field_names(hycom_paths[hycom_file_idx])}"
                    )
                    ds = xr.open_dataset(obs_paths[obs_file_idx])
                    print(
                        F"Available fields on observations: {print(list(ds.keys()))}"
                    )

                    model_state_np_fields = read_hycom_fields(
                        hycom_paths[hycom_file_idx], fields, layers=layers)
                    increment_np_fields = read_hycom_fields(
                        increment_paths[increment_file_idx],
                        fields,
                        layers=layers)

                    # obs_np_fields = read_netcdf(obs_paths[obs_file_idx], fields_obs, rename_fields=fields)
                    obs_np_fields = read_netcdf(obs_paths[obs_file_idx],
                                                fields_obs)

                    # Iterate over the fields defined in PreprocConfig and plot them
                    for idx_field, c_field_name in enumerate(fields):
                        increment_np_c_field = increment_np_fields[
                            c_field_name]
                        nan_indx = increment_np_c_field == 0
                        increment_np_c_field[nan_indx] = np.nan
                        model_state_np_c_field = model_state_np_fields[
                            c_field_name]

                        # diff_increment_vs_fo = increment_np_c_field - model_state_np_c_field
                        # In these 2 cases, we only compute it for the surface layer
                        # diff_obs_vs_hycom = obs_np_c_field - model_state_np_c_field[0]
                        # obs_np_c_field[502,609] - model_state_np_c_field[0][502,609]
                        # diff_obs_vs_da = obs_np_c_field - increment_np_c_field[0]

                        # mse_hycom_vs_da = mse(increment_np_c_field, model_state_np_c_field)
                        # mse_obs_vs_hycom = mse(obs_np_c_field, model_state_np_c_field[0])
                        # mse_obs_vs_da = mse(obs_np_c_field, increment_np_c_field[0])

                        if c_field_name == "thknss":
                            divide = 9806
                            model_state_np_c_field = model_state_np_c_field / divide
                            increment_np_c_field = increment_np_c_field / divide
                        if c_field_name == "srfhgt":
                            inc = increment_np_c_field
                        else:
                            inc = (model_state_np_c_field -
                                   increment_np_c_field)

                        # ======================= Only Background state and TSIS increment ==================
                        try:
                            title = F"{c_field_name} {c_year}_{c_month:02d}_{(c_day_of_month+1):02d}"
                            img_viz.plot_3d_data_np(
                                [model_state_np_c_field, inc],
                                # img_viz.plot_3d_data_np([model_state_np_c_field, increment_np_c_field],
                                var_names=['HYCOM', 'Increment (TSIS)'],
                                title=title,
                                file_name_prefix=
                                F'ModelAndIncrement_{c_field_name}_{c_year}_{c_month:02d}_{(c_day_of_month+1):02d}',
                                z_lavels_names=layers,
                                flip_data=True,
                                plot_mode=plot_modes[idx_field])
                        except Exception as e:
                            print(F"Failed for field: {c_field_name}: {e}")
Exemplo n.º 9
0
def plot_raw_data(proc_id):
    """
    Makes images of the available data (Free run, DA and Observations)
    :param proc_id:
    :return:
    """
    config = get_preproc_config()
    input_folder_tsis = config[PreprocParams.input_folder_tsis]
    input_folder_forecast = config[PreprocParams.input_folder_hycom]
    input_folder_obs = config[PreprocParams.input_folder_obs]
    output_folder = config[PreprocParams.imgs_output_folder]
    YEARS = config[PreprocParams.YEARS]
    MONTHS = config[PreprocParams.MONTHS]
    fields = config[PreprocParams.fields_names]
    fields_obs = config[PreprocParams.fields_names_obs]
    plot_modes = config[PreprocParams.plot_modes_per_field]
    layers = config[PreprocParams.layers_to_plot]

    img_viz = EOAImageVisualizer(output_folder=output_folder,
                                 disp_images=False)

    # Iterate current year
    for c_year in YEARS:
        # Iterate current month
        for c_month in MONTHS:
            try:
                days_of_month, days_of_year = get_days_from_month(c_month)
                # Reads the data (DA, Free run, and observations)
                increment_files, increment_paths = get_hycom_file_name(
                    input_folder_tsis, c_year, c_month)
                hycom_files, hycom_paths = get_hycom_file_name(
                    input_folder_forecast, c_year, c_month)
                obs_files, obs_paths = get_obs_file_names(
                    input_folder_obs, c_year, c_month)
            except Exception as e:
                print(F"Failed to find any file for date {c_year}-{c_month}")
                continue

            # This for is fixed to be able to run in parallel
            for c_day_of_month, c_day_of_year in enumerate(days_of_year):
                if (c_day_of_month % NUM_PROC) == proc_id:
                    # Makes regular expression of the current desired file
                    re_tsis = F'incupd.{c_year}_{c_day_of_year:03d}\S*.a'
                    re_hycom = F'archv.{c_year}_{c_day_of_year:03d}\S*.a'
                    re_obs = F'tsis_obs_ias_{c_year}{c_month:02d}{c_day_of_month+1:02d}\S*.nc'

                    try:
                        # Gets the proper index of the file for the three cases
                        increment_file_idx = [
                            i for i, file in enumerate(increment_files)
                            if re.search(re_tsis, file) != None
                        ][0]
                        hycom_file_idx = [
                            i for i, file in enumerate(hycom_files)
                            if re.search(re_hycom, file) != None
                        ][0]
                        obs_file_idx = [
                            i for i, file in enumerate(obs_files)
                            if re.search(re_obs, file) != None
                        ][0]
                    except Exception as e:
                        print(
                            F"ERROR: The file for date {c_year} - {c_month} - {(c_day_of_month+1)} doesn't exist: {e}"
                        )
                        continue

                    print(
                        F" =============== Working with: {increment_files[increment_file_idx]} ============= "
                    )
                    print(
                        F"Available fields on increment: {read_field_names(increment_paths[increment_file_idx])}"
                    )
                    increment_np_fields = read_hycom_fields(
                        increment_paths[increment_file_idx],
                        fields,
                        layers=layers)
                    model_state_np_fields = read_hycom_fields(
                        hycom_paths[hycom_file_idx], fields, layers=layers)
                    obs_np_fields = read_netcdf(obs_paths[obs_file_idx],
                                                fields_obs,
                                                layers=[0],
                                                rename_fields=fields)

                    for idx_field, c_field_name in enumerate(fields):
                        increment_np_c_field = increment_np_fields[
                            c_field_name]
                        nan_indx = increment_np_c_field == 0
                        increment_np_c_field[nan_indx] = np.nan
                        model_state_np_c_field = model_state_np_fields[
                            c_field_name]
                        obs_np_c_field = obs_np_fields[c_field_name]

                        # diff_increment_vs_fo = increment_np_c_field - model_state_np_c_field
                        # In these 2 cases, we only compute it for the surface layer
                        # diff_obs_vs_hycom = obs_np_c_field - model_state_np_c_field[0]
                        obs_np_c_field[502,
                                       609] - model_state_np_c_field[0][502,
                                                                        609]
                        # diff_obs_vs_da = obs_np_c_field - increment_np_c_field[0]

                        # mse_hycom_vs_da = mse(increment_np_c_field, model_state_np_c_field)
                        # mse_obs_vs_hycom = mse(obs_np_c_field, model_state_np_c_field[0])
                        # mse_obs_vs_da = mse(obs_np_c_field, increment_np_c_field[0])

                        title = F"{c_field_name} {c_year}_{c_month:02d}_{(c_day_of_month+1):02d}"
                        # ======================= Only Fredatae HYCOM, TSIS, Observations ==================
                        img_viz.plot_3d_data_np(
                            [
                                np.expand_dims(obs_np_c_field, 0),
                                model_state_np_c_field, increment_np_c_field
                            ],
                            var_names=[
                                F'Observations', 'HYCOM', 'Increment (TSIS)'
                            ],
                            title=title,
                            file_name_prefix=
                            F'Summary_{c_field_name}_{c_year}_{c_month:02d}_{(c_day_of_month+1):02d}',
                            z_lavels_names=layers,
                            flip_data=True,
                            plot_mode=plot_modes[idx_field])
Exemplo n.º 10
0
def main():
    config = get_makeprediction_config()
    # *********** Reads the parameters ***********

    input_file = config[ClassificationParams.input_file]
    output_folder = config[ClassificationParams.output_folder]
    output_imgs_folder = config[ClassificationParams.output_imgs_folder]
    output_file_name = config[ClassificationParams.output_file_name]
    model_weights_file = config[ClassificationParams.model_weights_file]
    forecasted_hours = config[LocalTrainingParams.forecasted_hours]
    pollutant = config[LocalTrainingParams.pollutant]

    # ********** Reading and preprocessing data *******
    _all_stations = [
        "ACO", "AJM", "AJU", "ARA", "ATI", "AZC", "BJU", "CAM", "CCA", "CES",
        "CFE", "CHO", "COR", "COY", "CUA", "CUI", "CUT", "DIC", "EAJ", "EDL",
        "FAC", "FAN", "GAM", "HAN", "HGM", "IBM", "IMP", "INN", "IZT", "LAA",
        "LAG", "LLA", "LOM", "LPR", "LVI", "MCM", "MER", "MGH", "MIN", "MON",
        "MPA", "NET", "NEZ", "PED", "PER", "PLA", "POT", "SAG", "SFE", "SHA",
        "SJA", "SNT", "SUR", "TAC", "TAH", "TAX", "TEC", "TLA", "TLI", "TPN",
        "UAX", "UIZ", "UNM", "VAL", "VIF", "XAL", "XCH"
    ]

    # Iterate over the stations
    models_folder = '/data/UNAM/Air_Pollution_Forecast/Data/Training/models'
    data_folder = '/data/UNAM/Air_Pollution_Forecast/Data/MergedDataCSV'
    for c_station in _all_stations:
        try:
            model_weights_file = [
                join(models_folder, x) for x in listdir(models_folder)
                if x.find(c_station) != -1
            ]
            input_file = [
                join(data_folder, x) for x in listdir(data_folder)
                if x.find(c_station) != -1
            ]
            # Selects the proper model file for the current station
            assert len(model_weights_file) > 0
            assert len(input_file) > 0

            print(F"Working with: {model_weights_file} and {input_file}")
            model_weights_file = model_weights_file[0]
            input_file = input_file[0]

            data = pd.read_csv(input_file, index_col=0)

            config[ModelParams.INPUT_SIZE] = len(data.columns)
            print(F'Data shape: {data.shape} Data axes {data.axes}')
            print("Done!")

            # Predicting for the next value after 24hrs (only one)
            print("Normalizing data....")
            datetimes_str = data.index.values
            datetimes = np.array([
                datetime.strptime(x, constants.datetime_format.value)
                for x in datetimes_str
            ])

            scaler = preprocessing.MinMaxScaler()
            scaler = scaler.fit(data)
            data_norm_np = scaler.transform(data)
            data_norm_df = DataFrame(data_norm_np,
                                     columns=data.columns,
                                     index=data.index)
            print(F'Done!')

            # Filtering only dates where there is data "forecasted hours after" (24 hrs after)
            print(F"\tBuilding X and Y ....")
            accepted_times_idx = []
            y_times_idx = []
            for i, c_datetime in enumerate(datetimes):
                forecasted_datetime = (c_datetime +
                                       timedelta(hours=forecasted_hours))
                if forecasted_datetime in datetimes:
                    accepted_times_idx.append(i)
                    y_times_idx.append(
                        np.argwhere(forecasted_datetime == datetimes)[0][0])

            X_df = data_norm_df.loc[datetimes_str[accepted_times_idx]]
            Y_df = data_norm_df.loc[datetimes_str[y_times_idx]][pollutant]
            X = X_df.values
            Y = Y_df.values

            print(F'X shape: {X.shape} Y shape: {Y.shape}')

            # *********** Chooses the proper model ***********
            print('Reading model ....')
            model = select_1d_model(config)

            # *********** Reads the weights***********
            print('Reading weights ....')
            model.load_weights(model_weights_file)

            create_folder(output_folder)
            create_folder(output_imgs_folder)

            # *********** Makes a dataframe to contain the DSC information **********
            metrics_params = config[ClassificationParams.metrics]
            metrics_dict = {met.name: met.value for met in metrics_params}

            # *********** Iterates over each case *********
            t0 = time.time()
            # -------------------- Reading data -------------
            output_nn_all = model.predict(X, verbose=1)

            # Plotting some intermediate results
            import matplotlib.pyplot as plt
            size = 24 * 60  # Two months of data
            start = np.random.randint(0, len(data) - size)
            end = start + size
            plt.figure(figsize=[64, 8])
            x_plot = range(len(Y))
            y_plot = Y
            yy_plot = Y_df.iloc[start:end].values
            viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder,
                                         disp_images=False)
            plot_this_many = 24 * 60
            viz_obj.plot_1d_data_np(
                x_plot[0:plot_this_many],
                [y_plot[0:plot_this_many], output_nn_all[0:plot_this_many, 0]],
                title=F"{c_station} {pollutant}",
                labels=['Original', 'Forecasted'],
                wide_ratio=4,
                file_name_prefix=F"{pollutant}_{c_station}")

            print(F'\t Done! Elapsed time {time.time() - t0:0.2f} seg')

        except Exception as e:
            print(
                F"---------------------------- Failed {c_station} error: {e} ----------------"
            )
Exemplo n.º 11
0
def test_model(config):
    input_folder = config[PredictionParams.input_folder]
    output_folder = config[PredictionParams.output_folder]
    output_fields = config[ProjTrainingParams.output_fields]
    model_weights_file = config[PredictionParams.model_weights_file]
    output_imgs_folder = config[PredictionParams.output_imgs_folder]
    field_names_model = config[ProjTrainingParams.fields_names]
    field_names_obs = config[ProjTrainingParams.fields_names_obs]
    rows = config[ProjTrainingParams.rows]
    cols = config[ProjTrainingParams.cols]
    run_name = config[TrainingParams.config_name]
    norm_type = config[ProjTrainingParams.norm_type]

    output_imgs_folder = join(output_imgs_folder, run_name)
    create_folder(output_imgs_folder)

    # *********** Chooses the proper model ***********
    print('Reading model ....')

    net_type = config[ProjTrainingParams.network_type]
    if net_type == NetworkTypes.UNET or net_type == NetworkTypes.UNET_MultiStream:
        model = select_2d_model(config, last_activation=None)
    if net_type == NetworkTypes.SimpleCNN_2:
        model = simpleCNN(config, nn_type="2d", hid_lay=2, out_lay=2)
    if net_type == NetworkTypes.SimpleCNN_4:
        model = simpleCNN(config, nn_type="2d", hid_lay=4, out_lay=2)
    if net_type == NetworkTypes.SimpleCNN_8:
        model = simpleCNN(config, nn_type="2d", hid_lay=8, out_lay=2)
    if net_type == NetworkTypes.SimpleCNN_16:
        model = simpleCNN(config, nn_type="2d", hid_lay=16, out_lay=2)

    plot_model(model,
               to_file=join(output_folder, F'running.png'),
               show_shapes=True)

    # *********** Reads the weights***********
    print('Reading weights ....')
    model.load_weights(model_weights_file)

    # *********** Read files to predict***********
    all_files = os.listdir(input_folder)
    all_files.sort()
    model_files = np.array([x for x in all_files if x.startswith('model')])

    z_layers = [0]
    var_file = join(input_folder, "cov_mat", "tops_ias_std.nc")
    field_names_std = config[ProjTrainingParams.fields_names_var]
    if len(field_names_std) > 0:
        input_fields_std = read_netcdf(var_file, field_names_std, z_layers)
    else:
        input_fields_std = []

    cmap_out = chooseCMAP(output_fields)
    cmap_model = chooseCMAP(field_names_model)
    cmap_obs = chooseCMAP(field_names_obs)
    cmap_std = chooseCMAP(field_names_std)

    tot_rows = 891
    tot_cols = 1401

    all_whole_mean_times = []
    all_whole_sum_times = []
    all_whole_rmse = []

    # np.random.shuffle(model_files)  # TODO this is only for testing
    for id_file, c_file in enumerate(model_files):
        # Find current and next date
        year = int(c_file.split('_')[1])
        day_of_year = int(c_file.split('_')[2].split('.')[0])

        model_file = join(input_folder, F'model_{year}_{day_of_year:03d}.nc')
        inc_file = join(input_folder, F'increment_{year}_{day_of_year:03d}.nc')
        obs_file = join(input_folder, F'obs_{year}_{day_of_year:03d}.nc')

        # *********************** Reading files **************************
        input_fields_model = read_netcdf(model_file, field_names_model,
                                         z_layers)
        input_fields_obs = read_netcdf(obs_file, field_names_obs, z_layers)
        output_field_increment = read_netcdf(inc_file, output_fields, z_layers)

        # ******************* Normalizing and Cropping Data *******************
        this_file_times = []

        try:
            perc_ocean = .01
            input_data, y_data = generateXandY(input_fields_model,
                                               input_fields_obs,
                                               input_fields_std,
                                               output_field_increment,
                                               field_names_model,
                                               field_names_obs,
                                               field_names_std,
                                               output_fields,
                                               0,
                                               0,
                                               grows,
                                               gcols,
                                               norm_type=norm_type,
                                               perc_ocean=perc_ocean)
        except Exception as e:
            print(F"Exception {e}")

        # ******************* Replacing nan values *********
        # We set a value of 0.5 on the land. Trying a new loss function that do not takes into account land
        input_data_nans = np.isnan(input_data)
        input_data = np.nan_to_num(input_data, nan=0)
        y_data = np.nan_to_num(y_data, nan=-0.5)

        X = np.expand_dims(input_data, axis=0)
        Y = np.expand_dims(y_data, axis=0)

        # Make the prediction of the network
        start = time.time()
        output_nn_original = model.predict(X, verbose=1)
        toc = time.time() - start
        this_file_times.append(toc)

        # Make nan all values inside the land
        land_indexes = Y == -0.5
        output_nn_original[land_indexes] = np.nan

        # ==== Denormalizingallinput and outputs
        denorm_cnn_output = denormalizeData(output_nn_original, output_fields,
                                            PreprocParams.type_inc, norm_type)
        denorm_y = denormalizeData(Y, output_fields, PreprocParams.type_inc,
                                   norm_type)
        input_types = [PreprocParams.type_model
                       for i in input_fields_model] + [
                           PreprocParams.type_obs for i in input_fields_obs
                       ] + [PreprocParams.type_std for i in input_fields_std]
        denorm_input = denormalizeData(
            input_data, field_names_model + field_names_obs + field_names_std,
            input_types, norm_type)

        # Recover the original land areas, they are lost after denormalization
        denorm_y[land_indexes] = np.nan

        # Remove the 'extra dimension'
        denorm_cnn_output = np.squeeze(denorm_cnn_output)
        denorm_y = np.squeeze(denorm_y)
        whole_cnn = denorm_cnn_output  # Add the the 'whole prediction'
        whole_y = denorm_y  # Add the the 'whole prediction'

        if len(
                denorm_cnn_output.shape
        ) == 2:  # In this case we only had one output and we need to make it 'array' to plot
            denorm_cnn_output = np.expand_dims(denorm_cnn_output, axis=2)
            denorm_y = np.expand_dims(denorm_y, axis=2)

        # Compute RMSE
        # rmse_cnn = np.zeros(len(output_fields))
        # for i in range(len(output_fields)):
        #     ocean_indexes = np.logical_not(np.isnan(denorm_y[:,:,i]))
        #     rmse_cnn[i] = np.sqrt(mean_squared_error(denorm_cnn_output[:,:,i][ocean_indexes], denorm_y[:,:,i][ocean_indexes]))

        # ================== DISPLAYS ALL INPUTS AND OUTPUTS DENORMALIZED ===================
        # Adding back mask to all the input variables
        denorm_input[input_data_nans] = np.nan

        # ======= Plots whole output with RMSE
        mincbar = np.nanmin(whole_y)
        maxcbar = np.nanmax(whole_y)
        error = whole_y - whole_cnn
        mincbarerror = np.nanmin(error)
        maxcbarerror = np.nanmax(error)
        no_zero_ids = np.count_nonzero(whole_cnn)

        if output_fields[
                0] == 'srfhgt':  # This should only be for SSH to adjust the units
            whole_cnn /= 9.81
            whole_y = np.array(whole_y) / 9.81

        rmse_cnn = np.sqrt(np.nansum((whole_y - whole_cnn)**2) / no_zero_ids)

        all_whole_rmse.append(rmse_cnn)
        all_whole_mean_times.append(np.mean(np.array(this_file_times)))
        all_whole_sum_times.append(np.sum(np.array(this_file_times)))

        # if day_of_year == 353: # Plot 10% of the times
        if True:  # Plot 10% of the times

            # viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder, disp_images=False, mincbar=mincbar, maxcbar=maxcbar)
            viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder,
                                         disp_images=False)

            # viz_obj.plot_2d_data_np_raw(np.concatenate((input_data.swapaxes(0,2), Y[0,:,:,:].swapaxes(0,2), output_nn_original[0,:,:,:].swapaxes(0,2))),
            viz_obj.plot_2d_data_np_raw(
                np.concatenate(
                    (denorm_input.swapaxes(0, 2), denorm_y.swapaxes(0, 2),
                     denorm_cnn_output.swapaxes(0, 2))),
                var_names=[F"in_model_{x}" for x in field_names_model] +
                [F"in_obs_{x}" for x in field_names_obs] +
                [F"in_var_{x}" for x in field_names_std] +
                [F"out_inc_{x}"
                 for x in output_fields] + [F"cnn_{x}" for x in output_fields],
                file_name=F"Global_Input_and_CNN_{c_file}",
                rot_90=True,
                cmap=cmap_model + cmap_obs + cmap_std + cmap_out + cmap_out,
                cols_per_row=len(field_names_model),
                title=
                F"Input data: {field_names_model} and obs {field_names_obs}, increment {output_fields}, cnn {output_fields}"
            )

            minmax = getMinMaxPlot(output_fields)[0]
            viz_obj = EOAImageVisualizer(
                output_folder=output_imgs_folder,
                disp_images=False,
                # mincbar=mincbar + mincbar + mincbarerror,
                # maxcbar=maxcbar + maxcbar + maxcbarerror)
                # mincbar=[minmax[0], minmax[0], max(minmax[0],-1)],
                # maxcbar=[minmax[1], minmax[1], min(minmax[1],1)])
                mincbar=[minmax[0], minmax[0], -1],
                maxcbar=[minmax[1], minmax[1], 1])

            # ================== Displays CNN and TSIS with RMSE ================
            error_cmap = cmocean.cm.diff
            viz_obj.output_folder = join(output_imgs_folder,
                                         'WholeOutput_CNN_TSIS')
            viz_obj.plot_2d_data_np_raw(
                [
                    np.flip(whole_cnn, axis=0),
                    np.flip(whole_y, axis=0),
                    np.flip(error, axis=0)
                ],
                # var_names=[F"CNN INC {x}" for x in output_fields] + [F"TSIS INC {x}" for x in output_fields] + [F'TSIS - CNN (Mean RMSE {rmse_cnn:0.4f} m)'],
                var_names=[F"CNN increment SSH" for x in output_fields] +
                [F"TSIS increment SSH" for x in output_fields] +
                [F'TSIS - CNN \n (Mean RMSE {rmse_cnn:0.4f} m)'],
                file_name=F"Global_WholeOutput_CNN_TSIS_{c_file}",
                rot_90=False,
                cmap=cmap_out + cmap_out + [error_cmap],
                cols_per_row=3,
                # title=F"{output_fields[0]} RMSE: {np.mean(rmse_cnn):0.5f} m.")
                title=F"SSH RMSE: {np.mean(rmse_cnn):0.5f} m.")

            print("DONE ALL FILES!!!!!!!!!!!!!")
    dic_summary = {
        "File": model_files,
        "rmse": all_whole_rmse,
        "times mean": all_whole_mean_times,
        "times sum": all_whole_sum_times,
    }
    df = pd.DataFrame.from_dict(dic_summary)
    df.to_csv(join(output_imgs_folder, "Global_RMSE_and_times.csv"))
Exemplo n.º 12
0
import numpy as np
from datetime import date, datetime, timedelta
from inout.io_hycom import read_hycom_output
from inout.io_netcdf import read_netcdf
from os.path import join, exists
from preproc.UtilsDates import get_month_and_day_of_month_from_day_of_year, get_day_of_year_from_month_and_day

# This code is just for debugging purposes (plot intermediate steps)
from img_viz.eoa_viz import EOAImageVisualizer
from img_viz.constants import PlotMode

img_viz = EOAImageVisualizer(
    output_folder='/data/HYCOM/DA_HYCOM_TSIS/images/inputNN',
    disp_images=False)

MAX_DA = {'temp': 40, 'srfhgt': 20, 'salin': 70, 'u-vel.': 4, 'v-vel.': 4}
MIN_DA = {'temp': 0, 'srfhgt': -20, 'salin': 0, 'u-vel.': -4, 'v-vel.': -4}

MAX_OBS = {'sst': 40, 'ssh': 0.9, 'sss': 40}
MIN_OBS = {'sst': 0, 'ssh': -0.9, 'sss': 15}


def data_gen_hycomtsis(paths,
                       file_names,
                       obs_path,
                       field_names,
                       obs_field_names,
                       output_field,
                       days_separation=1,
                       z_layers=[0]):
    """
Exemplo n.º 13
0
def preproc_data(proc_id):
    """
    This function preprocess the desired data. It does the following:
        1) Looks for dates where there is 'increment', model, and observations data.
        2) Saves the files on the same folder with only the 'desired' fields in netcdf format
    :param proc_id:
    :return:
    """
    print("Preprocessing data....")
    config = get_preproc_config()
    input_folder_increment = config[PreprocParams.input_folder_tsis]
    input_folder_model = config[PreprocParams.input_folder_hycom]
    input_folder_obs = config[PreprocParams.input_folder_obs]
    output_folder = config[PreprocParams.output_folder]
    YEARS = config[PreprocParams.YEARS]
    MONTHS = config[PreprocParams.MONTHS]
    fields = config[PreprocParams.fields_names]
    obs_fields = config[PreprocParams.fields_names_obs]
    layers = config[PreprocParams.layers_to_plot]
    img_viz = EOAImageVisualizer(output_folder=output_folder,
                                 disp_images=False)

    # These are the data assimilated files
    for c_year in YEARS:
        for c_month in MONTHS:
            print(
                F"=============== Year: {c_year}  Month: {c_month} ==========="
            )
            days_of_month, days_of_year = get_days_from_month(c_month)
            # Rads all the files for this month
            da_files, da_paths = get_hycom_file_name(input_folder_increment,
                                                     c_year, c_month)
            hycom_files, hycom_paths = get_hycom_file_name(
                input_folder_model, c_year, c_month)
            obs_files, obs_paths = get_obs_file_names(input_folder_obs, c_year,
                                                      c_month)

            # This for is fixed to be able to run in parallel
            for c_day_of_month, c_day_of_year in enumerate(days_of_year):
                if (c_day_of_month % NUM_PROC) == proc_id:
                    re_increment = F'incupd.{c_year}_{c_day_of_year:03d}\S*.a'
                    re_model = F'archv.{c_year}_{c_day_of_year:03d}\S*.a'
                    re_obs = F'tsis_obs_ias_{c_year}{c_month:02d}{c_day_of_month+1:02d}\S*.nc'

                    try:
                        da_file_idx = [
                            i for i, file in enumerate(da_files)
                            if re.search(re_increment, file) != None
                        ][0]
                        print(
                            F" =============== Working with: {da_files[da_file_idx]} Proc_id={proc_id} ============= "
                        )
                        da_np_fields = read_hycom_fields(da_paths[da_file_idx],
                                                         fields,
                                                         layers=layers)

                        hycom_file_idx = [
                            i for i, file in enumerate(hycom_files)
                            if re.search(re_model, file) != None
                        ][0]
                        hycom_np_fields = read_hycom_fields(
                            hycom_paths[hycom_file_idx], fields, layers=layers)

                        # --------- Preprocessing Increment (TSIS) -------------
                        proc_increment_data(
                            da_np_fields, hycom_np_fields, fields,
                            join(output_folder,
                                 F"increment_{c_year}_{c_day_of_year:03d}.nc"))
                    except Exception as e:
                        print(
                            F"Warning: Increment file for date {c_year}-{c_month}-{c_day_of_month} ({re_increment}) doesn't exist: {e}"
                        )
                        # Only when the increment file is not found we go to the next day.
                        continue

                    try:
                        print(
                            F" --------------- Working with: {hycom_files[hycom_file_idx]} ------------- "
                        )
                        hycom_file_idx = [
                            i for i, file in enumerate(hycom_files)
                            if re.search(re_model, file) != None
                        ][0]
                        hycom_np_fields = read_hycom_fields(
                            hycom_paths[hycom_file_idx], fields, layers=layers)
                        # --------- Preprocessing HYCOM data -------------
                        proc_model_data(
                            hycom_np_fields, fields,
                            join(output_folder,
                                 F"model_{c_year}_{c_day_of_year:03d}.nc"))
                    except Exception as e:
                        print(
                            F"Warning: HYCOM file for date {c_year}-{c_month}-{c_day_of_month} ({re_model}) doesn't exist: {e}"
                        )

                    try:
                        obs_file_idx = [
                            i for i, file in enumerate(obs_files)
                            if re.search(re_obs, file) != None
                        ][0]
                        # --------- Preprocessing observed data -------------
                        print(
                            F" --------------- Working with: {hycom_files[hycom_file_idx]} ------------- "
                        )
                        obs_ds = xr.load_dataset(obs_paths[obs_file_idx])
                        for id_field, c_obs_field in enumerate(obs_fields):
                            if id_field == 0:
                                preproc_obs_ds = obs_ds[
                                    c_obs_field].to_dataset()
                            else:
                                preproc_obs_ds = preproc_obs_ds.merge(
                                    obs_ds[c_obs_field].to_dataset())

                        # --------------- Here we add the fields from the profiles as gridded data -----------
                        temp_group = 0
                        saln_group = 1
                        sst_p = np.zeros(
                            preproc_obs_ds[c_obs_field].values.shape)
                        sss_p = np.zeros(sst_p.shape)
                        profiles = obs_ds.val
                        tot_profiles = profiles.shape[0]
                        obs_groups = obs_ds.ob_grp_present

                        lons_i = obs_ds.grdi.values[:, 0, 0]
                        lats_i = obs_ds.grdj.values[:, 0, 0]
                        for i_group, c_type in enumerate(obs_groups):
                            if c_type == saln_group or c_type == temp_group:
                                for c_profile_i in range(tot_profiles):
                                    c_data = profiles[c_profile_i, -1, i_group]
                                    if c_type == saln_group:
                                        sss_p[
                                            int(lats_i[c_profile_i]),
                                            int(lons_i[c_profile_i])] = c_data
                                    if c_type == temp_group:
                                        sst_p[
                                            int(lats_i[c_profile_i]),
                                            int(lons_i[c_profile_i])] = c_data
                        print(F"Max value: {np.amax(sst_p)}")
                        print(F"Max value s: {np.amax(sss_p)}")
                        preproc_obs_ds['sst_p'] = xr.DataArray(
                            sst_p, dims=['yc', 'xc'])
                        preproc_obs_ds['sss_p'] = xr.DataArray(
                            sss_p, dims=['yc', 'xc'])
                        preproc_obs_ds.to_netcdf(
                            join(output_folder,
                                 F"obs_{c_year}_{c_day_of_year:03d}.nc"))
                    except Exception as e:
                        print(
                            F"Warning: OBS file for date {c_year}-{c_month}-{c_day_of_month} doesn't exist: {e}"
                        )