def plotPunctualDataFromObs(ds, title): """This function will show the profiles of punctual data as well as the locations""" all_vars = ds.variables.keys() # print(F"All variables: {all_vars}") depths = ds.obs_level # groups = ["sla", "sst", "tem", "sal", "den", "thk", "uvl", "vvl"] # groups_long_name = ["Sea Level Anomaly", "Sea Surface Temperature", "Temperature", "Salinity", "Density", "Thickness", "U", "V"] groups_long_name = [ "Temperature", "Salinity", "Density", "Interface Depth" ] groups_long_name = [F"Field_{x}" for x in range(8)] obs_types = ds.obs_typ # Temp, Saln, Saln, u ? obs_groups_present = ds.ob_grp_present profiles = ds.val tot_profiles = profiles.shape[0] err = ds.err m_lon_idx = ds.grdj m_lat_idx = ds.grdi print(obs_groups_present.values) lons = ds.lon.values[:, 0, 0] lats = ds.lat.values[:, 0, 0] extent = [-98, -70.40002, 18.09165, 31.9267] img_viz = EOAImageVisualizer( output_folder="/home/olmozavala/Desktop/DELETE", disp_images=True) img_viz.scatter_coords_map(lons, lats, extent, title=title)
def main(): config = get_makeprediction_config() # *********** Reads the parameters *********** input_file = config[ClassificationParams.input_file] splits_file = config[ClassificationParams.split_file] output_folder = config[ClassificationParams.output_folder] output_imgs_folder = config[ClassificationParams.output_imgs_folder] output_file_name = config[ClassificationParams.output_file_name] run_name = config[TrainingParams.config_name] model_weights_file = config[ClassificationParams.model_weights_file] forecasted_hours = config[LocalTrainingParams.forecasted_hours] disp_images = config[ClassificationParams.show_imgs] generate_images = config[ClassificationParams.generate_images] metrics_user = config[ClassificationParams.metrics] filter_stations = config[LocalTrainingParams.stations] # Iterate over the stations # Selects the proper model file for the current station assert len(model_weights_file) > 0 assert len(input_file) > 0 print(F"Working with: {model_weights_file} \n and \n {input_file}") data = pd.read_csv(input_file, index_col=0, parse_dates=True) all_data_cols = data.columns date_columns = [ x for x in all_data_cols if (x.find('week') != -1) or ( x.find('hour') != -1) or (x.find('year') != -1) ] stations_columns = [ x for x in all_data_cols if (x.find('h') == -1) and (x not in date_columns) ] meteo_columns = [ x for x in all_data_cols if (x.find('h') != -1) and ( x not in date_columns) and (x not in stations_columns) ] desired_columns = meteo_columns + filter_stations + date_columns print("Appending date hot vector...") date_hv = generate_date_hot_vector(data.index) data = pd.concat([data[desired_columns], date_hv], axis=1) print("Done!") # print("Filtering data to hours 9 to 20...") filtered_data = data.between_time("9:00", "20:00") # filtered_data = data datetimes_str = filtered_data.index.values # print("Done!") print(F'Normalizing and filtering data....') parameters_folder = join(dirname(output_folder), 'Training', 'Parameters') data_norm_df_final, accepted_times_idx, y_times_idx, stations_columns, meteo_columns = \ normalizeAndFilterData(filtered_data, datetimes_str, forecasted_hours, output_folder=parameters_folder, run_name=run_name, read_from_file=True) # ********* Filling nan values in the stations with the mean values of all the 'available' stations ******** X_df = data_norm_df_final.loc[datetimes_str[accepted_times_idx]] Y_df = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns] # ********* Filling nan values in the stations with the mean values of all the 'available' stations ******** # for cur_station in stations_columns: # X_df[cur_station] = X_df[cur_station].fillna(X_df['MEAN']) # Y_df[cur_station] = Y_df[cur_station].fillna(data_norm_df_final.loc[datetimes_str[y_times_idx]]['MEAN']) # X = data_norm_df_final.loc[datetimes_str[accepted_times_idx]].values # X_df = X_df.drop(columns=['MEAN']) X_df = X_df.drop(columns=stations_columns) X = X_df.values # Y = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns].values Y = Y_df.values config[ModelParams.INPUT_SIZE] = len(X_df.columns) print(F'X shape: {X.shape} Y shape: {Y.shape}') # *********** Chooses the proper model *********** print('Reading model ....') config[ModelParams.NUMBER_OF_OUTPUT_CLASSES] = Y.shape[1] model = select_1d_model(config) # *********** Chooses the proper model *********** print('Reading splits info....') if splits_file != '': # In this case we do read the information split_info = pd.read_csv(splits_file, dtype=np.int16) else: split_info = pd.DataFrame({ 'train_ids': [], 'validation_ids': [], 'test_id': [] }) split_info['train_ids'] = range(Y.shape[0]) # *********** Reads the weights*********** print('Reading weights ....') model.load_weights(model_weights_file) # ************ Makes NN Prediction ******** print('Making prediction ....') output_nn_all = model.predict(X, verbose=1) # ************ Saves raw results ******** number_of_examples = 10 if generate_images: img_viz = EOAImageVisualizer(output_folder=output_imgs_folder, disp_images=disp_images) Y[Y == -1] = np.nan # So that we do not show the -1 for c_example in range(number_of_examples): hours_to_plot = 24 * 3 # How many points to plot start_idx = np.random.randint( 0, X.shape[0] - hours_to_plot - forecasted_hours) end_idx = start_idx + hours_to_plot create_folder(output_folder) create_folder(output_imgs_folder) for idx_station, cur_station in enumerate(filter_stations): img_viz.plot_1d_data_np( datetimes_str[y_times_idx][start_idx:end_idx], [ Y[start_idx:end_idx, idx_station], output_nn_all[start_idx:end_idx, idx_station] ], title=F'{cur_station}', labels=['GT', 'NN'], file_name_prefix=F'{cur_station}_{c_example}') # ************ Recovering original units******** print('Recovering original units....') nn_df = pd.DataFrame(output_nn_all, columns=stations_columns, index=filtered_data.index[y_times_idx]) nn_original_units = deNormalize(nn_df) Y_original = deNormalize(Y_df) # ************ Computing metrics******** print('Computing metrics and saving predictions....') compute_metrics(Y_original, nn_original_units, metrics_user, split_info, output_file_name, stations_columns)
import numpy as np import pandas as pd import seaborn as sns from img_viz.eoa_viz import EOAImageVisualizer import matplotlib.pyplot as plt from os.path import join import xarray as xr viz_obj = EOAImageVisualizer() def data_summary(ds): print("------------- Data summary ---------------------") print(ds.head()) df = ds.to_dataframe() print(df.describe()) def access_data(ds): """ Examples in how to access data. """ # In this example we have two variables (tmin, tmax) with two dimensions each (time:731, location:3) X = range(len(ds["time"])) # http://xarray.pydata.org/en/stable/indexing.html # --- access by index (single var, all times)---- Y = ds["tmin"][:, 0] viz_obj.plot_1d_data_np(X, [Y], title="Single var and dim") # --- access by name (single var, all times)---- Y = ds["tmin"].loc[:, "IA"]
def test_model(config): input_folder = config[PredictionParams.input_folder] output_folder = config[PredictionParams.output_folder] output_fields = config[ProjTrainingParams.output_fields] model_weights_file = config[PredictionParams.model_weights_file] output_imgs_folder = config[PredictionParams.output_imgs_folder] field_names_model = config[ProjTrainingParams.fields_names] field_names_obs = config[ProjTrainingParams.fields_names_obs] rows = config[ProjTrainingParams.rows] cols = config[ProjTrainingParams.cols] run_name = config[TrainingParams.config_name] norm_type = config[ProjTrainingParams.norm_type] output_imgs_folder = join(output_imgs_folder, run_name) create_folder(output_imgs_folder) # *********** Chooses the proper model *********** print('Reading model ....') net_type = config[ProjTrainingParams.network_type] if net_type == NetworkTypes.UNET or net_type == NetworkTypes.UNET_MultiStream: model = select_2d_model(config, last_activation=None) if net_type == NetworkTypes.SimpleCNN_2: model = simpleCNN(config, nn_type="2d", hid_lay=2, out_lay=2) if net_type == NetworkTypes.SimpleCNN_4: model = simpleCNN(config, nn_type="2d", hid_lay=4, out_lay=2) if net_type == NetworkTypes.SimpleCNN_8: model = simpleCNN(config, nn_type="2d", hid_lay=8, out_lay=2) if net_type == NetworkTypes.SimpleCNN_16: model = simpleCNN(config, nn_type="2d", hid_lay=16, out_lay=2) plot_model(model, to_file=join(output_folder, F'running.png'), show_shapes=True) # *********** Reads the weights*********** print('Reading weights ....') model.load_weights(model_weights_file) # *********** Read files to predict*********** all_files = os.listdir(input_folder) all_files.sort() model_files = np.array([x for x in all_files if x.startswith('model')]) z_layers = [0] var_file = join(input_folder, "cov_mat", "tops_ias_std.nc") field_names_std = config[ProjTrainingParams.fields_names_var] if len(field_names_std) > 0: input_fields_std = read_netcdf(var_file, field_names_std, z_layers) else: input_fields_std = [] cmap_out = chooseCMAP(output_fields) cmap_model = chooseCMAP(field_names_model) cmap_obs = chooseCMAP(field_names_obs) cmap_std = chooseCMAP(field_names_std) tot_rows = 891 tot_cols = 1401 all_whole_mean_times = [] all_whole_sum_times = [] all_whole_rmse = [] # np.random.shuffle(model_files) # TODO this is only for testing for id_file, c_file in enumerate(model_files): # Find current and next date year = int(c_file.split('_')[1]) day_of_year = int(c_file.split('_')[2].split('.')[0]) if day_of_year != 5: continue model_file = join(input_folder, F'model_{year}_{day_of_year:03d}.nc') inc_file = join(input_folder, F'increment_{year}_{day_of_year:03d}.nc') obs_file = join(input_folder, F'obs_{year}_{day_of_year:03d}.nc') # *********************** Reading files ************************** input_fields_model = read_netcdf(model_file, field_names_model, z_layers) input_fields_obs = read_netcdf(obs_file, field_names_obs, z_layers) output_field_increment = read_netcdf(inc_file, output_fields, z_layers) # ******************* Normalizing and Cropping Data ******************* whole_cnn = np.zeros((891, 1401)) whole_y = np.zeros((891, 1401)) this_file_times = [] start_row = 0 donerow = False while not (donerow): donecol = False start_col = 0 while not (donecol): # print(F"{start_row}-{start_row+rows} {start_col}-{start_col+cols}") # Generate the proper inputs for the NN try: perc_ocean = .05 input_data, y_data = generateXandY(input_fields_model, input_fields_obs, input_fields_std, output_field_increment, field_names_model, field_names_obs, field_names_std, output_fields, start_row, start_col, rows, cols, norm_type=norm_type, perc_ocean=perc_ocean) except Exception as e: print(F"Land for {c_file} row:{start_row} col:{start_col}") start_col, donecol = verifyBoundaries( start_col, cols, tot_cols) continue # ******************* Replacing nan values ********* # We set a value of 0.5 on the land. Trying a new loss function that do not takes into account land input_data_nans = np.isnan(input_data) input_data = np.nan_to_num(input_data, nan=0) y_data = np.nan_to_num(y_data, nan=-0.5) X = np.expand_dims(input_data, axis=0) Y = np.expand_dims(y_data, axis=0) # Make the prediction of the network start = time.time() output_nn_original = model.predict(X, verbose=1) toc = time.time() - start this_file_times.append(toc) # print(F"Time to get prediction {toc:0.3f} seconds") # PLOT RAW DATA # import matplotlib.pyplot as plt # plt.imshow(np.flip(output_nn_original[0,:,:,0], axis=0)) # plt.imshow(np.flip(Y[0,:,:,0], axis=0)) # plt.show() # Original MSE # print(F"MSE: {mean_squared_error(Y[0,:,:,0], output_nn_original[0,:,:,0])}") # Make nan all values inside the land land_indexes = Y == -0.5 output_nn_original[land_indexes] = np.nan # ====================== PLOTS RAW DATA NOT NECESSARY ============================= # viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder, disp_images=False) # viz_obj.plot_2d_data_np_raw(np.concatenate((input_data.swapaxes(0,2), Y[0,:,:,:].swapaxes(0,2), output_nn_original[0,:,:,:].swapaxes(0,2))), # var_names=[F"in_model_{x}" for x in field_names_model] + # [F"in_obs_{x}" for x in field_names_obs] + # [F"in_var_{x}" for x in field_names_std] + # [F"out_inc_{x}" for x in output_fields] + # [F"cnn_{x}" for x in output_fields], # file_name=F"RAW_Input_and_CNN_{c_file}_{start_row:03d}_{start_col:03d}", # rot_90=True, # cols_per_row=len(field_names_model), # title=F"Input data: {field_names_model} and obs {field_names_obs}, increment {output_fields}, cnn {output_fields}") # Denormalize the data to the proper units in each field denorm_cnn_output = np.zeros(output_nn_original.shape) denorm_y = np.zeros(Y.shape) # ==== Denormalizingallinput and outputs denorm_cnn_output = denormalizeData(output_nn_original, output_fields, PreprocParams.type_inc, norm_type) denorm_y = denormalizeData(Y, output_fields, PreprocParams.type_inc, norm_type) input_types = [ PreprocParams.type_model for i in input_fields_model ] + [PreprocParams.type_obs for i in input_fields_obs ] + [PreprocParams.type_std for i in input_fields_std] denorm_input = denormalizeData( input_data, field_names_model + field_names_obs + field_names_std, input_types, norm_type) # Recover the original land areas, they are lost after denormalization denorm_input[input_data_nans] = np.nan denorm_y[land_indexes] = np.nan # Remove the 'extra dimension' denorm_cnn_output = np.squeeze(denorm_cnn_output) denorm_y = np.squeeze(denorm_y) whole_cnn[ start_row:start_row + rows, start_col:start_col + cols] = denorm_cnn_output # Add the the 'whole prediction' whole_y[start_row:start_row + rows, start_col:start_col + cols] = denorm_y # Add the the 'whole prediction' # if np.random.random() > .99: # Plot 1% of the times if True: # Plot 1% of the times if len( denorm_cnn_output.shape ) == 2: # In this case we only had one output and we need to make it 'array' to plot denorm_cnn_output = np.expand_dims(denorm_cnn_output, axis=2) denorm_y = np.expand_dims(denorm_y, axis=2) # Compute RMSE rmse_cnn = np.zeros(len(output_fields)) for i in range(len(output_fields)): ocean_indexes = np.logical_not( np.isnan(denorm_y[:, :, i])) rmse_cnn[i] = np.sqrt( mean_squared_error( denorm_cnn_output[:, :, i][ocean_indexes], denorm_y[:, :, i][ocean_indexes])) # viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder, disp_images=False, mincbar=mincbar, maxcbar=maxcbar) viz_obj = EOAImageVisualizer( output_folder=output_imgs_folder, disp_images=False) # ================== DISPLAYS ALL INPUTS AND OUTPUTS DENORMALIZED =================== # viz_obj.plot_2d_data_np_raw(np.concatenate((input_data.swapaxes(0,2), Y[0,:,:,:].swapaxes(0,2), output_nn_original[0,:,:,:].swapaxes(0,2))), viz_obj.plot_2d_data_np_raw( np.concatenate( (denorm_input.swapaxes(0, 2), denorm_y.swapaxes(0, 2), denorm_cnn_output.swapaxes(0, 2))), var_names=[F"in_model_{x}" for x in field_names_model] + [F"in_obs_{x}" for x in field_names_obs] + [F"in_var_{x}" for x in field_names_std] + [F"out_inc_{x}" for x in output_fields] + [F"cnn_{x}" for x in output_fields], file_name= F"Input_and_CNN_{c_file}_{start_row:03d}_{start_col:03d}", cmap=cmap_model + cmap_obs + cmap_std + cmap_out + cmap_out, rot_90=True, cols_per_row=len(field_names_model), title= F"Input data: {field_names_model} and obs {field_names_obs}, increment {output_fields}, cnn {output_fields}" ) # =========== Making the same color bar for desired output and the NN ===================== mincbar = [ np.nanmin(denorm_y[:, :, x]) for x in range(denorm_cnn_output.shape[-1]) ] maxcbar = [ np.nanmax(denorm_y[:, :, x]) for x in range(denorm_cnn_output.shape[-1]) ] error = (denorm_y - denorm_cnn_output).swapaxes(0, 2) mincbarerror = [ np.nanmin(error[i, :, :]) for i in range(len(output_fields)) ] maxcbarerror = [ np.nanmax(error[i, :, :]) for i in range(len(output_fields)) ] viz_obj = EOAImageVisualizer( output_folder=output_imgs_folder, disp_images=False, mincbar=mincbar + mincbar + mincbarerror, maxcbar=maxcbar + maxcbar + maxcbarerror) # ================== Displays CNN and TSIS with RMSE ================ viz_obj.output_folder = join(output_imgs_folder, 'JoinedErrrorCNN') cmap = chooseCMAP(output_fields) error_cmap = cmocean.cm.diff viz_obj.plot_2d_data_np_raw( np.concatenate((denorm_cnn_output.swapaxes( 0, 2), denorm_y.swapaxes(0, 2), error), axis=0), var_names=[F"CNN INC {x}" for x in output_fields] + [F"TSIS INC {x}" for x in output_fields] + [F'RMSE {c_rmse_cnn:0.4f}' for c_rmse_cnn in rmse_cnn], file_name= F"AllError_{c_file}_{start_row:03d}_{start_col:03d}", rot_90=True, cmap=cmap + cmap + [error_cmap], cols_per_row=len(output_fields), title=F"{output_fields} RMSE: {np.mean(rmse_cnn):0.5f}" ) start_col, donecol = verifyBoundaries(start_col, cols, tot_cols) # Column for start_row, donerow = verifyBoundaries(start_row, rows, tot_rows) # Row for # ======= Plots whole output with RMSE mincbar = np.nanmin(whole_y) / 2 maxcbar = np.nanmax(whole_y) / 2 error = whole_y - whole_cnn mincbarerror = np.nanmin(error) / 2 maxcbarerror = np.nanmax(error) / 2 no_zero_ids = np.count_nonzero(whole_cnn) rmse_cnn = np.sqrt(np.nansum((whole_y - whole_cnn)**2) / no_zero_ids) all_whole_rmse.append(rmse_cnn) all_whole_mean_times.append(np.mean(np.array(this_file_times))) all_whole_sum_times.append(np.sum(np.array(this_file_times))) if np.random.random( ) > .9 or day_of_year == 353: # Plot 10% of the times viz_obj = EOAImageVisualizer( output_folder=output_imgs_folder, disp_images=False, mincbar=mincbar + mincbar + mincbarerror, maxcbar=maxcbar + maxcbar + maxcbarerror) # mincbar=[-5, -5, -1], # maxcbar=[10, 10, 1]) # ================== Displays CNN and TSIS with RMSE ================ viz_obj.output_folder = join(output_imgs_folder, 'WholeOutput_CNN_TSIS') viz_obj.plot_2d_data_np_raw( [ np.flip(whole_cnn, axis=0), np.flip(whole_y, axis=0), np.flip(error, axis=0) ], var_names=[F"CNN INC {x}" for x in output_fields] + [F"TSIS INC {x}" for x in output_fields] + [F'RMSE {rmse_cnn:0.4f}'], file_name=F"WholeOutput_CNN_TSIS_{c_file}", rot_90=False, cols_per_row=3, cmap=cmocean.cm.algae, title=F"{output_fields} RMSE: {np.mean(rmse_cnn):0.5f}")
def trainModel(config, cur_pollutant, cur_station): """Trying to separate things so that tf 'cleans' the memory """ input_folder = config[TrainingParams.input_folder] output_folder = config[TrainingParams.output_folder] val_perc = config[TrainingParams.validation_percentage] test_perc = config[TrainingParams.test_percentage] eval_metrics = config[TrainingParams.evaluation_metrics] loss_func = config[TrainingParams.loss_function] batch_size = config[TrainingParams.batch_size] epochs = config[TrainingParams.epochs] model_name_user = config[TrainingParams.config_name] optimizer = config[TrainingParams.optimizer] forecasted_hours = config[LocalTrainingParams.forecasted_hours] split_info_folder = join(output_folder, 'Splits') parameters_folder = join(output_folder, 'Parameters') weights_folder = join(output_folder, 'models') logs_folder = join(output_folder, 'logs') imgs_folder = join(output_folder, 'imgs') create_folder(split_info_folder) create_folder(parameters_folder) create_folder(weights_folder) create_folder(logs_folder) viz_obj = EOAImageVisualizer(output_folder=imgs_folder, disp_images=False) print( F"============ Reading data for: {cur_pollutant} -- {cur_station} ==========================" ) db_file_name = join(input_folder, constants.merge_output_folder.value, F"{cur_pollutant}_{cur_station}.csv") data = pd.read_csv(db_file_name, index_col=0) config[ModelParams.INPUT_SIZE] = len(data.columns) print(F'Data shape: {data.shape} Data axes {data.axes}') print("Done!") # Predicting for the next value after 24hrs (only one) print("Normalizing data....") datetimes_str = data.index.values datetimes = np.array([ datetime.strptime(x, constants.datetime_format.value) for x in datetimes_str ]) scaler = preprocessing.MinMaxScaler() scaler = scaler.fit(data) data_norm_np = scaler.transform(data) data_norm_df = DataFrame(data_norm_np, columns=data.columns, index=data.index) print(F'Done!') # Filtering only dates where there is data "forecasted hours after" (24 hrs after) print(F"\tBuilding X and Y ....") accepted_times_idx = [] y_times_idx = [] for i, c_datetime in enumerate(datetimes): forecasted_datetime = (c_datetime + timedelta(hours=forecasted_hours)) if forecasted_datetime in datetimes: accepted_times_idx.append(i) y_times_idx.append( np.argwhere(forecasted_datetime == datetimes)[0][0]) X_df = data_norm_df.loc[datetimes_str[accepted_times_idx]] Y_df = data_norm_df.loc[datetimes_str[y_times_idx]][cur_pollutant] X = X_df.values Y = Y_df.values print(F'X shape: {X.shape} Y shape: {Y.shape}') tot_examples = X.shape[0] rows_to_read = np.arange(tot_examples) # ================ Split definition ================= [train_ids, val_ids, test_ids ] = utilsNN.split_train_validation_and_test(tot_examples, val_percentage=val_perc, test_percentage=test_perc) print("Train examples (total:{}) :{}".format(len(train_ids), rows_to_read[train_ids])) print("Validation examples (total:{}) :{}:".format(len(val_ids), rows_to_read[val_ids])) print("Test examples (total:{}) :{}".format(len(test_ids), rows_to_read[test_ids])) print("Selecting and generating the model....") now = datetime.utcnow().strftime("%Y_%m_%d_%H_%M") model_name = F'{model_name_user}_{now}_{cur_pollutant}_{cur_station}' # ******************* Selecting the model ********************** model = select_1d_model(config) plot_model(model, to_file=join(output_folder, F'{model_name}.png'), show_shapes=True) print("Saving split information...") file_name_splits = join(split_info_folder, F'{model_name}.csv') info_splits = DataFrame({F'Train({len(train_ids)})': train_ids}) info_splits[F'Validation({len(val_ids)})'] = 0 info_splits[F'Validation({len(val_ids)})'][0:len(val_ids)] = val_ids info_splits[F'Test({len(test_ids)})'] = 0 info_splits[F'Test({len(test_ids)})'][0:len(test_ids)] = test_ids info_splits.to_csv(file_name_splits, index=None) print(F"Norm params: {scaler.get_params()}") file_name_normparams = join(parameters_folder, F'{model_name}.txt') utilsNN.save_norm_params(file_name_normparams, NormParams.min_max, scaler) info_splits.to_csv(file_name_splits, index=None) print("Getting callbacks ...") [logger, save_callback, stop_callback] = utilsNN.get_all_callbacks( model_name=model_name, early_stopping_func=F'val_{eval_metrics[0].__name__}', weights_folder=weights_folder, logs_folder=logs_folder) print("Compiling model ...") model.compile(loss=loss_func, optimizer=optimizer, metrics=eval_metrics) print("Training ...") # This part should be somehow separated, it will change for every project x_train = X[train_ids, :] y_train = Y[train_ids] x_val = X[val_ids, :] y_val = Y[val_ids] x_test = X[test_ids, :] y_test = Y[test_ids] # Plotting some intermediate results import matplotlib.pyplot as plt size = 24 * 60 # Two months of data start = np.random.randint(0, len(data) - size) end = start + size plt.figure(figsize=[64, 8]) x_plot = range(len(X_df.iloc[start:end].index.values)) y_plot = X_df.iloc[start:end][cur_pollutant].values yy_plot = Y_df.iloc[start:end].values viz_obj.plot_1d_data_np(x_plot, [y_plot, yy_plot], title=F"{cur_pollutant}_{cur_station}", labels=['Current', 'Desired'], wide_ratio=4, file_name_prefix=F"{cur_pollutant}_{cur_station}") model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_val, y_val), shuffle=True, callbacks=[logger, save_callback, stop_callback])
def img_generation_hycom(proc_id): """ Makes images of the available data (Free run, DA and Observations) :param proc_id: :return: """ config = get_preproc_config() input_folder_tsis = config[PreprocParams.input_folder_tsis] input_folder_forecast = config[PreprocParams.input_folder_hycom] input_folder_obs = config[PreprocParams.input_folder_obs] output_folder = config[PreprocParams.imgs_output_folder] YEARS = config[PreprocParams.YEARS] MONTHS = config[PreprocParams.MONTHS] fields = config[PreprocParams.fields_names] fields_obs = config[PreprocParams.fields_names_obs] plot_modes = config[PreprocParams.plot_modes_per_field] layers = config[PreprocParams.layers_to_plot] img_viz = EOAImageVisualizer(output_folder=output_folder, disp_images=False) # Iterate current year for c_year in YEARS: # Iterate current month for c_month in MONTHS: try: days_of_month, days_of_year = get_days_from_month(c_month) # Reads the data (DA, Free run, and observations) hycom_files, hycom_paths = get_hycom_file_name( input_folder_forecast, c_year, c_month) except Exception as e: print(F"Failed to find any file for date {c_year}-{c_month}") continue # This for is fixed to be able to run in parallel for c_day_of_month, c_day_of_year in enumerate(days_of_year): if (c_day_of_month % NUM_PROC) == proc_id: # Makes regular expression of the current desired file re_hycom = F'archv.{c_year}_{c_day_of_year:03d}\S*.a' try: # Gets the proper index of the file for the three cases hycom_file_idx = [ i for i, file in enumerate(hycom_files) if re.search(re_hycom, file) != None ][0] except Exception as e: print( F"ERROR: The file for date {c_year} - {c_month} - {c_day_of_month} doesn't exist: {e}" ) continue print( F" =============== Working with: {hycom_files[hycom_file_idx]} ============= " ) print( F"Available fields: {read_field_names(hycom_paths[hycom_file_idx])}" ) model_state_np_fields = read_hycom_fields( hycom_paths[hycom_file_idx], fields, layers=layers) for idx_field, c_field_name in enumerate(fields): model_state_np_c_field = model_state_np_fields[ c_field_name] title = F"{c_field_name} {c_year}_{c_month:02d}_{(c_day_of_month+1):02d}" # ======================= Only Fredatae HYCOM, TSIS, Observations ================== img_viz.plot_3d_data_np( [model_state_np_c_field], var_names=[F'HYCOM'], title=title, file_name_prefix= F'HYCOM_{c_field_name}_{c_year}_{c_month:02d}_{c_day_of_month:02d}', z_lavels_names=layers, flip_data=True, plot_mode=plot_modes[idx_field])
def compute_consecutive_days_difference(): """ Computes the difference between consecutive days on the hycom files. :param proc_id: :return: """ config = get_preproc_config() input_folder_forecast = config[PreprocParams.input_folder_hycom] output_folder = config[PreprocParams.imgs_output_folder] YEARS = config[PreprocParams.YEARS] MONTHS = config[PreprocParams.MONTHS] fields = config[PreprocParams.fields_names] layers = config[PreprocParams.layers_to_plot] img_viz = EOAImageVisualizer(output_folder=output_folder, disp_images=False) # Iterate current year for c_year in YEARS: # Iterate current month diff_per_field = {field: [] for field in fields} days_with_data = [] for c_month in MONTHS: # Reading the data try: days_of_month, days_of_year = get_days_from_month(c_month) # Reading hycom files hycom_files, hycom_paths = get_hycom_file_name( input_folder_forecast, c_year, c_month) except Exception as e: print(F"Failed to find any file for date {c_year}-{c_month}") continue # This for is fixed to be able to run in parallel for c_day_of_month, c_day_of_year in enumerate(days_of_year): print( F"---------- Year {c_year} day: {c_day_of_year} --------------" ) # Makes regular expression of the current desired file re_hycom = F'archv.{c_year}_{c_day_of_year:03d}\S*.a' re_hycom_prev = F'archv.{c_year}_{(c_day_of_year-1):03d}\S*.a' try: # Gets the proper index of the file for the three cases hycom_file_idx = [ i for i, file in enumerate(hycom_files) if re.search(re_hycom, file) != None ][0] hycom_file_idx_prev = [ i for i, file in enumerate(hycom_files) if re.search(re_hycom_prev, file) != None ][0] except Exception as e: print( F"ERROR: The file for date {c_year} - {c_month} - {c_day_of_month} (and prev day) don't exist: {e}" ) continue days_with_data.append(c_day_of_year) model_state_np_fields = read_hycom_fields( hycom_paths[hycom_file_idx], fields, layers=layers) model_state_np_fields_prev = read_hycom_fields( hycom_paths[hycom_file_idx_prev], fields, layers=layers) # Computes the difference between consecutive days from the desired fields for idx_field, c_field_name in enumerate(fields): model_state_np_c_field = model_state_np_fields[ c_field_name] model_state_np_c_field_prev = model_state_np_fields_prev[ c_field_name] c_diff = np.abs( np.nanmean(model_state_np_c_field_prev - model_state_np_c_field)) diff_per_field[c_field_name].append(c_diff) # Plots the differences between consecutive days. For all the fields together. img_viz.plot_1d_data_np( days_with_data, [diff_per_field[a] for a in diff_per_field.keys()], title='Difference between days', labels=fields, file_name_prefix='HYCOM_Diff_Between_Days', wide_ratio=4) # Plots the differences between consecutive days. Separated by fields for field in diff_per_field.keys(): img_viz.plot_1d_data_np( days_with_data, [diff_per_field[field]], title=F'Difference between days {field}', labels=[field], file_name_prefix=F'HYCOM_Diff_Between_Days_{field}', wide_ratio=4)
def plot_raw_data_new(proc_id): """ This code makes two plots: 1) model and increment 2) model, increment and observations Depending on which plot you want to make, it reads field_names and fields_names_obs from the PreprocConfig file :param proc_id: :return: """ config = get_preproc_config() input_folder_tsis = config[PreprocParams.input_folder_tsis] input_folder_forecast = config[PreprocParams.input_folder_hycom] input_folder_obs = config[PreprocParams.input_folder_obs] output_folder = config[PreprocParams.imgs_output_folder] YEARS = config[PreprocParams.YEARS] MONTHS = config[PreprocParams.MONTHS] fields = config[PreprocParams.fields_names] fields_obs = config[PreprocParams.fields_names_obs] plot_modes = config[PreprocParams.plot_modes_per_field] layers = config[PreprocParams.layers_to_plot] img_viz = EOAImageVisualizer(output_folder=output_folder, disp_images=False) # Iterate current year for c_year in YEARS: # Iterate current month for c_month in MONTHS: try: days_of_month, days_of_year = get_days_from_month(c_month) # Reads the data (DA, Free run, and observations) increment_files, increment_paths = get_hycom_file_name( input_folder_tsis, c_year, c_month) hycom_files, hycom_paths = get_hycom_file_name( input_folder_forecast, c_year, c_month, day_idx=2) obs_files, obs_paths = get_obs_file_names( input_folder_obs, c_year, c_month) except Exception as e: print(F"Failed to find any file for date {c_year}-{c_month}") continue # This for is fixed to be able to run in parallel for c_day_of_month, c_day_of_year in enumerate(days_of_year): if (c_day_of_month % NUM_PROC) == proc_id: # Makes regular expression of the current desired file re_tsis = F'incupd.{c_year}_{c_day_of_year:03d}\S*.a' re_hycom = F'020_archv.{c_year}_{c_day_of_year:03d}\S*.a' # re_hycom = F'archv.{c_year}_{c_day_of_year:03d}\S*.a' # re_obs = F'tsis_obs_ias_{c_year}{c_month:02d}{c_day_of_month+1:02d}\S*.nc' re_obs = F'tsis_obs_gomb4_{c_year}{c_month:02d}{c_day_of_month+1:02d}\S*.nc' try: # Gets the proper index of the file for the three cases increment_file_idx = [ i for i, file in enumerate(increment_files) if re.search(re_tsis, file) != None ][0] hycom_file_idx = [ i for i, file in enumerate(hycom_files) if re.search(re_hycom, file) != None ][0] obs_file_idx = [ i for i, file in enumerate(obs_files) if re.search(re_obs, file) != None ][0] except Exception as e: print( F"ERROR: The file for date {c_year} - {c_month} - {(c_day_of_month+1)} doesn't exist: {e}" ) continue print( F" =============== Working with: {increment_files[increment_file_idx]} ============= " ) print( F"Available fields on increment: {read_field_names(increment_paths[increment_file_idx])}" ) print( F"Available fields on model: {read_field_names(hycom_paths[hycom_file_idx])}" ) ds = xr.open_dataset(obs_paths[obs_file_idx]) print( F"Available fields on observations: {print(list(ds.keys()))}" ) model_state_np_fields = read_hycom_fields( hycom_paths[hycom_file_idx], fields, layers=layers) increment_np_fields = read_hycom_fields( increment_paths[increment_file_idx], fields, layers=layers) # obs_np_fields = read_netcdf(obs_paths[obs_file_idx], fields_obs, rename_fields=fields) obs_np_fields = read_netcdf(obs_paths[obs_file_idx], fields_obs) # Iterate over the fields defined in PreprocConfig and plot them for idx_field, c_field_name in enumerate(fields): increment_np_c_field = increment_np_fields[ c_field_name] nan_indx = increment_np_c_field == 0 increment_np_c_field[nan_indx] = np.nan model_state_np_c_field = model_state_np_fields[ c_field_name] # diff_increment_vs_fo = increment_np_c_field - model_state_np_c_field # In these 2 cases, we only compute it for the surface layer # diff_obs_vs_hycom = obs_np_c_field - model_state_np_c_field[0] # obs_np_c_field[502,609] - model_state_np_c_field[0][502,609] # diff_obs_vs_da = obs_np_c_field - increment_np_c_field[0] # mse_hycom_vs_da = mse(increment_np_c_field, model_state_np_c_field) # mse_obs_vs_hycom = mse(obs_np_c_field, model_state_np_c_field[0]) # mse_obs_vs_da = mse(obs_np_c_field, increment_np_c_field[0]) if c_field_name == "thknss": divide = 9806 model_state_np_c_field = model_state_np_c_field / divide increment_np_c_field = increment_np_c_field / divide if c_field_name == "srfhgt": inc = increment_np_c_field else: inc = (model_state_np_c_field - increment_np_c_field) # ======================= Only Background state and TSIS increment ================== try: title = F"{c_field_name} {c_year}_{c_month:02d}_{(c_day_of_month+1):02d}" img_viz.plot_3d_data_np( [model_state_np_c_field, inc], # img_viz.plot_3d_data_np([model_state_np_c_field, increment_np_c_field], var_names=['HYCOM', 'Increment (TSIS)'], title=title, file_name_prefix= F'ModelAndIncrement_{c_field_name}_{c_year}_{c_month:02d}_{(c_day_of_month+1):02d}', z_lavels_names=layers, flip_data=True, plot_mode=plot_modes[idx_field]) except Exception as e: print(F"Failed for field: {c_field_name}: {e}")
def plot_raw_data(proc_id): """ Makes images of the available data (Free run, DA and Observations) :param proc_id: :return: """ config = get_preproc_config() input_folder_tsis = config[PreprocParams.input_folder_tsis] input_folder_forecast = config[PreprocParams.input_folder_hycom] input_folder_obs = config[PreprocParams.input_folder_obs] output_folder = config[PreprocParams.imgs_output_folder] YEARS = config[PreprocParams.YEARS] MONTHS = config[PreprocParams.MONTHS] fields = config[PreprocParams.fields_names] fields_obs = config[PreprocParams.fields_names_obs] plot_modes = config[PreprocParams.plot_modes_per_field] layers = config[PreprocParams.layers_to_plot] img_viz = EOAImageVisualizer(output_folder=output_folder, disp_images=False) # Iterate current year for c_year in YEARS: # Iterate current month for c_month in MONTHS: try: days_of_month, days_of_year = get_days_from_month(c_month) # Reads the data (DA, Free run, and observations) increment_files, increment_paths = get_hycom_file_name( input_folder_tsis, c_year, c_month) hycom_files, hycom_paths = get_hycom_file_name( input_folder_forecast, c_year, c_month) obs_files, obs_paths = get_obs_file_names( input_folder_obs, c_year, c_month) except Exception as e: print(F"Failed to find any file for date {c_year}-{c_month}") continue # This for is fixed to be able to run in parallel for c_day_of_month, c_day_of_year in enumerate(days_of_year): if (c_day_of_month % NUM_PROC) == proc_id: # Makes regular expression of the current desired file re_tsis = F'incupd.{c_year}_{c_day_of_year:03d}\S*.a' re_hycom = F'archv.{c_year}_{c_day_of_year:03d}\S*.a' re_obs = F'tsis_obs_ias_{c_year}{c_month:02d}{c_day_of_month+1:02d}\S*.nc' try: # Gets the proper index of the file for the three cases increment_file_idx = [ i for i, file in enumerate(increment_files) if re.search(re_tsis, file) != None ][0] hycom_file_idx = [ i for i, file in enumerate(hycom_files) if re.search(re_hycom, file) != None ][0] obs_file_idx = [ i for i, file in enumerate(obs_files) if re.search(re_obs, file) != None ][0] except Exception as e: print( F"ERROR: The file for date {c_year} - {c_month} - {(c_day_of_month+1)} doesn't exist: {e}" ) continue print( F" =============== Working with: {increment_files[increment_file_idx]} ============= " ) print( F"Available fields on increment: {read_field_names(increment_paths[increment_file_idx])}" ) increment_np_fields = read_hycom_fields( increment_paths[increment_file_idx], fields, layers=layers) model_state_np_fields = read_hycom_fields( hycom_paths[hycom_file_idx], fields, layers=layers) obs_np_fields = read_netcdf(obs_paths[obs_file_idx], fields_obs, layers=[0], rename_fields=fields) for idx_field, c_field_name in enumerate(fields): increment_np_c_field = increment_np_fields[ c_field_name] nan_indx = increment_np_c_field == 0 increment_np_c_field[nan_indx] = np.nan model_state_np_c_field = model_state_np_fields[ c_field_name] obs_np_c_field = obs_np_fields[c_field_name] # diff_increment_vs_fo = increment_np_c_field - model_state_np_c_field # In these 2 cases, we only compute it for the surface layer # diff_obs_vs_hycom = obs_np_c_field - model_state_np_c_field[0] obs_np_c_field[502, 609] - model_state_np_c_field[0][502, 609] # diff_obs_vs_da = obs_np_c_field - increment_np_c_field[0] # mse_hycom_vs_da = mse(increment_np_c_field, model_state_np_c_field) # mse_obs_vs_hycom = mse(obs_np_c_field, model_state_np_c_field[0]) # mse_obs_vs_da = mse(obs_np_c_field, increment_np_c_field[0]) title = F"{c_field_name} {c_year}_{c_month:02d}_{(c_day_of_month+1):02d}" # ======================= Only Fredatae HYCOM, TSIS, Observations ================== img_viz.plot_3d_data_np( [ np.expand_dims(obs_np_c_field, 0), model_state_np_c_field, increment_np_c_field ], var_names=[ F'Observations', 'HYCOM', 'Increment (TSIS)' ], title=title, file_name_prefix= F'Summary_{c_field_name}_{c_year}_{c_month:02d}_{(c_day_of_month+1):02d}', z_lavels_names=layers, flip_data=True, plot_mode=plot_modes[idx_field])
def main(): config = get_makeprediction_config() # *********** Reads the parameters *********** input_file = config[ClassificationParams.input_file] output_folder = config[ClassificationParams.output_folder] output_imgs_folder = config[ClassificationParams.output_imgs_folder] output_file_name = config[ClassificationParams.output_file_name] model_weights_file = config[ClassificationParams.model_weights_file] forecasted_hours = config[LocalTrainingParams.forecasted_hours] pollutant = config[LocalTrainingParams.pollutant] # ********** Reading and preprocessing data ******* _all_stations = [ "ACO", "AJM", "AJU", "ARA", "ATI", "AZC", "BJU", "CAM", "CCA", "CES", "CFE", "CHO", "COR", "COY", "CUA", "CUI", "CUT", "DIC", "EAJ", "EDL", "FAC", "FAN", "GAM", "HAN", "HGM", "IBM", "IMP", "INN", "IZT", "LAA", "LAG", "LLA", "LOM", "LPR", "LVI", "MCM", "MER", "MGH", "MIN", "MON", "MPA", "NET", "NEZ", "PED", "PER", "PLA", "POT", "SAG", "SFE", "SHA", "SJA", "SNT", "SUR", "TAC", "TAH", "TAX", "TEC", "TLA", "TLI", "TPN", "UAX", "UIZ", "UNM", "VAL", "VIF", "XAL", "XCH" ] # Iterate over the stations models_folder = '/data/UNAM/Air_Pollution_Forecast/Data/Training/models' data_folder = '/data/UNAM/Air_Pollution_Forecast/Data/MergedDataCSV' for c_station in _all_stations: try: model_weights_file = [ join(models_folder, x) for x in listdir(models_folder) if x.find(c_station) != -1 ] input_file = [ join(data_folder, x) for x in listdir(data_folder) if x.find(c_station) != -1 ] # Selects the proper model file for the current station assert len(model_weights_file) > 0 assert len(input_file) > 0 print(F"Working with: {model_weights_file} and {input_file}") model_weights_file = model_weights_file[0] input_file = input_file[0] data = pd.read_csv(input_file, index_col=0) config[ModelParams.INPUT_SIZE] = len(data.columns) print(F'Data shape: {data.shape} Data axes {data.axes}') print("Done!") # Predicting for the next value after 24hrs (only one) print("Normalizing data....") datetimes_str = data.index.values datetimes = np.array([ datetime.strptime(x, constants.datetime_format.value) for x in datetimes_str ]) scaler = preprocessing.MinMaxScaler() scaler = scaler.fit(data) data_norm_np = scaler.transform(data) data_norm_df = DataFrame(data_norm_np, columns=data.columns, index=data.index) print(F'Done!') # Filtering only dates where there is data "forecasted hours after" (24 hrs after) print(F"\tBuilding X and Y ....") accepted_times_idx = [] y_times_idx = [] for i, c_datetime in enumerate(datetimes): forecasted_datetime = (c_datetime + timedelta(hours=forecasted_hours)) if forecasted_datetime in datetimes: accepted_times_idx.append(i) y_times_idx.append( np.argwhere(forecasted_datetime == datetimes)[0][0]) X_df = data_norm_df.loc[datetimes_str[accepted_times_idx]] Y_df = data_norm_df.loc[datetimes_str[y_times_idx]][pollutant] X = X_df.values Y = Y_df.values print(F'X shape: {X.shape} Y shape: {Y.shape}') # *********** Chooses the proper model *********** print('Reading model ....') model = select_1d_model(config) # *********** Reads the weights*********** print('Reading weights ....') model.load_weights(model_weights_file) create_folder(output_folder) create_folder(output_imgs_folder) # *********** Makes a dataframe to contain the DSC information ********** metrics_params = config[ClassificationParams.metrics] metrics_dict = {met.name: met.value for met in metrics_params} # *********** Iterates over each case ********* t0 = time.time() # -------------------- Reading data ------------- output_nn_all = model.predict(X, verbose=1) # Plotting some intermediate results import matplotlib.pyplot as plt size = 24 * 60 # Two months of data start = np.random.randint(0, len(data) - size) end = start + size plt.figure(figsize=[64, 8]) x_plot = range(len(Y)) y_plot = Y yy_plot = Y_df.iloc[start:end].values viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder, disp_images=False) plot_this_many = 24 * 60 viz_obj.plot_1d_data_np( x_plot[0:plot_this_many], [y_plot[0:plot_this_many], output_nn_all[0:plot_this_many, 0]], title=F"{c_station} {pollutant}", labels=['Original', 'Forecasted'], wide_ratio=4, file_name_prefix=F"{pollutant}_{c_station}") print(F'\t Done! Elapsed time {time.time() - t0:0.2f} seg') except Exception as e: print( F"---------------------------- Failed {c_station} error: {e} ----------------" )
def test_model(config): input_folder = config[PredictionParams.input_folder] output_folder = config[PredictionParams.output_folder] output_fields = config[ProjTrainingParams.output_fields] model_weights_file = config[PredictionParams.model_weights_file] output_imgs_folder = config[PredictionParams.output_imgs_folder] field_names_model = config[ProjTrainingParams.fields_names] field_names_obs = config[ProjTrainingParams.fields_names_obs] rows = config[ProjTrainingParams.rows] cols = config[ProjTrainingParams.cols] run_name = config[TrainingParams.config_name] norm_type = config[ProjTrainingParams.norm_type] output_imgs_folder = join(output_imgs_folder, run_name) create_folder(output_imgs_folder) # *********** Chooses the proper model *********** print('Reading model ....') net_type = config[ProjTrainingParams.network_type] if net_type == NetworkTypes.UNET or net_type == NetworkTypes.UNET_MultiStream: model = select_2d_model(config, last_activation=None) if net_type == NetworkTypes.SimpleCNN_2: model = simpleCNN(config, nn_type="2d", hid_lay=2, out_lay=2) if net_type == NetworkTypes.SimpleCNN_4: model = simpleCNN(config, nn_type="2d", hid_lay=4, out_lay=2) if net_type == NetworkTypes.SimpleCNN_8: model = simpleCNN(config, nn_type="2d", hid_lay=8, out_lay=2) if net_type == NetworkTypes.SimpleCNN_16: model = simpleCNN(config, nn_type="2d", hid_lay=16, out_lay=2) plot_model(model, to_file=join(output_folder, F'running.png'), show_shapes=True) # *********** Reads the weights*********** print('Reading weights ....') model.load_weights(model_weights_file) # *********** Read files to predict*********** all_files = os.listdir(input_folder) all_files.sort() model_files = np.array([x for x in all_files if x.startswith('model')]) z_layers = [0] var_file = join(input_folder, "cov_mat", "tops_ias_std.nc") field_names_std = config[ProjTrainingParams.fields_names_var] if len(field_names_std) > 0: input_fields_std = read_netcdf(var_file, field_names_std, z_layers) else: input_fields_std = [] cmap_out = chooseCMAP(output_fields) cmap_model = chooseCMAP(field_names_model) cmap_obs = chooseCMAP(field_names_obs) cmap_std = chooseCMAP(field_names_std) tot_rows = 891 tot_cols = 1401 all_whole_mean_times = [] all_whole_sum_times = [] all_whole_rmse = [] # np.random.shuffle(model_files) # TODO this is only for testing for id_file, c_file in enumerate(model_files): # Find current and next date year = int(c_file.split('_')[1]) day_of_year = int(c_file.split('_')[2].split('.')[0]) model_file = join(input_folder, F'model_{year}_{day_of_year:03d}.nc') inc_file = join(input_folder, F'increment_{year}_{day_of_year:03d}.nc') obs_file = join(input_folder, F'obs_{year}_{day_of_year:03d}.nc') # *********************** Reading files ************************** input_fields_model = read_netcdf(model_file, field_names_model, z_layers) input_fields_obs = read_netcdf(obs_file, field_names_obs, z_layers) output_field_increment = read_netcdf(inc_file, output_fields, z_layers) # ******************* Normalizing and Cropping Data ******************* this_file_times = [] try: perc_ocean = .01 input_data, y_data = generateXandY(input_fields_model, input_fields_obs, input_fields_std, output_field_increment, field_names_model, field_names_obs, field_names_std, output_fields, 0, 0, grows, gcols, norm_type=norm_type, perc_ocean=perc_ocean) except Exception as e: print(F"Exception {e}") # ******************* Replacing nan values ********* # We set a value of 0.5 on the land. Trying a new loss function that do not takes into account land input_data_nans = np.isnan(input_data) input_data = np.nan_to_num(input_data, nan=0) y_data = np.nan_to_num(y_data, nan=-0.5) X = np.expand_dims(input_data, axis=0) Y = np.expand_dims(y_data, axis=0) # Make the prediction of the network start = time.time() output_nn_original = model.predict(X, verbose=1) toc = time.time() - start this_file_times.append(toc) # Make nan all values inside the land land_indexes = Y == -0.5 output_nn_original[land_indexes] = np.nan # ==== Denormalizingallinput and outputs denorm_cnn_output = denormalizeData(output_nn_original, output_fields, PreprocParams.type_inc, norm_type) denorm_y = denormalizeData(Y, output_fields, PreprocParams.type_inc, norm_type) input_types = [PreprocParams.type_model for i in input_fields_model] + [ PreprocParams.type_obs for i in input_fields_obs ] + [PreprocParams.type_std for i in input_fields_std] denorm_input = denormalizeData( input_data, field_names_model + field_names_obs + field_names_std, input_types, norm_type) # Recover the original land areas, they are lost after denormalization denorm_y[land_indexes] = np.nan # Remove the 'extra dimension' denorm_cnn_output = np.squeeze(denorm_cnn_output) denorm_y = np.squeeze(denorm_y) whole_cnn = denorm_cnn_output # Add the the 'whole prediction' whole_y = denorm_y # Add the the 'whole prediction' if len( denorm_cnn_output.shape ) == 2: # In this case we only had one output and we need to make it 'array' to plot denorm_cnn_output = np.expand_dims(denorm_cnn_output, axis=2) denorm_y = np.expand_dims(denorm_y, axis=2) # Compute RMSE # rmse_cnn = np.zeros(len(output_fields)) # for i in range(len(output_fields)): # ocean_indexes = np.logical_not(np.isnan(denorm_y[:,:,i])) # rmse_cnn[i] = np.sqrt(mean_squared_error(denorm_cnn_output[:,:,i][ocean_indexes], denorm_y[:,:,i][ocean_indexes])) # ================== DISPLAYS ALL INPUTS AND OUTPUTS DENORMALIZED =================== # Adding back mask to all the input variables denorm_input[input_data_nans] = np.nan # ======= Plots whole output with RMSE mincbar = np.nanmin(whole_y) maxcbar = np.nanmax(whole_y) error = whole_y - whole_cnn mincbarerror = np.nanmin(error) maxcbarerror = np.nanmax(error) no_zero_ids = np.count_nonzero(whole_cnn) if output_fields[ 0] == 'srfhgt': # This should only be for SSH to adjust the units whole_cnn /= 9.81 whole_y = np.array(whole_y) / 9.81 rmse_cnn = np.sqrt(np.nansum((whole_y - whole_cnn)**2) / no_zero_ids) all_whole_rmse.append(rmse_cnn) all_whole_mean_times.append(np.mean(np.array(this_file_times))) all_whole_sum_times.append(np.sum(np.array(this_file_times))) # if day_of_year == 353: # Plot 10% of the times if True: # Plot 10% of the times # viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder, disp_images=False, mincbar=mincbar, maxcbar=maxcbar) viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder, disp_images=False) # viz_obj.plot_2d_data_np_raw(np.concatenate((input_data.swapaxes(0,2), Y[0,:,:,:].swapaxes(0,2), output_nn_original[0,:,:,:].swapaxes(0,2))), viz_obj.plot_2d_data_np_raw( np.concatenate( (denorm_input.swapaxes(0, 2), denorm_y.swapaxes(0, 2), denorm_cnn_output.swapaxes(0, 2))), var_names=[F"in_model_{x}" for x in field_names_model] + [F"in_obs_{x}" for x in field_names_obs] + [F"in_var_{x}" for x in field_names_std] + [F"out_inc_{x}" for x in output_fields] + [F"cnn_{x}" for x in output_fields], file_name=F"Global_Input_and_CNN_{c_file}", rot_90=True, cmap=cmap_model + cmap_obs + cmap_std + cmap_out + cmap_out, cols_per_row=len(field_names_model), title= F"Input data: {field_names_model} and obs {field_names_obs}, increment {output_fields}, cnn {output_fields}" ) minmax = getMinMaxPlot(output_fields)[0] viz_obj = EOAImageVisualizer( output_folder=output_imgs_folder, disp_images=False, # mincbar=mincbar + mincbar + mincbarerror, # maxcbar=maxcbar + maxcbar + maxcbarerror) # mincbar=[minmax[0], minmax[0], max(minmax[0],-1)], # maxcbar=[minmax[1], minmax[1], min(minmax[1],1)]) mincbar=[minmax[0], minmax[0], -1], maxcbar=[minmax[1], minmax[1], 1]) # ================== Displays CNN and TSIS with RMSE ================ error_cmap = cmocean.cm.diff viz_obj.output_folder = join(output_imgs_folder, 'WholeOutput_CNN_TSIS') viz_obj.plot_2d_data_np_raw( [ np.flip(whole_cnn, axis=0), np.flip(whole_y, axis=0), np.flip(error, axis=0) ], # var_names=[F"CNN INC {x}" for x in output_fields] + [F"TSIS INC {x}" for x in output_fields] + [F'TSIS - CNN (Mean RMSE {rmse_cnn:0.4f} m)'], var_names=[F"CNN increment SSH" for x in output_fields] + [F"TSIS increment SSH" for x in output_fields] + [F'TSIS - CNN \n (Mean RMSE {rmse_cnn:0.4f} m)'], file_name=F"Global_WholeOutput_CNN_TSIS_{c_file}", rot_90=False, cmap=cmap_out + cmap_out + [error_cmap], cols_per_row=3, # title=F"{output_fields[0]} RMSE: {np.mean(rmse_cnn):0.5f} m.") title=F"SSH RMSE: {np.mean(rmse_cnn):0.5f} m.") print("DONE ALL FILES!!!!!!!!!!!!!") dic_summary = { "File": model_files, "rmse": all_whole_rmse, "times mean": all_whole_mean_times, "times sum": all_whole_sum_times, } df = pd.DataFrame.from_dict(dic_summary) df.to_csv(join(output_imgs_folder, "Global_RMSE_and_times.csv"))
import numpy as np from datetime import date, datetime, timedelta from inout.io_hycom import read_hycom_output from inout.io_netcdf import read_netcdf from os.path import join, exists from preproc.UtilsDates import get_month_and_day_of_month_from_day_of_year, get_day_of_year_from_month_and_day # This code is just for debugging purposes (plot intermediate steps) from img_viz.eoa_viz import EOAImageVisualizer from img_viz.constants import PlotMode img_viz = EOAImageVisualizer( output_folder='/data/HYCOM/DA_HYCOM_TSIS/images/inputNN', disp_images=False) MAX_DA = {'temp': 40, 'srfhgt': 20, 'salin': 70, 'u-vel.': 4, 'v-vel.': 4} MIN_DA = {'temp': 0, 'srfhgt': -20, 'salin': 0, 'u-vel.': -4, 'v-vel.': -4} MAX_OBS = {'sst': 40, 'ssh': 0.9, 'sss': 40} MIN_OBS = {'sst': 0, 'ssh': -0.9, 'sss': 15} def data_gen_hycomtsis(paths, file_names, obs_path, field_names, obs_field_names, output_field, days_separation=1, z_layers=[0]): """
def preproc_data(proc_id): """ This function preprocess the desired data. It does the following: 1) Looks for dates where there is 'increment', model, and observations data. 2) Saves the files on the same folder with only the 'desired' fields in netcdf format :param proc_id: :return: """ print("Preprocessing data....") config = get_preproc_config() input_folder_increment = config[PreprocParams.input_folder_tsis] input_folder_model = config[PreprocParams.input_folder_hycom] input_folder_obs = config[PreprocParams.input_folder_obs] output_folder = config[PreprocParams.output_folder] YEARS = config[PreprocParams.YEARS] MONTHS = config[PreprocParams.MONTHS] fields = config[PreprocParams.fields_names] obs_fields = config[PreprocParams.fields_names_obs] layers = config[PreprocParams.layers_to_plot] img_viz = EOAImageVisualizer(output_folder=output_folder, disp_images=False) # These are the data assimilated files for c_year in YEARS: for c_month in MONTHS: print( F"=============== Year: {c_year} Month: {c_month} ===========" ) days_of_month, days_of_year = get_days_from_month(c_month) # Rads all the files for this month da_files, da_paths = get_hycom_file_name(input_folder_increment, c_year, c_month) hycom_files, hycom_paths = get_hycom_file_name( input_folder_model, c_year, c_month) obs_files, obs_paths = get_obs_file_names(input_folder_obs, c_year, c_month) # This for is fixed to be able to run in parallel for c_day_of_month, c_day_of_year in enumerate(days_of_year): if (c_day_of_month % NUM_PROC) == proc_id: re_increment = F'incupd.{c_year}_{c_day_of_year:03d}\S*.a' re_model = F'archv.{c_year}_{c_day_of_year:03d}\S*.a' re_obs = F'tsis_obs_ias_{c_year}{c_month:02d}{c_day_of_month+1:02d}\S*.nc' try: da_file_idx = [ i for i, file in enumerate(da_files) if re.search(re_increment, file) != None ][0] print( F" =============== Working with: {da_files[da_file_idx]} Proc_id={proc_id} ============= " ) da_np_fields = read_hycom_fields(da_paths[da_file_idx], fields, layers=layers) hycom_file_idx = [ i for i, file in enumerate(hycom_files) if re.search(re_model, file) != None ][0] hycom_np_fields = read_hycom_fields( hycom_paths[hycom_file_idx], fields, layers=layers) # --------- Preprocessing Increment (TSIS) ------------- proc_increment_data( da_np_fields, hycom_np_fields, fields, join(output_folder, F"increment_{c_year}_{c_day_of_year:03d}.nc")) except Exception as e: print( F"Warning: Increment file for date {c_year}-{c_month}-{c_day_of_month} ({re_increment}) doesn't exist: {e}" ) # Only when the increment file is not found we go to the next day. continue try: print( F" --------------- Working with: {hycom_files[hycom_file_idx]} ------------- " ) hycom_file_idx = [ i for i, file in enumerate(hycom_files) if re.search(re_model, file) != None ][0] hycom_np_fields = read_hycom_fields( hycom_paths[hycom_file_idx], fields, layers=layers) # --------- Preprocessing HYCOM data ------------- proc_model_data( hycom_np_fields, fields, join(output_folder, F"model_{c_year}_{c_day_of_year:03d}.nc")) except Exception as e: print( F"Warning: HYCOM file for date {c_year}-{c_month}-{c_day_of_month} ({re_model}) doesn't exist: {e}" ) try: obs_file_idx = [ i for i, file in enumerate(obs_files) if re.search(re_obs, file) != None ][0] # --------- Preprocessing observed data ------------- print( F" --------------- Working with: {hycom_files[hycom_file_idx]} ------------- " ) obs_ds = xr.load_dataset(obs_paths[obs_file_idx]) for id_field, c_obs_field in enumerate(obs_fields): if id_field == 0: preproc_obs_ds = obs_ds[ c_obs_field].to_dataset() else: preproc_obs_ds = preproc_obs_ds.merge( obs_ds[c_obs_field].to_dataset()) # --------------- Here we add the fields from the profiles as gridded data ----------- temp_group = 0 saln_group = 1 sst_p = np.zeros( preproc_obs_ds[c_obs_field].values.shape) sss_p = np.zeros(sst_p.shape) profiles = obs_ds.val tot_profiles = profiles.shape[0] obs_groups = obs_ds.ob_grp_present lons_i = obs_ds.grdi.values[:, 0, 0] lats_i = obs_ds.grdj.values[:, 0, 0] for i_group, c_type in enumerate(obs_groups): if c_type == saln_group or c_type == temp_group: for c_profile_i in range(tot_profiles): c_data = profiles[c_profile_i, -1, i_group] if c_type == saln_group: sss_p[ int(lats_i[c_profile_i]), int(lons_i[c_profile_i])] = c_data if c_type == temp_group: sst_p[ int(lats_i[c_profile_i]), int(lons_i[c_profile_i])] = c_data print(F"Max value: {np.amax(sst_p)}") print(F"Max value s: {np.amax(sss_p)}") preproc_obs_ds['sst_p'] = xr.DataArray( sst_p, dims=['yc', 'xc']) preproc_obs_ds['sss_p'] = xr.DataArray( sss_p, dims=['yc', 'xc']) preproc_obs_ds.to_netcdf( join(output_folder, F"obs_{c_year}_{c_day_of_year:03d}.nc")) except Exception as e: print( F"Warning: OBS file for date {c_year}-{c_month}-{c_day_of_month} doesn't exist: {e}" )