def preprocess_imgs(input_folder, output_folder, resampling, img_names, out_img_names, normalize_imgs, bias_corrections, fix_adc, options, save_imgs: bool): """ Generates preprocessesd images only, this can be used when making classifications of the test dataset of the NN. :param input_folder: Path to the DICOM files :param output_folder: Path to output nrrd files :param resampling: Array with 3 values indicating each resolution x,y,z :param img_names: Array Names of the images we want to transform :param out_img_names: Output img names :param normalize_imgs: Bool array Indicates if we need to perform percentile normalization to the images :param save_imgs: Bool indicates if we need to save or not the output images :return: """ viz_ob = MedicalImageVisualizer() create_folder(output_folder) # ******************** READS DATA ******************* print('\tReading data....') [orig_imgs, final_img_names] = read_dicom_mri_series(input_folder, img_names, out_img_names) # Saves original images without bias correction if save_imgs: write_itk_imgs(output_folder, 'img', orig_imgs, final_img_names) if len(fix_adc) == len(orig_imgs): for idx_fix_adc, c_fix_adc in enumerate(fix_adc): if c_fix_adc: print( '\tFixing ADC, changing "black" values on original images ....' ) orig_imgs[idx_fix_adc] = correct_adc_itk( orig_imgs[idx_fix_adc]) else: print('\tNone ADC img is being fixed') # ************** Normalize images (N4K bias correction) ************* if options[PreprocParams.bias_correction]: print("\tBias correction.....") pretxt = 'img_n4k' for ii in range(len(orig_imgs)): if bias_corrections[ii]: # First try to read an existing file, if not, compute it orig_imgs[ii] = n4itk(orig_imgs[ii]) # Saving bias corrected images if save_imgs: write_itk_imgs(output_folder, pretxt, orig_imgs, final_img_names) norm_perc = options[PreprocParams.normalize_percentiles] for idx_img in range(len(orig_imgs)): if normalize_imgs[idx_img]: print(F'\tNormalizing intensities ... {img_names[idx_img]}') orig_imgs[idx_img] = normalize_to_percentiles([orig_imgs[idx_img]], norm_perc[0], norm_perc[1])[0] # *********** Resample to [.5,.5,.5] and interpolate with optical flow ****************** if options[PreprocParams.resample]: print("\tResampling .....") # viz_obj.plot_img_and_ctrs_itk(orig_imgs[0], slices=SliceMode.MIDDLE,title='Befor resampling') resampled_imgs, _ = reample_imgs_and_ctrs(orig_imgs, [], resampling) # viz_obj.plot_img_and_ctrs_itk(resampled_imgs[0], slices=SliceMode.MIDDLE,title='RESAMPLED') if save_imgs: write_itk_imgs(output_folder, 'hr', resampled_imgs, final_img_names) # *********** Crop and normalize to 0 and 1 ************ if options[PreprocParams.compute_roi_from_intersection]: print("\tCropping.....") roi_imgs, _, startROI_final, sizeROI_final = getCroppedIsotropicImgsOZ( resampled_imgs, []) # Saves the size and start position of the ROI, used when running the model np.savetxt(join(output_folder, 'start_ROI.csv'), startROI_final) np.savetxt(join(output_folder, 'size_ROI.csv'), sizeROI_final) # Save the roi images if save_imgs: write_itk_imgs(output_folder, 'roi', roi_imgs, final_img_names) return orig_imgs, resampled_imgs, roi_imgs
def preprocess_imgs_and_ctrs(input_folder, output_folder, resampling, img_names, ctr_names_orig, out_img_names, out_ctr_names_orig, match_whole_w_ctr, normalize_imgs, ctr_folder_names, bias_corrections, fix_adc, options): """ Generates nrrd files from dicom files. It does it for contours and series :param input_folder: Path to the DICOM files :param output_folder: Path to output nrrd files :param resampling: Array with 3 values indicating each resolution x,y,z :param img_names: Array Names of the images we want to transform :param ctr_names: Array Name of the contours we want to read :param out_img_names: Output img names :param out_ctr_names: Output ctr names :param match_whole_w_ctr: Bool array Indicate if we need to match the name of the contours exactly or as a RegEx :param normalize_imgs: Bool array Indicates if we need to perform percentile normalization to the images :param ctr_folder_names: Str Array With the name of the 'folder' names to search for contours :return: """ viz_obj = MedicalImageVisualizer() ctr_names = ctr_names_orig.copy( ) # Patch to avoid problems with global variable out_ctr_names = out_ctr_names_orig.copy( ) # Patch to avoid problems with global variable create_folder(output_folder) # ******************** READS DATA ******************* print('\tReading data....') [orig_imgs, final_img_names] = read_dicom_mri_series(input_folder, img_names, out_img_names) [orig_ctrs, final_ctr_names ] = read_rtstruct_mri_series(input_folder, ctr_folder_names=ctr_folder_names, in_ctr_names=ctr_names, out_ctr_names=out_ctr_names, ref_img_itk=orig_imgs[0], match_whole_word=match_whole_w_ctr) # Saves original images without bias correction write_itk_imgs(output_folder, 'img', orig_imgs, final_img_names) write_itk_imgs(output_folder, 'ctr', orig_ctrs, final_ctr_names) # ************** Correcting ADC intensities ************* for idx_fix_adc, c_fix_adc in enumerate(fix_adc): if c_fix_adc: print( '\tFixing ADC, changing "black" values on original images ....' ) orig_imgs[idx_fix_adc] = correct_adc_itk(orig_imgs[idx_fix_adc]) # ************** Normalize images (N4K bias correction) ************* if options[PreprocParams.bias_correction]: print("\tBias correction.....") pretxt = 'img_n4k' for ii in range(len(orig_imgs)): if bias_corrections[ii]: # First try to read an existing file, if not, compute it file_name = join( output_folder, '{}_{}.nrrd'.format(pretxt, final_img_names[ii])) if exists(file_name): print('\t\tReading previous n4k file...') orig_imgs[ii] = sitk.ReadImage(file_name) else: orig_imgs[ii] = n4itk(orig_imgs[ii]) # Saving bias corrected images write_itk_imgs(output_folder, pretxt, orig_imgs, final_img_names) norm_perc = options[PreprocParams.normalize_percentiles] for idx_img in range(len(orig_imgs)): if normalize_imgs[idx_img]: print(F'\tNormalizing intensities ... {img_names[idx_img]}') orig_imgs[idx_img] = normalize_to_percentiles([orig_imgs[idx_img]], norm_perc[0], norm_perc[1])[0] # *********** Resample to [.5,.5,.5] and interpolate with optical flow ****************** if options[PreprocParams.resample]: print("\tResampling .....") # viz_obj.plot_img_and_ctrs_itk(orig_imgs[0], orig_ctrs, slices=SliceMode.MIDDLE, title='Befor resampling') resampled_imgs, resampled_ctrs = reample_imgs_and_ctrs( orig_imgs, orig_ctrs, resampling) # viz_obj.plot_img_and_ctrs_itk(resampled_imgs[0], resampled_ctrs, slices=SliceMode.MIDDLE,title='RESAMPLED') if options[PreprocParams.optical_flow_ctr_interpolation]: print('\t\tOptical flow ....') resampled_ctrs = optical_flow_interpolation(resampled_ctrs) write_itk_imgs(output_folder, 'hr', resampled_imgs, final_img_names) write_itk_imgs(output_folder, 'hr_ctr', resampled_ctrs, final_ctr_names) # *********** Crop and normalize to 0 and 1 ************ if options[PreprocParams.compute_roi_from_intersection]: print("\tCropping.....") roi_imgs, roi_ctrs, startROI_final, sizeROI_final = getCroppedIsotropicImgsOZ( resampled_imgs, resampled_ctrs) if options[PreprocParams.smooth_ctrs]: print("\t\tSmoothing ctrs.....") # viz_obj.plot_img_and_ctrs_itk(roi_ctrs[0], slices=SliceMode.MIDDLE,title='Before smoothing') roi_ctrs = smoothContours(roi_ctrs) # viz_obj.plot_img_and_ctrs_itk(roi_ctrs[0], slices=SliceMode.MIDDLE,title='After smoothing') # Saves the size and start position of the ROI, used when running the model np.savetxt(join(output_folder, 'start_ROI.csv'), startROI_final) np.savetxt(join(output_folder, 'size_ROI.csv'), sizeROI_final) # Save the roi images write_itk_imgs(output_folder, 'roi', roi_imgs, final_img_names) write_itk_imgs(output_folder, 'roi_ctr', roi_ctrs, final_ctr_names) # viz_obj.plot_imgs_and_ctrs_itk(roi_imgs, roi_ctrs, slices=SliceMode.MIDDLE, title='Final ROIs') print("DONE!!!!...")
loss_func = config[TrainingParams.loss_function] batch_size = config[TrainingParams.batch_size] epochs = config[TrainingParams.epochs] img_names = config[TrainingParams.image_file_names] model_name_user = config[TrainingParams.config_name] class_label_file_name = config[TrainingParams.class_label_file_name] optimizer = config[TrainingParams.optimizer] nn_input_size = config[ModelParams.INPUT_SIZE] model_type = config[ModelParams.MODEL] split_info_folder = join(output_folder, 'Splits') parameters_folder = join(output_folder, 'Parameters') weights_folder = join(output_folder, 'models') logs_folder = join(output_folder, 'logs') create_folder(split_info_folder) create_folder(parameters_folder) create_folder(weights_folder) create_folder(logs_folder) folders_to_read = select_cases_from_folder(input_folder, config[TrainingParams.cases]) tot_examples = len(folders_to_read) # ================ Split definition ================= [train_ids, val_ids, test_ids ] = utilsNN.split_train_validation_and_test(tot_examples, val_percentage=val_perc, test_percentage=test_perc) print("Train examples (total:{}) :{}".format(len(train_ids),
def main(): config = get_makeprediction_config() # *********** Reads the parameters *********** input_file = config[ClassificationParams.input_file] splits_file = config[ClassificationParams.split_file] output_folder = config[ClassificationParams.output_folder] output_imgs_folder = config[ClassificationParams.output_imgs_folder] output_file_name = config[ClassificationParams.output_file_name] run_name = config[TrainingParams.config_name] model_weights_file = config[ClassificationParams.model_weights_file] forecasted_hours = config[LocalTrainingParams.forecasted_hours] disp_images = config[ClassificationParams.show_imgs] generate_images = config[ClassificationParams.generate_images] metrics_user = config[ClassificationParams.metrics] filter_stations = config[LocalTrainingParams.stations] # Iterate over the stations # Selects the proper model file for the current station assert len(model_weights_file) > 0 assert len(input_file) > 0 print(F"Working with: {model_weights_file} \n and \n {input_file}") data = pd.read_csv(input_file, index_col=0, parse_dates=True) all_data_cols = data.columns date_columns = [ x for x in all_data_cols if (x.find('week') != -1) or ( x.find('hour') != -1) or (x.find('year') != -1) ] stations_columns = [ x for x in all_data_cols if (x.find('h') == -1) and (x not in date_columns) ] meteo_columns = [ x for x in all_data_cols if (x.find('h') != -1) and ( x not in date_columns) and (x not in stations_columns) ] desired_columns = meteo_columns + filter_stations + date_columns print("Appending date hot vector...") date_hv = generate_date_hot_vector(data.index) data = pd.concat([data[desired_columns], date_hv], axis=1) print("Done!") # print("Filtering data to hours 9 to 20...") filtered_data = data.between_time("9:00", "20:00") # filtered_data = data datetimes_str = filtered_data.index.values # print("Done!") print(F'Normalizing and filtering data....') parameters_folder = join(dirname(output_folder), 'Training', 'Parameters') data_norm_df_final, accepted_times_idx, y_times_idx, stations_columns, meteo_columns = \ normalizeAndFilterData(filtered_data, datetimes_str, forecasted_hours, output_folder=parameters_folder, run_name=run_name, read_from_file=True) # ********* Filling nan values in the stations with the mean values of all the 'available' stations ******** X_df = data_norm_df_final.loc[datetimes_str[accepted_times_idx]] Y_df = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns] # ********* Filling nan values in the stations with the mean values of all the 'available' stations ******** # for cur_station in stations_columns: # X_df[cur_station] = X_df[cur_station].fillna(X_df['MEAN']) # Y_df[cur_station] = Y_df[cur_station].fillna(data_norm_df_final.loc[datetimes_str[y_times_idx]]['MEAN']) # X = data_norm_df_final.loc[datetimes_str[accepted_times_idx]].values # X_df = X_df.drop(columns=['MEAN']) X_df = X_df.drop(columns=stations_columns) X = X_df.values # Y = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns].values Y = Y_df.values config[ModelParams.INPUT_SIZE] = len(X_df.columns) print(F'X shape: {X.shape} Y shape: {Y.shape}') # *********** Chooses the proper model *********** print('Reading model ....') config[ModelParams.NUMBER_OF_OUTPUT_CLASSES] = Y.shape[1] model = select_1d_model(config) # *********** Chooses the proper model *********** print('Reading splits info....') if splits_file != '': # In this case we do read the information split_info = pd.read_csv(splits_file, dtype=np.int16) else: split_info = pd.DataFrame({ 'train_ids': [], 'validation_ids': [], 'test_id': [] }) split_info['train_ids'] = range(Y.shape[0]) # *********** Reads the weights*********** print('Reading weights ....') model.load_weights(model_weights_file) # ************ Makes NN Prediction ******** print('Making prediction ....') output_nn_all = model.predict(X, verbose=1) # ************ Saves raw results ******** number_of_examples = 10 if generate_images: img_viz = EOAImageVisualizer(output_folder=output_imgs_folder, disp_images=disp_images) Y[Y == -1] = np.nan # So that we do not show the -1 for c_example in range(number_of_examples): hours_to_plot = 24 * 3 # How many points to plot start_idx = np.random.randint( 0, X.shape[0] - hours_to_plot - forecasted_hours) end_idx = start_idx + hours_to_plot create_folder(output_folder) create_folder(output_imgs_folder) for idx_station, cur_station in enumerate(filter_stations): img_viz.plot_1d_data_np( datetimes_str[y_times_idx][start_idx:end_idx], [ Y[start_idx:end_idx, idx_station], output_nn_all[start_idx:end_idx, idx_station] ], title=F'{cur_station}', labels=['GT', 'NN'], file_name_prefix=F'{cur_station}_{c_example}') # ************ Recovering original units******** print('Recovering original units....') nn_df = pd.DataFrame(output_nn_all, columns=stations_columns, index=filtered_data.index[y_times_idx]) nn_original_units = deNormalize(nn_df) Y_original = deNormalize(Y_df) # ************ Computing metrics******** print('Computing metrics and saving predictions....') compute_metrics(Y_original, nn_original_units, metrics_user, split_info, output_file_name, stations_columns)
def make_3d_segmentation(config): """ :param config: :return: """ # *********** Reads the parameters *********** cases = config[ClassificationParams.cases] save_segmented_ctrs = config[ClassificationParams.save_segmented_ctrs] input_folder = config[ClassificationParams.input_folder] input_img_names = config[ClassificationParams.input_img_file_names] output_folder = config[ClassificationParams.output_folder] output_imgs_folder = config[ClassificationParams.output_imgs_folder] output_file_name = config[ClassificationParams.output_file_name] model_weights_file = config[ClassificationParams.model_weights_file] compute_metrics = config[ClassificationParams.compute_metrics] compute_original_resolution = config[ ClassificationParams.compute_original_resolution] save_imgs = config[ClassificationParams.save_imgs] if save_imgs: save_imgs_planes = config[ClassificationParams.save_img_planes] save_imgs_slices = config[ClassificationParams.save_img_slices] # Builds the visualization object viz_obj = MedicalImageVisualizer( disp_images=config[ClassificationParams.show_imgs], output_folder=output_imgs_folder) if compute_metrics: output_ctr_file_names = config[ ClassificationParams.output_ctr_file_names] else: output_ctr_file_names = [] # *********** Chooses the proper model *********** print('Reading model ....') model = select_3d_model(config) # *********** Reads the weights*********** print('Reading weights ....') model.load_weights(model_weights_file) examples = select_cases_from_folder(input_folder, cases) create_folder(output_imgs_folder) # *********** Makes a dataframe to contain the DSC information ********** metrics_params = config[ClassificationParams.metrics] metrics_dict = {met.name: met.value for met in metrics_params} # Check if the output fiels already exist, in thtat case read the df from it. if os.path.exists(join(output_imgs_folder, output_file_name)): data = pd.read_csv(join(output_imgs_folder, output_file_name), index_col=0) else: data_columns = list(metrics_dict.values()) if compute_original_resolution: # In this case we add all the desired metrics, but append 'original at the beginnig' data_columns = { *data_columns, *[F'{ORIGINAL_TXT}_{col}' for col in data_columns] } data = DataFrame(index=examples, columns=data_columns) # *********** Iterates over each case ********* segmentation_type = config[ClassificationParams.segmentation_type] for id_folder, current_folder in enumerate(examples): print(F'******* Computing folder {current_folder} ************') t0 = time.time() try: # -------------------- Reading data ------------- print('\t Reading data....') # All these names are predefined, for any other 3d segmentation we will need to create a different configuration imgs_itk, ctrs_itk, size_roi, start_roi, _ = read_preproc_imgs_and_ctrs_itk( input_folder, folders_to_read=[current_folder], img_names=input_img_names, ctr_names=output_ctr_file_names) imgs_np = [sitk.GetArrayFromImage(c_img) for c_img in imgs_itk[0] ] # The 0 is because we read a single fold ctrs_np = [sitk.GetArrayFromImage(c_img) for c_img in ctrs_itk[0]] # If we want to visualize the input images # viz_obj.plot_imgs_and_ctrs_itk(imgs_itk[0], ctrs_itk=ctrs_itk[0]) # ------------------- Making prediction ----------- print('\t Making prediction....') input_array = format_for_nn_classification(imgs_np) output_nn_all = model.predict(input_array, verbose=1) output_nn_np = output_nn_all[0, :, :, :, 0] # For visualizing the output of the network # viz_obj.plot_img_and_ctrs_np(img_np=output_nn_np) # ------------------- Postprocessing ----------- print('\t Postprocessing prediction....') threshold = .5 output_nn_itk = copyItkImage(imgs_itk[0][0], output_nn_np) print(F'\t\t Threshold NN output to {threshold} ....') output_nn_itk = binaryThresholdImage(output_nn_itk, threshold) if segmentation_type == SegmentationTypes.PROSTATE or segmentation_type == SegmentationTypes.PZ: print( F'\t\t Restricting to largest connected component only ....' ) output_nn_itk = getLargestConnectedComponents(output_nn_itk) output_nn_np = sitk.GetArrayViewFromImage(output_nn_itk) if compute_original_resolution: print('\t Recovering original resolution...') print('\t\t Reading original resolution images....') img_names = [ config[ ClassificationParams.resampled_resolution_image_name], config[ClassificationParams.original_resolution_image_name] ] ctr_name = config[ ClassificationParams.original_resolution_ctr_name] imgs_itk_original_temp, ctrs_itk_original_temp, _, _, _ = read_preproc_imgs_and_ctrs_itk( input_folder, folders_to_read=[current_folder], img_names=img_names, ctr_names=[ctr_name]) gt_ctr_original_itk = ctrs_itk_original_temp[0][ 0] # Retrieves the gt ctr at the original resolution img_original_resampled_itk = imgs_itk_original_temp[0][0] img_original_itk = imgs_itk_original_temp[0][1] print('\t\t Resampling to original....') output_nn_original_itk = recover_original_resolution( roi_np=output_nn_np, resampled_itk=img_original_resampled_itk, original_itk=img_original_itk, start_positions=start_roi[0], size_roi=size_roi[0]) output_nn_original_itk = binaryThresholdImage( output_nn_original_itk, threshold) if segmentation_type == SegmentationTypes.PROSTATE or segmentation_type == SegmentationTypes.PZ: print( F'\t\t\t Restricting to largest connected component only ....' ) output_nn_original_itk = getLargestConnectedComponents( output_nn_original_itk) output_nn_original_np = sitk.GetArrayViewFromImage( output_nn_original_itk) if save_segmented_ctrs: print('\t Saving Prediction...') create_folder(join(output_folder, current_folder)) # TODO at some point we will need to see if we can output more than one ctr sitk.WriteImage( output_nn_itk, join(output_folder, current_folder, output_ctr_file_names[0])) if compute_original_resolution: sitk.WriteImage( output_nn_original_itk, join(output_folder, current_folder, F'{ORIGINAL_TXT}_{output_ctr_file_names[0]}')) if compute_metrics: # Compute metrics print('\t Computing metrics....') for c_metric in metrics_params: # Here we can add more metrics if c_metric == ClassificationMetrics.DSC_3D: metric_value = numpy_dice(output_nn_np, ctrs_np[0]) data.loc[current_folder][c_metric.value] = metric_value print(F'\t\t ----- DSC: {metric_value:.3f} -----') if compute_original_resolution: metric_value = numpy_dice( output_nn_original_np, sitk.GetArrayViewFromImage( gt_ctr_original_itk)) data.loc[current_folder][ F'{ORIGINAL_TXT}_{c_metric.value}'] = metric_value print(F'\t\t ----- DSC: {metric_value:.3f} -----') # Saving the results every 10 steps if id_folder % 10 == 0: save_metrics_images(data, metric_names=list( metrics_dict.values()), viz_obj=viz_obj) data.to_csv(join(output_folder, output_file_name)) if save_imgs: print('\t Plotting images....') plot_intermediate_results(current_folder, data_columns, imgs_itk=imgs_itk[0], gt_ctr_itk=ctrs_itk[0][0], nn_ctr_itk=output_nn_itk, data=data, viz_obj=viz_obj, slices=save_imgs_slices, compute_metrics=compute_metrics) if compute_original_resolution: plot_intermediate_results( current_folder, data_columns, imgs_itk=[img_original_itk], gt_ctr_itk=gt_ctr_original_itk, nn_ctr_itk=output_nn_original_itk, data=data, viz_obj=viz_obj, slices=save_imgs_slices, compute_metrics=compute_metrics, prefix_name=ORIGINAL_TXT) except Exception as e: print( "---------------------------- Failed {} error: {} ----------------" .format(current_folder, e)) print(F'\t Done! Elapsed time {time.time()-t0:0.2f} seg') if compute_metrics: save_metrics_images(data, metric_names=list(metrics_dict.values()), viz_obj=viz_obj) data.to_csv(join(output_folder, output_file_name))
def main(): config = getTrainingParams() # =============== Read data and merge meteorological variables=============== print("Reading data") pollutant = config[LocalTrainingParams.pollutant] input_folder = config[TrainingParams.input_folder] output_folder = config[TrainingParams.output_folder] val_perc = config[TrainingParams.validation_percentage] test_perc = config[TrainingParams.test_percentage] eval_metrics = config[TrainingParams.evaluation_metrics] loss_func = config[TrainingParams.loss_function] batch_size = config[TrainingParams.batch_size] epochs = config[TrainingParams.epochs] model_name_user = config[TrainingParams.config_name] optimizer = config[TrainingParams.optimizer] forecasted_hours = config[LocalTrainingParams.forecasted_hours] years = config[LocalTrainingParams.years] debugging = config[LocalTrainingParams.debug] filter_stations = config[LocalTrainingParams.stations] filter_dates = config[LocalTrainingParams.filter_dates] split_info_folder = join(output_folder, 'Splits') parameters_folder = join(output_folder, 'Parameters') weights_folder = join(output_folder, 'models') logs_folder = join(output_folder, 'logs') create_folder(split_info_folder) create_folder(parameters_folder) create_folder(weights_folder) create_folder(logs_folder) data = None for year in years: print(F"============ Reading data for {year}: {pollutant} -- AllStations ==========================") if debugging: db_file_name = join(input_folder, F"{year}_{pollutant}_AllStationsDebug.csv") else: db_file_name = join(input_folder, F"{year}_{pollutant}_AllStations.csv") temp = pd.read_csv(db_file_name, index_col=0, parse_dates=True) if data is None: all_data_cols = temp.columns date_columns = [x for x in all_data_cols if (x.find('week') != -1) or (x.find('hour') != -1) or (x.find('year') != -1)] stations_columns = [x for x in all_data_cols if (x.find('h') == -1) and (x not in date_columns)] meteo_columns = [x for x in all_data_cols if (x.find('h') != -1) and (x not in date_columns) and (x not in stations_columns)] desired_columns = meteo_columns + filter_stations + date_columns data = temp[desired_columns] else: data = data.append(temp[desired_columns]) print("Appending date hot vector...") date_hv = generate_date_hot_vector(data.index) data = pd.concat([data, date_hv], axis=1) print("Done!") # ********** Restricting only data between the hours of 9 to 20 TODO hardoded ***** if filter_dates: filtered_data = data.between_time("9:00", "20:00") else: filtered_data = data datetimes_str = filtered_data.index.values data_norm_df_final, accepted_times_idx, y_times_idx, stations_columns, meteo_columns =\ normalizeAndFilterData(filtered_data, datetimes_str, forecasted_hours, output_folder=parameters_folder, run_name=model_name_user, read_from_file=False) X_df = data_norm_df_final.loc[datetimes_str[accepted_times_idx]] Y_df = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns] # ********* Filling nan values in the stations with the mean values of all the 'available' stations ******** # for cur_station in stations_columns: # X_df[cur_station] = X_df[cur_station].fillna(X_df['MEAN']) # Y_df[cur_station] = Y_df[cur_station].fillna(data_norm_df_final.loc[datetimes_str[y_times_idx]]['MEAN']) # X = data_norm_df_final.loc[datetimes_str[accepted_times_idx]].values # X_df = X_df.drop(columns=['MEAN']) X_df = X_df.drop(columns=stations_columns) X = X_df.values # Y = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns].values Y = Y_df.values # ****** Bootstrap everything above 60 ppts TODO hardoded idx_by_col = Y_df > 0.24 idx_above = idx_by_col.any(axis=1) # butstrap_size = 5 # How many times are we repeating the 'high' values # for i in range(butstrap_size): Y = np.append(Y, Y[idx_above, :], axis=0) X = np.append(X, X[idx_above, :], axis=0) config[ModelParams.INPUT_SIZE] = len(X_df.columns) print(F'Data shape: {filtered_data.shape} Data axes {filtered_data.axes}') print(F'X shape: {X.shape} Y shape: {Y.shape}') tot_examples = X.shape[0] rows_to_read = np.arange(tot_examples) # ================ Split definition ================= [train_ids, val_ids, test_ids] = utilsNN.split_train_validation_and_test(tot_examples, val_percentage=val_perc, test_percentage=test_perc) print("Train examples (total:{}) :{}".format(len(train_ids), rows_to_read[train_ids])) print("Validation examples (total:{}) :{}:".format(len(val_ids), rows_to_read[val_ids])) print("Test examples (total:{}) :{}".format(len(test_ids), rows_to_read[test_ids])) print("Selecting and generating the model....") now = datetime.utcnow().strftime("%Y_%m_%d_%H_%M") model_name = F'{model_name_user}_{now}_{pollutant}_AllStations' # ******************* Selecting the model ********************** config[ModelParams.NUMBER_OF_OUTPUT_CLASSES] = Y.shape[1] print(F"Nomber of output variables {Y.shape[1]}") model = select_1d_model(config) plot_model(model, to_file=join(output_folder, F'{model_name}.png'), show_shapes=True) file_name_splits = join(split_info_folder, F'{model_name}.csv') utilsNN.save_splits(file_name_splits, train_ids, val_ids, test_ids) print("Getting callbacks ...") [logger, save_callback, stop_callback] = utilsNN.get_all_callbacks(model_name=model_name, early_stopping_func=F'val_{eval_metrics[0].__name__}', weights_folder=weights_folder, logs_folder=logs_folder) print("Compiling model ...") model.compile(loss=loss_func, optimizer=optimizer, metrics=eval_metrics) print("Training ...") # This part should be somehow separated, it will change for every project x_train = X[train_ids, :] y_train = Y[train_ids, :] x_val = X[val_ids, :] y_val = Y[val_ids, :] model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_val, y_val), shuffle=True, callbacks=[logger, save_callback, stop_callback])
def trainModel(config, cur_pollutant, cur_station): """Trying to separate things so that tf 'cleans' the memory """ input_folder = config[TrainingParams.input_folder] output_folder = config[TrainingParams.output_folder] val_perc = config[TrainingParams.validation_percentage] test_perc = config[TrainingParams.test_percentage] eval_metrics = config[TrainingParams.evaluation_metrics] loss_func = config[TrainingParams.loss_function] batch_size = config[TrainingParams.batch_size] epochs = config[TrainingParams.epochs] model_name_user = config[TrainingParams.config_name] optimizer = config[TrainingParams.optimizer] forecasted_hours = config[LocalTrainingParams.forecasted_hours] split_info_folder = join(output_folder, 'Splits') parameters_folder = join(output_folder, 'Parameters') weights_folder = join(output_folder, 'models') logs_folder = join(output_folder, 'logs') imgs_folder = join(output_folder, 'imgs') create_folder(split_info_folder) create_folder(parameters_folder) create_folder(weights_folder) create_folder(logs_folder) viz_obj = EOAImageVisualizer(output_folder=imgs_folder, disp_images=False) print( F"============ Reading data for: {cur_pollutant} -- {cur_station} ==========================" ) db_file_name = join(input_folder, constants.merge_output_folder.value, F"{cur_pollutant}_{cur_station}.csv") data = pd.read_csv(db_file_name, index_col=0) config[ModelParams.INPUT_SIZE] = len(data.columns) print(F'Data shape: {data.shape} Data axes {data.axes}') print("Done!") # Predicting for the next value after 24hrs (only one) print("Normalizing data....") datetimes_str = data.index.values datetimes = np.array([ datetime.strptime(x, constants.datetime_format.value) for x in datetimes_str ]) scaler = preprocessing.MinMaxScaler() scaler = scaler.fit(data) data_norm_np = scaler.transform(data) data_norm_df = DataFrame(data_norm_np, columns=data.columns, index=data.index) print(F'Done!') # Filtering only dates where there is data "forecasted hours after" (24 hrs after) print(F"\tBuilding X and Y ....") accepted_times_idx = [] y_times_idx = [] for i, c_datetime in enumerate(datetimes): forecasted_datetime = (c_datetime + timedelta(hours=forecasted_hours)) if forecasted_datetime in datetimes: accepted_times_idx.append(i) y_times_idx.append( np.argwhere(forecasted_datetime == datetimes)[0][0]) X_df = data_norm_df.loc[datetimes_str[accepted_times_idx]] Y_df = data_norm_df.loc[datetimes_str[y_times_idx]][cur_pollutant] X = X_df.values Y = Y_df.values print(F'X shape: {X.shape} Y shape: {Y.shape}') tot_examples = X.shape[0] rows_to_read = np.arange(tot_examples) # ================ Split definition ================= [train_ids, val_ids, test_ids ] = utilsNN.split_train_validation_and_test(tot_examples, val_percentage=val_perc, test_percentage=test_perc) print("Train examples (total:{}) :{}".format(len(train_ids), rows_to_read[train_ids])) print("Validation examples (total:{}) :{}:".format(len(val_ids), rows_to_read[val_ids])) print("Test examples (total:{}) :{}".format(len(test_ids), rows_to_read[test_ids])) print("Selecting and generating the model....") now = datetime.utcnow().strftime("%Y_%m_%d_%H_%M") model_name = F'{model_name_user}_{now}_{cur_pollutant}_{cur_station}' # ******************* Selecting the model ********************** model = select_1d_model(config) plot_model(model, to_file=join(output_folder, F'{model_name}.png'), show_shapes=True) print("Saving split information...") file_name_splits = join(split_info_folder, F'{model_name}.csv') info_splits = DataFrame({F'Train({len(train_ids)})': train_ids}) info_splits[F'Validation({len(val_ids)})'] = 0 info_splits[F'Validation({len(val_ids)})'][0:len(val_ids)] = val_ids info_splits[F'Test({len(test_ids)})'] = 0 info_splits[F'Test({len(test_ids)})'][0:len(test_ids)] = test_ids info_splits.to_csv(file_name_splits, index=None) print(F"Norm params: {scaler.get_params()}") file_name_normparams = join(parameters_folder, F'{model_name}.txt') utilsNN.save_norm_params(file_name_normparams, NormParams.min_max, scaler) info_splits.to_csv(file_name_splits, index=None) print("Getting callbacks ...") [logger, save_callback, stop_callback] = utilsNN.get_all_callbacks( model_name=model_name, early_stopping_func=F'val_{eval_metrics[0].__name__}', weights_folder=weights_folder, logs_folder=logs_folder) print("Compiling model ...") model.compile(loss=loss_func, optimizer=optimizer, metrics=eval_metrics) print("Training ...") # This part should be somehow separated, it will change for every project x_train = X[train_ids, :] y_train = Y[train_ids] x_val = X[val_ids, :] y_val = Y[val_ids] x_test = X[test_ids, :] y_test = Y[test_ids] # Plotting some intermediate results import matplotlib.pyplot as plt size = 24 * 60 # Two months of data start = np.random.randint(0, len(data) - size) end = start + size plt.figure(figsize=[64, 8]) x_plot = range(len(X_df.iloc[start:end].index.values)) y_plot = X_df.iloc[start:end][cur_pollutant].values yy_plot = Y_df.iloc[start:end].values viz_obj.plot_1d_data_np(x_plot, [y_plot, yy_plot], title=F"{cur_pollutant}_{cur_station}", labels=['Current', 'Desired'], wide_ratio=4, file_name_prefix=F"{cur_pollutant}_{cur_station}") model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_val, y_val), shuffle=True, callbacks=[logger, save_callback, stop_callback])
def organize_folders(self): ''' This is the main function that organizes the folders. :return: ''' LUT = {} original_date_folders = os.listdir(self._src_folder) original_date_folders.sort() last_idx = self._start_idx prev_file_found = False create_folder(self._dst_folder) if self._search_previous_names: LUT, prev_csv_file, last_idx, prev_file_found = self.read_prev_lut_file() # Iterate over all the source folders (dates of MIM) for c_date_folder in original_date_folders: print('******************************* {} *************************************'.format(c_date_folder)) all_patients_in_date = os.listdir(join(self._src_folder, c_date_folder)) only_date = c_date_folder.replace('__Studies', '') # Iterate folders we are _searching (a list for each image type) for idx_kf, keep_folder in enumerate(self._orig_folder_names): isNotDCE = self._new_folder_names[idx_kf] != 'DCE' # Iterate the hierarchy of this folder (when we are looking for multiple folders) for c_keep_folder in keep_folder: # Search the folders that match by hierarchy matched_folders = [x for x in all_patients_in_date if not (re.search(c_keep_folder, x) is None)] # Iterate over matched folders for c_folder in matched_folders: pid = '{}_{}'.format(c_folder[0:self._replace_chars], only_date) # Get patient id if pid in LUT: # Verify we havent 'used' this patient cidx = LUT[pid]['id'] if self._keep_original_names: curr_dst_folder = c_folder[0:self._replace_chars] else: curr_dst_folder = join('%s-%04d' % (self._prefix_name, cidx)) # Patch for DCE if isNotDCE: # Assure this folder is NOT already there if os.path.exists(join(self._dst_folder, curr_dst_folder)): check_folders = os.listdir(join(self._dst_folder, curr_dst_folder)) if np.any([x.find(self._new_folder_names[idx_kf]) != -1 for x in check_folders]): continue # In this case we matched a LOWER level folder (in the hierarchy) else: # pid is not in LUT if self._keep_original_names: curr_dst_folder = c_folder[0:self._replace_chars] else: curr_dst_folder = join('%s-%04d' % (self._prefix_name, last_idx)) LUT[pid] = {'Folder': join('%s-%04d' % (self._prefix_name, last_idx)), 'id': last_idx} last_idx += 1 curr_patient = c_folder[self._replace_chars:] # Take into account only the folders in 'self._orig_folder_names' src = join(self._src_folder, c_date_folder, c_folder) dst = join(self._dst_folder, curr_dst_folder, '{}_{}'.format(self._new_folder_names[idx_kf], curr_patient)) if (len(dst) > 150) and (system() == 'Windows'): curr_patient = self.make_windows_path(dst, curr_patient) dst = join(self._dst_folder, curr_dst_folder, '{}_{}'.format(self._new_folder_names[idx_kf], curr_patient)) print(F" -------------- \n {src} \n {dst}") if os.path.exists(dst): shutil.rmtree(dst) shutil.copytree(src, dst) # Remove the previous LUT file and save the new one if prev_file_found: # os.remove(prev_csv_file) _lut_file_name = join(self._dst_folder, '{}_from_{}_to_{}.csv'.format(self._lut_file_name, self._start_idx, last_idx - 1)) self.saveLUT(LUT)
def main(): config = get_makeprediction_config() # *********** Reads the parameters *********** input_file = config[ClassificationParams.input_file] output_folder = config[ClassificationParams.output_folder] output_imgs_folder = config[ClassificationParams.output_imgs_folder] output_file_name = config[ClassificationParams.output_file_name] model_weights_file = config[ClassificationParams.model_weights_file] forecasted_hours = config[LocalTrainingParams.forecasted_hours] pollutant = config[LocalTrainingParams.pollutant] # ********** Reading and preprocessing data ******* _all_stations = [ "ACO", "AJM", "AJU", "ARA", "ATI", "AZC", "BJU", "CAM", "CCA", "CES", "CFE", "CHO", "COR", "COY", "CUA", "CUI", "CUT", "DIC", "EAJ", "EDL", "FAC", "FAN", "GAM", "HAN", "HGM", "IBM", "IMP", "INN", "IZT", "LAA", "LAG", "LLA", "LOM", "LPR", "LVI", "MCM", "MER", "MGH", "MIN", "MON", "MPA", "NET", "NEZ", "PED", "PER", "PLA", "POT", "SAG", "SFE", "SHA", "SJA", "SNT", "SUR", "TAC", "TAH", "TAX", "TEC", "TLA", "TLI", "TPN", "UAX", "UIZ", "UNM", "VAL", "VIF", "XAL", "XCH" ] # Iterate over the stations models_folder = '/data/UNAM/Air_Pollution_Forecast/Data/Training/models' data_folder = '/data/UNAM/Air_Pollution_Forecast/Data/MergedDataCSV' for c_station in _all_stations: try: model_weights_file = [ join(models_folder, x) for x in listdir(models_folder) if x.find(c_station) != -1 ] input_file = [ join(data_folder, x) for x in listdir(data_folder) if x.find(c_station) != -1 ] # Selects the proper model file for the current station assert len(model_weights_file) > 0 assert len(input_file) > 0 print(F"Working with: {model_weights_file} and {input_file}") model_weights_file = model_weights_file[0] input_file = input_file[0] data = pd.read_csv(input_file, index_col=0) config[ModelParams.INPUT_SIZE] = len(data.columns) print(F'Data shape: {data.shape} Data axes {data.axes}') print("Done!") # Predicting for the next value after 24hrs (only one) print("Normalizing data....") datetimes_str = data.index.values datetimes = np.array([ datetime.strptime(x, constants.datetime_format.value) for x in datetimes_str ]) scaler = preprocessing.MinMaxScaler() scaler = scaler.fit(data) data_norm_np = scaler.transform(data) data_norm_df = DataFrame(data_norm_np, columns=data.columns, index=data.index) print(F'Done!') # Filtering only dates where there is data "forecasted hours after" (24 hrs after) print(F"\tBuilding X and Y ....") accepted_times_idx = [] y_times_idx = [] for i, c_datetime in enumerate(datetimes): forecasted_datetime = (c_datetime + timedelta(hours=forecasted_hours)) if forecasted_datetime in datetimes: accepted_times_idx.append(i) y_times_idx.append( np.argwhere(forecasted_datetime == datetimes)[0][0]) X_df = data_norm_df.loc[datetimes_str[accepted_times_idx]] Y_df = data_norm_df.loc[datetimes_str[y_times_idx]][pollutant] X = X_df.values Y = Y_df.values print(F'X shape: {X.shape} Y shape: {Y.shape}') # *********** Chooses the proper model *********** print('Reading model ....') model = select_1d_model(config) # *********** Reads the weights*********** print('Reading weights ....') model.load_weights(model_weights_file) create_folder(output_folder) create_folder(output_imgs_folder) # *********** Makes a dataframe to contain the DSC information ********** metrics_params = config[ClassificationParams.metrics] metrics_dict = {met.name: met.value for met in metrics_params} # *********** Iterates over each case ********* t0 = time.time() # -------------------- Reading data ------------- output_nn_all = model.predict(X, verbose=1) # Plotting some intermediate results import matplotlib.pyplot as plt size = 24 * 60 # Two months of data start = np.random.randint(0, len(data) - size) end = start + size plt.figure(figsize=[64, 8]) x_plot = range(len(Y)) y_plot = Y yy_plot = Y_df.iloc[start:end].values viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder, disp_images=False) plot_this_many = 24 * 60 viz_obj.plot_1d_data_np( x_plot[0:plot_this_many], [y_plot[0:plot_this_many], output_nn_all[0:plot_this_many, 0]], title=F"{c_station} {pollutant}", labels=['Original', 'Forecasted'], wide_ratio=4, file_name_prefix=F"{pollutant}_{c_station}") print(F'\t Done! Elapsed time {time.time() - t0:0.2f} seg') except Exception as e: print( F"---------------------------- Failed {c_station} error: {e} ----------------" )
def main(): config = get_segmentation_2d_config() cases = config[ClassificationParams.cases] save_segmented_ctrs = config[ClassificationParams.save_segmented_ctrs] input_folder = config[ClassificationParams.input_folder] input_img_names = config[ClassificationParams.input_img_file_names] output_folder = config[ClassificationParams.output_folder] output_imgs_folder = config[ClassificationParams.output_imgs_folder] output_file_name = config[ClassificationParams.output_file_name] model_weights_file = config[ClassificationParams.model_weights_file] save_imgs = config[ClassificationParams.save_imgs] # Builds the visualization object viz_obj = MedicalImageVisualizer( disp_images=config[ClassificationParams.show_imgs], output_folder=output_imgs_folder) output_ctr_file_names = config[ClassificationParams.output_ctr_file_names] # *********** Chooses the proper model *********** print('Reading model ....') model = select_2d_model(config) # *********** Reads the weights*********** print('Reading weights ....') model.load_weights(model_weights_file) examples = select_cases_from_folder(input_folder, cases) create_folder(output_imgs_folder) # *********** Makes a dataframe to contain the DSC information ********** metrics_params = config[ClassificationParams.metrics] metrics_dict = {met.name: met.value for met in metrics_params} # Check if the output files already exist, in that case read the df from it. if os.path.exists(join(output_imgs_folder, output_file_name)): data = pd.read_csv(join(output_imgs_folder, output_file_name), index_col=0) else: data_columns = list(metrics_dict.values()) data = DataFrame(index=examples, columns=data_columns) # *********** Iterates over each case ********* for id_folder, current_folder in enumerate(examples): print(F'******* Computing folder {current_folder} ************') t0 = time.time() try: # -------------------- Reading data ------------- print('\t Reading data....') # All these names are predefined, for any other 3d segmentation we will need to create a different configuration all_imgs, all_ctrs, _, _ = read_preproc_imgs_and_ctrs_png( input_folder, folders_to_read=[current_folder], img_names=input_img_names, ctr_names=output_ctr_file_names) imgs_np = all_imgs[0] ctrs_lungs_np = all_ctrs[0][0].copy( ) # VERIFY THE ORDER IS THE SAME IN THE CONFIG FILE ctrs_lesion_np = all_ctrs[0][1].copy( ) # VERIFY THE ORDER IS THE SAME IN THE CONFIG FILE # If we want to visualize the input images # viz_obj.plot_imgs_and_ctrs_itk(img_np[0], ctrs_itk=ctrs_itk[0]) # ------------------- Making prediction ----------- print('\t Making prediction....') input_array = format_for_nn_classification(imgs_np) output_nn_all = model.predict(input_array, verbose=1) output_nn_np = output_nn_all[0, :, :, 0] output_nn_np[ctrs_lungs_np == 0] = 0 # Making the prediction 0 outside the lungs # For visualizing the output of the network # viz_obj.plot_img_and_ctrs_np_2d(output_nn_np, np_ctrs=[], file_name_prefix=id_folder) # ------------------- Postprocessing ----------- print('\t Postprocessing prediction....') threshold = .5 print(F'\t\t Threshold NN output to {threshold} ....') output_nn_np[ output_nn_np <= threshold] = 0 # Making the prediction 0 outside the lungs output_nn_np[ output_nn_np > threshold] = 1 # Making the prediction 0 outside the lungs if save_segmented_ctrs: print('\t Saving Prediction...') create_folder(join(output_folder, current_folder)) cv2.imwrite( join(output_folder, current_folder, output_ctr_file_names[0]), cv2.convertScaleAbs(output_nn_np, alpha=(255.0))) # Compute metrics print('\t Computing metrics....') for c_metric in metrics_params: # Here we can add more metrics if c_metric == ClassificationMetrics.DSC_2D: metric_value = numpy_dice(output_nn_np, ctrs_lesion_np) data.loc[current_folder][c_metric.value] = metric_value print(F'\t\t ----- DSC: {metric_value:.3f} -----') # Saving the results every 10 steps if id_folder % 10 == 0: save_metrics_images(data, metric_names=list(metrics_dict.values()), viz_obj=viz_obj) data.to_csv(join(output_folder, output_file_name)) if save_imgs: print('\t Plotting images....') plot_intermediate_results(current_folder, data_columns, img_np=imgs_np[0], gt_ctr_np=ctrs_lesion_np, nn_ctr_np=output_nn_np, data=data, viz_obj=viz_obj) except Exception as e: print( "---------------------------- Failed {} error: {} ----------------" .format(current_folder, e)) print(F'\t Done! Elapsed time {time.time()-t0:0.2f} seg') save_metrics_images(data, metric_names=list(metrics_dict.values()), viz_obj=viz_obj) data.to_csv(join(output_folder, output_file_name))
def normalizeAndFilterData(data, datetimes_orig, forecasted_hours, output_folder='', run_name='', read_from_file=False): """ This function normalizes de data and filters only the cases where we have the appropiate forecasted times. It also obtains the 'y' index :param data: All the data :param datetimes_str: An array of datetimes as strings which correspond to the index :param forecasted_hours: an integer representing the number of hours in advance we want to read :return: """ # Predicting for the next value after 24hrs (only one) print("Normalizing data....") datetimes = np.array(datetimes_orig) all_data_cols = data.columns.values date_columns = [ x for x in all_data_cols if (x.find('week') != -1) or ( x.find('hour') != -1) or (x.find('year') != -1) ] stations_columns = [ x for x in all_data_cols if (x.find('h') == -1) and (x not in date_columns) ] meteo_columns = [ x for x in all_data_cols if (x.find('h') != -1) and ( x not in date_columns) and (x not in stations_columns) ] # Normalizing meteorological variables # In this case we obtain the normalization values directly from the data # meteo_names = ['U10', 'V10', 'RAINC', 'T2', 'RAINNC', 'PBLH', 'SWDOWN', 'GLW'] meteo_names = ['U10', 'V10', 'RAINC', 'T2', 'RAINNC', 'SWDOWN', 'GLW'] if not (read_from_file): min_data = {} max_data = {} for cur_meteo in meteo_names: cur_meteo_cols = [ x for x in meteo_columns if x.find(cur_meteo) != -1 ] min_data[cur_meteo] = data[cur_meteo_cols].min().min() max_data[cur_meteo] = data[cur_meteo_cols].max().max() # ********* Saving normalization values for each variable ****** create_folder(output_folder) pd.DataFrame(min_data, index=[1]).to_csv( join(output_folder, F'{run_name}_min_values.csv')) pd.DataFrame(max_data, index=[1]).to_csv( join(output_folder, F'{run_name}_max_values.csv')) else: # In this case we obtain the normalization values from the provided file min_data = pd.read_csv(join(output_folder, F'{run_name}_min_values.csv'), index_col=0) max_data = pd.read_csv(join(output_folder, F'{run_name}_max_values.csv'), index_col=0) data_norm_df = data.copy() # Normalizing the meteorological variables for cur_meteo in meteo_names: cur_meteo_cols = [x for x in meteo_columns if x.find(cur_meteo) != -1] # The data structure is a little bit different when reading from the file if not (read_from_file): min_val = min_data[cur_meteo] max_val = max_data[cur_meteo] else: min_val = min_data[cur_meteo].values[0] max_val = max_data[cur_meteo].values[0] data_norm_df[cur_meteo_cols] = (data_norm_df[cur_meteo_cols] - min_val) / (max_val - min_val) # Normalizing the pollution variables data_norm_df[stations_columns] = (data_norm_df[stations_columns] - _min_value_ozone) / (_max_value_ozone - _min_value_ozone) print(F'Done!') # Filtering only dates where there is data "forecasted hours after" (24 hrs after) print(F"Building X and Y ....") accepted_times_idx = [] y_times_idx = [] for i, c_datetime in enumerate(datetimes): forecasted_datetime = c_datetime + np.timedelta64( forecasted_hours, 'h') if forecasted_datetime in datetimes: accepted_times_idx.append(i) y_times_idx.append( np.argwhere(forecasted_datetime == datetimes)[0][0]) # ****************** Replacing nan columns with the mean value of all the other columns **************** mean_values = data_norm_df[stations_columns].mean(1) # Replace nan values with -1 and add additional MEAN column print(F"Filling nan values....") data_norm_df_final = data_norm_df.copy() for cur_station in stations_columns: data_norm_df_final[cur_station] = data_norm_df[cur_station].fillna(-1) data_norm_df_final['MEAN'] = mean_values # print(F"Norm params: {scaler.get_params()}") # file_name_normparams = join(parameters_folder, F'{model_name}.txt') # utilsNN.save_norm_params(file_name_normparams, NormParams.min_max, scaler) print("Done!") return data_norm_df_final, accepted_times_idx, y_times_idx, stations_columns, meteo_columns