def process_outlier_and_stack(interim_path, file_name, phase_str, processed_path): data_nc = load_pkl(interim_path, file_name) # Outlier processing for v in obs_var: data_nc['input_obs'][v] = process_outlier_and_normalize( data_nc['input_obs'][v], obs_range_dic[v]) data_nc['ground_truth'][v] = process_outlier_and_normalize( data_nc['ground_truth'][v], obs_range_dic[v]) for v in ruitu_var: data_nc['input_ruitu'][v] = process_outlier_and_normalize( data_nc['input_ruitu'][v], ruitu_range_dic[v]) stacked_data = [data_nc['input_obs'][v] for v in obs_var] stacked_input_obs = np.stack(stacked_data, axis=-1) stacked_data = [data_nc['input_ruitu'][v] for v in ruitu_var] stacked_input_ruitu = np.stack(stacked_data, axis=-1) stacked_data = [data_nc['ground_truth'][v] for v in target_var] stacked_ground_truth = np.stack(stacked_data, axis=-1) print( stacked_input_obs.shape) #(sample_ind, timestep, station_id, features) print(stacked_input_ruitu.shape) print(stacked_ground_truth.shape) data_dic = { 'input_obs': stacked_input_obs, 'input_ruitu': stacked_input_ruitu, 'ground_truth': stacked_ground_truth } #normalize save_pkl(data_dic, processed_path, '{}_norm.dict'.format(phase_str))
def main(): """ main """ global VERBOSE VECTORIZER = TfidfVectorizer() MODEL = {} apar = argparse.ArgumentParser(description="Learn Incident Classifier") apar.add_argument("-f", "--file", required=True) apar.add_argument("-o", "--out") apar.add_argument("-v", "--verbose", action="store_true") apar.add_argument("-b", "--best", action="store_true") args = apar.parse_args() VERBOSE = args.verbose csv_filename = args.file pkl_filename = args.out if args.out else "model.pkl" data = helper.load_csv(csv_filename) t0 = time() # data[NORMTEXTCOL] = data[TEXTCOL].apply(helper.normalize_str) data = helper.normalize_multiproc(data) print_verbose("normalization done:\t{:0.3f}s".format((time() - t0))) t0 = time() X_learn = VECTORIZER.fit_transform(data[config.NORMTEXTCOL]) print_verbose("fit done\t{:0.3f}s".format((time() - t0))) MODEL[config.VECTORIZERNAME] = VECTORIZER for classifier_name, classifier in CLASSIFIERS: MODEL[classifier_name] = {} print("=" * 40) print(classifier_name) print(classifier) for classcol in config.CLASSCOLS: print("training on\t{:s}".format(classcol)) t0 = time() y_learn = numpy.array(data[classcol]) classifier.fit(X_learn, y_learn) print_verbose("training done\t{:0.3f}s".format((time() - t0))) MODEL[classifier_name][classcol] = copy.copy(classifier) helper.save_pkl(MODEL, pkl_filename)
def train(args): # create real images reals = create_real_images(args) # save real images for i, real in enumerate(reals): image_path = os.path.join(args.logdir, 'real_%d.png' % i) imwrite(denormalize(np.transpose(real, [0, 2, 3, 1])[0]), image_path) # nnabla monitor monitor = Monitor(args.logdir) # use cv2 backend at MonitorImage set_backend('cv2') prev_models = [] Zs = [] noise_amps = [] for scale_num in range(len(reals)): fs = min(args.fs_init * (2**(scale_num // 4)), 128) min_fs = min(args.min_fs_init * (2**(scale_num // 4)), 128) model = Model(real=reals[scale_num], num_layer=args.num_layer, fs=fs, min_fs=min_fs, kernel=args.kernel, pad=args.pad, lam_grad=args.lam_grad, alpha_recon=args.alpha_recon, d_lr=args.d_lr, g_lr=args.g_lr, beta1=args.beta1, gamma=args.gamma, lr_milestone=args.lr_milestone, scope=str(scale_num)) z_curr = train_single_scale(args, scale_num, model, reals, prev_models, Zs, noise_amps, monitor) prev_models.append(model) Zs.append(z_curr) noise_amps.append(args.noise_amp) # save data nn.save_parameters(os.path.join(args.logdir, 'models.h5')) save_pkl(Zs, os.path.join(args.logdir, 'Zs.pkl')) save_pkl(reals, os.path.join(args.logdir, 'reals.pkl')) save_pkl(noise_amps, os.path.join(args.logdir, 'noise_amps.pkl')) return Zs, reals, noise_amps
def netCDF_filter_nan(data_file, phase_str, interim_path): ''' phase_str: train, val or test ''' data_dic = {'input_obs': None, 'input_ruitu': None, 'ground_truth': None} print('processing...:', data_file) ori_data = nc.Dataset(data_file) # 读取nc文件 ori_dimensions, ori_variables = ori_data.dimensions, ori_data.variables # 获取文件中的维度和变量 date_index, fortime_index, station_index = 1, 2, 3 # 根据三个维度,读取该变量在特定日期、特定站点的特定预报时刻的数值 var_obs = [] # var name list var_all = [] var_ruitu = [] for v in ori_variables: var_all.append(v) if v.find("_obs") != -1: var_obs.append(v) elif v.find('_M') != -1: var_ruitu.append(v) sta_id = ori_variables['station'][:].data print('sta_id:', sta_id) hour_index = ori_variables['foretimes'][:].data print('hour_index:', hour_index) day_index = ori_variables['date'][:].data print('day_index:', day_index) print(str(list(day_index)[-1]).split('.')[0]) # build a map for staion and its index station_dic = {} for i, s in enumerate(sta_id): station_dic[s] = i print(station_dic) NUMS = ori_dimensions['date'].size # 1188 for train input_ruitu_nan_list = [] input_obs_nan_list = [] #output_obs_nan_list=[] for i in range(NUMS - 1): input_ruitu_nan = ( ori_variables['t2m_M'][i, :, :].data == -9999.).all() input_obs_nan = ( ori_variables['t2m_obs'][i, :, :].data == -9999.).all() if input_ruitu_nan: input_ruitu_nan_list.append(i) if input_obs_nan: input_obs_nan_list.append(i) input_ruitu_nan_list_minus1 = [i - 1 for i in input_ruitu_nan_list] print('input_ruitu_nan_list_minus1:', input_ruitu_nan_list_minus1) print('input_obs_nan_list', input_obs_nan_list) deleted_obs_days = input_ruitu_nan_list_minus1 + input_obs_nan_list #bad_days print('deleted_obs_days:', (deleted_obs_days)) print('deleted_obs_days_nums:', len(deleted_obs_days)) input_obs_dic = dict.fromkeys(var_obs, None) input_ruitu_dic = dict.fromkeys(var_ruitu, None) ground_truth_dic = dict.fromkeys(var_obs, None) good_obs_days = [i for i in range(NUMS - 1) if i not in deleted_obs_days] print('The number of not seriously NaN days:', len(good_obs_days)) good_groundtruth_days = [i + 1 for i in good_obs_days] for v in var_obs: input_obs_dic[v] = ori_variables[v][good_obs_days, :, :].data ground_truth_dic[v] = ori_variables[v][ good_groundtruth_days, :, :].data for v in var_ruitu: input_ruitu_dic[v] = ori_variables[v][good_groundtruth_days, :, :].data for v in var_obs: np.place(input_obs_dic[v], input_obs_dic[v] == -9999., np.nan) # Fill missing value with mean value mean_ = np.nanmean(input_obs_dic[v]) # Calculate mean except np.nan where_are_NaNs = np.isnan(input_obs_dic[v]) input_obs_dic[v][where_are_NaNs] = mean_ np.place(ground_truth_dic[v], ground_truth_dic[v] == -9999., np.nan) mean_ = np.nanmean(ground_truth_dic[v]) # Calculate mean except np.nan where_are_NaNs = np.isnan(ground_truth_dic[v]) ground_truth_dic[v][where_are_NaNs] = mean_ data_dic['input_obs'] = input_obs_dic data_dic['ground_truth'] = ground_truth_dic for v in var_ruitu: np.place(input_ruitu_dic[v], input_ruitu_dic[v] == -9999., np.nan) mean_ = np.nanmean(input_ruitu_dic[v]) # Calculate mean except np.nan where_are_NaNs = np.isnan(input_ruitu_dic[v]) input_ruitu_dic[v][where_are_NaNs] = mean_ data_dic['input_ruitu'] = input_ruitu_dic save_pkl(data_dic, interim_path, '{}_non_NaN.dict'.format(phase_str)) return '{}_non_NaN.dict'.format(phase_str)
transform=val_transform) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, pin_memory=True) criterion = nn.CrossEntropyLoss().cuda() model = vgg16(pretrained=True).to(args.device) show_summary(model) # save apoz pkl if not os.path.exists(args.apoz_path): apoz = APoZ(model).get_apoz(valid_loader, criterion) save_pkl(apoz, args.apoz_path) else: apoz = load_pkl(args.apoz_path) # info apoz print("Average Percentage Of Zero Mean") for n, p in zip(module_name, apoz): print(f"{n} : {p.mean() * 100 : .2f}%") # Masking mask = [] for i, p in enumerate(apoz[-3:-1]): sorted_arg = np.argsort(p) mask.append(sorted_arg < select_rate[i])
def netCDF2TheLastDay(data_file, phase_str, interim_filepath, datetime): ''' phase_str: testA, testB or OnlineEveryDay ''' data_dic = {'input_obs': None, 'input_ruitu': None, 'ground_truth': None} print('processing...:', data_file) ori_data = nc.Dataset(data_file) # 读取nc文件 ori_dimensions, ori_variables = ori_data.dimensions, ori_data.variables # 获取文件中的维度和变量 date_index, fortime_index, station_index = 1, 2, 3 # 根据三个维度,读取该变量在特定日期、特定站点的特定预报时刻的数值 var_obs = [] # var name list var_all = [] var_ruitu = [] for v in ori_variables: var_all.append(v) if v.find("_obs") != -1: var_obs.append(v) elif v.find('_M') != -1: var_ruitu.append(v) sta_id = ori_variables['station'][:].data print('sta_id:', sta_id) hour_index = ori_variables['foretimes'][:].data print('hour_index:', hour_index) day_index = ori_variables['date'][:].data print('day_index:', day_index) print(str(list(day_index)[-1]).split('.')[0]) # build a map for staion and its index station_dic = {} for i, s in enumerate(sta_id): station_dic[s] = i print(station_dic) NUMS = ori_dimensions['date'].size print("The number of days:", NUMS) input_obs_dic = dict.fromkeys(var_obs, None) input_ruitu_dic = dict.fromkeys(var_ruitu, None) for v in var_obs: input_obs_dic[v] = ori_variables[v][ -2, :, :].data[: -9] # Only select the last 28 days excluding the last 9 NaN days if (input_obs_dic[v] == -9999.).any(): temp_df = pd.DataFrame(data=input_obs_dic[v]) temp_df.replace(-9999., np.NaN, inplace=True) temp_df.interpolate(inplace=True) temp_df.bfill(inplace=True) temp_df.ffill(inplace=True) input_obs_dic[v] = temp_df.values assert not (input_obs_dic[v] == -9999.).any( ), 'Error. -9999 happens in Obs for the predictive day!' assert not (input_obs_dic[v] == np.NaN).any( ), 'Error. np.NaN happens in Obs for the predictive day!' for v in var_ruitu: input_ruitu_dic[v] = ori_variables[v][-1, :, :].data if (input_ruitu_dic[v] == -9999.).any(): temp_df = pd.DataFrame(data=input_ruitu_dic[v]) temp_df.replace(-9999., np.NaN, inplace=True) temp_df.interpolate(inplace=True) temp_df.bfill(inplace=True) temp_df.ffill(inplace=True) input_ruitu_dic[v] = temp_df.values assert not (input_ruitu_dic[v] == -9999.).any( ), 'Error. -9999 happens in Ruitu for the predictive day!' assert not (input_ruitu_dic[v] == np.NaN).any( ), 'Error. np.NaN happens in Obs for the predictive day!' data_dic['input_obs'] = input_obs_dic data_dic['input_ruitu'] = input_ruitu_dic save_pkl(data_dic, interim_filepath, '{}_one_predict_day_{}.dict'.format(phase_str, datetime)) return '{}_one_predict_day_{}.dict'.format(phase_str, datetime)