def process_outlier_and_stack(interim_path, file_name, phase_str,
                              processed_path):
    data_nc = load_pkl(interim_path, file_name)
    # Outlier processing
    for v in obs_var:
        data_nc['input_obs'][v] = process_outlier_and_normalize(
            data_nc['input_obs'][v], obs_range_dic[v])
        data_nc['ground_truth'][v] = process_outlier_and_normalize(
            data_nc['ground_truth'][v], obs_range_dic[v])
    for v in ruitu_var:
        data_nc['input_ruitu'][v] = process_outlier_and_normalize(
            data_nc['input_ruitu'][v], ruitu_range_dic[v])

    stacked_data = [data_nc['input_obs'][v] for v in obs_var]
    stacked_input_obs = np.stack(stacked_data, axis=-1)

    stacked_data = [data_nc['input_ruitu'][v] for v in ruitu_var]
    stacked_input_ruitu = np.stack(stacked_data, axis=-1)

    stacked_data = [data_nc['ground_truth'][v] for v in target_var]
    stacked_ground_truth = np.stack(stacked_data, axis=-1)

    print(
        stacked_input_obs.shape)  #(sample_ind, timestep, station_id, features)
    print(stacked_input_ruitu.shape)
    print(stacked_ground_truth.shape)

    data_dic = {
        'input_obs': stacked_input_obs,
        'input_ruitu': stacked_input_ruitu,
        'ground_truth': stacked_ground_truth
    }
    #normalize

    save_pkl(data_dic, processed_path, '{}_norm.dict'.format(phase_str))
Exemplo n.º 2
0
def main():
    """ main """
    global VERBOSE
    VECTORIZER = TfidfVectorizer()
    MODEL = {}
    apar = argparse.ArgumentParser(description="Learn Incident Classifier")
    apar.add_argument("-f", "--file", required=True)
    apar.add_argument("-o", "--out")
    apar.add_argument("-v", "--verbose", action="store_true")
    apar.add_argument("-b", "--best", action="store_true")
    args = apar.parse_args()
    VERBOSE = args.verbose
    csv_filename = args.file
    pkl_filename = args.out if args.out else "model.pkl"
    data = helper.load_csv(csv_filename)
    t0 = time()
    # data[NORMTEXTCOL] = data[TEXTCOL].apply(helper.normalize_str)
    data = helper.normalize_multiproc(data)
    print_verbose("normalization done:\t{:0.3f}s".format((time() - t0)))
    t0 = time()
    X_learn = VECTORIZER.fit_transform(data[config.NORMTEXTCOL])
    print_verbose("fit done\t{:0.3f}s".format((time() - t0)))
    MODEL[config.VECTORIZERNAME] = VECTORIZER
    for classifier_name, classifier in CLASSIFIERS:
        MODEL[classifier_name] = {}
        print("=" * 40)
        print(classifier_name)
        print(classifier)
        for classcol in config.CLASSCOLS:
            print("training on\t{:s}".format(classcol))
            t0 = time()
            y_learn = numpy.array(data[classcol])
            classifier.fit(X_learn, y_learn)
            print_verbose("training done\t{:0.3f}s".format((time() - t0)))
            MODEL[classifier_name][classcol] = copy.copy(classifier)
    helper.save_pkl(MODEL, pkl_filename)
Exemplo n.º 3
0
def train(args):
    # create real images
    reals = create_real_images(args)
    # save real images
    for i, real in enumerate(reals):
        image_path = os.path.join(args.logdir, 'real_%d.png' % i)
        imwrite(denormalize(np.transpose(real, [0, 2, 3, 1])[0]), image_path)

    # nnabla monitor
    monitor = Monitor(args.logdir)
    # use cv2 backend at MonitorImage
    set_backend('cv2')

    prev_models = []
    Zs = []
    noise_amps = []

    for scale_num in range(len(reals)):
        fs = min(args.fs_init * (2**(scale_num // 4)), 128)
        min_fs = min(args.min_fs_init * (2**(scale_num // 4)), 128)

        model = Model(real=reals[scale_num],
                      num_layer=args.num_layer,
                      fs=fs,
                      min_fs=min_fs,
                      kernel=args.kernel,
                      pad=args.pad,
                      lam_grad=args.lam_grad,
                      alpha_recon=args.alpha_recon,
                      d_lr=args.d_lr,
                      g_lr=args.g_lr,
                      beta1=args.beta1,
                      gamma=args.gamma,
                      lr_milestone=args.lr_milestone,
                      scope=str(scale_num))

        z_curr = train_single_scale(args, scale_num, model, reals, prev_models,
                                    Zs, noise_amps, monitor)

        prev_models.append(model)
        Zs.append(z_curr)
        noise_amps.append(args.noise_amp)

    # save data
    nn.save_parameters(os.path.join(args.logdir, 'models.h5'))
    save_pkl(Zs, os.path.join(args.logdir, 'Zs.pkl'))
    save_pkl(reals, os.path.join(args.logdir, 'reals.pkl'))
    save_pkl(noise_amps, os.path.join(args.logdir, 'noise_amps.pkl'))

    return Zs, reals, noise_amps
def netCDF_filter_nan(data_file, phase_str, interim_path):
    '''
    phase_str: train, val or test
    '''
    data_dic = {'input_obs': None, 'input_ruitu': None, 'ground_truth': None}

    print('processing...:', data_file)
    ori_data = nc.Dataset(data_file)  # 读取nc文件
    ori_dimensions, ori_variables = ori_data.dimensions, ori_data.variables  # 获取文件中的维度和变量
    date_index, fortime_index, station_index = 1, 2, 3  # 根据三个维度,读取该变量在特定日期、特定站点的特定预报时刻的数值
    var_obs = []  # var name list
    var_all = []
    var_ruitu = []
    for v in ori_variables:
        var_all.append(v)
        if v.find("_obs") != -1:
            var_obs.append(v)
        elif v.find('_M') != -1:
            var_ruitu.append(v)

    sta_id = ori_variables['station'][:].data
    print('sta_id:', sta_id)
    hour_index = ori_variables['foretimes'][:].data
    print('hour_index:', hour_index)
    day_index = ori_variables['date'][:].data
    print('day_index:', day_index)
    print(str(list(day_index)[-1]).split('.')[0])
    # build a map for staion and its index
    station_dic = {}
    for i, s in enumerate(sta_id):
        station_dic[s] = i
    print(station_dic)

    NUMS = ori_dimensions['date'].size  # 1188 for train
    input_ruitu_nan_list = []
    input_obs_nan_list = []
    #output_obs_nan_list=[]
    for i in range(NUMS - 1):
        input_ruitu_nan = (
            ori_variables['t2m_M'][i, :, :].data == -9999.).all()
        input_obs_nan = (
            ori_variables['t2m_obs'][i, :, :].data == -9999.).all()

        if input_ruitu_nan:
            input_ruitu_nan_list.append(i)

        if input_obs_nan:
            input_obs_nan_list.append(i)

    input_ruitu_nan_list_minus1 = [i - 1 for i in input_ruitu_nan_list]
    print('input_ruitu_nan_list_minus1:', input_ruitu_nan_list_minus1)
    print('input_obs_nan_list', input_obs_nan_list)
    deleted_obs_days = input_ruitu_nan_list_minus1 + input_obs_nan_list
    #bad_days
    print('deleted_obs_days:', (deleted_obs_days))
    print('deleted_obs_days_nums:', len(deleted_obs_days))
    input_obs_dic = dict.fromkeys(var_obs, None)
    input_ruitu_dic = dict.fromkeys(var_ruitu, None)
    ground_truth_dic = dict.fromkeys(var_obs, None)
    good_obs_days = [i for i in range(NUMS - 1) if i not in deleted_obs_days]
    print('The number of not seriously NaN days:', len(good_obs_days))

    good_groundtruth_days = [i + 1 for i in good_obs_days]

    for v in var_obs:
        input_obs_dic[v] = ori_variables[v][good_obs_days, :, :].data
        ground_truth_dic[v] = ori_variables[v][
            good_groundtruth_days, :, :].data
    for v in var_ruitu:
        input_ruitu_dic[v] = ori_variables[v][good_groundtruth_days, :, :].data

    for v in var_obs:
        np.place(input_obs_dic[v], input_obs_dic[v] == -9999., np.nan)
        # Fill missing value with mean value
        mean_ = np.nanmean(input_obs_dic[v])  # Calculate mean except np.nan
        where_are_NaNs = np.isnan(input_obs_dic[v])
        input_obs_dic[v][where_are_NaNs] = mean_

        np.place(ground_truth_dic[v], ground_truth_dic[v] == -9999., np.nan)
        mean_ = np.nanmean(ground_truth_dic[v])  # Calculate mean except np.nan
        where_are_NaNs = np.isnan(ground_truth_dic[v])
        ground_truth_dic[v][where_are_NaNs] = mean_

    data_dic['input_obs'] = input_obs_dic
    data_dic['ground_truth'] = ground_truth_dic

    for v in var_ruitu:
        np.place(input_ruitu_dic[v], input_ruitu_dic[v] == -9999., np.nan)
        mean_ = np.nanmean(input_ruitu_dic[v])  # Calculate mean except np.nan
        where_are_NaNs = np.isnan(input_ruitu_dic[v])
        input_ruitu_dic[v][where_are_NaNs] = mean_

    data_dic['input_ruitu'] = input_ruitu_dic
    save_pkl(data_dic, interim_path, '{}_non_NaN.dict'.format(phase_str))

    return '{}_non_NaN.dict'.format(phase_str)
Exemplo n.º 5
0
                                     transform=val_transform)

valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                           batch_size=args.batch_size,
                                           pin_memory=True)

criterion = nn.CrossEntropyLoss().cuda()

model = vgg16(pretrained=True).to(args.device)

show_summary(model)

# save apoz pkl
if not os.path.exists(args.apoz_path):
    apoz = APoZ(model).get_apoz(valid_loader, criterion)
    save_pkl(apoz, args.apoz_path)

else:
    apoz = load_pkl(args.apoz_path)

# info apoz
print("Average Percentage Of Zero Mean")
for n, p in zip(module_name, apoz):
    print(f"{n} : {p.mean() * 100 : .2f}%")

# Masking
mask = []

for i, p in enumerate(apoz[-3:-1]):
    sorted_arg = np.argsort(p)
    mask.append(sorted_arg < select_rate[i])
Exemplo n.º 6
0
def netCDF2TheLastDay(data_file, phase_str, interim_filepath, datetime):
    '''
    phase_str: testA, testB or OnlineEveryDay
    '''
    data_dic = {'input_obs': None, 'input_ruitu': None, 'ground_truth': None}

    print('processing...:', data_file)
    ori_data = nc.Dataset(data_file)  # 读取nc文件
    ori_dimensions, ori_variables = ori_data.dimensions, ori_data.variables  # 获取文件中的维度和变量
    date_index, fortime_index, station_index = 1, 2, 3  # 根据三个维度,读取该变量在特定日期、特定站点的特定预报时刻的数值
    var_obs = []  # var name list
    var_all = []
    var_ruitu = []
    for v in ori_variables:
        var_all.append(v)
        if v.find("_obs") != -1:
            var_obs.append(v)
        elif v.find('_M') != -1:
            var_ruitu.append(v)

    sta_id = ori_variables['station'][:].data
    print('sta_id:', sta_id)
    hour_index = ori_variables['foretimes'][:].data
    print('hour_index:', hour_index)
    day_index = ori_variables['date'][:].data
    print('day_index:', day_index)
    print(str(list(day_index)[-1]).split('.')[0])
    # build a map for staion and its index
    station_dic = {}
    for i, s in enumerate(sta_id):
        station_dic[s] = i
    print(station_dic)

    NUMS = ori_dimensions['date'].size
    print("The number of days:", NUMS)

    input_obs_dic = dict.fromkeys(var_obs, None)
    input_ruitu_dic = dict.fromkeys(var_ruitu, None)

    for v in var_obs:
        input_obs_dic[v] = ori_variables[v][
            -2, :, :].data[:
                           -9]  # Only select the last 28 days excluding the last 9 NaN days
        if (input_obs_dic[v] == -9999.).any():
            temp_df = pd.DataFrame(data=input_obs_dic[v])
            temp_df.replace(-9999., np.NaN, inplace=True)
            temp_df.interpolate(inplace=True)
            temp_df.bfill(inplace=True)
            temp_df.ffill(inplace=True)

            input_obs_dic[v] = temp_df.values

        assert not (input_obs_dic[v] == -9999.).any(
        ), 'Error. -9999 happens in Obs for the predictive day!'
        assert not (input_obs_dic[v] == np.NaN).any(
        ), 'Error. np.NaN happens in Obs for the predictive day!'

    for v in var_ruitu:
        input_ruitu_dic[v] = ori_variables[v][-1, :, :].data

        if (input_ruitu_dic[v] == -9999.).any():
            temp_df = pd.DataFrame(data=input_ruitu_dic[v])
            temp_df.replace(-9999., np.NaN, inplace=True)
            temp_df.interpolate(inplace=True)
            temp_df.bfill(inplace=True)
            temp_df.ffill(inplace=True)

            input_ruitu_dic[v] = temp_df.values

        assert not (input_ruitu_dic[v] == -9999.).any(
        ), 'Error. -9999 happens in Ruitu for the predictive day!'
        assert not (input_ruitu_dic[v] == np.NaN).any(
        ), 'Error. np.NaN happens in Obs for the predictive day!'

    data_dic['input_obs'] = input_obs_dic
    data_dic['input_ruitu'] = input_ruitu_dic

    save_pkl(data_dic, interim_filepath,
             '{}_one_predict_day_{}.dict'.format(phase_str, datetime))

    return '{}_one_predict_day_{}.dict'.format(phase_str, datetime)