def error_stats_one_day(year, month, day, runs, base_folder): truth = os.path.join(base_folder, f'data/{year:04}/{month:02}/{day:02}/data.nc') truth = xr.open_dataset(truth) truth = truth['ci'] truth = letkf_io.add_crop_attributes(truth) truth = return_error_domain(truth) to_return = [] for run in runs: print(run) adict = {'name': run} if run == 'persistence': adict = return_persistence_dict_one_day( adict, truth, [15, 30, 45, 60]) to_return.append(adict) continue full_day = letkf_io.return_day(year, month, day, run, base_folder) full_day = letkf_io.add_crop_attributes(full_day) full_day = return_error_domain(full_day) full_day = full_day['ci'] full_day = return_ens_mean(full_day) rmse = return_rmse_one_day(truth, full_day) forecast_sd, truth_sd = return_sd_one_day(truth, full_day) bias = return_bias_one_day(truth, full_day) corr = return_correlation_one_day(truth, full_day) adict['rmse'] = rmse adict['forecast_sd'] = forecast_sd adict['truth_sd'] = truth_sd adict['bias'] = bias adict['correlation'] = corr to_return.append(adict) return to_return
def prob_analysis_runs(month_day, runs, horizons, base_folder='/a2/uaren/travis', ): for this_month_day in month_day: print(this_month_day) year = 2014 month = this_month_day[0] day = this_month_day[1] truth = os.path.join( base_folder, f'data/{year:04}/{month:02}/{day:02}/data.nc') truth = xr.open_dataset(truth) truth = truth['ci'] truth = letkf_io.add_crop_attributes(truth) truth = return_error_domain(truth) truth = truth.load() full_index = truth.time.to_pandas().index for run in runs: crps_df = pd.DataFrame( index=full_index, columns=horizons) print(run) full_day = letkf_io.return_day( year, month, day, run, base_folder) full_day = letkf_io.add_crop_attributes(full_day) full_day = return_error_domain(full_day) full_day = full_day['ci'] full_day = full_day.load() for horizon in horizons: this_full_day = return_horizon(full_day, horizon) these_error_times = np.intersect1d( full_index, this_full_day.time.to_pandas().index) this_full_day = this_full_day.sel(time=these_error_times) this_truth = truth.sel(time=these_error_times) this_crps = ps.crps_ensemble( this_truth.values, this_full_day.values.transpose([0, 2, 3, 1])) this_crps = pd.Series(this_crps.mean(axis=(1, 2)), index=these_error_times) crps_df[horizon] = this_crps file_path = os.path.join( base_folder, 'results', f'{year:04}', f'{month:02}', f'{day:02}', run) file_path = letkf_io.find_latest_run(file_path) file_path = os.path.join(file_path, 'crps.h5') crps_df.to_hdf(file_path, 'crps')
def error_stats(year, month, day, runs, base_folder, optimize_folder=None): truth = os.path.join(base_folder, f'data/{year:04}/{month:02}/{day:02}/data.nc') truth = xr.open_dataset(truth) truth = truth['ci'] truth = letkf_io.add_crop_attributes(truth) truth = return_error_domain(truth) to_return = [] truth_sd = np.sqrt(truth.var(dim=['south_north', 'west_east'])) truth_sd = truth_sd.to_pandas() for run in runs: print(run) adict = {'name': run, 'truth_sd': truth_sd} if run == 'persistence': adict = return_persistence_dict( adict, truth, [15, 30, 45, 60]) to_return.append(adict) continue full_day = letkf_io.return_day(year, month, day, run, base_folder, optimize_folder) full_day = letkf_io.add_crop_attributes(full_day) full_day = return_error_domain(full_day) adict['u_spread'] = return_stat_df( truth, full_day['U'], return_spread) adict['v_spread'] = return_stat_df( truth, full_day['V'], return_spread) adict['spread_ci'] = return_stat_df( truth, full_day['ci'], return_spread) full_day = full_day['ci'] full_day = return_ens_mean(full_day) adict['rmse'] = return_stat_df( truth, full_day, return_rmse) adict['forecast_sd'] = return_stat_df( truth, full_day, return_sd) adict['bias'] = return_stat_df( truth, full_day, return_bias) adict['correlation'] = return_stat_df( truth, full_day, return_correlation) to_return.append(adict) return to_return
def fraction_of_positives_runs(month_day, runs, horizons, bounds_dict, N_bins, base_folder='/a2/uaren/travis', ): bins = np.arange(N_bins) multi_column = [np.repeat(horizons, bins.size), np.tile(bins, len(horizons))] multi_column = list(zip(*multi_column)) multi_column = pd.MultiIndex.from_tuples( multi_column, names=['horizon', 'bin']) for this_month_day in month_day: print(this_month_day) year = 2014 month = this_month_day[0] day = this_month_day[1] truth = os.path.join( base_folder, f'data/{year:04}/{month:02}/{day:02}/data.nc') truth = xr.open_dataset(truth) truth = truth['ci'] truth = letkf_io.add_crop_attributes(truth) truth = return_error_domain(truth) truth = truth.load() full_index = truth.time.to_pandas().index for run in runs: print(run) full_day = letkf_io.return_day( year, month, day, run, base_folder) full_day = letkf_io.add_crop_attributes(full_day) full_day = return_error_domain(full_day) full_day = full_day['ci'] full_day = full_day.load() for bound_name, bounds in bounds_dict.items(): print(bound_name) if bounds[0] == 0: truth_bounded = (truth < bounds[1]).astype('float') full_day_bounded = (full_day < bounds[1]).astype('float') elif bounds[1] == 1: truth_bounded = (truth >= bounds[0]).astype('float') full_day_bounded = (full_day >= bounds[0]).astype('float') else: truth_bounded = np.logical_and( truth >= bounds[0], truth < bounds[1]).astype('float') full_day_bounded = np.logical_and( full_day >= bounds[0], full_day < bounds[1]).astype('float') brier_score = pd.DataFrame( index=full_index, columns=horizons) fraction_of_positives = pd.DataFrame( index=full_index, columns=multi_column) mean_predicted_prob = fraction_of_positives.copy() forecast_hist = fraction_of_positives.copy() truth_hist = pd.DataFrame( index=full_index, columns=bins) for tt in range(truth_bounded.shape[0]): hist, temp = np.histogram( truth_bounded.values[tt], bins=N_bins, range=(0, 1)) truth_hist.iloc[tt] = hist for horizon in horizons: this_full_day = return_horizon(full_day_bounded, horizon) these_error_times = np.intersect1d( full_index, this_full_day.time.to_pandas().index) this_full_day = this_full_day.sel(time=these_error_times) this_full_day = this_full_day.mean(dim='ensemble_number') # account for boundary cases this_full_day = (this_full_day - 1e-8).clip(0, 1) this_truth = truth_bounded.sel(time=these_error_times) this_brier_score = ps.brier_score( this_truth.values.ravel(), this_full_day.values.ravel()) this_brier_score = this_brier_score.reshape( this_truth.shape).mean(axis=(1, 2)) this_brier_score = pd.Series(this_brier_score, index=these_error_times) brier_score[horizon] = this_brier_score this_fraction_of_positives = np.ones( [this_truth.shape[0], N_bins]) * np.nan this_mean_predicted_prob = np.ones( [this_truth.shape[0], N_bins]) * np.nan this_forecast_hist = this_fraction_of_positives.copy() for tt in range(this_truth.shape[0]): this_forecast_hist[tt], temp = np.histogram( this_full_day.values[tt], bins=N_bins, range=(0, 1)) fop, mpp = calibration.calibration_curve( this_truth.values[tt].ravel(), this_full_day.values[tt].ravel(), n_bins=N_bins) if fop.size < N_bins: correct_bins = np.floor(mpp*N_bins).astype('int') indexes = np.setdiff1d(bins, correct_bins) indexes -= np.arange(indexes.size) fop = np.insert(fop, indexes, 0) mpp = np.insert(mpp, indexes, 0) this_fraction_of_positives[tt] = fop this_mean_predicted_prob[tt] = mpp this_forcast_hist = pd.DataFrame( this_forecast_hist, index=these_error_times, columns=bins) forecast_hist[horizon] = this_forcast_hist this_fraction_of_positives = pd.DataFrame( this_fraction_of_positives, index=these_error_times, columns=bins) fraction_of_positives[horizon] = this_fraction_of_positives this_mean_predicted_prob = pd.DataFrame( this_mean_predicted_prob, index=these_error_times, columns=bins) mean_predicted_prob[horizon] = this_mean_predicted_prob file_path = os.path.join( base_folder, 'results', f'{year:04}', f'{month:02}', f'{day:02}', run) file_path = letkf_io.find_latest_run(file_path) this_folder = (bound_name + '_' + str(bounds[0]).replace('.', 'p') + '_' + str(bounds[1]).replace('.', 'p')) file_path = os.path.join( file_path, this_folder) if not os.path.exists(file_path): os.mkdir(file_path) this_file_path = os.path.join(file_path, 'brier_score.h5') brier_score.to_hdf(this_file_path, 'brier_score') this_file_path = os.path.join(file_path, 'truth_hist.h5') truth_hist.to_hdf(this_file_path, 'truth_hist') this_file_path = os.path.join(file_path, 'forecast_hist.h5') forecast_hist.to_hdf(this_file_path, 'forecast_hist') this_file_path = os.path.join(file_path, 'fraction_of_positives.h5') fraction_of_positives.to_hdf(this_file_path, 'fraction_of_positives') this_file_path = os.path.join(file_path, 'mean_predicted_prob.h5') mean_predicted_prob.to_hdf(this_file_path, 'mean_predicted_prob')
def prob_analysis_baselines(month_day, horizons, file_path, base_folder='/a2/uaren/travis', ): ens_members = 50 climatology = pd.read_hdf(file_path) climatology = climatology.values.ravel() climatology = climatology[~np.isnan(climatology)] climatology = climatology.clip(min=0, max=1) for this_month_day in month_day: print(this_month_day) year = 2014 month = this_month_day[0] day = this_month_day[1] truth = os.path.join( base_folder, f'data/{year:04}/{month:02}/{day:02}/data.nc') truth = xr.open_dataset(truth) truth = truth['ci'] truth = letkf_io.add_crop_attributes(truth) truth = return_error_domain(truth) full_index = truth.time.to_pandas().index crps_persistence = pd.DataFrame( index=full_index, columns=horizons) crps_persistent_dist = pd.DataFrame( index=full_index, columns=horizons) crps_climatology = pd.DataFrame( index=full_index, columns=horizons) crps_dates_climatology = pd.DataFrame( index=full_index, columns=horizons) for horizon in horizons: persistence = truth.copy() time_step = pd.Timedelta(str(horizon) + 'min') persistence['time'] = persistence.time + time_step dates_error_times = np.intersect1d( truth.time.to_pandas(), persistence.time.to_pandas()) this_truth = truth.sel(time=dates_error_times) this_index = this_truth.time.to_pandas().index persistence = persistence.sel(time=dates_error_times) # for persistence this_crps = ps.crps_ensemble( this_truth.values, persistence.values) this_crps = pd.Series(this_crps.mean(axis=(1, 2)), index=this_index) crps_persistence[horizon] = this_crps persis_array = persistence.values persis_shape = persis_array.shape persis_array = persis_array.reshape( persis_shape[0], persis_shape[1] * persis_shape[2]) persis_weights = np.ones( [dates_error_times.size, ens_members]) for ii in range(dates_error_times.size): persis_weights[ii], bin_edges = np.histogram( persis_array[ii], bins=ens_members, range=(0, 1), normed=True) persis_ens = (bin_edges[:-1] + bin_edges[1:])/2 persis_ens = np.repeat(persis_ens[None, :], persis_shape[2], axis=0) persis_ens = np.repeat(persis_ens[None, :], persis_shape[1], axis=0) persis_ens = np.repeat(persis_ens[None, :], persis_shape[0], axis=0) persis_weights = np.repeat(persis_weights[:, None, :], persis_shape[2], axis=1) persis_weights = np.repeat(persis_weights[:, None, :, :], persis_shape[1], axis=1) # for persistent distribution this_crps = ps.crps_ensemble( this_truth.sel(time=dates_error_times).values, persis_ens, weights=persis_weights) this_crps = pd.Series(this_crps.mean(axis=(1, 2)), index=this_index) crps_persistent_dist[horizon] = this_crps clim_weights, bin_edges = np.histogram( climatology, bins=ens_members, range=(0, 1), normed=True) clim_ens = (bin_edges[:-1] + bin_edges[1:])/2 clim_ens = np.repeat(clim_ens[None, :], persis_shape[2], axis=0) clim_ens = np.repeat(clim_ens[None, :], persis_shape[1], axis=0) clim_ens = np.repeat(clim_ens[None, :], persis_shape[0], axis=0) clim_weights = np.repeat( clim_weights[None, :], persis_shape[2], axis=0) clim_weights = np.repeat( clim_weights[None, :], persis_shape[1], axis=0) clim_weights = np.repeat( clim_weights[None, :], persis_shape[0], axis=0) # for climatology this_crps = ps.crps_ensemble( this_truth.sel(time=dates_error_times).values, clim_ens, weights=clim_weights) this_crps = pd.Series(this_crps.mean(axis=(1, 2)), index=this_index) crps_climatology[horizon] = this_crps climatology_dates = this_truth.values.ravel() climatology_dates = climatology_dates[~np.isnan(climatology_dates)] climatology_dates = climatology_dates.clip(min=0, max=1) clim_dates_weights, bin_edges = np.histogram( climatology_dates, bins=ens_members, range=(0, 1), normed=True) clim_dates_ens = (bin_edges[:-1] + bin_edges[1:])/2 clim_dates_ens = np.repeat(clim_dates_ens[None, :], persis_shape[2], axis=0) clim_dates_ens = np.repeat(clim_dates_ens[None, :], persis_shape[1], axis=0) clim_dates_ens = np.repeat(clim_dates_ens[None, :], persis_shape[0], axis=0) clim_dates_weights = np.repeat(clim_dates_weights[None, :], persis_shape[2], axis=0) clim_dates_weights = np.repeat(clim_dates_weights[None, :], persis_shape[1], axis=0) clim_dates_weights = np.repeat(clim_dates_weights[None, :], persis_shape[0], axis=0) # for dates climatology this_crps = ps.crps_ensemble( this_truth.sel(time=dates_error_times).values, clim_dates_ens, weights=clim_dates_weights) this_crps = pd.Series(this_crps.mean(axis=(1, 2)), index=this_index) crps_dates_climatology[horizon] = this_crps adict = {'persistence': crps_persistence, 'persistent_dist': crps_persistent_dist, 'climatology': crps_climatology, 'dates_climatology': crps_dates_climatology} for key, value in adict.items(): save_directory = ( '/a2/uaren/travis/' + f'results/2014/{month:02}/{day:02}/{key}_000') if not os.path.exists(save_directory): os.makedirs(save_directory) save_file = os.path.join(save_directory, 'crps.h5') value.to_hdf(save_file, 'crps')
def error_stats_many_days(dates, runs, horizons, base_folder, only_cloudy=False, only_of_times=True, mean_win_size=None, one_km_err=False): truth = letkf_io.return_many_truths(dates, base_folder) truth = truth['ci'] truth = letkf_io.add_crop_attributes(truth) truth = return_error_domain(truth) if only_of_times: truth_times = truth.time.to_pandas() these_dates = np.unique(truth_times.index.date) keep_times = pd.Series() for this_date in these_dates: keep_times = pd.concat( [keep_times, truth_times.loc[str(this_date)].iloc[1:]]) truth = truth.sel(time=keep_times.index) if only_cloudy: print('only_cloudy') truth_max = truth.max(dim=['south_north', 'west_east']) truth_mean = truth.mean(dim=['south_north', 'west_east']) bool_max = truth_max > 0.2 bool_mean = truth_mean > 0.1 cloudy_bool = xr.ufuncs.logical_or( bool_max, bool_mean) cloudy_times = truth.time[cloudy_bool] else: cloudy_times = None if one_km_err: west_east_err = down_sample_coord(truth.west_east, 1) south_north_err = down_sample_coord(truth.south_north, 1) truth = truth.sel(west_east=west_east_err, south_north=south_north_err) to_return = [] for run in runs: print(run) adict = {'name': run} if run == 'persistence': adict = return_persistence_dict_one_day( adict, truth, horizons, cloudy_times=cloudy_times) to_return.append(adict) continue ens_flag = False analysis_fore_flag = False if run[0] == 'ensemble': ens_flag = True run = run[1] elif run[0] == 'anly_fore': run = run[1] analysis_fore_flag = True all_days = letkf_io.return_many_days( dates, run, base_folder, only_of_times=only_of_times, mean_win_size=mean_win_size, analysis_fore_flag=analysis_fore_flag) if mean_win_size is None: all_days = all_days['ci'] if one_km_err: all_days = all_days.sel(west_east=west_east_err, south_north=south_north_err) if not ens_flag: all_days = return_ens_mean(all_days) rmse, total_error_times = return_rmse_one_day( truth, all_days, horizons, cloudy_times=cloudy_times) forecast_sd, truth_sd = return_sd_one_day( truth, all_days, horizons, total_error_times=total_error_times) bias = return_bias_one_day( truth, all_days, horizons, total_error_times=total_error_times) corr = return_correlation_one_day( truth, all_days, horizons, total_error_times=total_error_times) adict['rmse'] = rmse adict['forecast_sd'] = forecast_sd adict['truth_sd'] = truth_sd adict['bias'] = bias adict['correlation'] = corr adict['stat_times'] = total_error_times to_return.append(adict) return to_return