def _setup(dataset1: np.ndarray, dataset2: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: dataset1 = dataset1.flatten() dataset2 = dataset2.flatten() assert dataset1.size == dataset2.size afactor = dataset1.size / (dataset1.size + 1) first_ecdf = empirical_distribution.ECDF(dataset1)(dataset1) * afactor second_ecdf = empirical_distribution.ECDF(dataset2)(dataset2) * afactor return first_ecdf, second_ecdf
def gof_ks(x, Rho, P): """ The function performs Kolmogorov-Smirnov goodness-of-fit test for the Vasicek distribution. Parameters: x : A numeric vector in the interval of (0, 1) to test Rho : The Rho parameter in the Vasicek distribution P : The P parameter in the Vasicek distribution Returns: A dictionary with ks-statistic and pvalue Example: import py_vsk x = py_vsk.vsk_rvs(100, Rho = 0.2, P = 0.1) gof_ks(x, Rho = 0.2, P = 0.1) # {'ks': 0.09, 'pvalue': 0.8154147124661313} """ _x = sorted([_ for _ in x if _ > 0 and _ < 1 and not numpy.isnan(_)]) ocdf = empirical_distribution.ECDF(_x)(_x) ecdf = [_["cdf"] for _ in vsk_cdf(_x, Rho=Rho, P=P)] _rst = ks_2samp(ecdf, ocdf) return ({"ks": _rst.statistic, "pvalue": _rst.pvalue})
def Covariance_valuation(data): x, y = np.shape(data) #data is the data matrix at all time step. The dimention would be X*Y #data 2 is required if calculating disimilarity #Step 1: Transform the data into emperical CDF P = np.zeros((x, y)) for i in range(0, y): ECDF = edis.ECDF(data[:, i]) P[:, i] = ECDF(data[:, i]) #Step 2: Transform the ECDF into a uniform distribution Y = 2 * (P - 0.5) #Calculate different indcies #M is surfeit index which is the mean state of the system M = 1 / y * np.sum(Y, axis=1) #S is the severity index showing systemwide how extreme the states are S = 1 / y * np.sum(np.abs(Y), axis=1) #D measures how dissimilar the states of all sites are with rescpect to each other D1 = np.zeros((x, y - 1)) D2 = np.zeros((x, y)) for i in range(0, y - 1): for j in range(i + 1, y): D2[:, j] = np.abs(Y[:, i] - Y[:, j]) D1[:, i] = np.sum(D2, axis=1) D = 1 / y**2 * np.sum(D1, axis=1) return M, S, D
def omega_empirical(returns, target_rtn=0, log=True, plot=False, steps=1000): """ Omega Ratio based on empirical distribution. """ # validate_return_type(return_type) if not log: returns = pct_to_log_return(returns) # TODO ecdf = sde.ECDF(returns) # Generate computation space x = np.linspace(start=returns.min(), stop=returns.max(), num=steps) y = ecdf(x) norm_cdf = ss.norm.cdf(x, loc=returns.mean(), scale=returns.std(ddof=1)) # Plot empirical distribution CDF versus Normal CDF with same mean and # stdev if plot: fig, ax = plt.subplots() fig.set_size_inches((12, 6)) ax.plot(x, y, c="r", ls="--", lw=1.5, alpha=0.8, label="ECDF") ax.plot(x, norm_cdf, alpha=0.3, ls="-", c="b", lw=5, label="Normal CDF") ax.legend(loc="best") plt.show(fig) plt.close(fig)
def ecdfer(df: pd.DataFrame, ascending: bool = True, prediction_column: str = "prediction", ecdf_column: str = "prediction_ecdf", max_range: int = 1000) -> LearnerReturnType: """ Learns an Empirical Cumulative Distribution Function from the specified column in the input DataFrame. It is usually used in the prediction column to convert a predicted probability into a score from 0 to 1000. Parameters ---------- df : Pandas' pandas.DataFrame A Pandas' DataFrame that must contain a `prediction_column` columns. ascending : bool Whether to compute an ascending ECDF or a descending one. prediction_column : str The name of the column in `df` to learn the ECDF from. ecdf_column : str The name of the new ECDF column added by this function max_range : int The maximum value for the ECDF. It will go will go from 0 to max_range. """ if ascending: base = 0 sign = 1 else: base = max_range sign = -1 values = df[prediction_column] ecdf = ed.ECDF(values) def p(new_df: pd.DataFrame) -> pd.DataFrame: return new_df.assign( **{ ecdf_column: ( base + sign * max_range * ecdf(new_df[prediction_column])) }) p.__doc__ = learner_pred_fn_docstring("ecdefer") log = { 'ecdfer': { 'nobs': len(values), 'prediction_column': prediction_column, 'ascending': ascending, 'transformed_column': [ecdf_column] } } return p, p(df), log
def interp_ecfd(sample): # https://stackoverflow.com/a/44163082 sample_edf = edf.ECDF(sample) slope_changes = sorted(set(sample)) sample_edf_values_at_slope_changes = [ sample_edf(item) for item in slope_changes ] inverted_edf = interp1d(sample_edf_values_at_slope_changes, slope_changes) return inverted_edf
def linear_interpolation(array, sample_size): """ Sampling from 1D array with linear interpolation as inverse cdf :param array: input 1D array of Numbers :param sample_size: number of samples :return: array of simulated data """ sample_edf = edf.ECDF(array) slope_changes = sorted(set(array)) sample_edf_values_at_slope_changes = [ sample_edf(item) for item in slope_changes ] inverted_edf = interp1d(sample_edf_values_at_slope_changes, slope_changes) return inverted_edf(np.random.uniform(0, 1, sample_size))
def gof_chisq(x, Rho, P, n=10): """ The function performs chi-square goodness-of-fit test for the Vasicek distribution. Parameters: x : A numeric vector in the interval of (0, 1) to test Rho : The Rho parameter in the Vasicek distribution P : The P parameter in the Vasicek distribution n : The number of groups for the chi-square test. The value should be picked such that all observed and expected frequencies should be at least 5. Returns: A dictionary with chi-square statistic, pvalue, and a table to calculate chi-square Example: import py_vsk x = py_vsk.vsk_rvs(100, Rho = 0.2, P = 0.1) gof_chisq(x, Rho = 0.2, P = 0.1)['stat'] # {'chisq': 11.0, 'pvalue': 0.27570893677222197} """ _x = sorted([_ for _ in x if _ > 0 and _ < 1 and not numpy.isnan(_)]) ocdf = empirical_distribution.ECDF(_x)(_x) ecdf = [_["cdf"] for _ in vsk_cdf(_x, Rho=Rho, P=P)] _cut = [_ for _ in sorted(qcut(ecdf, n) + [0, 1])] ogrp = numpy.searchsorted(_cut, ocdf).tolist() egrp = numpy.searchsorted(_cut, ecdf).tolist() _tbl = [ dict( zip(["group", "observed", "expected"], [ g, len([_ for _ in ogrp if _ == g]), len([_ for _ in egrp if _ == g]) ])) for g in sorted(set(egrp)) ] _rst = chisquare([_["observed"] for _ in _tbl], [_["expected"] for _ in _tbl]) return ({ "stat": { "chisq": _rst.statistic, "pvalue": _rst.pvalue }, "tbl": _tbl })
def pbo(M, S, metric_func, threshold, n_jobs=1, verbose=False, plot=False, hist=False): ''' Based on http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2326253 Features: * training and test sets are of equal size, providing comparable accuracy to both IS and OOS Sharpe ratios. * CSCV is symmetric, decline in performance can only result from overfitting, not arbitrary discrepancies between the training and test sets. * CSCV respects the time-dependence and other season-dependent features present in the data. * Results are deterministic, can be replicated. * Dispersion in the distribution of logits conveys relevant info regarding the robustness of the strategy selection process. * Model-free, non-parametric. Logits distribution resembles the cumulative Normal distribution if w_bar are close to uniform distribution (i.e. the backtest appears to be information-less). Therefore, for good backtesting, the distribution of logits will be centered in a significantly positive value, and its tail will marginally cover the region of negative logit values. Limitations: * CSCV is symmetric, for some strategies, K-fold CV might be better. * Not suitable for time series with strong auto-correlation, especially when S is large. * Assumes all the sample statistics carry the same weight. * Entirely possible that all the N strategy configs have high but similar Sharpe ratios. Therefore, PBO may appear high, however, 'overfitting' here is among many 'skilful' strategies. Parameters: M: returns data, numpy or dataframe format. S: chuncks to devided M into, must be even number. Paper suggests setting S = 16. See paper for details of choice of S. metric_func: evaluation function for returns data threshold: used as prob. of OOS Loss calculation cutoff. For Sharpe ratio, this should be 0 to indicate probabilty of loss. n_jobs: if greater than 1 then enable parallel mode hist: Default False, whether to plot histogram for rank of logits. Some problems exist when S >= 10. Need to look at why numpy / matplotlib does it. Returns: PBO result in namedtuple, instance of PBO. ''' if S % 2 == 1: raise ValueError( 'S must be an even integer, {:.1f} was given'.format(S)) n_jobs = int(n_jobs) if n_jobs < 1: n_jobs = 1 if isinstance(M, pd.DataFrame): # conver to numpy values if verbose: print('Convert from DataFrame to numpy array.') M = M.values # Paper suggests T should be 2x the no. of observations used by investor # to choose a model config, due to the fact that CSCV compares combinations # of T/2 observations with their complements. T, N = M.shape residual = T % S if residual != 0: M = M[residual:] T, N = M.shape sub_T = T // S if verbose: print('Total sample size: {:,d}, chunck size: {:,d}'.format(T, sub_T)) # generate subsets, each of length sub_T Ms = [] Ms_values = [] for i in range(S): start, end = i * sub_T, (i + 1) * sub_T Ms.append((i, M[start:end, :])) Ms_values.append(M[start:end, :]) Ms_values = np.array(Ms_values) if verbose: print('No. of Chuncks: {:,d}'.format(len(Ms))) # generate combinations Cs = [x for x in itr.combinations(Ms, S // 2)] if verbose: print('No. of combinations = {:,d}'.format(len(Cs))) # Ms_index used to find J_bar (complementary OOS part) Ms_index = set([x for x in range(len(Ms))]) # create J and J_bar if n_jobs < 2: J = [] J_bar = [] for i in range(len(Cs)): # make sure chucks are concatenated in their original order order = [x for x, _ in Cs[i]] sort_ind = np.argsort(order) Cs_values = np.array([v for _, v in Cs[i]]) # if verbose: # print('Cs index = {}, '.format(order), end='') joined = np.concatenate(Cs_values[sort_ind, :]) J.append(joined) # find Cs_bar Cs_bar_index = list(sorted(Ms_index - set(order))) # if verbose: # print('Cs_bar_index = {}'.format(Cs_bar_index)) J_bar.append(np.concatenate(Ms_values[Cs_bar_index, :])) # compute matrices for J and J_bar, e.g. Sharpe ratio R = [metric_func(j) for j in J] R_bar = [metric_func(j) for j in J_bar] # compute ranks of metrics R_rank = [ss.rankdata(x) for x in R] R_bar_rank = [ss.rankdata(x) for x in R_bar] # find highest metric, rn contains the index position of max value # in each set of R (IS) rn = [np.argmax(r) for r in R_rank] # use above index to find R_bar (OOS) in same index position # i.e. the same config / setting rn_bar = [R_bar_rank[i][rn[i]] for i in range(len(R_bar_rank))] # formula in paper used N+1 as the denominator for w_bar. w_bar = [float(r) / N for r in rn_bar] # logit(.5) gives 0 so if w_bar value is equal to median logits is 0 logits = [spec.logit(w) for w in w_bar] else: # use joblib for parallel calc # print('Run in parallel mode.') cores = job.Parallel(n_jobs=n_jobs)(job.delayed(pbo_core_calc)( Cs_x, Ms, Ms_values, Ms_index, metric_func, verbose) for Cs_x in Cs) # core_df = pd.DataFrame(cores, columns=PBOCore._fields) # convert to values needed. # # core_df = pd.DataFrame.from_records(cores) # J = core_df.J.values # J_bar = core_df.J_bar.values # R = core_df.R.values # R_bar = core_df.R_bar.values # R_rank = core_df.R_rank.values # R_bar_rank = core_df.R_bar_rank.values # rn = core_df.rn.values # rn_bar = core_df.rn_bar.values # w_bar = core_df.w_bar.values # logits = core_df.logits.values J = [c.J for c in cores] J_bar = [c.J_bar for c in cores] R = [c.R for c in cores] R_bar = [c.R_bar for c in cores] R_rank = [c.R_rank for c in cores] R_bar_rank = [c.R_bar_rank for c in cores] rn = [c.rn for c in cores] rn_bar = [c.rn_bar for c in cores] w_bar = [c.w_bar for c in cores] logits = [c.logits for c in cores] # prob of overfitting phi = np.array([1.0 if lam <= 0 else 0.0 for lam in logits]) / len(Cs) pbo_test = np.sum(phi) # performance degradation R_n_star = np.array([R[i][rn[i]] for i in range(len(R))]) R_bar_n_star = np.array([R_bar[i][rn[i]] for i in range(len(R_bar))]) lm = ss.linregress(x=R_n_star, y=R_bar_n_star) prob_oos_loss = np.sum( [1.0 if r < threshold else 0.0 for r in R_bar_n_star]) / len(R_bar_n_star) # Stochastic dominance y = np.linspace(min(R_bar_n_star), max(R_bar_n_star), endpoint=True, num=1000) R_bar_n_star_cdf = smd.ECDF(R_bar_n_star) optimized = R_bar_n_star_cdf(y) R_bar_cdf = smd.ECDF(np.concatenate(R_bar)) non_optimized = R_bar_cdf(y) dom_df = pd.DataFrame( dict(optimized_IS=optimized, non_optimized_OOS=non_optimized)) dom_df.index = y # visually, non_optimized curve above optimized curve indicates good # backtest with low overfitting. dom_df['SD2'] = dom_df.non_optimized_OOS - dom_df.optimized_IS result = PBO(pbo_test, prob_oos_loss, lm, dom_df, Cs, J, J_bar, R, R_bar, R_rank, R_bar_rank, rn, rn_bar, w_bar, logits, R_n_star, R_bar_n_star) if plot: plot_pbo(result, hist=hist) return result
def discrete_ecdfer(df: pd.DataFrame, ascending: bool = True, prediction_column: str = "prediction", ecdf_column: str = "prediction_ecdf", max_range: int = 1000, round_method: Callable = int) -> LearnerReturnType: """ Learns an Empirical Cumulative Distribution Function from the specified column in the input DataFrame. It is usually used in the prediction column to convert a predicted probability into a score from 0 to 1000. Parameters ---------- df : Pandas' pandas.DataFrame A Pandas' DataFrame that must contain a `prediction_column` columns. ascending : bool Whether to compute an ascending ECDF or a descending one. prediction_column : str The name of the column in `df` to learn the ECDF from. ecdf_column : str The name of the new ECDF column added by this function. max_range : int The maximum value for the ECDF. It will go will go from 0 to max_range. round_method: Callable A function perform the round of transformed values for ex: (int, ceil, floor, round) """ if ascending: base = 0 sign = 1 else: base = max_range sign = -1 values = df[prediction_column] ecdf = ed.ECDF(values) df_ecdf = pd.DataFrame() df_ecdf['x'] = ecdf.x df_ecdf['y'] = pd.Series(base + sign * max_range * ecdf.y).apply(round_method) boundaries = df_ecdf.groupby("y").agg((min, max))["x"]["min"].reset_index() y = boundaries["y"] x = boundaries["min"] side = ecdf.side log = { 'discrete_ecdfer': { 'map': dict(zip(x, y)), 'round_method': round_method, 'nobs': len(values), 'prediction_column': prediction_column, 'ascending': ascending, 'transformed_column': [ecdf_column] } } del ecdf del values del df_ecdf def p(new_df: pd.DataFrame) -> pd.DataFrame: if not ascending: tind = np.searchsorted(-x, -new_df[prediction_column]) else: tind = np.searchsorted(x, new_df[prediction_column], side) - 1 return new_df.assign(**{ecdf_column: y[tind].values}) return p, p(df), log
#M is surfeit index which is the mean state of the system M = 1 / y * np.sum(Y, axis=1) #S is the severity index showing systemwide how extreme the states are S = 1 / y * np.sum(np.abs(Y), axis=1) #D measures how dissimilar the states of all sites are with rescpect to each other D1 = np.zeros((x, y - 1)) D2 = np.zeros((x, y)) for i in range(0, y - 1): for j in range(i + 1, y): D2[:, j] = np.abs(Y[:, i] - Y[:, j]) D1[:, i] = np.sum(D2, axis=1) D = 1 / y**2 * np.sum(D1, axis=1) return M, S, D A = edis.ECDF(Q[:, 0]) B = edis.ECDF(Q_stronger[:, 0]) C = edis.ECDF(Q_weaker[:, 0]) import matplotlib.pyplot as plt #Temperature fig, ax = plt.subplots() ax.hist(Q, bins=300, histtype='step', label='Reconstruct') ax.hist(Q_stronger, bins=300, histtype='step', label='Stronger') ax.hist(Q_weaker, bins=300, histtype='step', label='Weaker') fig.legend() ax.set_title('Histgram') fig.savefig('Netload_PDF.png') #
def bc_a_bootstrap_ratio_cookie_buckets(treatment_cookie_buckets, control_cookie_buckets, bins_boundaries, num_of_boot_samples, ci_level, estimator_type='quantile', quantile=0.5, paired=False, return_bootstrap_est=False, return_interval=False): """ Function that computes BCa bootstrap CI Parameters ---------- treatment_cookie_buckets: pd.Dataframe dataframe corresponding to treatment group in which each rows corresponds to a cookie bucket and number of columns correspond to number of bins in a histogram control_cookie_buckets: pd.Dataframe dataframe corresponding to treatment group in which each rows corresponds to a cookie bucket and number of columns correspond to number of bins in a histogram bins_boundaries: list of bins boundaries Example: [1,3,5,9,16] corresponds to 4 bins num_of_boot_samples: int number of bootstrap samples to use ci_level: float in (0,1) level at which to construct the confidence interval estimator_type: 'quantile' or 'mean' whether to perform a test when the effect size is in terms of mean or quantile quantile: (0,1) quantile for which the test is to be performed return_bootstrap_est: Bool whether to return estimators corresponding to bootstrap samples Returns ---------- test_result: Bool whether the test rejects (True) the null hypothesis If return_bootstrap_est is True, then estimators computed on bootstrap samples are returned (might worth further to return the interval itself-for length and shape comparison) """ # convert to arrays for treatment and control groups # !!! some notebooks have to be updates to pass arrays # data_treat = treatment_cookie_buckets.values # data_control = control_cookie_buckets.values # stacking boostrap estimators bootstrap_est = list() # number of histogram bins num_of_bins = len(bins_boundaries)-1 # obtain bins given boundaries bins_tuples = [(bins_boundaries[i-1], bins_boundaries[i]) for i in range(1, num_of_bins+1)] # get number of cookie buckets for treatment and check that # it matches control num_of_cookie_buckets = treatment_cookie_buckets.shape[0] assert num_of_cookie_buckets == control_cookie_buckets.shape[0] # possible indices to perform resamplaing indices = np.arange(num_of_cookie_buckets) for i in range(num_of_boot_samples): if paired: # get bootstrap indices for treatment and control separately ind_resampled = np.random.choice( indices, size=num_of_cookie_buckets) # get an bootstrap array for treatment and control separately # and compute the resulting histogram boot_treat = treatment_cookie_buckets[ind_resampled, :].sum(axis=0) boot_control = control_cookie_buckets[ind_resampled, :].sum(axis=0) # compute estimator else: # get bootstrap indices for treatment and control separately ind_treat = np.random.choice(indices, size=num_of_cookie_buckets) ind_control = np.random.choice(indices, size=num_of_cookie_buckets) # get an bootstrap array for treatment and control separately # and compute the resulting histogram boot_treat = treatment_cookie_buckets[ind_treat, :].sum(axis=0) boot_control = control_cookie_buckets[ind_control, :].sum(axis=0) # compute estimator if estimator_type is 'quantile': quant_treat = compute_quantile_hist_data( boot_treat, bins_tuples, quantile) quant_control = compute_quantile_hist_data( boot_control, bins_tuples, quantile) assert quant_control > 0 bootstrap_est += [100 * (quant_treat / quant_control - 1)] # convert list to array for simplicity bootstrap_est = np.array(bootstrap_est) # compute estimator on original sample quant_treat = compute_quantile_hist_data( treatment_cookie_buckets.sum(axis=0), bins_tuples, quantile) quant_control = compute_quantile_hist_data( control_cookie_buckets.sum(axis=0), bins_tuples, quantile) est_ratio = 100 * (quant_treat / quant_control - 1) # compute bias correction pre_z = (bootstrap_est <= est_ratio).mean() z_0 = norm.ppf(pre_z) # compute leave-one-bucket-out estimators leave_one_out_est = list() for i in range(num_of_cookie_buckets): # leave one cookie bucket out at a time ind_treat = np.delete(indices, i) ind_control = np.delete(indices, i) # get corresponding histograms current_data_treat = treatment_cookie_buckets[ind_treat, :].sum(axis=0) current_data_control = control_cookie_buckets[ind_control, :].sum( axis=0) # compute estimator if estimator_type is 'quantile': quant_treat = compute_quantile_hist_data( current_data_treat, bins_tuples, quantile) quant_control = compute_quantile_hist_data( current_data_control, bins_tuples, quantile) leave_one_out_est += [100 * (quant_treat / quant_control - 1)] # convert list to array for simplicity leave_one_out_est = np.array(leave_one_out_est) # take the mean for further comp of infl fns est_mean = leave_one_out_est.mean() # compute influence functions infl_fns = (num_of_cookie_buckets-1) * (est_mean-leave_one_out_est) # compute acceleration factor num = sum(infl_fns ** 3) den = sum(infl_fns ** 2) ** (3/2) accel_factor = num / (6 * den) # compute left and right quantiles of standard normal left_q, right_q = norm.ppf([(1 - ci_level)/2, (1 + ci_level)/2]) # transform using bias correction and acceleration left_bound = z_0 + (z_0 + left_q) / (1 - accel_factor * (z_0 + left_q)) right_bound = z_0 + (z_0 + right_q) / (1 - accel_factor * (z_0 + right_q)) # apply gaussian transform left_bound, right_bound = norm.cdf([left_bound, right_bound]) # apply inverse transform using empirical cdf of bootstrap samples sample_edf = edf.ECDF(bootstrap_est) slope_changes = sorted(set(bootstrap_est)) sample_edf_values_at_slope_changes = [ sample_edf(item) for item in slope_changes] inverted_edf = interp1d(sample_edf_values_at_slope_changes, slope_changes) left_bound, right_bound = inverted_edf([left_bound, right_bound]) if return_interval: if return_bootstrap_est: # in case we want to return the estimators for analyses if 0 < left_bound or right_bound < 0: return True, [left_bound, right_bound], bootstrap_est else: return False, [left_bound, right_bound], bootstrap_est else: # in case we just need to perform the test if 0 < left_bound or right_bound < 0: return True, [left_bound, right_bound] else: return False, [left_bound, right_bound] else: if return_bootstrap_est: # in case we want to return the estimators for analyses if 0 < left_bound or right_bound < 0: return True, bootstrap_est else: return False, bootstrap_est else: # in case we just need to perform the test if 0 < left_bound or right_bound < 0: return True else: return False
def run(output_suffix): # Import historical tmeperature data df_temp = pd.read_excel( 'Synthetic_streamflows/input/hist_temps_1953_2007.xlsx') his_temp_matrix = df_temp.values # Import calender calender = pd.read_excel( 'Synthetic_streamflows/input/BPA_hist_streamflow.xlsx', sheet_name='Calender', header=None) calender = calender.values julian = calender[:, 2] ############################### # Synthetic HDD CDD calculation # Simulation data sim_weather = pd.read_csv( 'Synthetic_weather/output/synthetic_weather_data' + output_suffix + '.csv', header=0) # Load temperature data only cities = [ 'SALEM_T', 'EUGENE_T', 'SEATTLE_T', 'BOISE_T', 'PORTLAND_T', 'SPOKANE_T', 'FRESNO_T', 'LOS ANGELES_T', 'SAN DIEGO_T', 'SACRAMENTO_T', 'SAN JOSE_T', 'SAN FRANCISCO_T', 'TUCSON_T', 'PHOENIX_T', 'LAS VEGAS_T' ] sim_temperature = sim_weather[cities] # Convert temperatures to Fahrenheit sim_temperature = (sim_temperature * (9 / 5)) + 32 sim_temperature = sim_temperature.values num_cities = len(cities) num_sim_days = len(sim_temperature) HDD_sim = np.zeros((num_sim_days, num_cities)) CDD_sim = np.zeros((num_sim_days, num_cities)) # calculate daily records of heating (HDD) and cooling (CDD) degree days for i in range(0, num_sim_days): for j in range(0, num_cities): HDD_sim[i, j] = np.max((0, 65 - sim_temperature[i, j])) CDD_sim[i, j] = np.max((0, sim_temperature[i, j] - 65)) # calculate annual totals of heating and cooling degree days for each city annual_HDD_sim = np.zeros((int(len(HDD_sim) / 365), num_cities)) annual_CDD_sim = np.zeros((int(len(CDD_sim) / 365), num_cities)) for i in range(0, int(len(HDD_sim) / 365)): for j in range(0, num_cities): annual_HDD_sim[i, j] = np.sum(HDD_sim[0 + (i * 365):365 + (i * 365), j]) annual_CDD_sim[i, j] = np.sum(CDD_sim[0 + (i * 365):365 + (i * 365), j]) ######################################################################## #Calculate HDD and CDD for historical temperature data num_cities = len(cities) num_days = len(his_temp_matrix) # daily records HDD = np.zeros((num_days, num_cities)) CDD = np.zeros((num_days, num_cities)) for i in range(0, num_days): for j in range(0, num_cities): HDD[i, j] = np.max((0, 65 - his_temp_matrix[i, j + 1])) CDD[i, j] = np.max((0, his_temp_matrix[i, j + 1] - 65)) # annual sums annual_HDD = np.zeros((int(len(HDD) / 365), num_cities)) annual_CDD = np.zeros((int(len(CDD) / 365), num_cities)) for i in range(0, int(len(HDD) / 365)): for j in range(0, num_cities): annual_HDD[i, j] = np.sum(HDD[0 + (i * 365):365 + (i * 365), j]) annual_CDD[i, j] = np.sum(CDD[0 + (i * 365):365 + (i * 365), j]) ########################################################################################### #This section is used for calculating total hydro # Load relevant streamflow data (1953-2007) BPA_streamflow = pd.read_excel( 'Synthetic_streamflows/input/BPA_hist_streamflow.xlsx', sheet_name='Inflows', header=0) Hoover_streamflow = pd.read_csv( 'Synthetic_streamflows/input/Hoover_hist_streamflow.csv', header=0) CA_streamflow = pd.read_excel( 'Synthetic_streamflows/input/CA_hist_streamflow.xlsx', header=0) Willamette_streamflow = pd.read_csv( 'Synthetic_streamflows/input/Willamette_hist_streamflow.csv', header=0) # headings name_Will = list(Willamette_streamflow.loc[:, 'Albany':]) name_CA = list(CA_streamflow.loc[:, 'ORO_fnf':]) name_BPA = list(BPA_streamflow.loc[:, '1M':]) # number of streamflow gages considered num_BPA = len(name_BPA) num_CA = len(name_CA) num_Will = len(name_Will) num_gages = num_BPA + num_CA + num_Will + 1 # Calculate historical totals for 1953-2007 years = range(1953, 2008) for y in years: y_index = years.index(y) BPA = BPA_streamflow.loc[BPA_streamflow['year'] == y, '1M':] CA = CA_streamflow.loc[CA_streamflow['year'] == y, 'ORO_fnf':] WB = Willamette_streamflow.loc[Willamette_streamflow['year'] == y, 'Albany':] HO = Hoover_streamflow.loc[Hoover_streamflow['year'] == y, 'Discharge'] BPA_sums = np.reshape(np.sum(BPA, axis=0).values, (1, num_BPA)) CA_sums = np.reshape(np.sum(CA, axis=0).values, (1, num_CA)) WB_sums = np.reshape(np.sum(WB, axis=0).values, (1, num_Will)) HO_sums = np.reshape(np.sum(HO, axis=0), (1, 1)) # matrix of annual flows for each stream gage joined = np.column_stack((BPA_sums, CA_sums, WB_sums, HO_sums)) if y_index < 1: hist_totals = joined else: hist_totals = np.vstack((hist_totals, joined)) BPA_headers = np.reshape(list(BPA_streamflow.loc[:, '1M':]), (1, num_BPA)) CA_headers = np.reshape(list(CA_streamflow.loc[:, 'ORO_fnf':]), (1, num_CA)) WB_headers = np.reshape(list(Willamette_streamflow.loc[:, 'Albany':]), (1, num_Will)) HO_headers = np.reshape(['Hoover'], (1, 1)) headers = np.column_stack( (BPA_headers, CA_headers, WB_headers, HO_headers)) # annual streamflow totals for 1953-2007 df_hist_totals = pd.DataFrame(hist_totals) df_hist_totals.columns = headers[0, :] df_hist_totals.loc[38, '83L'] = df_hist_totals.loc[36, '83L'] added_value = abs(np.min((df_hist_totals))) + 5 log_hist_total = np.log(df_hist_totals + abs(added_value)) A = df_hist_totals.values B = np.column_stack((A, annual_HDD, annual_CDD)) x, y = np.shape(B) #data is the data matrix at all time step. The dimention would be X*Y #data 2 is required if calculating disimilarity #Step 1: Transform the data into emperical CDF P = np.zeros((x, y)) for i in range(0, y): ECDF = edis.ECDF(B[:, i]) P[:, i] = ECDF(B[:, i]) Y = 2 * (P - 0.5) new_cols = ['Name'] + ['type_' + str(i) for i in range(0, 141)] #remove constant zeros columns need_to_remove = [1, 17, 22, 24, 27, 32, 34, 36, 37, 38, 44, 107, 108, 109] Y2 = np.delete(Y, need_to_remove, axis=1) Y[:, 107] = 1 mean = np.mean(Y, axis=0) cov = np.cov(Y, rowvar=0) runs = int(num_sim_days / 365) * 5 sim_years = int(num_sim_days / 365) N = np.random.multivariate_normal(mean, cov, runs) T = (N / 2) + 0.5 T_all = np.zeros((runs, y)) for i in range(0, y): for j in range(0, runs): if T[j, i] < 0: T_all[j, i] = (np.percentile(B[:, i], q=0 * 100)) * (1 + T[j, i]) elif T[j, i] <= 1 and T[j, i] >= 0: T_all[j, i] = np.percentile(B[:, i], q=T[j, i] * 100) else: T_all[j, i] = (np.percentile(B[:, i], q=1 * 100)) * T[j, i] Sim_total = T_all[:, :112] Sim_HDD_CDD = T_all[:, 112:] Sim_CDD = Sim_HDD_CDD[:, 15:] Sim_HDD = Sim_HDD_CDD[:, :15] ###################################### #sns.kdeplot(annual_CDD[:,0],label='His') #sns.kdeplot(annual_CDD_sim[:,0],label='Syn') #sns.kdeplot(Sim_HDD_CDD[:,15],label='Capula') #plt.legend() # #sns.kdeplot(annual_HDD[:,0],label='His') #sns.kdeplot(annual_HDD_sim[:,0],label='Syn') #sns.kdeplot(Sim_HDD_CDD[:,0],label='Capula') #plt.legend() ######################################### HDD_CDD = np.column_stack((annual_HDD_sim, annual_CDD_sim)) year_list = np.zeros(int(num_sim_days / 365)) Best_RMSE = 9999999999 CHECK = np.zeros((sim_years, runs)) for i in range(0, sim_years): for j in range(0, runs): RMSE = (np.sum(np.abs(HDD_CDD[i, :] - Sim_HDD_CDD[j, :]))) CHECK[i, j] = RMSE if RMSE <= Best_RMSE: year_list[i] = j Best_RMSE = RMSE else: pass Best_RMSE = 9999999999 sim_totals = np.zeros((sim_years, num_gages)) for i in range(0, sim_years): sim_totals[i, :] = Sim_total[int(year_list[i]), :] ################################################################################### #C_1=np.corrcoef(sim_totals,rowvar=0) #C_his=np.corrcoef(A,rowvar=0) #import seaborn as sns; sns.set() # #grid_kws = {"height_ratios": (.9, .05), "hspace": .3} #fig,ax=plt.subplots() #plt.rcParams["font.weight"] = "bold" #plt.rcParams["axes.labelweight"] = "bold" #ax1=plt.subplot(121) #sns.heatmap(C_1,vmin=0,vmax=1,cbar=False) #plt.axis('off') #ax.set_title('Syn') # # # #ax2=plt.subplot(122) #cbar_ax = fig.add_axes([.92, .15, .03, .7]) # <-- Create a colorbar axes # #fig2=sns.heatmap(C_his,ax=ax2,cbar_ax=cbar_ax,vmin=0,vmax=1) #cbar=ax2.collections[0].colorbar #cbar.ax.tick_params(labelsize='large') # #fig2.axis('off') # # # ################################################################################## #plt.figure() #sns.kdeplot(A[:,0],label='His') #sns.kdeplot(sim_totals[:,0],label='Syn') #sns.kdeplot(Sim_total[:,0],label='Capula') #plt.legend() # #plt.figure() #sns.kdeplot(A[:,5],label='His') #sns.kdeplot(sim_totals[:,5],label='Syn') #sns.kdeplot(Sim_total[:,5],label='Capula') #plt.legend() # #plt.figure() #sns.kdeplot(A[:,52],label='His') #sns.kdeplot(sim_totals[:,52],label='Syn') #sns.kdeplot(Sim_total[:,52],label='Capula') #plt.legend() # #plt.figure() #sns.kdeplot(A[:,55],label='His') #sns.kdeplot(sim_totals[:,55],label='Syn') #sns.kdeplot(Sim_total[:,55],label='Capula') #plt.legend() # #plt.figure() #sns.kdeplot(A[:,56],label='His') #sns.kdeplot(sim_totals[:,56],label='Syn') #sns.kdeplot(Sim_total[:,56],label='Capula') #plt.legend() # #plt.figure() #sns.kdeplot(A[:,66],label='His') #sns.kdeplot(sim_totals[:,66],label='Syn') #sns.kdeplot(Sim_total[:,66],label='Capula') #plt.legend() ################################################################################## # impose logical constraints mins = np.min(df_hist_totals.loc[:, :'Hoover'], axis=0) for i in range(0, num_gages): lower_bound = mins[i] for j in range(0, sim_years): if sim_totals[j, i] < lower_bound: sim_totals[j, i] = lower_bound * np.random.uniform(0, 1) df_sim_totals = pd.DataFrame(sim_totals) H = list(headers) df_sim_totals.columns = H #A1=[] #A2=[] #for h in H: # a1=np.average(df_hist_totals.loc[:,h]) # a2=np.average(df_sim_totals.loc[:,h]) # A1.append(a1) # A2.append(a2) # #plt.plot(A1) #plt.plot(A2) ##################################################################################### # This section selects daily fractions which are paired with # annual totals to arrive at daily streamflows # 4 cities are nearest to all 109 stream gage sites Fraction_calculation_cities = ['Spokane', 'Boise', 'Sacramento', 'Fresno'] # Each is weighted by average annual flow at nearby gage sites Temperature_weights = pd.read_excel( 'Synthetic_streamflows/input/city_weights.xlsx', header=0) # historical temperatures for those 4 cities fraction_hist_temp = df_temp[Fraction_calculation_cities] fraction_hist_temp_matrix = fraction_hist_temp.values # calculate daily record of weighted temperatures across 4 cities weighted_T = np.zeros(len(fraction_hist_temp_matrix)) for i in range(0, len(fraction_hist_temp_matrix)): weighted_T[i] = fraction_hist_temp_matrix[ i, 0] * Temperature_weights['Spokane'] + fraction_hist_temp_matrix[ i, 1] * Temperature_weights['Boise'] + fraction_hist_temp_matrix[ i, 2] * Temperature_weights[ 'Sacramento'] + fraction_hist_temp_matrix[ i, 3] * Temperature_weights['Fresno'] # synthetic temperatures for each of the cities fcc = list(['SPOKANE_T', 'BOISE_T', 'SACRAMENTO_T', 'FRESNO_T']) fraction_sim = sim_weather[fcc] fraction_sim_matrix = fraction_sim.values weighted_T_sim = np.zeros(len(fraction_sim_matrix)) # calculate synthetic weighted temperature (in Fahrenheit) for i in range(0, len(fraction_sim_matrix)): weighted_T_sim[i] = fraction_sim_matrix[i, 0] * Temperature_weights[ 'Spokane'] + fraction_sim_matrix[i, 1] * Temperature_weights[ 'Boise'] + fraction_sim_matrix[i, 2] * Temperature_weights[ 'Sacramento'] + fraction_sim_matrix[ i, 3] * Temperature_weights['Fresno'] weighted_T_sim = (weighted_T_sim * (9 / 5)) + 32 #Sample synthetic fractions, then combine with totals sim_years = int(len(fraction_sim_matrix) / 365) sim_T = np.zeros((365, sim_years)) hist_years = int(len(fraction_hist_temp) / 365) hist_T = np.zeros((365, hist_years)) # reshape historical and simulated weighted temperatures in new variables for i in range(0, hist_years): hist_T[:, i] = weighted_T[i * 365:365 + (i * 365)] for i in range(0, sim_years): sim_T[:, i] = weighted_T_sim[i * 365:365 + (i * 365)] # aggregate weighted temperatures into monthly values Normal_Starting = datetime(1900, 1, 1) datelist = pd.date_range(Normal_Starting, periods=365) count = 0 m = np.zeros(365) for i in range(0, 365): m[i] = int(datelist[count].month) count = count + 1 if count > 364: count = 0 hist_T_monthly = np.column_stack((hist_T, m)) monthly_hist_T = np.zeros((12, hist_years)) for i in range(0, sim_years): for j in range(1, 13): d1 = hist_T_monthly[hist_T_monthly[:, hist_years] == j] d2 = d1[:, :hist_years] monthly_hist_T[j - 1, :] = np.sum(d2, axis=0) Normal_Starting = datetime(1900, 1, 1) datelist = pd.date_range(Normal_Starting, periods=365) count = 0 m = np.zeros(365) for i in range(0, 365): m[i] = int(datelist[count].month) count = count + 1 if count > 364: count = 0 sim_T_monthly = np.column_stack((sim_T, m)) monthly_sim_T = np.zeros((12, sim_years)) for i in range(0, sim_years): for j in range(1, 13): d1 = sim_T_monthly[sim_T_monthly[:, sim_years] == j] d2 = d1[:, :sim_years] monthly_sim_T[j - 1, :] = np.sum(d2, axis=0) # select historical year with most similar spring and summer temperatures # to new simulated years year_list = np.zeros(sim_years) Best_RMSE = 9999999999 CHECK = np.zeros((sim_years, hist_years)) for i in range(0, sim_years): for j in range(0, hist_years): RMSE = (np.sum( np.abs(monthly_sim_T[3:8, i] - monthly_hist_T[3:8, j]))) CHECK[i, j] = RMSE if RMSE <= Best_RMSE: year_list[i] = j Best_RMSE = RMSE else: pass Best_RMSE = 9999999999 ################################################################################ #Generate streamflow TDA = np.zeros((int(365 * sim_years), 2)) totals_hist = np.zeros((num_gages, hist_years)) fractions_hist = np.zeros((hist_years, 365, num_gages)) totals_hist_hoover = np.zeros((1, hist_years)) output_BPA = np.zeros((sim_years * 365, num_BPA)) output_Hoover = np.zeros((sim_years * 365, 1)) output_CA = np.zeros((sim_years * 365, num_CA)) output_WI = np.zeros((sim_years * 365, num_Will)) # historical daily flows x_Hoover = Hoover_streamflow.loc[:, 'Discharge'].values x_BPA = BPA_streamflow.loc[:, '1M':].values x_CA = CA_streamflow.loc[:, 'ORO_fnf':].values x_WI = Willamette_streamflow.loc[:, 'Albany':'COT5A'].values x = np.column_stack((x_BPA, x_CA, x_WI, x_Hoover)) x = np.reshape(x, (hist_years, 365, num_gages)) # historical daily fractions for i in range(0, hist_years): for j in range(0, num_gages): totals_hist[j, i] = np.sum(np.abs(x[i, :, j])) if totals_hist[j, i] == 0: fractions_hist[i, :, j] = 0 else: fractions_hist[i, :, j] = x[i, :, j] / totals_hist[j, i] # sample simulated daily fractions for i in range(0, sim_years): for j in range(0, num_gages): if j <= num_BPA - 1: output_BPA[(i * 365):(i * 365) + 365, j] = fractions_hist[int(year_list[i]), :, j] * sim_totals[i, j] elif j == num_gages - 1: output_Hoover[(i * 365):(i * 365) + 365, 0] = fractions_hist[int(year_list[i]), :, j] * sim_totals[i, j] elif j > num_BPA - 1 and j <= num_BPA + num_CA - 1: output_CA[(i * 365):(i * 365) + 365, j - num_BPA] = fractions_hist[int(year_list[i]), :, j] * sim_totals[i, j] else: output_WI[(i * 365):(i * 365) + 365, j - num_BPA - num_CA] = fractions_hist[int(year_list[i]), :, j] * sim_totals[i, j] TDA[(i * 365):(i * 365) + 365, 0] = range(1, 366) # assign flows to the Dalles, OR TDA[:, 1] = output_BPA[:, 47] ############################################################################### # # Output # np.savetxt('Synthetic_streamflows/output/synthetic_streamflows_FCRPS.csv',output_BPA,delimiter=',') # np.savetxt('Synthetic_streamflows/output/synthetic_streamflows_TDA.csv',TDA[:,1],delimiter=',') # np.savetxt('Synthetic_streamflows/output/synthetic_discharge_Hoover.csv',output_Hoover,delimiter=',') # CA=pd.DataFrame(output_CA,columns=name_CA) # CA.to_csv('Synthetic_streamflows/output/synthetic_streamflows_CA.csv') # Willamatte_Syn=pd.DataFrame(output_WI,columns=name_Will) # Willamatte_Syn.to_csv('Synthetic_streamflows/output/synthetic_streamflows_Willamette.csv') #write CA synthetic flows to ORCA file leap_cycles = int(sim_years // 4) r = np.shape(output_CA) for i in range(0, leap_cycles): if i < 1: C = output_CA[0:1154, :] B = np.empty((1, int(r[1]))) B[:] = np.nan D = output_CA[i * 1460 + 1154:i * 1460 + 1154 + 1460] F = np.vstack((C, B, D)) else: D = output_CA[i * 1460 + 1154:i * 1460 + 1154 + 1460] F = np.vstack((F, B, D)) df_leap = pd.DataFrame(F, columns=name_CA) df_leap.to_csv('Synthetic_streamflows/output/ORCA_forecast_flows' + output_suffix + '.csv')