def optimize_stationary_for_period_and_all_cells( data_file = 'data/streamflows/hydrosheds_euler9/aex_discharge_1970_01_01_00_00.nc', paramfile = 'gev_params_stationary', high_flow = True, start_month = 1, end_month = 12, start_date = datetime(1970,1,1,0,0), end_date = datetime(1999,12, 31,0,0), event_duration = timedelta(days = 1)): print(paramfile) #check whether optimization is required if os.path.isfile(paramfile): print('already optimized, if you want to reoptimize delete %s' % paramfile) pars_set = pickle.load(open(paramfile)) return pars_set #get streamflow data streamflow, times, xs, ys = data_select.get_data_from_file(path = data_file) data = [] for pos in range(streamflow.shape[1]): if high_flow: data1 = data_select.get_period_maxima(streamflow[:,pos], times, start_date = start_date, end_date = end_date, start_month = start_month, end_month = end_month, event_duration = event_duration ) else: data1 = data_select.get_period_minima(streamflow[:, pos], times, start_date = start_date, end_date = end_date, start_month = start_month, end_month = end_month, event_duration = event_duration ) data.append(list(data1.values())) data = np.array(data).transpose() pars_set = optimize_stationary_for_period_and_all_cells_using_data(data = data, high_flow = high_flow) f = open(paramfile ,'w') pickle.dump(pars_set, f) f.close() return pars_set
def get_extremes_list(data_path = "", member_ids = None, high_flow = True, start_date = None, end_date = None, event_duration = timedelta(days = 1), period_start_month = 1, period_end_month = 12 ): """ returns list of 2d arrays of extremes, the 2d arrays have the shape = (time, cell_index) """ file_paths = [] for the_name in os.listdir(data_path): prefix = the_name.split('_')[0] if prefix in member_ids: file_paths += [os.path.join(data_path, the_name)] #merge extreme data all_extremes = [] i_indices = None j_indices = None for the_path in file_paths: streamflow, times, i_indices, j_indices = data_select.get_data_from_file(the_path) domain_extremes = [[] for pos in range(len(i_indices))] for pos, point_extrems in enumerate(domain_extremes): if high_flow: extremes = data_select.get_period_maxima(streamflows=streamflow[:, pos], times = times, start_date = start_date, end_date = end_date, event_duration = event_duration, start_month = period_start_month, end_month = period_end_month ) else: extremes = data_select.get_period_minima(streamflows=streamflow[:, pos], times = times, start_date = start_date, end_date = end_date, event_duration = event_duration, start_month = period_start_month, end_month = period_end_month ) point_extrems.extend(list(extremes.values())) all_extremes.append(np.transpose( np.array(domain_extremes) )) return all_extremes, i_indices, j_indices
def kw_test_for_means(current_climate = True, data_folder = 'data/streamflows/hydrosheds_euler9', months = list(range(1,13))): """ returns p-values resulting from kruskal - wallis test on annual means """ the_ids = members.all_current if current_climate else members.all_future file_paths = [] for the_file in os.listdir(data_folder): if the_file.split("_")[0] in the_ids: file_paths.append(os.path.join(data_folder, the_file)) real_means = [] for the_path in file_paths: streamflow, times, i_indices, j_indices = data_select.get_data_from_file(the_path) #for each year and for each gridcell get mean value for the period means_dict = data_select.get_means_over_months_for_each_year(times, streamflow, months = months) means_sorted_in_time = [x[1] for x in sorted(list(means_dict.items()), key=lambda x: x[0])] data_matrix = np.array(means_sorted_in_time) real_means.append(data_matrix) #save modelled means #print "data_matrix.shape = ", data_matrix.shape n_positions = real_means[0].shape[1] p_values = np.zeros((n_positions,)) for pos in range(n_positions): samples = [ data2d[:, pos] for data2d in real_means ] #x = list(samples) #print len(x), x[0].shape h, p_values[pos] = kruskalwallis(*samples) return p_values pass
def apply_bootstrap(data_path = '', member_name = 'aex', period_start_month = 1, period_end_month = 12, start_date = datetime(1970,1,1,0,0), end_date = datetime(1999,12,31,0,0), event_duration_days = timedelta(days = 1), n_samples = 1, high_flow = True, return_periods = [], process_pool = None ): """ Applying bootstrap to the given file at data_path """ if high_flow: prefix = 'high' else: prefix = 'low' out_file = member_name + '_' + prefix + '_std_dev' if os.path.isfile(out_file): print('%s exists already ' % out_file) return #get streamflow data assert os.path.isfile(data_path) streamflow, times, xs, ys = data_select.get_data_from_file(data_path) apply_bootstrap_to_data(streamflow, times = times, period_start_month = period_start_month, period_end_month = period_end_month, start_date = start_date, end_date = end_date, event_duration_days = event_duration_days, n_samples = n_samples, high_flow = high_flow, return_periods = return_periods, process_pool = process_pool, out_file = out_file)
def __init__(self, data_path=""): data, times, i_indices, j_indices = data_select.get_data_from_file(data_path) self._id, rest = os.path.basename(data_path).split("_", 1) self._data = data self._times = times self._i_indices = i_indices self._j_indices = j_indices self.data_extremes = None self.return_period_years = 2 self.high_flow = True # low_flows are calculated if False self.start_date = None self.end_date = None self.start_month = 1 self.end_month = 12 self.event_duration = timedelta(days=1) self.median_field = None self.ret_level_2yr = None self.longitudes = polar_stereographic.lons[i_indices, j_indices] self.latitudes = polar_stereographic.lats[i_indices, j_indices]
def apply_bootstrap_to_all_members_merged(file_paths = None, high_flow = True, n_samples = 10, out_file = '', process_pool = None, start_date = None, end_date = None, start_month = None, end_month = None, duration_days = None, return_periods = None ): """ duration_days - timedelta object """ if os.path.isfile(out_file): print("{0} already exists, skipping ".format(out_file)) return #select data all_extremes = [] streamflow = None for the_path in file_paths: print(the_path) streamflow, times, i_indices, j_indices = data_select.get_data_from_file(the_path) if not len(all_extremes): all_extremes = [[] for i in range(streamflow.shape[1])] for pos in range(streamflow.shape[1]): if high_flow: data1 = data_select.get_period_maxima(streamflow[:, pos], times, start_date = start_date, end_date = end_date, start_month = start_month, end_month = end_month, event_duration = duration_days ) else: data1 = data_select.get_period_minima(streamflow[:, pos], times, start_date = start_date, end_date = end_date, start_month = start_month, end_month = end_month, event_duration = duration_days ) all_extremes[pos].extend(list(data1.values())) #axes order: (time, position) all_extremes = np.array(all_extremes).transpose() bootstrap.apply_bootstrap_to_extremes(all_extremes, n_samples = n_samples, out_file = out_file, process_pool = process_pool, return_periods = return_periods, positions = range(streamflow.shape[1]), high_flow = high_flow, restrict_indices_to_member=True, n_values_per_member= all_extremes.shape[0] / len(file_paths) ) print("n_indices per member = ", all_extremes.shape[0] / len(file_paths)) pass
def gev_fit_all_members( high_flow=True, member_ids=[], data_folder="", file_name_pattern="", start_date=None, end_date=None, start_month=1, end_month=12, duration_days=timedelta(days=1), ): """ gev fit using data from all members data_folder - path to the folder with input data (streamflow) start_month - end_month - """ param_file = "high" if high_flow else "low" for id in member_ids: param_file += "_" + id if os.path.isfile(param_file): print("delete {0}, to reoptimize".format(param_file)) return pickle.load(open(param_file)) # select data path_pattern = os.path.join(data_folder, file_name_pattern) all_extremes = [] for id in member_ids: print(id) the_path = path_pattern.format(id) streamflow, times, i_indices, j_indices = data_select.get_data_from_file(the_path) if not len(all_extremes): for i in range(streamflow.shape[1]): all_extremes.append([]) for pos in range(streamflow.shape[1]): if high_flow: data1 = data_select.get_period_maxima( streamflow[:, pos], times, start_date=start_date, end_date=end_date, start_month=start_month, end_month=end_month, event_duration=duration_days, ) else: data1 = data_select.get_period_minima( streamflow[:, pos], times, start_date=start_date, end_date=end_date, start_month=start_month, end_month=end_month, event_duration=duration_days, ) all_extremes[pos].extend(list(data1.values())) # axes order: (time, position) all_extremes = np.array(all_extremes).transpose() if np.any(all_extremes is None): assert False, "all_extremes = " + str(all_extremes) # optimize print(all_extremes.shape) assert all_extremes.shape[1] == 547, "all_extremes.shape[1] != 547" param_set = gevfit.optimize_stationary_for_period_and_all_cells_using_data(data=all_extremes, high_flow=high_flow) pickle.dump(param_set, open(param_file, "wb")) return param_set pass
def calculate_and_plot(return_period = 10, return_level_function = ret_level_getters[0], ax = None): save_fig_to_file = (ax is None) if return_level_function == gevfit.get_high_ret_level_stationary: level_type = 'high' else: level_type = 'low' fig = plt.figure() assert isinstance(fig, Figure) save_to_txt = False current_ids = ["ccsm-crcm-current"] #members.current_ids future_ids = ["ccsm-crcm-future"] #members.future_ids current2future = dict(list(zip(current_ids, future_ids))) #folder_path = 'data/streamflows/hydrosheds_euler9/' folder_path = "data/streamflows/narccap_ccsm-crcm" coord_file = os.path.join(folder_path, '{0}_discharge_1970_01_01_00_00.nc'.format(current_ids[0])) i_indices, j_indices = data_select.get_indices_from_file(coord_file) significance_counter = None #plt.subplots_adjust(left = 0., hspace = 0.2, wspace = 0.2) labels = ["", "(b)", "(c)", "(d)", "(e)", "(f)"] ##for querying high flow data for saving to text file current_query = None future_query = None current_highs = None future_highs = None if level_type == 'high' and save_to_txt: high_period_start_month = 3 high_period_end_month = 7 current_start_date = datetime(1970,1,1,0,0) current_end_date = datetime(1999,12,31,0,0) future_start_date = datetime(2041,1,1,0,0) future_end_date = datetime(2070,12,31,0,0) future_query = QueryObject() future_query.start_date = future_start_date future_query.end_date = future_end_date future_query.event_duration = timedelta(days = 1) future_query.start_month = high_period_start_month future_query.end_month = high_period_end_month current_query = QueryObject() current_query.start_date = current_start_date current_query.end_date = current_end_date current_query.event_duration = timedelta(days = 1) current_query.start_month = high_period_start_month current_query.end_month = high_period_end_month gs = gridspec.GridSpec(3,2) current_id_to_changes = {} all_current = [] all_future = [] all_stds_current = [] all_stds_future = [] for k, current_id in enumerate(current_ids): if level_type == 'high' and save_to_txt: current_path = folder_path + '{0}_discharge_1970_01_01_00_00.nc'.format(current_id) future_path = folder_path + '{0}_discharge_2041_01_01_00_00.nc'.format(current2future[current_id]) current_data, times_current, x_indices, y_indices = data_select.get_data_from_file(current_path) future_data, times_future, x_indices, y_indices = data_select.get_data_from_file(future_path) current_highs = data_select.get_period_maxima_query(current_data, times_current, current_query) future_highs = data_select.get_period_maxima_query(future_data, times_future, future_query) #get current return levels pars_list = get_pars_for_member_and_type(current_id, level_type) return_levels_current = np.zeros(len(pars_list)) for pos, pars in enumerate(pars_list): return_levels_current[pos] = return_level_function(pars, return_period) stdevs_current = get_stdevs_for_member_and_type(current_id, level_type)[return_period] #get future return levels future_id = current2future[current_id] pars_list = get_pars_for_member_and_type(future_id, level_type) return_levels_future = np.zeros(len(pars_list)) for pos, pars in enumerate(pars_list): return_levels_future[pos] = return_level_function(pars, return_period) stdevs_future = get_stdevs_for_member_and_type(future_id, level_type)[return_period] change = return_levels_future - return_levels_current if significance_counter is None: significance_counter = np.zeros( change.shape ) print('minmax(std_current)') print(np.min(stdevs_current), np.max(stdevs_current)) print('minmax(std_future)') print(np.min(stdevs_future), np.max(stdevs_future)) print('min max min abs(rl_current - rl_future)') the_delta = np.abs(return_levels_future - return_levels_current) print(np.min(the_delta), np.max(the_delta), np.mean(the_delta)) #stdev = -1 - stands for undefined value condition = np.logical_and(np.abs(change) > 1.96 * ( stdevs_current + stdevs_future ), (stdevs_current >= 0) & (stdevs_future >= 0) & (return_levels_current > 0) ) sign_index = np.where(condition) significance_counter[sign_index] += 1 print(len(sign_index[0])) all_current.append(return_levels_current) all_future.append(return_levels_future) all_stds_current.append(stdevs_current) all_stds_future.append(stdevs_future) change /= return_levels_current change *= 100.0 min_change = np.min(change) print(return_levels_current[change == min_change], return_levels_future[change == min_change], min_change) if not level_type == "high": delta = 100 lower_limit = 0 if min_change >= 0 else np.floor(min_change / 10.0) * 10 else: delta = 50 lower_limit = np.floor(min_change / 10.0 ) * 10 not_significant = np.zeros(change.shape) not_significant = np.ma.masked_where(condition, not_significant) if level_type == 'high': assert np.all(return_levels_current > 0) assert np.all(stdevs_current >= 0) assert np.all(stdevs_future >= 0) #temp change to condition #change = np.ma.masked_where(np.logical_not(condition), change) print('Plotting: current %s, future %s' % (current_id, future_id)) current_id_to_changes[current_id] = change if ax is None: ax = fig.add_subplot(gs[k // 2, k % 2]) plot(change , i_indices, j_indices, xs, ys, title = "{0}-year {1} flow".format(return_period, level_type), label = labels[k], color_map = mycolors.get_red_blue_colormap(ncolors = 20), units = '%', basemap = basemap, minmax = (-delta, delta), colorbar_label_format = '%d', upper_limited = True, colorbar_tick_locator = LinearLocator(numticks = 11), not_significant_mask = not_significant , impose_lower_limit=lower_limit, ax= ax ) if return_period == 10 and level_type == 'high' and save_to_txt: txt_saver.save_to_file_rls_and_sign(current_id, return_period, return_levels_current, return_levels_future, stdevs_current, stdevs_future, condition, current_highs, future_highs) plt.subplot(gs[2,1]) plot_sign_count = False plot_significance = True if plot_sign_count: #if plotting significance count significance_counter = np.ma.masked_where(significance_counter == 0, significance_counter) plot(significance_counter, i_indices, j_indices, xs, ys, title = 'Significance Count', label = labels[5], minmax = (1,6), color_map = mycolors.get_sign_count_cmap(ncolors = 5), basemap = basemap, colorbar_tick_locator = MaxNLocator(nbins = 5), colorbar_label_format = '%d' ) #TODO plot +/- plus_change = None minus_change = None for current_id, the_change in current_id_to_changes.items(): if plus_change is None: plus_change = (the_change > 0) minus_change = (the_change < 0) else: plus_change = np.logical_and(the_change > 0, plus_change) minus_change = np.logical_and(the_change < 0, minus_change) #should be at least one member with significant changes plus_change = np.logical_and(plus_change, significance_counter > 0) minus_change = np.logical_and(minus_change, significance_counter > 0) x_interest = xs[i_indices, j_indices] y_interest = ys[i_indices, j_indices] x_plus = x_interest[plus_change] y_plus = y_interest[plus_change] x_minus = x_interest[minus_change] y_minus = y_interest[minus_change] basemap.scatter(x_plus, y_plus, marker = "+", color = "m", s = 15, zorder = 5, linewidth = 1) if len(x_minus) > 0: basemap.scatter(x_minus, y_minus, marker = "d", zorder = 6) else: #plot ensemble mean all_current = np.array( all_current ) all_future = np.array( all_future ) all_stds_current = np.array( all_stds_current ) all_stds_future = np.array( all_stds_future ) mean_current = np.mean(all_current, axis = 0) mean_future = np.mean(all_future, axis = 0) mean_stds_current = np.mean( all_stds_current, axis = 0 ) mean_stds_future = np.mean( all_stds_future, axis = 0 ) min_change = np.min((mean_future - mean_current)/mean_current * 100.0) if not level_type == "high": delta = 100 lower_limit = 0 if min_change >= 0 else np.floor(min_change / 10.0) * 10 else: delta = 100 lower_limit = np.floor(min_change / 10.0 ) * 10 not_significant = np.absolute(mean_future - mean_current) <= 1.96 * (mean_stds_current + mean_stds_future) not_significant = not_significant.astype(int) print(" sum(not_significant) = ", np.sum(not_significant)) not_significant = np.ma.masked_where(~(not_significant == 1), not_significant) not_significant *= 0.0 if not plot_significance: not_significant = None plot((mean_future - mean_current) / mean_current * 100.0, i_indices, j_indices, xs, ys, title = "", label = labels[-1], color_map = mycolors.get_red_blue_colormap(ncolors = 20), units = '%', basemap = basemap, minmax = (-delta, delta), colorbar_label_format = '%d', upper_limited = True, colorbar_tick_locator = LinearLocator(numticks = 11), not_significant_mask = not_significant, impose_lower_limit = lower_limit ) pass if save_fig_to_file: plt.tight_layout() plt.savefig('%d_%s_change_rl.png' % (return_period, level_type))