示例#1
0
文件: gevfit.py 项目: guziy/GevFit
def optimize_stationary_for_period_and_all_cells(
                data_file = 'data/streamflows/hydrosheds_euler9/aex_discharge_1970_01_01_00_00.nc',
                paramfile = 'gev_params_stationary',
                high_flow = True,
                start_month = 1, end_month = 12,
                start_date = datetime(1970,1,1,0,0),
                end_date = datetime(1999,12, 31,0,0),
                event_duration = timedelta(days = 1)):

    print(paramfile)

    #check whether optimization is required
    if os.path.isfile(paramfile):
        print('already optimized, if you want to reoptimize delete %s' % paramfile)
        pars_set = pickle.load(open(paramfile))
        return pars_set

    #get streamflow data
    streamflow, times, xs, ys = data_select.get_data_from_file(path = data_file)

    data = []
    for pos in range(streamflow.shape[1]):
        if high_flow:
            data1 = data_select.get_period_maxima(streamflow[:,pos], times,
                            start_date = start_date,
                            end_date = end_date,
                            start_month = start_month,
                            end_month = end_month,
                            event_duration = event_duration
                            )
        else:
            data1 = data_select.get_period_minima(streamflow[:, pos], times,
                            start_date = start_date,
                            end_date = end_date,
                            start_month = start_month,
                            end_month = end_month,
                            event_duration = event_duration
                            )
        data.append(list(data1.values()))


    data = np.array(data).transpose()
    pars_set = optimize_stationary_for_period_and_all_cells_using_data(data = data,
                                                    high_flow = high_flow)
    f = open(paramfile ,'w')
    pickle.dump(pars_set, f)
    f.close()
    return pars_set
示例#2
0
def get_extremes_list(data_path = "", member_ids = None, high_flow = True,
                        start_date = None, end_date = None,
                        event_duration = timedelta(days = 1),
                        period_start_month = 1, period_end_month = 12
                        ):
    """
    returns list of 2d arrays of extremes, the 2d arrays have the  shape = (time, cell_index)
    """
    file_paths = []
    for the_name in os.listdir(data_path):
        prefix = the_name.split('_')[0]
        if prefix in member_ids:
            file_paths += [os.path.join(data_path, the_name)]


    #merge extreme data
    all_extremes = []
    i_indices = None
    j_indices = None
    for the_path in file_paths:
        streamflow, times, i_indices, j_indices = data_select.get_data_from_file(the_path)


        domain_extremes = [[] for pos in range(len(i_indices))]

        for pos, point_extrems in enumerate(domain_extremes):
            if high_flow:
                extremes = data_select.get_period_maxima(streamflows=streamflow[:, pos], times = times,
                                                               start_date = start_date, end_date = end_date,
                                                               event_duration = event_duration,
                                                               start_month = period_start_month,
                                                               end_month = period_end_month
                                                               )
            else:
                extremes = data_select.get_period_minima(streamflows=streamflow[:, pos], times = times,
                                                           start_date = start_date, end_date = end_date,
                                                           event_duration = event_duration,
                                                           start_month = period_start_month,
                                                           end_month = period_end_month
                                                           )
            point_extrems.extend(list(extremes.values()))

        all_extremes.append(np.transpose( np.array(domain_extremes) ))


    return all_extremes, i_indices, j_indices
示例#3
0
def kw_test_for_means(current_climate = True, data_folder = 'data/streamflows/hydrosheds_euler9', months = list(range(1,13))):
    """
    returns p-values resulting from kruskal - wallis test on annual means
    """

    the_ids = members.all_current if current_climate else members.all_future

    file_paths = []
    for the_file in os.listdir(data_folder):
        if the_file.split("_")[0] in the_ids:
            file_paths.append(os.path.join(data_folder, the_file))

    real_means = []
    for the_path in file_paths:
        streamflow, times, i_indices, j_indices = data_select.get_data_from_file(the_path)

        #for each year and for each gridcell get mean value for the period
        means_dict = data_select.get_means_over_months_for_each_year(times, streamflow, months = months)

        means_sorted_in_time = [x[1] for x in sorted(list(means_dict.items()), key=lambda x: x[0])]
        data_matrix = np.array(means_sorted_in_time)
        real_means.append(data_matrix) #save modelled means
        #print "data_matrix.shape = ", data_matrix.shape

    n_positions = real_means[0].shape[1]
    p_values = np.zeros((n_positions,))
    for pos in range(n_positions):
        samples = [
            data2d[:, pos] for data2d in real_means
        ]

        #x = list(samples)
        #print len(x), x[0].shape


        h, p_values[pos] = kruskalwallis(*samples)
    return p_values

    pass
示例#4
0
文件: bootstrap.py 项目: guziy/GevFit
def apply_bootstrap(data_path = '',
                    member_name = 'aex',
                    period_start_month = 1, period_end_month = 12,
                    start_date = datetime(1970,1,1,0,0),
                    end_date = datetime(1999,12,31,0,0),
                    event_duration_days = timedelta(days = 1),
                    n_samples = 1, high_flow = True,
                    return_periods = [], process_pool = None
                    ):
    """
    Applying bootstrap to the given file at data_path
    """

    if high_flow:
        prefix = 'high'
    else:
        prefix = 'low'

    out_file = member_name + '_' + prefix + '_std_dev'
    if os.path.isfile(out_file):
        print('%s exists already ' % out_file)
        return


    #get streamflow data

    assert os.path.isfile(data_path)
    streamflow, times, xs, ys = data_select.get_data_from_file(data_path)
    
    apply_bootstrap_to_data(streamflow, times = times,
                        period_start_month = period_start_month,
                        period_end_month = period_end_month,
                        start_date = start_date,
                        end_date = end_date,
                        event_duration_days = event_duration_days,
                        n_samples = n_samples, high_flow = high_flow,
                        return_periods = return_periods,
                        process_pool = process_pool, out_file = out_file)
示例#5
0
    def __init__(self, data_path=""):
        data, times, i_indices, j_indices = data_select.get_data_from_file(data_path)
        self._id, rest = os.path.basename(data_path).split("_", 1)
        self._data = data
        self._times = times
        self._i_indices = i_indices
        self._j_indices = j_indices

        self.data_extremes = None
        self.return_period_years = 2
        self.high_flow = True  # low_flows are calculated if False

        self.start_date = None
        self.end_date = None
        self.start_month = 1
        self.end_month = 12

        self.event_duration = timedelta(days=1)

        self.median_field = None
        self.ret_level_2yr = None

        self.longitudes = polar_stereographic.lons[i_indices, j_indices]
        self.latitudes = polar_stereographic.lats[i_indices, j_indices]
def apply_bootstrap_to_all_members_merged(file_paths = None,
                                high_flow = True,
                                n_samples = 10, out_file = '',
                                process_pool = None,
                                start_date = None,
                                end_date = None,
                                start_month = None,
                                end_month = None,
                                duration_days = None,
                                return_periods = None
                                ):
    """
    duration_days - timedelta object
    """

    if os.path.isfile(out_file):
        print("{0} already exists, skipping ".format(out_file))
        return



    #select data
    all_extremes = []
    streamflow = None
    for the_path in file_paths:
        print(the_path)
        streamflow, times, i_indices, j_indices = data_select.get_data_from_file(the_path)

        if not len(all_extremes):
            all_extremes = [[] for i in range(streamflow.shape[1])]

        for pos in range(streamflow.shape[1]):
            if high_flow:
                data1 = data_select.get_period_maxima(streamflow[:, pos], times,
                                start_date = start_date,
                                end_date = end_date,
                                start_month = start_month,
                                end_month = end_month,
                                event_duration = duration_days
                                )
            else:
                data1 = data_select.get_period_minima(streamflow[:, pos], times,
                                start_date = start_date,
                                end_date = end_date,
                                start_month = start_month,
                                end_month = end_month,
                                event_duration = duration_days
                                )
            all_extremes[pos].extend(list(data1.values()))

    #axes order: (time, position)
    all_extremes = np.array(all_extremes).transpose()
    bootstrap.apply_bootstrap_to_extremes(all_extremes,
                                        n_samples = n_samples,
                                        out_file = out_file,
                                        process_pool = process_pool,
                                        return_periods = return_periods,
                                        positions = range(streamflow.shape[1]),
                                        high_flow = high_flow,
                                        restrict_indices_to_member=True,
                                        n_values_per_member= all_extremes.shape[0] / len(file_paths)
                                        )
    print("n_indices per member = ", all_extremes.shape[0] / len(file_paths))
    pass
示例#7
0
def gev_fit_all_members(
    high_flow=True,
    member_ids=[],
    data_folder="",
    file_name_pattern="",
    start_date=None,
    end_date=None,
    start_month=1,
    end_month=12,
    duration_days=timedelta(days=1),
):
    """
    gev fit using data from all members
    data_folder - path to the folder with input data (streamflow)
    start_month -
    end_month - 
    """

    param_file = "high" if high_flow else "low"
    for id in member_ids:
        param_file += "_" + id
    if os.path.isfile(param_file):
        print("delete {0}, to reoptimize".format(param_file))
        return pickle.load(open(param_file))

    # select data
    path_pattern = os.path.join(data_folder, file_name_pattern)
    all_extremes = []
    for id in member_ids:
        print(id)
        the_path = path_pattern.format(id)
        streamflow, times, i_indices, j_indices = data_select.get_data_from_file(the_path)

        if not len(all_extremes):
            for i in range(streamflow.shape[1]):
                all_extremes.append([])

        for pos in range(streamflow.shape[1]):
            if high_flow:
                data1 = data_select.get_period_maxima(
                    streamflow[:, pos],
                    times,
                    start_date=start_date,
                    end_date=end_date,
                    start_month=start_month,
                    end_month=end_month,
                    event_duration=duration_days,
                )
            else:
                data1 = data_select.get_period_minima(
                    streamflow[:, pos],
                    times,
                    start_date=start_date,
                    end_date=end_date,
                    start_month=start_month,
                    end_month=end_month,
                    event_duration=duration_days,
                )
            all_extremes[pos].extend(list(data1.values()))

    # axes order: (time, position)
    all_extremes = np.array(all_extremes).transpose()

    if np.any(all_extremes is None):
        assert False, "all_extremes = " + str(all_extremes)

    # optimize
    print(all_extremes.shape)
    assert all_extremes.shape[1] == 547, "all_extremes.shape[1] != 547"
    param_set = gevfit.optimize_stationary_for_period_and_all_cells_using_data(data=all_extremes, high_flow=high_flow)
    pickle.dump(param_set, open(param_file, "wb"))
    return param_set
    pass
def calculate_and_plot(return_period = 10,
                       return_level_function = ret_level_getters[0], ax = None):

    save_fig_to_file = (ax is None)
    if return_level_function == gevfit.get_high_ret_level_stationary:
        level_type = 'high'
    else:
        level_type = 'low'

    fig = plt.figure()
    assert isinstance(fig, Figure)


    save_to_txt = False
    current_ids = ["ccsm-crcm-current"] #members.current_ids
    future_ids = ["ccsm-crcm-future"] #members.future_ids
    current2future = dict(list(zip(current_ids, future_ids)))

    #folder_path = 'data/streamflows/hydrosheds_euler9/'
    folder_path = "data/streamflows/narccap_ccsm-crcm"
    coord_file = os.path.join(folder_path, '{0}_discharge_1970_01_01_00_00.nc'.format(current_ids[0]))
    i_indices, j_indices = data_select.get_indices_from_file(coord_file)
    significance_counter = None
    #plt.subplots_adjust(left = 0., hspace = 0.2, wspace = 0.2)



    labels = ["", "(b)", "(c)", "(d)", "(e)", "(f)"]

    ##for querying high flow data for saving to text file
    current_query = None
    future_query = None
    current_highs = None
    future_highs = None
    if level_type == 'high' and save_to_txt:
        high_period_start_month = 3
        high_period_end_month = 7

        current_start_date = datetime(1970,1,1,0,0)
        current_end_date = datetime(1999,12,31,0,0)

        future_start_date = datetime(2041,1,1,0,0)
        future_end_date = datetime(2070,12,31,0,0)

        future_query = QueryObject()
        future_query.start_date = future_start_date
        future_query.end_date = future_end_date
        future_query.event_duration = timedelta(days = 1)
        future_query.start_month = high_period_start_month
        future_query.end_month = high_period_end_month

        current_query = QueryObject()
        current_query.start_date = current_start_date
        current_query.end_date = current_end_date
        current_query.event_duration = timedelta(days = 1)
        current_query.start_month = high_period_start_month
        current_query.end_month = high_period_end_month


    gs = gridspec.GridSpec(3,2)
    current_id_to_changes = {}
    all_current = []
    all_future = []
    all_stds_current = []
    all_stds_future = []
    for k, current_id in enumerate(current_ids):
        if level_type == 'high' and save_to_txt:
            current_path = folder_path + '{0}_discharge_1970_01_01_00_00.nc'.format(current_id)
            future_path = folder_path + '{0}_discharge_2041_01_01_00_00.nc'.format(current2future[current_id])
            current_data, times_current, x_indices, y_indices = data_select.get_data_from_file(current_path)
            future_data, times_future, x_indices, y_indices = data_select.get_data_from_file(future_path)

            current_highs = data_select.get_period_maxima_query(current_data, times_current, current_query)
            future_highs = data_select.get_period_maxima_query(future_data, times_future, future_query)

        #get current return levels
        pars_list = get_pars_for_member_and_type(current_id, level_type)
        return_levels_current = np.zeros(len(pars_list))
        for pos, pars in enumerate(pars_list):
            return_levels_current[pos] = return_level_function(pars, return_period)
        stdevs_current = get_stdevs_for_member_and_type(current_id, level_type)[return_period]


        #get future return levels
        future_id = current2future[current_id]
        pars_list = get_pars_for_member_and_type(future_id, level_type)
        return_levels_future = np.zeros(len(pars_list))
        for pos, pars in enumerate(pars_list):
            return_levels_future[pos] = return_level_function(pars, return_period)
        stdevs_future = get_stdevs_for_member_and_type(future_id, level_type)[return_period]


        change = return_levels_future - return_levels_current
        if significance_counter is None:
            significance_counter = np.zeros( change.shape )


        print('minmax(std_current)')
        print(np.min(stdevs_current), np.max(stdevs_current))
        print('minmax(std_future)')
        print(np.min(stdevs_future), np.max(stdevs_future))

        print('min max min abs(rl_current - rl_future)')
        the_delta = np.abs(return_levels_future - return_levels_current)
        print(np.min(the_delta), np.max(the_delta), np.mean(the_delta))
        
        #stdev = -1 - stands for undefined value
        condition = np.logical_and(np.abs(change) > 1.96 * ( stdevs_current + stdevs_future ),
                                   (stdevs_current >= 0) & (stdevs_future >= 0)
                                   & (return_levels_current > 0)
                                )

        

       
        sign_index = np.where(condition)
        significance_counter[sign_index] += 1


        print(len(sign_index[0]))

        all_current.append(return_levels_current)
        all_future.append(return_levels_future)

        all_stds_current.append(stdevs_current)
        all_stds_future.append(stdevs_future)

        change /= return_levels_current
        change *= 100.0


        min_change = np.min(change)
        print(return_levels_current[change == min_change], return_levels_future[change == min_change], min_change)
        
        if not level_type == "high":
            delta = 100
            lower_limit = 0 if min_change >= 0 else np.floor(min_change / 10.0) * 10
        else:
            delta = 50
            lower_limit = np.floor(min_change / 10.0 ) * 10


        not_significant = np.zeros(change.shape)
        not_significant = np.ma.masked_where(condition, not_significant)


        if level_type == 'high':
            assert np.all(return_levels_current > 0)
            assert np.all(stdevs_current >= 0)
            assert np.all(stdevs_future >= 0)

        #temp change to condition
        #change = np.ma.masked_where(np.logical_not(condition), change)
        print('Plotting: current %s, future %s' % (current_id, future_id))

        current_id_to_changes[current_id] = change
        if ax is None:
            ax = fig.add_subplot(gs[k // 2, k % 2])
        plot(change , i_indices, j_indices, xs, ys,
                    title = "{0}-year {1} flow".format(return_period, level_type), label = labels[k],
                    color_map = mycolors.get_red_blue_colormap(ncolors = 20), units = '%',
                    basemap = basemap, minmax = (-delta, delta),
                    colorbar_label_format = '%d',
                    upper_limited = True, colorbar_tick_locator = LinearLocator(numticks = 11),
                    not_significant_mask = not_significant
                    , impose_lower_limit=lower_limit, ax= ax

                    )
        if return_period == 10 and level_type == 'high' and save_to_txt:
            txt_saver.save_to_file_rls_and_sign(current_id, return_period,
                                      return_levels_current, return_levels_future,
                                      stdevs_current, stdevs_future,
                                      condition, current_highs, future_highs)
    




    plt.subplot(gs[2,1])

    plot_sign_count = False
    plot_significance = True
    if plot_sign_count: #if plotting significance count
        significance_counter = np.ma.masked_where(significance_counter == 0, significance_counter)
        plot(significance_counter, i_indices, j_indices, xs, ys,
             title = 'Significance Count', label = labels[5], minmax = (1,6),
             color_map = mycolors.get_sign_count_cmap(ncolors = 5), basemap = basemap,
             colorbar_tick_locator = MaxNLocator(nbins = 5),
             colorbar_label_format = '%d'
             )

        #TODO plot +/-
        plus_change = None
        minus_change = None

        for current_id, the_change in current_id_to_changes.items():
            if plus_change is None:
                plus_change = (the_change > 0)
                minus_change = (the_change < 0)
            else:
                plus_change = np.logical_and(the_change > 0, plus_change)
                minus_change = np.logical_and(the_change < 0, minus_change)

        #should be at least one member with significant changes
        plus_change = np.logical_and(plus_change, significance_counter > 0)
        minus_change = np.logical_and(minus_change, significance_counter > 0)

        x_interest = xs[i_indices, j_indices]
        y_interest = ys[i_indices, j_indices]


        x_plus = x_interest[plus_change]
        y_plus = y_interest[plus_change]
        x_minus = x_interest[minus_change]
        y_minus = y_interest[minus_change]

        basemap.scatter(x_plus, y_plus, marker = "+", color = "m", s = 15, zorder = 5, linewidth = 1)

        if len(x_minus) > 0:
            basemap.scatter(x_minus, y_minus, marker = "d", zorder = 6)
    else:
        #plot ensemble mean
        all_current = np.array( all_current )
        all_future = np.array( all_future )
        all_stds_current = np.array( all_stds_current )
        all_stds_future = np.array( all_stds_future )


        mean_current = np.mean(all_current, axis = 0)
        mean_future = np.mean(all_future, axis = 0)
        mean_stds_current = np.mean( all_stds_current, axis = 0 )
        mean_stds_future = np.mean( all_stds_future, axis = 0 )

        min_change = np.min((mean_future - mean_current)/mean_current * 100.0)
        if not level_type == "high":
            delta = 100

            lower_limit = 0 if min_change >= 0 else np.floor(min_change / 10.0) * 10
        else:
            delta = 100
            lower_limit = np.floor(min_change / 10.0 ) * 10

        not_significant = np.absolute(mean_future - mean_current) <= 1.96 * (mean_stds_current + mean_stds_future)
        not_significant = not_significant.astype(int)
        print(" sum(not_significant) = ", np.sum(not_significant))
        not_significant = np.ma.masked_where(~(not_significant == 1), not_significant)
        not_significant *= 0.0

        if not plot_significance:
            not_significant = None

        plot((mean_future - mean_current) / mean_current * 100.0, i_indices, j_indices, xs, ys,
                    title = "", label = labels[-1],
                    color_map = mycolors.get_red_blue_colormap(ncolors = 20), units = '%',
                    basemap = basemap, minmax = (-delta, delta),
                    colorbar_label_format = '%d',
                    upper_limited = True, colorbar_tick_locator = LinearLocator(numticks = 11),
                    not_significant_mask = not_significant,
                    impose_lower_limit = lower_limit
                    )



        pass


    if save_fig_to_file:
        plt.tight_layout()
        plt.savefig('%d_%s_change_rl.png' % (return_period, level_type))