def fit_incs_pd(ascs, ascs_month, months, flag=0):
    # this period accepts a list of increasing periods and a month desired
    # and returns the parameter of the distribution of increments for that month
    
    ascs_select = list()
    for m in range(len(ascs)):
        #if ascs_month[m] == months:
        if np.in1d(ascs_month[m], months):
            ascs_select.append(ascs[m])
            
    L = len(ascs_select)
    incs = list()
    
    for k in range(L):
        asc_temp = ascs_select[k]
        if hasattr(asc_temp, "__len__"):
            asc1 = asc_temp[1:]
            asc2 = asc_temp[:-1]
            incs.extend(np.subtract(asc1, asc2))
        else:
            pass
    
    incs = list(filter(lambda a: a > 0, incs))
    if flag == 0:
        optparms = fitweibull(incs)
    elif flag == 1:
        optparms = fitlognorm(incs)
    elif flag == 2:
        optparms = gumbel_r.fit(incs)
    elif flag == 3:
        optparms = powerlaw.fit(incs)
    elif flag == 4:
        optparms = genextreme.it(incs)
    return(optparms)
예제 #2
0
 def fetch_duration_params(self, start_id):
     d = dict()
     start_time = time.time()
     journey_df = self.df_from_sql(
         f"""
         SELECT
             "EndStation Id"
             ,Duration / 60 AS Duration
         FROM
             journeys
         WHERE
             "StartStation Id" = {start_id}
             AND year >= {self.min_year}
             AND weekday_ind = 1
             -- Query plan more efficient if we specify these rather than ESid > 0
             AND "EndStation Id" != -1
             AND "EndStation Id" NOT NULL
             {self.additional_filters}
         """
     )
     print(f"fetched {len(journey_df)} journeys for station {start_id} in {(time.time()-start_time)} seconds")
     start_time = time.time()
     journey_df.dropna(subset=['Duration'], inplace=True)
     for end_id in journey_df["EndStation Id"].unique():
         durations = journey_df.loc[journey_df["EndStation Id"] == end_id]['Duration'].values
         # creates a tuple of scipy.stats.gumbel_r parameters
         params = gumbel_r.fit(durations)
         d[end_id] = params
     print(f"\tfitted {len(journey_df)} journeys in {(time.time()-start_time)/60} minutes")
     return d
예제 #3
0
    def __init__(self, data, block_size=0):
        if len(data) < 10:
            raise Exception("Not enough data to make predictions")

        self.__data = data
        histogram = {}

        if block_size:
            self.__block_size = block_size
        else:
            # Block sizes between 20 and 50 are ok if
            # all the dataset can be represented in less
            # than 20 blocks
            self.__block_size = len(data)/20

        if self.__block_size < 1:
            warnings.warn("Invalid block size, set it to 1")
            self.__block_size = 1

        block_maxima = []

        current_block_size = 0
        current_block_maximum = -sys.maxint - 1

        # Compute block maxima
        for value in data:
            if value in histogram:
                histogram[value] += 1
            else:
                histogram[value] = 1
            current_block_size += 1
            if value > current_block_maximum:
                current_block_maximum = value
            if current_block_size == self.__block_size:
                block_maxima.append(current_block_maximum)
                current_block_maximum = 0
                current_block_size = 0

        # Build original 1-cdf histogram
        self.__values = histogram.keys()
        self.__values.sort()
        self.__values.pop()
        self.__frequencies = []
        previous = 0.0
        for value in self.__values:
            previous = histogram[value]/float(len(self.__data)) + previous
            self.__frequencies.append(1-previous)

        # Fit Gumbel distribution to block maxima
        params = gumbel.fit(block_maxima)
        self.__shape = 0
        self.__location = params[0]
        self.__scale = params[1]
예제 #4
0
    def EstimateParameter(data, ObjFunc, threshold):
        """
        There are two likelihood functions (L1 and L2), one for values above some
        threshold (x>=C) and one for values below (x < C), now the likeliest parameters
        are those at the max value of mutiplication between two functions max(L1*L2).

        In this case the L1 is still the product of multiplication of probability
        density function's values at xi, but the L2 is the probability that threshold
        value C will be exceeded (1-F(C)).

        Parameters
        ----------
        data : TYPE
            DESCRIPTION.
        threshold : TYPE
            DESCRIPTION.

        Returns
        -------
        Param : TYPE
            DESCRIPTION.

        Example:
            from Hapi.statisticaltools import StatisticalTools as ST
            Param_dist = Gumbel.EstimateParameter(data, threshold)

        """
        # obj_func = lambda p, x: (-np.log(Gumbel.Pdf(x, p[0], p[1]))).sum()
        # #first we make a simple Gumbel fit
        # Par1 = so.fmin(obj_func, [0.5,0.5], args=(np.array(data),))
        Par1 = gumbel_r.fit(data)
        #then we use the result as starting value for your truncated Gumbel fit
        Param = so.fmin(ObjFunc, [threshold, Par1[0], Par1[1]],
                        args=(np.array(data), ),
                        maxiter=500,
                        maxfun=500)
        # Param_dist = [Param[1], Param[2]]

        return Param
예제 #5
0
파일: hminputs.py 프로젝트: nguyetlm/Hapi
    def StatisticalProperties(self,
                              PathNodes,
                              PathTS,
                              StartDate,
                              WarmUpPeriod,
                              SavePlots,
                              SavePath,
                              SeparateFiles=False,
                              Filter=False,
                              Distibution="GEV",
                              EstimateParameters=False,
                              Quartile=0,
                              RIMResults=False,
                              SignificanceLevel=0.1):
        """
        =============================================================================
          StatisticalProperties(PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, SavePath,
                              SeparateFiles = False, Filter = False, RIMResults = False)
        =============================================================================

        StatisticalProperties method reads the SWIM output file (.dat file) that
        contains the time series of discharge for some computational nodes
        and calculate some statistical properties

        the code assumes that the time series are of a daily temporal resolution, and
        that the hydrological year is 1-Nov/31-Oct (Petrow and Merz, 2009, JoH).

        Parameters
        ----------
            1-PathNodes : [String]
                the name of the file which contains the ID of the computational
                nodes you want to do the statistical analysis for, the ObservedFile
                should contain the discharge time series of these nodes in order.
            2-PathTS : [String]
                the name of the SWIM result file (the .dat file).
            3-StartDate : [string]
                the begining date of the time series.
            4-WarmUpPeriod : [integer]
                the number of days you want to neglect at the begining of the
                Simulation (warm up period).
            5-SavePlots : [Bool]
                DESCRIPTION.
            6-SavePath : [String]
                the path where you want to  save the statistical properties.
            7-SeparateFiles: [Bool]
                if the discharge data are stored in separate files not all in one file
                SeparateFiles should be True, default [False].
            8-Filter: [Bool]
                for observed or RIMresult data it has gaps of times where the
                model did not run or gaps in the observed data if these gap days
                are filled with a specific value and you want to ignore it here
                give Filter = Value you want
            9-RIMResults: [Bool]
                If the files are results form RIM or observed, as the format
                differes between the two. default [False]

        Returns
        -------
            1-Statistical Properties.csv:
                file containing some statistical properties like mean, std, min, 5%, 25%,
                median, 75%, 95%, max, t_beg, t_end, nyr, q1.5, q2, q5, q10, q25, q50,
                q100, q200, q500.
        """

        ComputationalNodes = np.loadtxt(PathNodes, dtype=np.uint16)
        # hydrographs
        if SeparateFiles:
            TS = pd.DataFrame()
            if RIMResults:
                for i in range(len(ComputationalNodes)):
                    TS.loc[:, int(ComputationalNodes[i])] = self.ReadRIMResult(
                        PathTS + "/" + str(int(ComputationalNodes[i])) +
                        '.txt')
            else:
                for i in range(len(ComputationalNodes)):
                    TS.loc[:, int(ComputationalNodes[i])] = np.loadtxt(
                        PathTS + "/" + str(int(ComputationalNodes[i])) +
                        '.txt')  #,skiprows = 0

            StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d")
            EndDate = StartDate + dt.timedelta(days=TS.shape[0] - 1)
            ind = pd.date_range(StartDate, EndDate)
            TS.index = ind
        else:
            TS = pd.read_csv(PathTS, delimiter=r'\s+', header=None)
            StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d")
            EndDate = StartDate + dt.timedelta(days=TS.shape[0] - 1)
            TS.index = pd.date_range(StartDate, EndDate, freq="D")
            # delete the first two columns
            del TS[0], TS[1]
            TS.columns = ComputationalNodes

        # neglect the first year (warmup year) in the time series
        TS = TS.loc[StartDate + dt.timedelta(days=WarmUpPeriod):EndDate, :]

        # List of the table output, including some general data and the return periods.
        col_csv = [
            'mean', 'std', 'min', '5%', '25%', 'median', '75%', '95%', 'max',
            't_beg', 't_end', 'nyr'
        ]
        rp_name = [
            'q1.5', 'q2', 'q5', 'q10', 'q25', 'q50', 'q100', 'q200', 'q500',
            'q1000'
        ]
        col_csv = col_csv + rp_name

        # In a table where duplicates are removed (np.unique), find the number of
        # gauges contained in the .csv file.
        # no_gauge = len(ComputationalNodes)
        # Declare a dataframe for the output file, with as index the gaugne numbers
        # and as columns all the output names.
        StatisticalPr = pd.DataFrame(np.nan,
                                     index=ComputationalNodes,
                                     columns=col_csv)
        StatisticalPr.index.name = 'ID'
        DistributionPr = pd.DataFrame(np.nan,
                                      index=ComputationalNodes,
                                      columns=['loc', 'scale'])
        DistributionPr.index.name = 'ID'
        # required return periods
        T = [1.5, 2, 5, 10, 25, 50, 50, 100, 200, 500, 1000]
        T = np.array(T)
        # these values are the Non Exceedance probability (F) of the chosen
        # return periods F = 1 - (1/T)
        # Non Exceedance propabilities
        #F = [1/3, 0.5, 0.8, 0.9, 0.96, 0.98, 0.99, 0.995, 0.998]
        F = 1 - (1 / T)
        # Iteration over all the gauge numbers.
        for i in ComputationalNodes:
            QTS = TS.loc[:, i]
            # The time series is resampled to the annual maxima, and turned into a
            # numpy array.
            # The hydrological year is 1-Nov/31-Oct (from Petrow and Merz, 2009, JoH).
            amax = QTS.resample('A-OCT').max().values

            if type(Filter) != bool:
                amax = amax[amax != Filter]
            if EstimateParameters:
                # estimate the parameters through an optimization
                # alpha = (np.sqrt(6) / np.pi) * amax.std()
                # beta = amax.mean() - 0.5772 * alpha
                # param_dist = [beta, alpha]
                threshold = np.quantile(amax, Quartile)
                if Distibution == "GEV":
                    print("Still to be finished later")
                else:
                    param = Gumbel.EstimateParameter(amax, Gumbel.ObjectiveFn,
                                                     threshold)
                    param_dist = [param[1], param[2]]

            else:
                # estimate the parameters through an maximum liklehood method
                if Distibution == "GEV":
                    param_dist = genextreme.fit(amax)
                else:
                    # A gumbel distribution is fitted to the annual maxima
                    param_dist = gumbel_r.fit(amax)

            if Distibution == "GEV":
                DistributionPr.loc[i, 'c'] = param_dist[0]
                DistributionPr.loc[i, 'loc'] = param_dist[1]
                DistributionPr.loc[i, 'scale'] = param_dist[2]
            else:
                DistributionPr.loc[i, 'loc'] = param_dist[0]
                DistributionPr.loc[i, 'scale'] = param_dist[1]

            # Return periods from the fitted distribution are stored.
            # get the Discharge coresponding to the return periods
            if Distibution == "GEV":
                Qrp = genextreme.ppf(F,
                                     param_dist[0],
                                     loc=param_dist[1],
                                     scale=param_dist[2])
            else:
                Qrp = gumbel_r.ppf(F, loc=param_dist[0], scale=param_dist[1])
            # to get the Non Exceedance probability for a specific Value
            # sort the amax
            amax.sort()
            # calculate the F (Exceedence probability based on weibul)
            cdf_Weibul = ST.Weibul(amax)
            # Gumbel.ProbapilityPlot method calculates the theoretical values based on the Gumbel distribution
            # parameters, theoretical cdf (or weibul), and calculate the confidence interval
            if Distibution == "GEV":
                Qth, Qupper, Qlower = GEV.ProbapilityPlot(
                    param_dist, cdf_Weibul, amax, SignificanceLevel)
                # to calculate the F theoretical
                Qx = np.linspace(0, 1.5 * float(amax.max()), 10000)
                pdf_fitted = genextreme.pdf(Qx,
                                            param_dist[0],
                                            loc=param_dist[2],
                                            scale=param_dist[2])
                cdf_fitted = genextreme.cdf(Qx,
                                            param_dist[0],
                                            loc=param_dist[1],
                                            scale=param_dist[2])
            else:
                Qth, Qupper, Qlower = Gumbel.ProbapilityPlot(
                    param_dist, cdf_Weibul, amax, SignificanceLevel)
                # gumbel_r.interval(SignificanceLevel)
                # to calculate the F theoretical
                Qx = np.linspace(0, 1.5 * float(amax.max()), 10000)
                pdf_fitted = gumbel_r.pdf(Qx,
                                          loc=param_dist[0],
                                          scale=param_dist[1])
                cdf_fitted = gumbel_r.cdf(Qx,
                                          loc=param_dist[0],
                                          scale=param_dist[1])
            # then calculate the the T (return period) T = 1/(1-F)
            if SavePlots:
                fig = plt.figure(60, figsize=(20, 10))
                gs = gridspec.GridSpec(nrows=1, ncols=2, figure=fig)
                # Plot the histogram and the fitted distribution, save it for each gauge.
                ax1 = fig.add_subplot(gs[0, 0])
                ax1.plot(Qx, pdf_fitted, 'r-')
                ax1.hist(amax, density=True)
                ax1.set_xlabel('Annual Discharge(m3/s)', fontsize=15)
                ax1.set_ylabel('pdf', fontsize=15)

                ax2 = fig.add_subplot(gs[0, 1])
                ax2.plot(Qx, cdf_fitted, 'r-')
                ax2.plot(amax, cdf_Weibul, '.-')
                ax2.set_xlabel('Annual Discharge(m3/s)', fontsize=15)
                ax2.set_ylabel('cdf', fontsize=15)

                plt.savefig(SavePath + "/" + "Figures/" + str(i) + '.png',
                            format='png')
                plt.close()

                fig = plt.figure(70, figsize=(10, 8))
                plt.plot(Qth,
                         amax,
                         'd',
                         color='#606060',
                         markersize=12,
                         label='Gumbel Distribution')
                plt.plot(Qth,
                         Qth,
                         '^-.',
                         color="#3D59AB",
                         label="Weibul plotting position")
                if Distibution != "GEV":
                    plt.plot(Qth,
                             Qlower,
                             '*--',
                             color="#DC143C",
                             markersize=12,
                             label='Lower limit (' +
                             str(int(
                                 (1 - SignificanceLevel) * 100)) + " % CI)")
                    plt.plot(Qth,
                             Qupper,
                             '*--',
                             color="#DC143C",
                             markersize=12,
                             label='Upper limit (' +
                             str(int(
                                 (1 - SignificanceLevel) * 100)) + " % CI)")

                plt.legend(fontsize=15, framealpha=1)
                plt.xlabel('Theoretical Annual Discharge(m3/s)', fontsize=15)
                plt.ylabel('Annual Discharge(m3/s)', fontsize=15)
                plt.savefig(SavePath + "/" + "Figures/F-" + str(i) + '.png',
                            format='png')
                plt.close()

            StatisticalPr.loc[i, 'mean'] = QTS.mean()
            StatisticalPr.loc[i, 'std'] = QTS.std()
            StatisticalPr.loc[i, 'min'] = QTS.min()
            StatisticalPr.loc[i, '5%'] = QTS.quantile(0.05)
            StatisticalPr.loc[i, '25%'] = QTS.quantile(0.25)
            StatisticalPr.loc[i, 'median'] = QTS.quantile(0.50)
            StatisticalPr.loc[i, '75%'] = QTS.quantile(0.75)
            StatisticalPr.loc[i, '95%'] = QTS.quantile(0.95)
            StatisticalPr.loc[i, 'max'] = QTS.max()
            StatisticalPr.loc[i, 't_beg'] = QTS.index.min()
            StatisticalPr.loc[i, 't_end'] = QTS.index.max()
            StatisticalPr.loc[
                i, 'nyr'] = (StatisticalPr.loc[i, 't_end'] -
                             StatisticalPr.loc[i, 't_beg']).days / 365.25
            for irp, irp_name in zip(Qrp, rp_name):
                StatisticalPr.loc[i, irp_name] = irp

            # Print for prompt and check progress.
            print("Gauge", i, "done.")
        #
        # Output file
        StatisticalPr.to_csv(SavePath + "/" + "Statistical Properties.csv")
        self.StatisticalPr = StatisticalPr
        DistributionPr.to_csv(SavePath + "/" + "DistributionProperties.csv")
        self.DistributionPr = DistributionPr
예제 #6
0
	fbest, pbest = my_per.get_best_frequencies() # Return best n_local_optima frequencies
	
	bestperiod = 1./fbest[0]
	bestperiod2 = 1./fbest[1]
	bestperiod3 = 1./fbest[2]
	bestperiod4 = 1./fbest[3]
	
	pbest_bootstrap = np.zeros(shape=(100, 2))
	for index in range(pbest_bootstrap.shape[0]):
		P = np.random.permutation(len(mjd))
		my_per.set_data(mjd, mag[P], err[P])
		my_per.frequency_grid_evaluation(fmin=0.0, fmax=4.0, fresolution=1e-3)
		my_per.finetune_best_frequencies(fresolution=1e-4, n_local_optima=pbest_bootstrap.shape[1])
		_, pbest_bootstrap[index, :] = my_per.get_best_frequencies()
	                                
		param = gumbel_r.fit(pbest_bootstrap.ravel())
		rv = gumbel_r(loc=param[0], scale=param[1])
		x = np.linspace(rv.ppf(0.001), rv.ppf(0.999), 100)
	                                
	p_vals = [0.01, 0.05, 0.08]
	sig1 = rv.ppf(1.-p_vals[0])
	sig5 = rv.ppf(1.-p_vals[1])
	sig8 = rv.ppf(1.-p_vals[2])
	
	bestpower = pbest[0]
	bestpower2 = pbest[1]
	bestpower3 = pbest[2]
	bestpower4 = pbest[3]
	
	
	
예제 #7
0
    exit(0)

print("Calculating distribution of scores for {} scrambled alignments.".format(N))


sscores = []
# Calculate the N random alignments
for i in tqdm(range(N)):
    seqB = "".join(sample(seqB, len(seqB)))
    s, a, ma, ta = alignFunction(
        seqA, seqB, matScore, gapOpen, gapExtend, ScoreOnly=True)

    sscores.append(s)

# Fit extreme value distribution to the scramble alignment data
miu, beta = gumbel_r.fit(sscores)
print("Length of sscores: ", len(sscores))
print("Computing histogram for {} scramble scores".format(N))
print("Max scrambled score:", max(sscores))
print("Min scrambled score:", min(sscores))
print("Median of scrambled scores:", np.median(sscores))
print("Gumbel miu:", miu)
print("Gumbel beta:", beta)
print("Probability of unscrambled score in a random alignment: ",
      1-gumbel_r.cdf(uscore, miu, beta))
print()

# Generate the basename for save files
basename = "smith" if args.alignment_method == "local" else "needle"
basename += "_{}_{}_{}_{:3.1f}_{:3.1f}".format(
    N, len(seqA), smatrix, abs(gapOpen), abs(gapExtend))
예제 #8
0
fig, ax = plt.subplots()
ericeira_wts.wave_data['Tp'].hist(density=True, bins=np.arange(22))
ax.set_title('Tp')

#%% Extremes

ericeira_wts.plot_timeseries()
hmax = ericeira_wts.maxima()
hmax.plot()

#%%
from scipy.stats import gumbel_r
from scipy.stats import probplot
import statsmodels.distributions

loc, scale = gumbel_r.fit(hmax)
fig, ax = plt.subplots()
x = np.linspace(gumbel_r.ppf(0.01, loc=loc, scale=scale),
                gumbel_r.ppf(0.99, loc=loc, scale=scale), 100)
ax.plot(x,
        gumbel_r.pdf(x, loc=loc, scale=scale),
        'r-',
        lw=5,
        alpha=0.6,
        label='gumbel_r pdf')
ax.hist(hmax, density=True)

fig, ax = plt.subplots()
ax.plot(x,
        gumbel_r.cdf(x, loc=loc, scale=scale),
        'r-',
예제 #9
0
    def StatisticalProperties(self,
                              PathNodes,
                              PathTS,
                              StartDate,
                              WarmUpPeriod,
                              SavePlots,
                              SavePath,
                              SeparateFiles=False,
                              Filter=False):
        """
        =============================================================================
          StatisticalProperties(PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, saveto)
        =============================================================================

        StatisticalProperties method reads the SWIM output file (.dat file) that
        contains the time series of discharge for some computational nodes
        and calculate some statistical properties

        the code assumes that the time series are of a daily temporal resolution, and
        that the hydrological year is 1-Nov/31-Oct (Petrow and Merz, 2009, JoH).

        Parameters
        ----------
            1-PathNodes : [String]
                the name of the file which contains the ID of the computational
                nodes you want to do the statistical analysis for, the ObservedFile
                should contain the discharge time series of these nodes in order.
            2-PathTS : [String]
                the name of the SWIM result file (the .dat file).
            3-StartDate : [string]
                the begining date of the time series.
            4-WarmUpPeriod : [integer]
                the number of days you want to neglect at the begining of the
                Simulation (warm up period).
            5-SavePlots : [Bool]
                DESCRIPTION.
            6-SavePath : [String]
                the path where you want to  save the statistical properties.

        Returns
        -------
            1-Statistical Properties.csv:
                file containing some statistical properties like mean, std, min, 5%, 25%,
                median, 75%, 95%, max, t_beg, t_end, nyr, q1.5, q2, q5, q10, q25, q50,
                q100, q200, q500.
        """

        ComputationalNodes = np.loadtxt(PathNodes, dtype=np.uint16)
        # hydrographs
        if SeparateFiles:
            ObservedTS = pd.DataFrame()

            for i in range(len(ComputationalNodes)):
                ObservedTS.loc[:, int(ComputationalNodes[i])] = np.loadtxt(
                    PathTS + "/" + str(int(ComputationalNodes[i])) +
                    '.txt')  #,skiprows = 0

            StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d")
            EndDate = StartDate + dt.timedelta(days=ObservedTS.shape[0] - 1)
            ind = pd.date_range(StartDate, EndDate)
            ObservedTS.index = ind
        else:
            ObservedTS = pd.read_csv(PathTS, delimiter=r'\s+', header=None)
            StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d")
            EndDate = StartDate + dt.timedelta(days=ObservedTS.shape[0] - 1)
            ObservedTS.index = pd.date_range(StartDate, EndDate, freq="D")
            # delete the first two columns
            del ObservedTS[0], ObservedTS[1]
            ObservedTS.columns = ComputationalNodes

        # neglect the first year (warmup year) in the time series
        ObservedTS = ObservedTS.loc[StartDate +
                                    dt.timedelta(days=WarmUpPeriod):EndDate, :]

        # List of the table output, including some general data and the return periods.
        col_csv = [
            'mean', 'std', 'min', '5%', '25%', 'median', '75%', '95%', 'max',
            't_beg', 't_end', 'nyr'
        ]
        rp_name = [
            'q1.5', 'q2', 'q5', 'q10', 'q25', 'q50', 'q100', 'q200', 'q500'
        ]
        col_csv = col_csv + rp_name

        # In a table where duplicates are removed (np.unique), find the number of
        # gauges contained in the .csv file.
        # no_gauge = len(ComputationalNodes)
        # Declare a dataframe for the output file, with as index the gaugne numbers
        # and as columns all the output names.
        StatisticalPr = pd.DataFrame(np.nan,
                                     index=ComputationalNodes,
                                     columns=col_csv)
        StatisticalPr.index.name = 'ID'
        DistributionPr = pd.DataFrame(np.nan,
                                      index=ComputationalNodes,
                                      columns=['loc', 'scale'])
        DistributionPr.index.name = 'ID'
        # required return periods
        T = [1.5, 2, 5, 10, 25, 50, 50, 100, 200, 500]
        T = np.array(T)
        # these values are the Non Exceedance probability (F) of the chosen
        # return periods F = 1 - (1/T)
        # Non Exceedance propabilities
        #F = [1/3, 0.5, 0.8, 0.9, 0.96, 0.98, 0.99, 0.995, 0.998]
        F = 1 - (1 / T)
        # Iteration over all the gauge numbers.
        for i in ComputationalNodes:
            QTS = ObservedTS.loc[:, i]
            # The time series is resampled to the annual maxima, and turned into a
            # numpy array.
            # The hydrological year is 1-Nov/31-Oct (from Petrow and Merz, 2009, JoH).
            amax = QTS.resample('A-OCT').max().values
            if type(Filter) != bool:
                amax = amax[amax != Filter]
            # A gumbel distribution is fitted to the annual maxima
            param_dist = gumbel_r.fit(amax)
            DistributionPr.loc[i, 'loc'] = param_dist[0]
            DistributionPr.loc[i, 'scale'] = param_dist[1]
            # Return periods from the fitted distribution are stored.
            # get the Discharge coresponding to the return periods
            Qrp = gumbel_r.ppf(F, loc=param_dist[0], scale=param_dist[1])
            # to get the Non Exceedance probability for a specific Value
            #gumbel_r.cdf(Qrp, loc=param_dist[0], scale=param_dist[1])
            # then calculate the the T (return period) T = 1/(1-F)

            # Plot the histogram and the fitted distribution, save it for each gauge.
            Qx = np.linspace(0, 1.5 * float(amax.max()), 10000)
            pdf_fitted = gumbel_r.pdf(Qx,
                                      loc=param_dist[0],
                                      scale=param_dist[1])
            if SavePlots:
                plt.plot(Qx, pdf_fitted, 'r-')
                plt.hist(amax, normed=True)
                plt.savefig(SavePath + "/" + "Figures/" + str(i) + '.png',
                            format='png')
                plt.close()

            StatisticalPr.loc[i, 'mean'] = QTS.mean()
            StatisticalPr.loc[i, 'std'] = QTS.std()
            StatisticalPr.loc[i, 'min'] = QTS.min()
            StatisticalPr.loc[i, '5%'] = QTS.quantile(0.05)
            StatisticalPr.loc[i, '25%'] = QTS.quantile(0.25)
            StatisticalPr.loc[i, 'median'] = QTS.quantile(0.50)
            StatisticalPr.loc[i, '75%'] = QTS.quantile(0.75)
            StatisticalPr.loc[i, '95%'] = QTS.quantile(0.95)
            StatisticalPr.loc[i, 'max'] = QTS.max()
            StatisticalPr.loc[i, 't_beg'] = QTS.index.min()
            StatisticalPr.loc[i, 't_end'] = QTS.index.max()
            StatisticalPr.loc[
                i, 'nyr'] = (StatisticalPr.loc[i, 't_end'] -
                             StatisticalPr.loc[i, 't_beg']).days / 365.25
            for irp, irp_name in zip(Qrp, rp_name):
                StatisticalPr.loc[i, irp_name] = irp

            # Print for prompt and check progress.
            print("Gauge", i, "done.")
        #
        # Output file
        StatisticalPr.to_csv(SavePath + "/" + "Statistical Properties.csv")
        self.StatisticalPr = StatisticalPr
        DistributionPr.to_csv(SavePath + "/" + "DistributionProperties.csv")
        self.DistributionPr = DistributionPr
예제 #10
0
number_value_smaller = 0
# calculate the number of samples smaller than the reference score
for score in permutation_scores:
    if score < reference_score:
        number_value_smaller +=1
# estimate the number of total samples + 1
number_samples = len(permutation_scores) +1
# calculate the p-value
p_value = 1-(float(number_value_smaller) / float(number_samples))
print(p_value )

# Task 5: Compute the associate p-value using an estimated gumble distribution

# estimate the parameter loc and scale using the fit function
loc,scale = gumbel_r.fit(permutation_scores)

# calculate the p-value
p_value = 1-gumbel_r.cdf(reference_score, loc=loc, scale=scale )
print(p_value)


# Task 6: Plot the histogram and the fitted probability density function with the reference score as vertical line

fig, ax = plt.subplots(1, 1)
x = np.linspace(gumbel_r.ppf(0.01,loc=loc, scale=scale),gumbel_r.ppf(0.99,loc=loc, scale=scale), 1000)

ax.plot(x, gumbel_r.pdf(x, loc=loc, scale=scale), 'k-', lw=2, label='frozen pdf')
ax.hist(permutation_scores, normed=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
ax.axvline(reference_score)
예제 #11
0
def compare_to_blank(blank_model_size, p_val=0.05, sparse_rounds=False,
                     interactome_interface_instance=None):
    """
    Recovers the statistics on the circulation nodes and shows the visual of a circulation system.
    There is no issue with using the same interactome interface instance, because they are forked when
    threads are generated and will not interfere.

    :param blank_model_size: the number of uniprots in the blank model
    :param p_val: desired p_value for the returned terms
    :param sparse_rounds: if set to a number, sparse computation technique would be used
     with the number of rounds equal the integer value of that argument
    :param interactome_interface_instance:
    :return: None if no significant nodes, the node and group characteristic
     dictionaries otherwise
    """
    def get_max_for_each_degree(sample_sub_arrray):
        degrees = np.unique(sample_sub_arrray[1, :])
        max_array = []

        for degree in degrees:
            filter = sample_sub_arrray[1, :] == degree
            max_array.append([sample_sub_arrray[0, filter].max(), degree])

        m_arr = np.array(max_array)
        return m_arr.T

    if interactome_interface_instance is None:
        interactome_interface_instance = InteractomeInterface(True, True)
        interactome_interface_instance.fast_load()

    md5_hash = interactome_interface_instance.md5_hash()

    background_sub_array_list = []
    max_sub_array_list = []
    count = 0

    log.info("looking to test against:"
             "\t size: %s \t sys_hash: %s \t sparse_rounds: %s" %
             (blank_model_size, md5_hash, sparse_rounds))

    log.info("samples found to test against:\t %s" %
             interactome_rand_samp_db.find({'size': blank_model_size,
                                            'sys_hash': md5_hash,
                                            'sparse_rounds': sparse_rounds}
                                            ).count())

    for i, sample in enumerate(interactome_rand_samp_db.find(
                                                            {'size': blank_model_size,
                                                             'sys_hash': md5_hash,
                                                             'sparse_rounds': sparse_rounds})):

        _, node_currents = pickle.loads(sample['currents'])

        dictionary_system = interactome_interface_instance.format_node_props(node_currents, limit=0)
        background_sub_array = list(dictionary_system.values())
        background_sub_array_list.append(np.array(background_sub_array).T)
        max_arr = get_max_for_each_degree(np.array(background_sub_array).T)
        max_sub_array_list.append(max_arr)
        count = i

    # This part declares the pre-operators required for the verification of a
    # real sample

    background_array = np.concatenate(tuple(background_sub_array_list), axis=1)
    max_array = np.concatenate(tuple(max_sub_array_list), axis=1)


    node_currents = interactome_interface_instance.node_current
    dictionary_system = interactome_interface_instance.format_node_props(node_currents)
    curr_inf_conf_tot = np.array(
        [[int(key)] + list(val) for key, val in dictionary_system.items()]).T

    node_ids, query_array = (curr_inf_conf_tot[0, :], curr_inf_conf_tot[(1, 2), :])

    log.info("stats on  %s samples" % count)

    background_density = kde_compute(background_array[(1, 0), :], 50, count)
    base_bi_corr = background_array[(0, 1), :]

    r_rels = []
    r_std_nodes = []

    # TODO: idea for the improved statistics, cluster a test node of degree k with 100 nodes with
    #  closest degrees

    samples_scatter_and_hist(background_array, query_array)

    degrees = np.unique(query_array[1, :])

    combined_p_vals = np.ones_like(query_array[1, :])

    for degree in degrees.tolist():
        filter = query_array[1, :] == degree

        entry = query_array[:, filter]
        background_set = background_array[:, background_array[1, :] == degree]
        max_set = max_array[:, max_array[1, :] == degree]

        params = gumbel_r.fit(max_set[0, :])
        arg = params[:-2]
        mu = params[-2]
        beta = params[-1]

        frozen_gumbel = gumbel_r(loc=mu, scale=beta)

        p_vals = 1 - frozen_gumbel.cdf(entry[0, :])

        combined_p_vals[filter] = p_vals

        # TODO: insert into appropriate locations => we will assume that the order is preserved

        # samples_scatter_and_hist(max_set, entry)

    r_nodes = background_density(query_array[(1, 0), :])  # this is currently used as a p-value, which is problematic.
    r_nodes = combined_p_vals

    for point in query_array.T:
        selector = np.logical_and(base_bi_corr[1, :] > point[1]*0.9, base_bi_corr[1, :] < point[1]*1.1)
        r_rels.append(point[0]/np.mean(base_bi_corr[0, selector]))
        r_std_nodes.append((point[0]-np.mean(base_bi_corr[0, selector]))/np.std(base_bi_corr[0, selector]))

    r_rels = np.array(r_rels)
    r_std_nodes = np.array(r_std_nodes)

    not_random_nodes = [node_id for node_id in node_ids[r_nodes < p_val].tolist()]

    # basically the second element below are the nodes that contribute to the
    #  information flow through the node that is considered as non-random

    log.debug('debug, not random nodes: %s', not_random_nodes)
    log.debug('debug bulbs_id_disp_name: %s',
              interactome_interface_instance.neo4j_id_2_display_name.items()[:10])

    node_char_list = [
        [int(nr_node_id), interactome_interface_instance.neo4j_id_2_display_name[nr_node_id]] +
        dictionary_system[nr_node_id] + r_nodes[node_ids == float(nr_node_id)].tolist()
        for nr_node_id in not_random_nodes]

    nodes_dict = np.hstack((node_ids[:, np.newaxis], r_nodes[:, np.newaxis], r_rels[:, np.newaxis], r_std_nodes[:, np.newaxis]))
    nodes_dict = dict((node[0], (node[1], node[2], node[3])) for node in nodes_dict.tolist())
    nodes_dict = defaultdict(lambda: (1., 0., 0.), nodes_dict)  # corresponds to the cases of super low flow - never significant

    # TODO: pull the groups corresponding to non-random associations.

    return sorted(node_char_list, key=lambda x: x[4]), nodes_dict
import numpy as np
from matplotlib import pyplot as plt

from scipy.stats import gumbel_r
from scipy.stats import gumbel_l
from scipy.stats import genextreme

dataN = np.loadtxt("../data/Qdaily.txt")

x_pdf = np.linspace(np.min(dataN), np.max(dataN), num=100)
param = gumbel_r.fit(dataN)
cdf1 = gumbel_r.cdf(x_pdf, *param[:-2], loc=param[-2], scale=param[-1])
plt.plot(x_pdf, -np.log(-np.log(cdf1)), 'o')

print(param)

num_bins = 200
counts, bin_edges = np.histogram(dataN, bins=num_bins)
cdf2 = np.cumsum(counts) / np.sum(counts)
plt.plot(bin_edges[1:], -np.log(-np.log(cdf2)), 'x')

plt.show()
예제 #13
0
def lake_dike_system(h_0, forcings, parameters_cc, lake_par, dike_par,
                     wind_par, policy):
    """Simulate the entire lake + dike system

    Args:
        h_0 (float): Initial condition, water level in the lake at t = 0
        forcings (pd.Dataframe): External forcing historically observed
        parameters_cc (dict): list of parameters that set the change in the historically observed forcings
        lake_par (dict): dictionary of parameters for the model of the lake
            K:
            A: lake surface #TODO modify
        dike_par (dict): dictionary of parameters for the model of the dike
            slope:
            crown_height:
            gamma_b:
            gamma_beta:
            gamma_f:
            q_critical:
        wind_par (dict): dictionary of parameters for the model of the wind effects on the water level

        policy (dict):
            pumping_capacity
            h_target

    Returns:
        F (float): Frequency of dike failure
    """

    # implement policy structural actions
    lake_par['K'] = lake_par['K'] * policy['sluices widening']
    dike_par['height'] = dike_par['height'] + policy['raise dikes']

    # model of water demand
    # oversimplified model: water demand proportional to potential evaporation
    forcings['water demand'] = 0
    k_demand = 500 / 0.005  # m^3/s / ??? # max demand (estimated) / max potential evaporation (under stat conditions)
    forcings['water demand'] = forcings['potential evaporation'] * k_demand

    # model of rainfall-runoff
    S_lat = 1419 * 1000000  # km^2 to m^2
    alpha = 0.8
    forcings['inflow lateral'] = forcings[
        'precipitation'] * S_lat * alpha / lake_par[
            'Delta_t']  # rational formula

    # Variate Forcings (bottom-up climate change analysis)
    forcings_cc = variate_forcings(forcings, parameters_cc)

    # model of lake
    model_output = lake_sim(h_0, forcings_cc, lake_par,
                            wind_par['Afsluitdijk'], policy)

    # supply deficit
    water_demand_daily = forcings_cc['water demand'].resample('D').mean()
    supply_relative_deficit =  sum(water_demand_daily - model_output['water supply'])  / \
                               sum(water_demand_daily)
    #((forcings.index[-1] - forcings.index[0]).days / 365.25 ) # Simulation horizon, in years

    # Dike boundary conditions
    h_year_max = yearly_max_wl(model_output['average water level'],
                               forcings.resample('D').mean(),
                               wind_par['Roggebotsluizen'])

    mu_wl, sigma_wl = gumbel_r.fit(h_year_max.values)
    water_level_pdf = gumbel_r.freeze(loc=mu_wl, scale=sigma_wl)  # TEST THIS

    # Dike failure
    F = frequency_failure(water_level_pdf,
                          dike_par,
                          base_year=1000,
                          N=1000,
                          constant_waves=True)

    return (F, supply_relative_deficit)