def plot_gev(shape, loc, scale): dist = ss.genextreme(c=shape, loc=loc, scale=scale) xs = dist.rvs(size=100) ys = dist.cdf(xs) plt.plot(np.sort(xs), np.sort(ys)) return dist
def extremeDistribution_blockMaximaGEV(x, t, t_st): '''Approximates the short-term extreme distribution using the block maxima method and the Generalized Extreme Value distribution. Parameters ---------- x : np.array Independent random variable (global peaks) t : np.array Time vector corresponding to x t_st : float Short-term period Returns ------- stextreme_dist: scipy.stats rv_frozen Probability distribution of the short-term extreme. stextreme_dist : scipy.stats rv_frozen Probability distribution of the short-term extreme. ste_params: np.array length 3 Parameters of the short term extreme distribution (Generalized Extreme Value) [shape_c, loc, scale]. block_maxima: np.array Block maxima (i.e. largest peak in each block). ''' block_maxima = blockMaxima(x, t, t_st) ste_parameters = stats.genextreme.fit(block_maxima) stextreme_dist = stats.genextreme(c=ste_parameters[0], loc=ste_parameters[1], scale=ste_parameters[2]) return stextreme_dist, ste_parameters, block_maxima
def parse_epistatic_data(self): if self.software.name != 'bagpipe': data_file = self.base_dir + os.sep + self.data_prefix + '.' + str(self.sweep_size/1000000) + '.dat' else: data_file = self.base_dir + os.sep + self.data_prefix + '_add.' + str(self.sweep_size/1000000) + '.dat' run_number = 1 np_extreme_values = np.genfromtxt(data_file, skip_header=1, usecols=(1, 2, 3)) frozen_gev = genextreme( self.gev_model_params.shape, loc=self.gev_model_params.location, scale=self.gev_model_params.scale) epistatic_data_list = [] for data in np_extreme_values: adj_pvalues = self.get_adjusted_pvalue_scipy(data, frozen_gev) if run_number <= 1000: snp_id = 'fa0' else: snp_id = 'fa1' epistatic_data_list.append(exp_data.EpistaticModel( parameter=self.params, software=self.software, run_number=run_number % 1000, locus_span=self.sweep_size, snp_id=snp_id, locus_pvalue=data[0], adj_locus_pvalue=adj_pvalues[0], non_locus_pvalue=data[1], adj_non_locus_pvalue=adj_pvalues[1], non_chrm_pvalue=data[2], adj_non_chrm_pvalue=adj_pvalues[2])) run_number += 1 exp_data.EpistaticModel.objects.bulk_create(epistatic_data_list) return 0
def generate_gev_noise(c, N, Y, target_R2=0.005): """Draw N observations from a generalized extreme value distribution. The GEV distribution is described at https://docs.scipy.org/doc/scipy/reference/tutorial/stats/continuous_genextreme.html. Parameters ---------- c: real The shape parameter for the distribution. The distribution is skewed left if c > 0 and skewed right if c < 0. The larger the magnitude of c, the higher the kurtosis of the distribution. N: integer The number of observations to draw. Y: vector of reals. The true output of the data, ie. f(X). target_R2: real in [0, 1] The target R^2, if errors were Gaussian. For example, 0.5% would be 0.005. Default is 0.5%. """ coef = (1 - target_R2) / target_R2 scale = np.sqrt(coef * np.var(Y)) # Center the distribution to have mean 0. center = -1 / c * (1 - special.gamma(1 + c)) noise = stats.genextreme(c=c, loc=center, scale=scale).rvs(size=N) return noise
def testBijector(self): loc = 0.3 scale = 5. concentration = np.array([[[-5.5], [-20], [0.], [1.]]], dtype=np.float32) bijector = tfb.GeneralizedExtremeValueCDF(loc=loc, scale=scale, concentration=concentration, validate_args=True) self.assertStartsWith(bijector.name, "generalizedextremevalue") x = np.array([[[0.], [-3.], [0.], [4.2]]], dtype=np.float32) # GeneralizedExtremeValue distribution gev_dist = stats.genextreme(-concentration, loc=loc, scale=scale) y = gev_dist.cdf(x).astype(np.float32) self.assertAllClose(y, self.evaluate(bijector.forward(x))) self.assertAllClose(x, self.evaluate(bijector.inverse(y))) self.assertAllClose( np.squeeze(gev_dist.logpdf(x), axis=-1), self.evaluate(bijector.forward_log_det_jacobian(x, event_ndims=1))) self.assertAllClose( self.evaluate( -bijector.inverse_log_det_jacobian(y, event_ndims=1)), self.evaluate(bijector.forward_log_det_jacobian(x, event_ndims=1)), rtol=1e-4, atol=0.)
def parse_additive_env_sweep(self): data_file = self.base_dir + os.sep + self.data_prefix + '.' + str(self.sweep_size/1000000) + '.dat' run_number = 1 np_extreme_values = np.genfromtxt(data_file, skip_header=1, usecols=(1, 2, 3)) frozen_gev = genextreme( self.gev_model_params.shape, loc=self.gev_model_params.location, scale=self.gev_model_params.scale) additive_models_list = [] for data in np_extreme_values: adj_pvalues = self.get_adjusted_pvalue_scipy(data, frozen_gev) additive_models_list.append(exp_data.AdditiveEnvironmentalSweepModel( parameter=self.params, software=self.software, run_number=run_number, locus_span=self.sweep_size, locus_pvalue=data[0], adj_locus_pvalue=adj_pvalues[0], non_locus_pvalue=data[1], adj_non_locus_pvalue=adj_pvalues[1], non_chrm_pvalue=data[2], adj_non_chrm_pvalue=adj_pvalues[2])) run_number += 1 exp_data.AdditiveEnvironmentalSweepModel.objects.bulk_create(additive_models_list) return 0
def _fit(self): # Fit can be made using Maximum Likelihood Estimation (mle) or using # l-moments. # L-moments is fast and accurate most of the time for the GEV # distribution. # MLE FIT # In the case of the mle estimation, sometimes we get unstable values # if we don't provide an initial guess of the parameters. Loc and scale # are more or less stable but shape can be quite unstable depending the # input data. This is why we are using lmoments to obtain start values # for the mle optimization. For mle we are using fmin_bfgs as it is # faster than others and with the first guess provide accurate results. if self.fit_method == 'mle': # Initial guess to make the fit of GEV more stable # To do the initial guess we are using lmoments... _params0 = _lmdistr.gev.lmom_fit(self.data) # The mle fit will start with the initial estimators obtained # with lmoments above _params = _st.genextreme.fit(self.data, _params0['c'], loc=_params0['loc'], scale=_params0['scale'], optimizer=_op.fmin_bfgs) self.params = OrderedDict() # For the shape parameter the value provided by scipy # is defined as negative as that obtained from other # packages in R, some textbooks, wikipedia,... ¿? self.params["shape"] = _params[0] self.params["location"] = _params[1] self.params["scale"] = _params[2] # L-MOMENTS FIT if self.fit_method == 'lmoments': _params = _lmdistr.gev.lmom_fit(self.data) self.params = OrderedDict() # For the shape parameter the value provided by lmoments3 # is defined as negative as that obtained from other # packages in R, some textbooks, wikipedia,... ¿? self.params["shape"] = _params['c'] self.params["location"] = _params['loc'] self.params["scale"] = _params['scale'] # METHOD OF MOMENTS FIT if self.fit_method == 'mom': _params = _gev_momfit(self.data) self.params = OrderedDict() self.params["shape"] = _params[0] self.params["location"] = _params[1] self.params["scale"] = _params[2] # Estimators and a frozen distribution for the estimators self.c = self.params['shape'] # shape self.loc = self.params['location'] # location self.scale = self.params['scale'] # scale self.distr = _st.genextreme( self.c, # frozen distribution loc=self.loc, scale=self.scale)
def mml(self): if self.data is None: raise e.DataNotExist("Data not's None", 25) mml = gev.lmom_fit(self.data) self.estimador = 'MML' self.shape = mml['c'] self.loc = mml['loc'] self.scale = mml['scale'] self.dist = genextreme(c=self.shape, loc=self.loc, scale=self.scale) return self.shape, self.loc, self.scale
def mvs(self): if self.data is None: raise e.DataNotExist("Data not's None", 35) mvs = genextreme.fit(data=self.data) self.estimador = 'MVS' self.shape = mvs[0] self.loc = mvs[1] self.scale = mvs[2] self.dist = genextreme(c=self.shape, loc=self.loc, scale=self.scale) return self.shape, self.loc, self.scale
def extreme_value_prob(NPM, perc): n = NPM.shape[0] t = NPM.shape[1] n_perc = int(round(t * perc)) m = np.zeros(n) for i in range(n): temp = np.abs(NPM[i, :]) temp = np.sort(temp) temp = temp[t - n_perc:] m[i] = trim_mean(temp, 0.05) params = genextreme.fit(m) ev = genextreme(params[0]) probs = ev.cdf(m) return probs
def calc_match_statistics(self, oligo, charges, modifications, ms, ppm_error, score_to_test, random_oligo_to_test=1000): fr = Fragmentor() matcher = Matcher() column_headers = ['Sequence', 'Score'] min_char = len(oligo) max_char = len(oligo) allchar = ['A', 'G', 'C', 'T', 'U'] data_to_save = [] for i in range(random_oligo_to_test): random_oligo = "".join( choice(allchar) for _ in range(randint(min_char, max_char))) fragments = fr.fragment_oligo(random_oligo) df_search_space = matcher.create_search_space( fragments, charges, modifications) df_results = matcher.match_oligo_fragments_pandas( df_search_space, ms, ppm_error) score = self.simple_score(df_results) print('Oligo: {0:<30} Score: {1:7.3f}'.format(random_oligo, score)) data_to_save.append([random_oligo, score]) dist_df = pd.DataFrame(data_to_save, columns=column_headers) extreme_fit = genextreme.fit(dist_df.Score) c = extreme_fit[0] loc = extreme_fit[1] scale = extreme_fit[2] print(("Extreme value fits c = {0}, loc = {1}, scale = {2}").format( c, loc, scale)) extreme_to_plot = genextreme(c, loc, scale) p_value = extreme_to_plot.pdf(score_to_test) print(("p value of score {0} = {1}").format(score_to_test, p_value)) return dist_df, p_value, score_to_test, extreme_to_plot
def testGEVLogPdf(self): batch_size = 6 loc = np.array([0.] * batch_size, dtype=self._dtype) scale = np.array([3.] * batch_size, dtype=self._dtype) conc = np.array([2.] * batch_size, dtype=self._dtype) gev_dist = stats.genextreme(-conc, loc=loc, scale=scale) x = np.array([2., 3., 4., 5., 6., 7.], dtype=self._dtype) gev = tfd.GeneralizedExtremeValue(loc=self.make_tensor(loc), scale=self.make_tensor(scale), concentration=self.make_tensor(conc), validate_args=True) log_pdf = gev.log_prob(self.make_tensor(x)) self.assertAllClose(gev_dist.logpdf(x), self.evaluate(log_pdf)) pdf = gev.prob(x) self.assertAllClose(gev_dist.pdf(x), self.evaluate(pdf))
def testGEVLogPdfMultidimensional(self): batch_size = 6 loc = np.array([[-2.0, -4.0, -5.0]] * batch_size, dtype=self._dtype) scale = np.array([1.0], dtype=self._dtype) conc = np.array([[0.0, 1.0, 2.0]] * batch_size, dtype=self._dtype) gev_dist = stats.genextreme(-conc, loc=loc, scale=scale) x = np.array([[2., 3., 4., 5., 6., 7.]], dtype=self._dtype).T gev = tfd.GeneralizedExtremeValue(loc=self.make_tensor(loc), scale=self.make_tensor(scale), concentration=self.make_tensor(conc), validate_args=True) log_pdf = gev.log_prob(self.make_tensor(x)) self.assertAllClose(self.evaluate(log_pdf), gev_dist.logpdf(x)) pdf = gev.prob(self.make_tensor(x)) self.assertAllClose(self.evaluate(pdf), gev_dist.pdf(x))
def testGEVSample(self): loc = self._dtype(4.0) scale = self._dtype(1.0) conc = self._dtype(0.2) n = int(1e6) gev_dist = stats.genextreme(-conc, loc=loc, scale=scale) gev = tfd.GeneralizedExtremeValue(loc=self.make_tensor(loc), scale=self.make_tensor(scale), concentration=self.make_tensor(conc), validate_args=True) samples = gev.sample(n, seed=test_util.test_seed()) sample_values = self.evaluate(samples) self.assertEqual((n, ), sample_values.shape) self.assertAllClose(gev_dist.mean(), sample_values.mean(), rtol=.01) self.assertAllClose(gev_dist.var(), sample_values.var(), rtol=.01)
def testGEVSampleMultidimensionalVar(self): loc = np.array([2.0, 4.0, 5.0], dtype=self._dtype) scale = np.array([1.0, 0.8, 0.5], dtype=self._dtype) conc = np.array([0.2], dtype=self._dtype) gev_dist = stats.genextreme(-conc, loc=loc, scale=scale) n = int(1e6) gev = tfd.GeneralizedExtremeValue(loc=self.make_tensor(loc), scale=self.make_tensor(scale), concentration=self.make_tensor(conc), validate_args=True) samples = gev.sample(n, seed=test_util.test_seed()) sample_values = self.evaluate(samples) self.assertAllClose(gev_dist.var(), sample_values.var(axis=0), rtol=.03, atol=0)
def plot_ks_gev_gauss(data_sample, alg_name): data_min = min(data_sample) data_max = max(data_sample) n_points = 100 plot_points = [(data_min + (i / n_points) * (data_max - data_min)) for i in range(0, n_points + 1)] # Estimate gaussian: nrm_fit = norm.fit(data_sample) # GEV parameters from fit: (mu, sigma) = nrm_fit rv_nrm = norm(loc=mu, scale=sigma) # Create data from estimated GEV to plot: nrm_pdf = rv_nrm.pdf(plot_points) # Estimate GEV: gev_fit = genextreme.fit(data_sample) # GEV parameters from fit: c, loc, scale = gev_fit rv_gev = genextreme(c, loc=loc, scale=scale) # Create data from estimated GEV to plot: gev_pdf = rv_gev.pdf(plot_points) # Use Kernel-Density Estimation for comparison # Make a Kernel density plot: sns.set(color_codes=True) plt.figure() ax = sns.kdeplot(data_sample, kernel='gau', label='Kernel Density') #####ax.plot(plot_points, gev_pdf, label='Estimated GEV') ax.plot(plot_points, nrm_pdf, label='Estimated Gaussian') ax.legend() # Use title to indicate parameters found: plot_title = "PDF estimated from data created for " + alg_name + "\n" #####plot_title += "Estimated parameters for GEV: location={:.2f} scale={:.2f} c={:.2f}\n".format(loc, scale, c) plot_title += "Estimated parameters for Gaussian: location={:.2f} scale={:.2f}\n".format( mu, sigma) plt.title(plot_title) plt.xlabel("Independent Variable") plt.ylabel("Probability Density from " + str(len(data_sample)) + " points") plt.tight_layout() plt.draw()
def testGEVMean(self): loc = np.array([2.0], dtype=self._dtype) scale = np.array([1.5], dtype=self._dtype) conc = np.array([-0.9, 0.0], dtype=self._dtype) gev_dist = stats.genextreme(-conc, loc=loc, scale=scale) gev = tfd.GeneralizedExtremeValue(loc=self.make_tensor(loc), scale=self.make_tensor(scale), concentration=self.make_tensor(conc), validate_args=True) self.assertAllClose(self.evaluate(gev.mean()), gev_dist.mean()) conc_with_inf_mean = np.array([2.], dtype=self._dtype) gev_with_inf_mean = tfd.GeneralizedExtremeValue( loc=self.make_tensor(loc), scale=self.make_tensor(scale), concentration=self.make_tensor(conc_with_inf_mean), validate_args=True) self.assertAllClose(self.evaluate(gev_with_inf_mean.mean()), [np.inf])
def evdplot(df,outfile): ''' Distribution of escores. ''' matplotlib.use('pdf') f, (ax1, ax2, ax3) = subplots(1,3,figsize=[10,5]) # s score distribution df.escore.hist(ax=ax1,normed=1,bins=50,histtype='stepfilled', alpha=0.2,label='EScore') c,loc,scale = stats.genextreme.fit(df.escore) rv = stats.genextreme(c,loc=loc,scale=scale) x = numpy.linspace(0,df.escore.max(),50) ax1.plot(x, rv.pdf(x), 'k-', lw=1, label='EVD') ax1.legend(loc='best') ax1.set_xlabel('Enrichment score') ax1.set_ylabel('Probability') # cummulative distribution df.escore.hist(ax=ax2,cumulative=True, normed=1, bins=50,histtype='stepfilled', alpha=0.2,label='EScore') ax2.plot(x, rv.cdf(x), 'k-', lw=1, label='EVD') ax2.legend(loc='best') ax2.set_xlabel('Enrichment score') ax2.set_ylabel('Probability') # fitness test nbin = int(round(1+numpy.log2(df.escore.size))) x = numpy.linspace(0,df.escore.max(),nbin+1) y = rv.cdf(x) counts, bin_edges = numpy.histogram(df.escore, bins=nbin, normed=False) counts = [df.escore.size-len(df.escore.nonzero()[0])] + list(counts) cdf = numpy.cumsum(counts) cdf = cdf/float(max(cdf)) kst,ksp = stats.ks_2samp(y,cdf) chit,chip = stats.chisquare(cdf,y) ax3.plot(bin_edges,y,'r-',label='EVD') ax3.plot(bin_edges, cdf,'b-',label='EScore') ax3.legend(loc='best') ax3.text(df.escore.max()/4,0.3,"KS test:\nstat={0:.2f},p={1:.2e}\nChiSquare test:\nstat={2:.2f},p={3:.2e}".format(kst,ksp,chit,chip)) ax3.set_xlabel('Enrichment score') ax3.set_ylabel('Probability') savefig(outfile,format='pdf') return rv
def compute_ehalf(distribution, a, b, intertions, ranks, gev): gevparam = row['actual_gev'] gevdist = stats.genextreme(*gevparam) # Sample-projected runtime on 1 block is just the expected value of the GEV distribution runtime_sample_emma = [gevdist.moment(4) * iterations / 1000000] # Distribution-projected runtime based on either the original distribution or its EMMA # project to the initial number of ranks rank = ranks if rank == 0: runtime_original_emma = [odist.moment(1) * iterations / 1000000] else: runtime_original_emma = [emma(odist, rank) * iterations / 1000000] # And project everything to more ranks rank_list = [] for i in range(1, 5): rank = ranks * (2**i) rank_list.append(rank) runtime_sample_emma.append(emma(gevdist, 2**i) * iterations / 1000000) runtime_original_emma.append(emma(odist, rank) * iterations / 1000000) # For each set of distributions and parameters, project runtimes from smallest # experiment and plot larger experiments versus this projection for frame in expdata.groupby(['workload', 'a', 'b', 'iterations']): exps = frame[1].sort_values(['ranks']).reset_index(drop=True) print('Projecting runtimes for experiment {0}'.format(frame[0])) fig, ax = plt.subplots() runtimes = [] sizes = [] for iter, row in exps.iterrows(): sizes.append(row['ranks']) runtimes.append(row['runtime'] / 1000000) if iter == 0: project_runtimes(frame[0], row, fig, ax) ax.scatter(sizes, runtimes, label='Actual Runtimes') ax.grid() plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left") plt.show()
def test_gev_genextreme(case): gev = stats.genextreme(0) # check ev copulas, cdf and transform against R `evt` package ev_tr, v1, v2, args, res0, res1, res2 = case y = [v1, v2] u = gev.cdf(y) res = copula_bv_ev(u, ev_tr, args=args) assert_allclose(res, res1, rtol=1e-13) ev = ExtremeValueCopula(ev_tr) # evaluated at using u = y cdf1 = ev.cdf(y, args) assert_allclose(cdf1, res0, rtol=1e-13) # evaluated at transformed u = F(y) cdf1 = ev.cdf(u, args) assert_allclose(cdf1, res1, rtol=1e-13) cev = CopulaDistribution([gev, gev], ev, copargs=args) cdfd = cev.cdf(np.array(y), args=args) assert_allclose(cdfd, res1, rtol=1e-13) pdfd = cev.pdf(np.array(y), args=args) assert_allclose(pdfd, res2, rtol=1e-13)
def main(): # Get DataFrame of All Experiments df = getAllExperiments() # Get Only Runs from Experiment 15 df_filtered = df[df['Experiment'] == 15] # Get Specific Run - No Rabbit Workload and No Stencil and 8 ppn on 2 nodes df_filtered = df_filtered[df_filtered['cores'] == 8] df_filtered = df_filtered[df_filtered['processors'] == 16] # 2 Nodes df_filtered = df_filtered[df_filtered['rabbit_workload'] == 0] df_filtered = df_filtered[df_filtered['stencil_size'] == 0] print(df_filtered) # Get Data from Specific Run data = getData(df_filtered) print(data.head()) # Get Data Only from Rank 0 data_rank0 = data[data['rank'] == 0] # Find Shape, Location, and Scale of Max Data shape, loc, scale = gevfit.fit(data_rank0['workload_max_usec']) print("Shape: ", shape, "\tLocation: ", loc, "\tScale: ", scale) # Get Overall Runtime runtime0 = data_rank0['interval_max_usec'].sum() dist = stats.genextreme(shape, loc, scale) print("Runtime: ", runtime0, "Microseconds at Initial ", data['comm_size'].iloc[0], " Ranks") # Projected Runtime at k = 8 -> 128 ranks (16 nodes) projected = emma(dist, 8) * data['iterations'].iloc[0] print("K = 8: Projected Runtime: ", projected, "\tProjected Efficiency: ", runtime0 / projected) # Get Projected Scale of When Efficiency Reaches 50% eh = ehalf(runtime0, data['comm_size'].iloc[0], dist, data['iterations'].iloc[0]) print("Expect 50% Efficiency at: ", eh, " Ranks")
def makeGraphs(rawdata, title, filename, pmodel=False, chaosnoise=False): for i in range(len(rawdata)): #Plot e ajuste do histograma da série temporal (mu, sigma) = norm.fit(rawdata[0][2]) # rv_nrm = norm(loc=mu, scale=sigma) # Estimate GEV: n = 8192 ypoints = [ min(rawdata[0][2]) + (i / n) * (max(rawdata[0][2]) - min(rawdata[0][2])) for i in range(0, n + 1) ] gev_fit = genextreme.fit(rawdata[0][2]) # GEV parameters from fit: c, loc, scale = gev_fit mean, var, skew, kurt = genextreme.stats(c, moments='mvsk') rv_gev = genextreme(c, loc=loc, scale=scale) # Create data from estimated GEV to plot: gev_pdf = rv_gev.pdf(ypoints) plt.title((title + "\nMu= {1:.3}, Sigma={2:.3}.").format( rawdata[i][0], mu, sigma)) n, bins, patches = plt.hist(rawdata[0][2], 60, density=1, facecolor='powderblue', alpha=0.75, label="Normalized data") plt.plot(np.arange(min(bins), max(bins), (max(bins) - min(bins)) / len(rawdata[0][2])), gev_pdf[:len(rawdata[0][2])], 'r-', lw=5, alpha=0.6, label='genextreme pdf') plt.ylabel("Probability Density") plt.xlabel("Value") plt.legend() plt.savefig("PDF" + filename.format(i)) plt.show() plt.figure(figsize=(20, 12)) #Plot da série temporal ax1 = plt.subplot(211) ax1.set_title(title.format(rawdata[i][0]), fontsize=18) if pmodel == True: ax1.plot(rawdata[i][2], color="firebrick", linestyle='-', label="Data") elif chaosnoise == True: ax1.plot(rawdata[i][1], rawdata[i][2], color="firebrick", marker='o', linestyle='', label="Data") else: ax1.plot(rawdata[i][1], rawdata[i][2], color="firebrick", linestyle='-', label="Data") #Plot e cálculo do DFA ax2 = plt.subplot(223) ax2.set_title(r"Detrended Fluctuation Analysis $\alpha$={0:.3}".format( rawdata[i][3]), fontsize=15) ax2.plot(rawdata[i][4], rawdata[i][5], marker='o', linestyle='', color="#12355B", label="{0:.3}".format(rawdata[i][3])) ax2.plot(rawdata[i][4], rawdata[i][6], color="#9DACB2") #Plot e cáculo do PSD ax3 = plt.subplot(224) ax3.set_title(r"Power Spectrum Density $\beta$={0:.3}".format( rawdata[i][12]), fontsize=15) ax3.set_yscale('log') ax3.set_xscale('log') ax3.plot(rawdata[i][7], rawdata[i][8], '-', color='deepskyblue', alpha=0.7) ax3.plot(rawdata[i][9], rawdata[i][10], color="darkblue", alpha=0.8) ax3.axvline(rawdata[i][7][rawdata[i][14]], color="darkblue", linestyle='--') ax3.axvline(rawdata[i][7][rawdata[i][15]], color="darkblue", linestyle='--') ax3.plot(rawdata[i][9], rawdata[i][13](rawdata[i][9], rawdata[i][11], rawdata[i][12]), color="#D65108", linestyle='-', linewidth=3, label='{0:.3}$'.format(rawdata[i][12])) ax2.set_xlabel("log(s)") ax2.set_ylabel("log F(s)") ax3.set_xlabel("Frequência (Hz)") ax3.set_ylabel("Potência") ax3.legend() plt.savefig(filename.format(i)) plt.show()
dist = sp.gumbel_l(X[0], X[1]) x = np.array(bins) y = dist.pdf(x) print(y) plt.plot(x, y, 'k--', linewidth=2) X = sp.norm.fit(np.array(trace)) print(X) dist = sp.norm(X[0], X[1]) x = np.array(bins) y = dist.pdf(x) plt.plot(x, y, 'r--', linewidth=2) X = sp.genextreme.fit(np.array(trace)) print(X) dist = sp.genextreme(X[0], X[1], X[2]) x = np.array(bins) y = dist.pdf(x) plt.plot(x, y, 'b--', linewidth=2) plt.title("%s" % (t), fontsize='small') elif plot_idx == 3: n, bins, patches = plt.hist(np.array(trace), 50, normed=1, facecolor='green', alpha=0.75) X = sp.expon.fit(np.array(trace), floc=0) print(X)
from scipy.stats import genextreme import matplotlib.pyplot as plt import numpy as np import pandas as pd from scipy.stats import expon data = pd.read_csv("times.csv") data_list = data['data_queue_vessels_waiting_lock'].tolist() parrams = genextreme.fit(data_list) fig, ax = plt.subplots(1, 1) c1 = -0.7776 scale1 = 15.08 loc1 = 13.16 rv1 = genextreme(c=c1, scale=scale1, loc=loc1) rv2 = genextreme(c=parrams[0], scale=parrams[1], loc=parrams[2]) x1 = np.linspace(rv1.ppf(0.00001), rv1.ppf(0.99999), 100) x2 = np.linspace(rv2.ppf(0.00001), rv2.ppf(0.99999), 100) ax.plot(x2, rv2.pdf(x2), 'r-', lw=5, label='scipy') ax.plot(x1, rv1.pdf(x1), 'k-', lw=2, label='matlab') plt.show()
dist = sp.gumbel_l(X[0],X[1]) x = np.array(bins) y = dist.pdf(x) print y plt.plot(x, y,'k--',linewidth=2) X = sp.norm.fit(np.array(trace)) print X dist = sp.norm(X[0],X[1]) x = np.array(bins) y = dist.pdf(x) plt.plot(x, y,'r--',linewidth=2) X = sp.genextreme.fit(np.array(trace)) print X dist = sp.genextreme(X[0],X[1],X[2]) x = np.array(bins) y = dist.pdf(x) #plt.plot(x, y,'b--',linewidth=2) elif plot_idx==3: n, bins, patches = plt.hist(np.array(trace), 50, normed=1, facecolor='green', alpha=0.75) #copied = np.append(np.array(trace), -np.array(trace)) #(mu, sigma) = sp.norm.fit(copied) #y = mlab.normpdf( bins, mu, sigma) #l = plt.plot(bins, y, 'r--', linewidth=2) X = sp.expon.fit(np.array(trace),floc=0)
def all_dists(): # dists param were taken from scipy.stats official # documentaion examples # Total - 89 return { "alpha": stats.alpha(a=3.57, loc=0.0, scale=1.0), "anglit": stats.anglit(loc=0.0, scale=1.0), "arcsine": stats.arcsine(loc=0.0, scale=1.0), "beta": stats.beta(a=2.31, b=0.627, loc=0.0, scale=1.0), "betaprime": stats.betaprime(a=5, b=6, loc=0.0, scale=1.0), "bradford": stats.bradford(c=0.299, loc=0.0, scale=1.0), "burr": stats.burr(c=10.5, d=4.3, loc=0.0, scale=1.0), "cauchy": stats.cauchy(loc=0.0, scale=1.0), "chi": stats.chi(df=78, loc=0.0, scale=1.0), "chi2": stats.chi2(df=55, loc=0.0, scale=1.0), "cosine": stats.cosine(loc=0.0, scale=1.0), "dgamma": stats.dgamma(a=1.1, loc=0.0, scale=1.0), "dweibull": stats.dweibull(c=2.07, loc=0.0, scale=1.0), "erlang": stats.erlang(a=2, loc=0.0, scale=1.0), "expon": stats.expon(loc=0.0, scale=1.0), "exponnorm": stats.exponnorm(K=1.5, loc=0.0, scale=1.0), "exponweib": stats.exponweib(a=2.89, c=1.95, loc=0.0, scale=1.0), "exponpow": stats.exponpow(b=2.7, loc=0.0, scale=1.0), "f": stats.f(dfn=29, dfd=18, loc=0.0, scale=1.0), "fatiguelife": stats.fatiguelife(c=29, loc=0.0, scale=1.0), "fisk": stats.fisk(c=3.09, loc=0.0, scale=1.0), "foldcauchy": stats.foldcauchy(c=4.72, loc=0.0, scale=1.0), "foldnorm": stats.foldnorm(c=1.95, loc=0.0, scale=1.0), # "frechet_r": stats.frechet_r(c=1.89, loc=0.0, scale=1.0), # "frechet_l": stats.frechet_l(c=3.63, loc=0.0, scale=1.0), "genlogistic": stats.genlogistic(c=0.412, loc=0.0, scale=1.0), "genpareto": stats.genpareto(c=0.1, loc=0.0, scale=1.0), "gennorm": stats.gennorm(beta=1.3, loc=0.0, scale=1.0), "genexpon": stats.genexpon(a=9.13, b=16.2, c=3.28, loc=0.0, scale=1.0), "genextreme": stats.genextreme(c=-0.1, loc=0.0, scale=1.0), "gausshyper": stats.gausshyper(a=13.8, b=3.12, c=2.51, z=5.18, loc=0.0, scale=1.0), "gamma": stats.gamma(a=1.99, loc=0.0, scale=1.0), "gengamma": stats.gengamma(a=4.42, c=-3.12, loc=0.0, scale=1.0), "genhalflogistic": stats.genhalflogistic(c=0.773, loc=0.0, scale=1.0), "gilbrat": stats.gilbrat(loc=0.0, scale=1.0), "gompertz": stats.gompertz(c=0.947, loc=0.0, scale=1.0), "gumbel_r": stats.gumbel_r(loc=0.0, scale=1.0), "gumbel_l": stats.gumbel_l(loc=0.0, scale=1.0), "halfcauchy": stats.halfcauchy(loc=0.0, scale=1.0), "halflogistic": stats.halflogistic(loc=0.0, scale=1.0), "halfnorm": stats.halfnorm(loc=0.0, scale=1.0), "halfgennorm": stats.halfgennorm(beta=0.675, loc=0.0, scale=1.0), "hypsecant": stats.hypsecant(loc=0.0, scale=1.0), "invgamma": stats.invgamma(a=4.07, loc=0.0, scale=1.0), "invgauss": stats.invgauss(mu=0.145, loc=0.0, scale=1.0), "invweibull": stats.invweibull(c=10.6, loc=0.0, scale=1.0), "johnsonsb": stats.johnsonsb(a=4.32, b=3.18, loc=0.0, scale=1.0), "johnsonsu": stats.johnsonsu(a=2.55, b=2.25, loc=0.0, scale=1.0), "ksone": stats.ksone(n=1e03, loc=0.0, scale=1.0), "kstwobign": stats.kstwobign(loc=0.0, scale=1.0), "laplace": stats.laplace(loc=0.0, scale=1.0), "levy": stats.levy(loc=0.0, scale=1.0), "levy_l": stats.levy_l(loc=0.0, scale=1.0), "levy_stable": stats.levy_stable(alpha=0.357, beta=-0.675, loc=0.0, scale=1.0), "logistic": stats.logistic(loc=0.0, scale=1.0), "loggamma": stats.loggamma(c=0.414, loc=0.0, scale=1.0), "loglaplace": stats.loglaplace(c=3.25, loc=0.0, scale=1.0), "lognorm": stats.lognorm(s=0.954, loc=0.0, scale=1.0), "lomax": stats.lomax(c=1.88, loc=0.0, scale=1.0), "maxwell": stats.maxwell(loc=0.0, scale=1.0), "mielke": stats.mielke(k=10.4, s=3.6, loc=0.0, scale=1.0), "nakagami": stats.nakagami(nu=4.97, loc=0.0, scale=1.0), "ncx2": stats.ncx2(df=21, nc=1.06, loc=0.0, scale=1.0), "ncf": stats.ncf(dfn=27, dfd=27, nc=0.416, loc=0.0, scale=1.0), "nct": stats.nct(df=14, nc=0.24, loc=0.0, scale=1.0), "norm": stats.norm(loc=0.0, scale=1.0), "pareto": stats.pareto(b=2.62, loc=0.0, scale=1.0), "pearson3": stats.pearson3(skew=0.1, loc=0.0, scale=1.0), "powerlaw": stats.powerlaw(a=1.66, loc=0.0, scale=1.0), "powerlognorm": stats.powerlognorm(c=2.14, s=0.446, loc=0.0, scale=1.0), "powernorm": stats.powernorm(c=4.45, loc=0.0, scale=1.0), "rdist": stats.rdist(c=0.9, loc=0.0, scale=1.0), "reciprocal": stats.reciprocal(a=0.00623, b=1.01, loc=0.0, scale=1.0), "rayleigh": stats.rayleigh(loc=0.0, scale=1.0), "rice": stats.rice(b=0.775, loc=0.0, scale=1.0), "recipinvgauss": stats.recipinvgauss(mu=0.63, loc=0.0, scale=1.0), "semicircular": stats.semicircular(loc=0.0, scale=1.0), "t": stats.t(df=2.74, loc=0.0, scale=1.0), "triang": stats.triang(c=0.158, loc=0.0, scale=1.0), "truncexpon": stats.truncexpon(b=4.69, loc=0.0, scale=1.0), "truncnorm": stats.truncnorm(a=0.1, b=2, loc=0.0, scale=1.0), "tukeylambda": stats.tukeylambda(lam=3.13, loc=0.0, scale=1.0), "uniform": stats.uniform(loc=0.0, scale=1.0), "vonmises": stats.vonmises(kappa=3.99, loc=0.0, scale=1.0), "vonmises_line": stats.vonmises_line(kappa=3.99, loc=0.0, scale=1.0), "wald": stats.wald(loc=0.0, scale=1.0), "weibull_min": stats.weibull_min(c=1.79, loc=0.0, scale=1.0), "weibull_max": stats.weibull_max(c=2.87, loc=0.0, scale=1.0), "wrapcauchy": stats.wrapcauchy(c=0.0311, loc=0.0, scale=1.0), }
def key_distribution(num_samples): dist = genextreme(30.7984, 8.20449, 0.078688) return dist.rvs(num_samples)
@author: rarossi """ from scipy import stats as ss from numpy import linspace from matplotlib import pyplot as plt from math import log, exp sz = 1000 mydistro = ss.gumbel_r myparams = (0, 1) myfunc = lambda x: -log(-log(x)) # myfunc = lambda x: -log(x) # myfunc = lambda x: x # myfunc = lambda x: exp(x) sample = [mydistro.rvs(*myparams) for _ in range(sz)] sample.sort() emp = [(i + 0.6) / (sz + 0.4) for i in range(sz)] dist_emp = list(map(myfunc, emp)) ge = ss.genextreme(*ss.genextreme.fit(sample)) x = linspace(min(sample), max(sample), 100) plt.subplot(211) plt.hist(sample, normed=True, bins=20) plt.plot(x, ge.pdf(x)) plt.subplot(212) plt.plot(sample, dist_emp, '.') plt.plot(sample, list(map(myfunc, ge.cdf(sample))))
c = -0.1 mean, var, skew, kurt = genextreme.stats(c, moments='mvsk') # Display the probability density function (``pdf``): x = np.linspace(genextreme.ppf(0.01, c), genextreme.ppf(0.99, c), 100) ax.plot(x, genextreme.pdf(x, c), 'r-', lw=5, alpha=0.6, label='genextreme pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = genextreme(c) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = genextreme.ppf([0.001, 0.5, 0.999], c) np.allclose([0.001, 0.5, 0.999], genextreme.cdf(vals, c)) # True # Generate random numbers: r = genextreme.rvs(c, size=1000) # And compare the histogram: ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
""" from scipy import stats as ss from numpy import linspace from matplotlib import pyplot as plt from math import log, exp sz = 1000 mydistro = ss.gumbel_r myparams = (0, 1) myfunc = lambda x: -log(-log(x)) # myfunc = lambda x: -log(x) # myfunc = lambda x: x # myfunc = lambda x: exp(x) sample = [mydistro.rvs(*myparams) for _ in range(sz)] sample.sort() emp = [(i+0.6)/(sz+0.4) for i in range(sz)] dist_emp = list(map(myfunc, emp)) ge = ss.genextreme(*ss.genextreme.fit(sample)) x = linspace(min(sample), max(sample), 100) plt.subplot(211) plt.hist(sample, normed=True, bins=20) plt.plot(x, ge.pdf(x)) plt.subplot(212) plt.plot(sample, dist_emp, '.') plt.plot(sample, list(map(myfunc, ge.cdf(sample))))
def fitting(dataset, frequency=200): reactivities = [] # Summary instances into a table for seq_id in dataset.keys(): structure_contexts, reactivity = dataset[seq_id] L = len(reactivity) assert len( structure_contexts ) == L, "Structure context and reactivities has different length" for i in range(L): reactivities.append((structure_contexts[i], reactivity[i], seq_id)) reactivities = pd.DataFrame.from_records(reactivities) reactivities.columns = ["structure-context", "reactivity", "sequence-id"] # Only use reactivity larger than zero reactivities = reactivities[reactivities["reactivity"] > 0] # Get instances number of each structure context structure_contexts = reactivities["structure-context"].unique() n_context = structure_contexts.shape[0] n_instances = reactivities.groupby("structure-context").apply( lambda x: x.shape[0]) statistics = pd.DataFrame(index=structure_contexts, columns=["instances", "assignment"]) statistics.loc[n_instances.index, "instances"] = n_instances.values print("5 mer should have 32 structure contexts") frequent_set = set(n_instances[n_instances > frequency].index) print("{} present in the input dataset".format(n_context)) print("{} meet the frequency cutoff".format(len(frequent_set))) # Split structure context to frequent ones and not frequent ones frequent_reactivities = reactivities[ reactivities["structure-context"].isin(frequent_set)] not_frequent_reactivities = reactivities[ ~reactivities["structure-context"].isin(frequent_set)] ## Fit an individual generalized extreme distribution for each frequent 5 mer instance ## For rare instance, assign the instance the most similar fitted instance (fitted model with highest likelihood) # Structure context to model mapping modelDict = {} # logged likelihod of rare structure context to assign to a structure context likelihoodsDict = {} # Structure context not fit well to generalize extreme value distribution dubious_instances = [] for structure_context in frequent_reactivities["structure-context"].unique( ): react = reactivities[reactivities["structure-context"] == structure_context]["reactivity"].values shape, location, scale = genextreme.fit(react) if shape > 0: dubious_instances.append(structure_context) continue model = genextreme(shape, location, scale) modelDict[structure_context] = model likelihoodsDict[structure_context] = not_frequent_reactivities.groupby( "structure-context").apply( lambda x: np.log(model.pdf(x["reactivity"].values)).sum()) # Summarize the likelihod of rare structure context into a DataFrame likelihoods = pd.DataFrame(likelihoodsDict) # Map rare structure context to most similar frequent structure context assignment0 = dict(likelihoods.idxmax(axis=1)) # Map frequent structure context to list of rare structure contexts assignment = defaultdict(list) for less, more in assignment0.items(): statistics.loc[less, "assignment"] = more assignment[more].append(less) # Refit genextreme model for frequent structure context for more, less in assignment.items(): for context in less: modelDict[context] = modelDict[more] contexts = set(less) contexts.add(more) react = reactivities[reactivities["structure-context"].isin( contexts)]["reactivity"].values shape, location, scale = genextreme.fit(react) model = genextreme(shape, location, scale) for structure_context in contexts: modelDict[structure_context] = model if len(dubious_instances) > 0: print( "Fitting for the following instances generates positive shape value, which is dubious" ) print(",".join(dubious_instances)) max_ll = np.nan for structure_context in dubious_instances: for fitted_context, model in modelDict.items(): x = reactivities[reactivities["structure-context"] == fitted_context]["reactivity"].values current_ll = np.log(model.pdf(x)).sum() if np.isnan(max_ll): max_ll = np.log(model.pdf(x)).sum() max_instance = fitted_context else: if max_ll < current_ll: max_ll = current_ll max_instance = fitted_context print("{} is assigned to {}".format(structure_context, max_instance)) statistics.loc[structure_context, "assignment"] = max_instance modelDict[structure_context] = modelDict[max_instance] #reactivities.to_csv("reactivity-table.txt",index=False,sep="\t") return modelDict, statistics
freqs, power, xdata, ydata, amp, index, powerlaw, inicio, fim = statsfuncs.psd(y) psi, amax, amin, a0 = mfdfa.makemfdfa(y, True) beta = 2.*alfa-1. print("Beta=2*Alpha-1={}".format(beta)) # Plot e ajuste do histograma da série temporal mu, sigma = norm.fit(y) rv_nrm = norm(loc=mu, scale=sigma) # Estimate GEV: gev_fit = genextreme.fit(y) # GEV parameters from fit: c, loc, scale = gev_fit mean, var, skew, kurt = genextreme.stats(c, moments='mvsk') rv_gev = genextreme(c, loc=loc, scale=scale) # Create data from estimated GEV to plot: gev_pdf = rv_gev.pdf(ypoints) plt.title("PDF with data from " + country + "\nmu={0:3.5}, sigma={1:3.5}" .format(mu, sigma)) n, bins, patches = plt.hist(y, 60, density=1, facecolor='powderblue', alpha=0.75, label="Normalized data") plt.plot(np.arange(min(bins), max(bins)+1, (max(bins) - min(bins))/len(y)), gev_pdf, 'r-', lw=5, alpha=0.6, label='genextreme pdf') plt.ylabel("Probability Density") plt.xlabel("Value") plt.legend() plt.savefig("PDF"+filename) plt.show() plt.figure(figsize=(20, 14))
print(( "Mu:{0:5.2f} Sigma:{1:5.2f} z_score:{2:5.2f} p_value_one_side: {3:15.13f} p_value_two_side: {4:15.13f} p_values {5:15.13f}" ).format(mu, sigma, z_score, p_values, p_values_2side, p_values3)) # instead use extreme value distribution #fit to data extreme_fit = genextreme.fit(df[1]) c = extreme_fit[0] loc = extreme_fit[1] scale = extreme_fit[2] print(("Extreme value fits c = {0}, loc = {1}, scale = {2}").format( c, loc, scale)) ax1 = sns.distplot(df[1], fit=genextreme, kde=False) x = np.linspace(-10, 16, 1000) extreme_to_plot = genextreme(c, loc, scale) ax1.plot(x, extreme_to_plot.pdf(x), 'r-', lw=2, label='pdf') p_value = extreme_to_plot.pdf(score_to_test) ax1.axvline(score_to_test) print(("p value of score {0} = {1}").format(score_to_test, p_value)) plt.show()
def cipe(src_ra, src_dec, counterpart_separation, region_radius=0.1, numpoints=10000): counterpart_separation = counterpart_separation * u.arcsec region_radius = region_radius * u.degree tap_cap = 100000 tap_server = TapPlus(url='https://gea.esac.esa.int/tap-server/tap', verbose=False) catalog = 'gaiaedr3.gaia_source' query = "SELECT TOP " + str(tap_cap) + \ " * FROM " + catalog + " WHERE ra BETWEEN " + \ str(src_ra.value - region_radius.value) + \ " AND " + str(src_ra.value + region_radius.value) + \ " AND dec BETWEEN " + str(src_dec.value - region_radius.value) + \ " AND " + str(src_dec.value + region_radius.value) search = tap_server.launch_job(query) results = search.get_results() print('Number of Gaia sources:' + str(len(results))) if len(results) == tap_cap: print('WARNING: Gaia contains too many sources in the region >' + str(tap_cap) + '). Region may be too large.') gaia_srclist = SkyCoord(ra=results['ra'], dec=results['dec']) fake_srclist = SkyCoord( ra=src_ra + (np.random.rand(numpoints) - 0.5) * 2 * region_radius, dec=src_dec + (np.random.rand(numpoints) - 0.5) * 2 * region_radius) sep_dist = fake_srclist.match_to_catalog_sky(gaia_srclist)[1].to( u.arcsec).value fig1 = plt.figure(figsize=(6, 4)) ax1 = fig1.add_subplot(1, 1, 1) ax1.hist(sep_dist, bins=50, color='#034BCA', edgecolor='w', density=True, label='Simulations') model_x = np.linspace(0, 8, 1000) params = st.genextreme.fit(sep_dist) model_y = st.genextreme(*params).pdf(model_x) p_less = len(sep_dist[sep_dist <= counterpart_separation.value]) / len( sep_dist) * 100 ax1.plot(model_x, model_y, color='#EB24F4', label='Gumble fit') ax1.axvline(0.51, color='k', linestyle='--') ax1.set_title(f"$P(d<{counterpart_separation.value}'')={p_less:.3}\%$", fontsize=12) ax1.legend(fontsize=12) ax1.set_xlabel(r'Distance to closest random Gaia source (arcsec)', fontsize=14) ax1.set_ylabel(r'Probabilty density (arcsec$^{-1}$)', fontsize=14) ax1.set_xlim(0, 8) ax1.minorticks_on() ax1.tick_params(axis='both', which='major', labelsize=14) ax1.tick_params(axis='both', which='major', length=9) ax1.tick_params(axis='both', which='minor', length=4.5) ax1.tick_params(axis='both', which='both', direction='in', right=True, top=True) return fig1