def test_1(): try: from matplotlib import pyplot as pp except ImportError: raise SkipTest('Not making QQ plot') # check that ldirichlet_softmax_pdf is actually giving a dirichlet # distribution, by comparing a QQ plot with np.random.dirichlet alpha = np.array([1, 2, 3], dtype=float) def logprob(x, alpha): grad = np.zeros_like(x) logp = ldirichlet_softmax(x, alpha, grad=grad) return logp, grad samples, diag = hmc(logprob, x0=np.random.normal(size=(3, )), n_samples=1000, args=(alpha, ), n_steps=10, return_diagnostics=True) expx = np.exp(samples) pi1 = expx / np.sum(expx, 1, keepdims=True) pi2 = np.random.dirichlet(alpha=alpha, size=1000) sm.qqplot_2samples(pi1[:, 0], pi2[:, 0], line='45') pp.savefig('bayes_ratematrix-test-1.png')
def run(self, data = 'normal'): mean_dist = self.generate_mean_distribution(data) mean_of_sample = np.mean(mean_dist) sd_of_sample = np.std(mean_dist) print(f'Mean of Mean Distribution: {mean_of_sample}') print(f'Mean of Data Distribution: {np.mean(self.data)}') print(f'S.D. of Mean Distribution: {sd_of_sample}') print(f'S.D. of Data Distribution: {np.std(self.data)}') cust_norm = np.random.normal(mean_of_sample, sd_of_sample, self.sample_size) plt.figure(figsize = (14,10)) ax1 = plt.subplot2grid ((2, 2), (0, 0)) ax1 = sns.distplot(cust_norm) ax1 = plt.title('Normal Distribution') ax2 = plt.subplot2grid ((2, 2), (0, 1)) ax2 = sns.distplot(mean_dist) ax2 = plt.title('Mean Distribution') ax3 = plt.subplot2grid ((2, 2), (1, 0), colspan=2) ax3 = sns.distplot(self.data) ax3 = plt.title('Data Distribution') qqplot_2samples(np.array(mean_dist),cust_norm, line = '45') plt.show()
def run_dtwf_coalescent_comparison(self, test_name, **kwargs): df = pd.DataFrame() for model in ["hudson", "dtwf"]: kwargs["model"] = model print("Running: ", kwargs) replicates = msprime.simulate(**kwargs) data = collections.defaultdict(list) for ts in replicates: t_mrca = np.zeros(ts.num_trees) for tree in ts.trees(): t_mrca[tree.index] = tree.time(tree.root) data["tmrca_mean"].append(np.mean(t_mrca)) data["num_trees"].append(ts.num_trees) data["model"].append(model) df = df.append(pd.DataFrame(data)) basedir = os.path.join("tmp__NOBACKUP__", test_name) if not os.path.exists(basedir): os.mkdir(basedir) df_hudson = df[df.model == "hudson"] df_dtwf = df[df.model == "dtwf"] for stat in ["tmrca_mean", "num_trees"]: v1 = df_hudson[stat] v2 = df_dtwf[stat] sm.graphics.qqplot(v1) sm.qqplot_2samples(v1, v2, line="45") f = os.path.join(basedir, "{}.png".format(stat)) pyplot.savefig(f, dpi=72) pyplot.close('all')
def _plot_stats(self, key, stats_type, df_msp, df_ms): assert set(df_ms.columns.values) == set(df_msp.columns.values) for stat in df_ms.columns.values: v1 = df_ms[stat] v2 = df_msp[stat] sm.graphics.qqplot(v1) sm.qqplot_2samples(v1, v2, line="45") f = self._build_filename(key, stats_type, stat) pyplot.savefig(f, dpi=72) pyplot.close('all')
def test_qqplot_unequal(): rs = np.random.RandomState(0) data1 = rs.standard_normal(100) data2 = rs.standard_normal(200) fig1 = sm.qqplot_2samples(data1, data2) fig2 = sm.qqplot_2samples(data2, data1) x1, y1 = fig1.get_axes()[0].get_children()[0].get_data() x2, y2 = fig2.get_axes()[0].get_children()[0].get_data() np.testing.assert_allclose(x1, x2) np.testing.assert_allclose(y1, y2) numobj1 = len(fig1.get_axes()[0].get_children()) numobj2 = len(fig2.get_axes()[0].get_children()) assert numobj1 == numobj2
def test_qqplot_2samples_ProbPlotObjects(self): # also tests all values for line for line in ['r', 'q', '45', 's']: # test with `ProbPlot` instances fig = sm.qqplot_2samples(self.prbplt, self.other_prbplot, line=line) plt.close('all')
def test_qqplot_2samples_ProbPlotObjects(self): # also tests all values for line for line in ['r', 'q', '45', 's']: # test with `ProbPlot` instances fig = sm.qqplot_2samples(self.prbplt, self.other_prbplot, line=line)
def main(): options = get_options() import sys import numpy as np import pandas as pd import statsmodels.api as sm import matplotlib.pyplot as plt m = pd.read_csv(options.table, usecols=['lrt-pvalue'], sep='\t')['lrt-pvalue'] plt.figure(figsize=(4, 3.75)) ax = plt.subplot(111) y = -np.log10(m) x = -np.log10(np.random.uniform(0, 1, m.shape[0])) fig = sm.qqplot_2samples(y, x, xlabel='Expected $-log_{10}(pvalue)$', ylabel='Observed $-log_{10}(pvalue)$', line='45', ax=ax) ax = fig.axes[0] ax.lines[0].set_color('k') ax.lines[0].set_alpha(0.3) ax.set_xlim(-0.5, x.max() + 0.5) ax.set_ylim(-0.5, y.max() + 0.5) plt.tight_layout() plt.savefig(options.output, dpi=150)
def validate(replicates): """ Validate that we are simulating the same things in the simulators by running some replicates and plotting the distributions of the number of output trees. """ # NOTE: we seem to consistently get more trees from ARGON. Looking # at the qqplots, the distributions looks about the same, but there's # consistently more from ARGON. We've check the parameters as closely # as we can here, so I'm not sure there's much we can do. # However, see the discussion here: # https://github.com/tskit-dev/msprime-1.0-paper/pull/109 # When we export to a tree sequence and squash the trees down properly, # we get the same distributions. So, this is fine. L = 1 # Megabases sample_size = 10 nt_argon = np.zeros(replicates) nt_hybrid = np.zeros(replicates) nt_msprime = np.zeros(replicates) with click.progressbar(range(replicates)) as bar: for j in bar: nt_argon[j] = sim_argon(sample_size, L, count_trees=True) nt_hybrid[j] = sim_msprime_hybrid(sample_size, L) nt_msprime[j] = sim_msprime(sample_size, L) print( "mean number of trees:", "argon=", np.mean(nt_argon), "msprime breakpoints=", np.mean(nt_msprime), "hybrid breakpoints=", np.mean(nt_hybrid), ) sm.graphics.qqplot(nt_argon) sm.qqplot_2samples(nt_argon, nt_msprime, line="45") plt.xlabel("argon") plt.ylabel("msprime") plt.savefig("figures/verify_argon_v_msprime.png") plt.close("all")
def run_tbl_analytical_check(self): """ Runs the check for the total branch length. """ R = 10000 basedir = "tmp__NOBACKUP__/analytical_tbl" if not os.path.exists(basedir): os.mkdir(basedir) for n in range(2, 15): tbl_ms = self.get_tbl_distribution(n, R, self._ms_executable) tbl_msp = self.get_tbl_distribution(n, R, self._mspms_executable) sm.graphics.qqplot(tbl_ms) sm.qqplot_2samples(tbl_ms, tbl_msp, line="45") filename = os.path.join(basedir, "qqplot_{}.png".format(n)) pyplot.savefig(filename, dpi=72) pyplot.close('all') hist_ms, bin_edges = np.histogram(tbl_ms, 20, density=True) hist_msp, _ = np.histogram(tbl_msp, bin_edges, density=True) index = bin_edges[:-1] # We don't seem to have the analytical value quite right here, # but since the value is so very close to ms's, there doesn't # seem to be much point in trying to fix it. analytical = [self.get_analytical_tbl(n, x * 2) for x in index] fig, ax = pyplot.subplots() bar_width = 0.15 rects1 = pyplot.bar(index, hist_ms, bar_width, color='b', label="ms") rects2 = pyplot.bar(index + bar_width, hist_msp, bar_width, color='r', label="msp") pyplot.plot(index + bar_width, analytical, "o", color='k') pyplot.legend() # pyplot.xticks(index + bar_width, [str(j) for j in index]) pyplot.tight_layout() filename = os.path.join(basedir, "hist_{}.png".format(n)) pyplot.savefig(filename)
def run_pairwise_island_model(self): """ Runs the check for the pairwise coalscence times for within and between populations. """ R = 10000 M = 0.2 basedir = "tmp__NOBACKUP__/analytical_pairwise_island" if not os.path.exists(basedir): os.mkdir(basedir) for d in range(2, 6): cmd = "2 {} -T -I {} 2 {} {}".format(R, d, "0 " * (d - 1), M) T_w_ms = self.get_pairwise_coalescence_time( self._ms_executable + cmd.split() + self.get_ms_seeds(), R) T_w_msp = self.get_pairwise_coalescence_time( self._mspms_executable + cmd.split() + self.get_ms_seeds(), R) cmd = "2 {} -T -I {} 1 1 {} {}".format(R, d, "0 " * (d - 2), M) T_b_ms = self.get_pairwise_coalescence_time( self._ms_executable + cmd.split() + self.get_ms_seeds(), R) T_b_msp = self.get_pairwise_coalescence_time( self._mspms_executable + cmd.split() + self.get_ms_seeds(), R) print(d, np.mean(T_w_ms), np.mean(T_w_msp), d / 2, np.mean(T_b_ms), np.mean(T_b_msp), (d + (d - 1) / M) / 2, sep="\t") sm.graphics.qqplot(T_w_ms) sm.qqplot_2samples(T_w_ms, T_w_msp, line="45") f = os.path.join(basedir, "within_{}.png".format(d)) pyplot.savefig(f, dpi=72) pyplot.close('all') sm.graphics.qqplot(T_b_ms) sm.qqplot_2samples(T_b_ms, T_b_msp, line="45") f = os.path.join(basedir, "between_{}.png".format(d)) pyplot.savefig(f, dpi=72) pyplot.close('all')
def run_verify_coalescent(n, m, Ne, r, models, num_replicates, output_prefix): """ Runs ms and msprime on the specified parameters and outputs qqplots of the coalescent simulation summary statistics with the specified prefix. """ ms = MsCoalescentStatisticsSimulator(n, m, r, Ne, models) df_ms = ms.run(num_replicates) msp = MsprimeCoalescentStatisticsSimulator(n, m, r, Ne, models) df_msp = msp.run(num_replicates) for stat in ["t", "num_trees", "re_events", "ca_events"]: v1 = df_ms[stat] v2 = df_msp[stat] # pyplot.hist(v1, 20, alpha=0.5, label="ms") # pyplot.hist(v2, 20, alpha=0.5, label="msp") # pyplot.legend(loc="upper left") sm.graphics.qqplot(v1) sm.qqplot_2samples(v1, v2, line="45") f = "{0}_{1}.png".format(output_prefix, stat) pyplot.savefig(f, dpi=72) pyplot.clf()
def run_verify_mutations(n, m, Ne, r, models, num_replicates, mutation_rate, output_prefix): """ Runs ms and msprime for the specified parameters, and filters the results through Hudson's sample_stats program to get distributions of the haplotype statistics. """ ms = MsMutationStatisticsSimulator(n, m, r, Ne, models, mutation_rate) df_ms = ms.run(num_replicates) msp = MsprimeMutationStatisticsSimulator(n, m, r, Ne, models, mutation_rate) df_msp = msp.run(num_replicates) for stat in ["pi", "ss", "D", "thetaH", "H"]: v1 = df_ms[stat] v2 = df_msp[stat] # pyplot.hist(v1, 20, alpha=0.5, label="ms") # pyplot.hist(v2, 20, alpha=0.5, label="msp") # pyplot.legend(loc="upper left") sm.graphics.qqplot(v1) sm.qqplot_2samples(v1, v2, line="45") f = "{0}_{1}.png".format(output_prefix, stat) pyplot.savefig(f, dpi=72) pyplot.clf()
def run_tbl_analytical_check(self): """ Runs the check for the total branch length. """ R = 10000 basedir = "tmp__NOBACKUP__/analytical_tbl" if not os.path.exists(basedir): os.mkdir(basedir) for n in range(2, 15): tbl_ms = self.get_tbl_distribution(n, R, self._ms_executable) tbl_msp = self.get_tbl_distribution(n, R, self._mspms_executable) sm.graphics.qqplot(tbl_ms) sm.qqplot_2samples(tbl_ms, tbl_msp, line="45") filename = os.path.join(basedir, "qqplot_{}.png".format(n)) pyplot.savefig(filename, dpi=72) pyplot.close('all') hist_ms, bin_edges = np.histogram(tbl_ms, 20, density=True) hist_msp, _ = np.histogram(tbl_msp, bin_edges, density=True) index = bin_edges[:-1] # We don't seem to have the analytical value quite right here, # but since the value is so very close to ms's, there doesn't # seem to be much point in trying to fix it. analytical = [self.get_analytical_tbl(n, x * 2) for x in index] fig, ax = pyplot.subplots() bar_width = 0.15 rects1 = pyplot.bar( index, hist_ms, bar_width, color='b', label="ms") rects2 = pyplot.bar( index + bar_width, hist_msp, bar_width, color='r', label="msp") pyplot.plot(index + bar_width, analytical, "o", color='k') pyplot.legend() # pyplot.xticks(index + bar_width, [str(j) for j in index]) pyplot.tight_layout() filename = os.path.join(basedir, "hist_{}.png".format(n)) pyplot.savefig(filename)
def test_qqplot_2samples_arrays(): #just test that it runs x = np.random.normal(loc=8.25, scale=3.25, size=37) y = np.random.normal(loc=8.25, scale=3.25, size=37) pp_x = sm.ProbPlot(x) pp_y = sm.ProbPlot(y) # also tests all values for line for line in ['r', 'q', '45', 's']: # test with arrays fig1 = sm.qqplot_2samples(x, y, line=line) plt.close('all')
def test_1(): try: from matplotlib import pyplot as pp except ImportError: raise SkipTest('Not making QQ plot') # check that ldirichlet_softmax_pdf is actually giving a dirichlet # distribution, by comparing a QQ plot with np.random.dirichlet alpha = np.array([1, 2, 3], dtype=float) def logprob(x, alpha): grad = np.zeros_like(x) logp = ldirichlet_softmax(x, alpha, grad=grad) return logp, grad samples, diag = hmc(logprob, x0=np.random.normal(size=(3,)), n_samples=1000, args=(alpha,), n_steps=10, return_diagnostics=True) expx = np.exp(samples) pi1 = expx / np.sum(expx, 1, keepdims=True) pi2 = np.random.dirichlet(alpha=alpha, size=1000) sm.qqplot_2samples(pi1[:, 0], pi2[:, 0], line='45') pp.savefig('bayes_ratematrix-test-1.png')
def run_verify(args): """ Checks that the distibution of events we get is the same as msprime. """ n = args.sample_size m = args.num_loci rho = args.recombination_rate msp_events = np.zeros(args.num_replicates) local_events = np.zeros(args.num_replicates) for j in range(args.num_replicates): random.seed(j) s = Simulator(n, m, rho, 10000) s.simulate() local_events[j] = s.num_re_events s = msprime.TreeSimulator(n) s.set_num_loci(m) s.set_scaled_recombination_rate(rho) s.set_random_seed(j) s.run() msp_events[j] = s.get_num_recombination_events() sm.graphics.qqplot(local_events) sm.qqplot_2samples(local_events, msp_events, line="45") pyplot.savefig(args.outfile, dpi=72)
verbose_eval=2000) preds = gbm.predict(X_val, num_iteration=gbm.best_iteration) print('validation smape: ', smape(y_val, preds)) print('validation mae: ', mean_absolute_error(y_val, preds)) # investigating the distribution of the error error = y_val.values - preds plt.figure(figsize=(15, 5)) plt.subplot(1, 2, 1) plt.hist(error, EDGECOLOR='black', color='y') # comparing the distribution of the predictin and the actual sm.qqplot_2samples(y_val.values, preds, line='45', ax=plt.subplot(1, 2, 2)) plt.show() # exploring the feature importance lgb.plot_importance(gbm, height=0.6) plt.show() # predicting sale values for year 2018 X_train = training_df.loc[:, [ col for col in training_df.columns if col not in ['sales'] ]].values y_train = training_df['sales'].values X_test = testing_df.loc[:, [ col for col in testing_df.columns if col not in ['sales'] ]] lgb_train = lgb.Dataset(X_train, y_train)
k_gam #from scipy.special import gamma as Gamma # #def f1(x): # return Gamma(x) #eqn3 = Eq( med/(np.log(2)**(1/k))*f1((1+1/k))-esp ) k_wei = float(0.4758754) lambda_wei = med/(np.log(2))**(1/k_wei) lambda_wei vec=np.sort(vec) # Log(vec) -> Normal norm = np.array([np.random.normal(loc=esp,scale=sigma) for x in range(len(vec))]) count, bins, _ = plt.hist(norm, 30, normed=True) sm.qqplot_2samples(norm,np.log(vec),line='r').suptitle('Log(echantillon) -> Normal (KS_dist = 0.7)', fontsize=20) np.corrcoef(np.sort(norm),np.log(vec)) stats.ks_2samp(np.sort(norm),np.log(vec)) #Ks_2sampResult(statistic=0.69090909090909092, pvalue=1.8946637774700268e-12) # vec -> LogNormal lognorm = np.random.lognormal(mean=param_logn_u,sigma=param_logn_sigma**(1/2),size=len(vec)) count,bins,_ = plt.hist(lognorm,30,normed=True) sm.qqplot_2samples(lognorm,vec,line='r').suptitle('echantillon -> LogNorm (KS_dist = 0.14)', fontsize=20) np.corrcoef(np.sort(lognorm),vec) stats.ks_2samp(np.sort(lognorm),vec) #Ks_2sampResult(statistic=0.145, pvalue=0.00784630338162055) # vec -> exp exp = np.random.exponential(scale=esp,size=len(vec))
plt.xscale('log') #plt.title('power {} and scale {}'.format(power_final,scale_final)) plt.xlabel('Returnperiod [years]') plt.ylabel('Storm Severity Index []') #plot cdfs to get ks test statistic cdf_gev_i = scipy.stats.genextreme.cdf(ssi_quantiles_prob, params_final_gev[0], loc=params_final_gev[1], scale=params_final_gev[2]) plt.figure() plt.plot(ssi_quantiles_prob, cdf_gev_i, '.k') # plt.plot(ssi_quantiles_distribution,(1-1/return_periods)**5,'.b') cdf_wisc_prob_i = (1 - 1 / return_period_prob)**block_size_years plt.plot(ssi_quantiles_prob, cdf_wisc_prob_i, '.b') plt.xlim(xmin=10 ^ 10) #plt.title('power {} and scale {}'.format(power_final,scale_final)) plt.ylabel('cummulative distribution function') plt.xlabel('Storm Severity Index []') # show qq plot of historic and probabilistic ssis sm.qqplot_2samples( wisc_hist.ssi, wisc_prob_CH.ssi_full_area, line='45', ) # xlabel='"WISC historic" pan-European SSI', # ylabel='"WISC probabilistic extension" pan-European SSI' plt.xlabel('"WISC probabilistic extension" pan-European SSI') plt.ylabel('"WISC historic" pan-European SSI')
def test_qqplot_2samples_arrays(self): # also tests all values for line for line in ['r', 'q', '45', 's']: # test with arrays fig = sm.qqplot_2samples(self.res, self.other_array, line=line) plt.close('all')
print('Number of non-NA/null observations for:', labels[i], data.count()[i]) print('Maximum value for:', labels[i], data.max()[i]) print('Minimum value for:', labels[i], '', data.max()[i]) print('Mean for:', labels[i], data.mean()[i]) print('Standard deviation for', labels[i], data.std()[i]) print('Kurtosis for:', labels[i], data.kurt()[i]) print('IQR for:', labels[i], data.quantile(0.75)[i] - data.quantile(0.25)[i]) print('') print('Covariance matrix') print(data.cov()) # 2.3 Scatter plot of 2 variables (SL & SW) # Scatter-plot, SL vs SW sns.catplot(x="SL", y="SW", data=data, hue='CLASS') # 2.4 q-q plot of 2 variables (SL & SW), we need statsmodels # Same sample sizes, sorted data, 'PW' vs 'SL', the result is a qqplot, with same sample size sm.qqplot_2samples(data['SL'], data['PW']) # 2.5, 2.6 # Scatter plot matrix sns.pairplot(data, hue='CLASS') # 2.7 Apply multidimensional scaling (MDS) to project the d-dimensional data in 2-d data X = data.iloc[:, :4] embedding = MDS(n_components=2) x_transformes = embedding.fit_transform(X[:100]) x_transformes.shape plt.scatter(x_transformes[:, 0], x_transformes[:, 1])
i = 0 for fname in fnames: finalData = pd.read_csv(fname + ".csv") scores = reliefAlgorithm(finalData, shifts[i], 1000) scores = scores.sort_values() #print "\nFor", fname, "\n-------------------------------\n" first = scores.idxmax() #print "Most Important Attribute =>", scores.idxmax() scores[scores.idxmax()] = -sys.maxint - 1 #print "Second Most Important Attribute =>", scores.idxmax() second = scores.idxmax() X = finalData[first] Y = finalData[second] X = (X - X.min()) / (X.max() - X.min()) Y = (Y - Y.min()) / (Y.max() - Y.min()) plt.xlabel(first) plt.ylabel(second) plt.scatter(X, Y) plt.savefig(fname + '.png', bbox_inches='tight') plt.clf() # for quantiles pp_x = sm.ProbPlot(X) pp_y = sm.ProbPlot(Y) ppp = sm.qqplot_2samples(pp_x, pp_y) plt.savefig(fname + '_quantiles.png', bbox_inches='tight') plt.clf() #plt.show() i = i + 1
Used in QQ plot, normalize all the data first. """ max_num = max(x) min_num = min(x) inter = max_num - min_num return [(data - min_num) / inter for data in x] import statsmodels.api as sm age_norm = normalize(age) fat_norm = normalize(fat) sm.qqplot_2samples(np.asarray(age_norm), np.asarray(fat_norm), xlabel='age', ylabel='fat', line='45') plt.show() # ------------------------------- # For Q6 :) def cosine_similarity(x, y): x = np.asarray(x) y = np.asarray(y) numerator = np.dot(x, y) sqrt_x = np.sqrt(sum(x**2)) sqrt_y = np.sqrt(sum(y**2))
pass intact_more_dict ={} for key, item in ID_dict.items(): if item >= MIN_PUBLICATION_NUM: intact_more_dict[key] = intact_dict[key] #plots to see how ERC value changes with interacting proteins list_values = [ v for v in intact_more_dict.values() ] int_choices = np.random.choice(list_values, 1000) plt.hist(df_choices) plt.title("All ERC\n Mean = %.4f" % np.mean(df_choices)) plt.show() print("ERC mean for all: %.4f" % np.mean(df_choices)) plt.hist(int_choices) plt.title("Interacting ERC\n Mean = %.4f" % np.mean(int_choices)) plt.show() print("ERC mean for interacting: %.4f" % np.mean(int_choices)) sm.qqplot_2samples(np.asarray(df_choices), np.asarray(int_choices),xlabel="All ERC", ylabel="Interacting ERC") plt.plot() plt.xlim(-1, 1) plt.ylim(-1, 1) plt.plot( [-1,1],[-1,1] , 'r') plt.gca().set_aspect('equal', adjustable='box') plt.draw() df_biogrid_acms.to_csv("ACMS_BIOGRID-ORGANISM-Saccharomyces_cerevisiae_S288c-3.4.146.tab2.txt", sep = '\t')
def test_qqplot_2samples_arrays(self, close_figures): # also tests all values for line for line in ['r', 'q', '45', 's']: # test with arrays sm.qqplot_2samples(self.res, self.other_array, line=line)
def predict(GP,multitestfiles,RAFOLD,OUTDIR): Nsample = 30 seed = 326323 allMC = [] allDeltaMC = [] X = None dc_keys = ['ks2','ks','ad2','ad','kl'] dc_fns = [computeKS2Sample,computeKS,computeAD2Sample,computeAD,computeKLdivergence] distrCompare = {} for key in dc_keys: distrCompare[key] = {} if len(multitestfiles) >1 : raise Exception("Multiple test files not compatible") for fno, file in enumerate(multitestfiles): dataperfile = pd.read_csv(file, header=None) Dperfile = dataperfile.values if fno == 0: X = Dperfile[:, :-2] allMC.append(Dperfile[:, -2].tolist()) allDeltaMC.append(Dperfile[:, -1].tolist()) bestparamfileForRA = GP.printRAmetrics(RAFOLD) print("\n\n\n\n") # RESULTS with open(GP.bestparamfile, 'r') as f: ds = json.load(f) if 'buildtype' in ds: buildtype = ds['buildtype'] else: print("Buildtype not in ds not implemented") sys.exit(1) if buildtype != "gp": Ymean, Ysd = GP.predictHeteroscedastic(X) else: Ymean, Ysd = GP.predictHomoscedastic(X) allchi2metric = [] allmeanmsemetric = [] allsdmsemetric = [] # for j, (mu, sd) in enumerate(zip(Ymean, Ysd)): # MCatp = [allMC[i][j] for i in range(len(allMC))] # allchi2metric.append(((mu - np.mean(MCatp)) / sd) ** 2) # allmeanmsemetric.append((mu - np.mean(MCatp)) ** 2) # allsdmsemetric.append((sd - np.std(MCatp)) ** 2) for j, (mu, sd) in enumerate(zip(Ymean, Ysd)): MCatp = allMC[0][j] DeltaMCatp = allDeltaMC[0][j] allchi2metric.append(((mu - MCatp) / sd) ** 2) allmeanmsemetric.append((mu - MCatp) ** 2) allsdmsemetric.append((sd - DeltaMCatp) ** 2) chi2metric = np.mean(allchi2metric) meanmsemetric = np.mean(allmeanmsemetric) sdmsemetric = np.mean(allsdmsemetric) print("#########################") for kno,key in enumerate(dc_keys): distrCompare[key]['MCvs{}'.format(buildtype)] = [] distrCompare[key]['MCvs{}'.format(buildtype)] = [] for j,(mu,sd) in enumerate(zip(Ymean,Ysd)): MCatp = allMC[0][j] DeltaMCatp = allDeltaMC[0][j] data = np.random.normal(MCatp, DeltaMCatp, Nsample) distrCompare[key]['MCvs{}'.format(buildtype)].append( dc_fns[kno](data,mu,sd,seed) ) print("#########################\n\n") print("################ RESULTS START HERE") with open(GP.bestparamfile, 'r') as f: ds = json.load(f) bestkernel = ds['kernel'] print("Best Kernel is {}".format(bestkernel)) print("with meanmsemetric %.2E" % (meanmsemetric)) print("with sdmsemetric %.2E" % (sdmsemetric)) print("with chi2metric %.2E" % (chi2metric)) ############################################ # print(X) # print(Ymean) # print(Ysd) os.makedirs(OUTDIR, exist_ok=True) datatdump = np.column_stack((X, Ymean, Ysd)) np.savetxt(os.path.join(OUTDIR, "{}.csv".format(ds["obsname"])), datatdump, delimiter=',') ############################################ if bestparamfileForRA is not None: import apprentice OUTDIRRA = os.path.dirname(bestparamfileForRA) with open(bestparamfileForRA, 'r') as f: ds = json.load(f) seed = ds['seed'] Moutfile = os.path.join(OUTDIRRA, 'RA', "{}_MCRA_S{}.json".format(GP.obsname.replace('/', '_'), seed)) DeltaMoutfile = os.path.join(OUTDIRRA, 'RA', "{}_DeltaMCRA_S{}.json".format(GP.obsname.replace('/', '_'), seed)) meanappset = apprentice.appset.AppSet(Moutfile, binids=[GP.obsname]) if len(meanappset._binids) != 1 or \ meanappset._binids[0] != GP.obsname: print("Something went wrong.\n" "RA Fold Mean function could not be created.") exit(1) meanerrappset = apprentice.appset.AppSet(DeltaMoutfile, binids=[GP.obsname]) if len(meanerrappset._binids) != 1 or \ meanerrappset._binids[0] != GP.obsname: print("Something went wrong.\n" "RA Fold Error mean function could not be created.") exit(1) Mte = np.array([meanappset.vals(x)[0] for x in X]) DeltaMte = np.array([meanerrappset.vals(x)[0] for x in X]) else: Mte = np.array([GP.approxmeancountval(x) for x in X]) DeltaMte = np.array([GP.errapproxmeancountval(x) for x in X]) allchi2metricRA = [] allmeanmsemetricRA = [] allsdmsemetricRA = [] # for j, (mu, sd) in enumerate(zip(Mte, DeltaMte)): # MCatp = [allMC[i][j] for i in range(len(allMC))] # allchi2metricRA.append(((mu - np.mean(MCatp)) / sd) ** 2) # allmeanmsemetricRA.append((mu - np.mean(MCatp)) ** 2) # allsdmsemetricRA.append((sd - np.std(MCatp)) ** 2) for j, (mu, sd) in enumerate(zip(Mte, DeltaMte)): MCatp = allMC[0][j] DeltaMCatp = allDeltaMC[0][j] allchi2metricRA.append(((mu - MCatp) / sd) ** 2) allmeanmsemetricRA.append((mu - MCatp) ** 2) allsdmsemetricRA.append((sd - DeltaMCatp) ** 2) chi2metricRA = np.mean(allchi2metricRA) meanmsemetricRA = np.mean(allmeanmsemetricRA) sdmsemetricRA = np.mean(allsdmsemetricRA) print("RAMEAN (meanmsemetric_RA) is %.2E" % (meanmsemetricRA)) print("RAMEAN (sdmsemetric_RA) is %.2E" % (sdmsemetricRA)) print("RAMEAN (chi2metric_RA) is %.2E" % (chi2metricRA)) print("\n\n#########################") for kno,key in enumerate(dc_keys): distrCompare[key]['MCvsRA'] = [] distrCompare[key]['MCvsRA'] = [] for j, (mu, sd) in enumerate(zip(Mte, DeltaMte)): MCatp = allMC[0][j] DeltaMCatp = allDeltaMC[0][j] data = np.random.normal(MCatp, DeltaMCatp, Nsample) distrCompare[key]['MCvsRA'].append( dc_fns[kno](data,mu,sd,seed) ) print("#########################") # np.random.seed(seed) # distrCompare['ks']['RAvs{}'.format(buildtype)] = \ # [computeKSstatistic(np.random.normal(mu1, sd1, Nsample), # np.random.normal(mu2, sd2, Nsample)) # for (mu1, mu2, sd1, sd2) in # zip(Mte, Ymean,DeltaMte,Ysd)] # np.random.seed(seed) # distrCompare['kl']['RAvs{}'.format(buildtype)] = \ # [computeKLdivergence(np.random.normal(mu1, sd1, Nsample), # np.random.normal(mu2, sd2, Nsample)) # for (mu1, mu2, sd1, sd2) in # zip(Mte, Ymean, DeltaMte, Ysd)] ############################################ # Print best metrics into a json file ############################################ bestmetricdata = { 'RA':{ 'meanmsemetric' : meanmsemetricRA, 'chi2metric': chi2metricRA, 'sdmsemetric': sdmsemetricRA }, buildtype:{ 'meanmsemetric': meanmsemetric, 'chi2metric': chi2metric, 'sdmsemetric': sdmsemetric, 'bestkernel':bestkernel }, 'distrCompare': distrCompare, "Nsample":Nsample } bestmetricfile = os.path.join(OUTDIR,"{}_bestmetrics.json".format(ds["obsname"])) with open(bestmetricfile, 'w') as f: json.dump(bestmetricdata, f, indent=4) ############################################ import scipy.stats as stats import statsmodels.api as sm import matplotlib.pyplot as plt plotoutdir = os.path.join(OUTDIR,'plots','QQplot') os.makedirs(plotoutdir,exist_ok=True) for j, (gpmu, gpsd, ramu, rasd) in enumerate(zip(Ymean, Ysd, Mte, DeltaMte)): MCatp = allMC[0][j] DeltaMCatp = allDeltaMC[0][j] MCdata = np.random.normal(MCatp, DeltaMCatp, 1000) RAdata = np.random.normal(ramu, rasd, 1000) GPdata = np.random.normal(gpmu, gpsd, 1000) fig = plt.figure() plt.style.use('seaborn') ax = fig.add_subplot(1, 1, 1) sm.qqplot_2samples(MCdata, RAdata, line='45',ax=ax) ax.get_lines()[0].set_markerfacecolor('blue') ax.get_lines()[0].set_label('RA') sm.qqplot_2samples(MCdata, GPdata, line='45',ax=ax) ax.get_lines()[2].set_markerfacecolor('green') ax.get_lines()[2].set_label('GP') ax.set_xlabel('MC') ax.set_ylabel('') plt.legend(loc='best') fig.tight_layout() plotfilename = os.path.join(plotoutdir, "qqplot_{}.pdf".format(j)) plt.savefig(plotfilename) # plt.show() plt.close('all')
def validate(replicates, sample_size): """ Validate that we are simulating the same things in the simulators by running some replicates and plotting the distributions of the number of output trees. """ L = 1000 gc_rate = 0.015 gc_tract_length = 10 nt_simbac = np.zeros(replicates) nt_fastsimbac = np.zeros(replicates) nt_msprime = np.zeros(replicates) nb_msprime = np.zeros(replicates) with click.progressbar(range(replicates)) as bar: for j in bar: nt_simbac[j] = run_simbac( sample_size=sample_size, L=L, gc_rate=gc_rate, gc_tract_length=gc_tract_length, count_trees=True, ) nt_fastsimbac[j] = run_fastsimbac( sample_size=sample_size, L=L, gc_rate=gc_rate, gc_tract_length=gc_tract_length, set_seed=j, count_trees=True, ) nt_msprime[j], nb_msprime[j] = run_msprime( sample_size=sample_size, L=L, gc_rate=gc_rate, gc_tract_length=gc_tract_length, ret_breakpoints=True, ) print( "mean number of trees:", "simbac=", np.mean(nt_simbac), "fastsimbac=", np.mean(nt_fastsimbac), "msprime trees=", np.mean(nt_msprime), "msprime breakpoints=", np.mean(nb_msprime), ) sm.graphics.qqplot(nt_simbac) sm.qqplot_2samples(nt_simbac, nb_msprime, line="45") plt.xlabel("simbac") plt.ylabel("msprime") plt.savefig("figures/verify_simbac_v_msprime.png") plt.close("all") sm.graphics.qqplot(nt_fastsimbac) sm.qqplot_2samples(nt_fastsimbac, nt_msprime, line="45") plt.xlabel("fastsimbac") plt.ylabel("msprime") plt.savefig("figures/verify_fastsimbac_v_msprime.png")
##pareto solving from sympy import Eq, Symbol, solve k = Symbol('k') eqn = Eq( (esp_c**2)+ (2*var_c*k)-(var_c*k**2) ,0) k_root = solve(eqn)[1] xm = Symbol('xm') eqn2 = Eq( esp_c - k_root*xm/(k_root-1) ) xm_root = solve(eqn2)[0] # xm_root = 499.55021 # mode = xm_root*((k_root-1)/k_root)**1/k_root exp_c = np.random.exponential(scale=esp_c,size=len(vec_c)) count, bins, _ = plt.hist(exp_c, 200, normed=True) sm.qqplot_2samples(exp_c,vec_c,line='r').suptitle('echantillon C ~> expo (KS_dist = 0.49)', fontsize=20) stats.ks_2samp(np.sort(exp_c),vec_c) #Ks_2sampResult(statistic=0.49, pvalue=0.0 norm_c = np.random.normal(loc=esp_c,scale=sigma_c,size=len(vec_c)) count, bins, _ = plt.hist(norm_c, 200, normed=True) sm.qqplot_2samples(norm_c,np.log(vec_c),line='r').suptitle('echantillon C ~> normal (KS_dist = 0.07)', fontsize=20) stats.ks_2samp(np.sort(norm_c),vec_c) #Ks_2sampResult(statistic=0.068199999999999927, pvalue=6.3527863052483843e-41) par_c = (np.random.pareto(k_root,len(vec_c))+1) * float(xm_root) count, bins, _ = plt.hist(par_c, 200, normed=True) sm.qqplot_2samples(par_c,vec_c,line='r').suptitle('echantillon C ~> pareto (KS_dist = 0.27)', fontsize=20) stats.ks_2samp(np.sort(par_c),vec_c) #Ks_2sampResult(statistic=0.26915, pvalue=0.0)
import pandas as pd import numpy as np import matplotlib.pyplot as plt from statsmodels.distributions.empirical_distribution import ECDF import statsmodels.api as sm from paretochart.pareto import pareto df_ospedali = pd.read_csv( '/home/gabriele/Documenti/Università/Statistica/Dataset/csv_lab/dati-ospedali.csv', sep=';') #print(df_ospedali) """ dist = ECDF(df_ospedali['Medici SSN']) #print(dist) plt.plot(dist.x, dist.y) plt.show() df_ospedali['Medici SSN'].plot.hist() plt.show() """ """ sm.qqplot_2samples(df_ospedali['Medici SSN'], df_ospedali['Farmacisti SSN'], line = '45') plt.show() """ grouped = df_ospedali.groupby( 'Regione') #raggruppa i valori della colonna regione temp = grouped['Medici SSN'].sum() pareto(temp, labels=temp.index) plt.show()