def _threshold_counts(counts, thresholdLow, thresholdHigh, n_vec): y11 = [] y11_upper_err = [] y11_lower_err = [] y10_01 = [] y10_01_upper_err = [] y10_01_lower_err = [] y00 = [] y00_upper_err = [] y00_lower_err = [] for countVec, N in zip(counts, n_vec): countVec = countVec[0:int(N)] p11 = sum([x >= thresholdHigh for x in countVec]) / N p10_01 = sum([((x < thresholdHigh) & (x > thresholdLow)) for x in countVec]) / N p00 = sum([x <= thresholdLow for x in countVec]) / N y11.append(p11) y10_01.append(p10_01) y00.append(p00) kci11 = binom.interval(0.68, N, p11) kci10_01 = binom.interval(0.68, N, p10_01) kci00 = binom.interval(0.68, N, p00) y11_lower_err.append( p11 - kci11[0]/N ) y11_upper_err.append( kci11[1]/N - p11 ) y10_01_lower_err.append( p10_01 - kci10_01[0]/N ) y10_01_upper_err.append( kci10_01[1]/N - p10_01 ) y00_lower_err.append( p00 - kci00[0]/N ) y00_upper_err.append( kci00[1]/N - p00 ) return y11, y11_upper_err, y11_lower_err, y00, y00_upper_err, y00_lower_err, y10_01, y10_01_upper_err, y10_01_lower_err
def error_bars(df): """ Return 68% (1-sigma) binomial confidence interval. pyplot.errorbar expects a 2xN array of unsigned offsets relative to points """ errs = [binom.interval(0.68, n, p=k/n, loc=-k) / n for n, k in zip(df.Trials, df.Observations)] return np.abs(np.array(errs).T)
def plot_reliability_diagram(y,x,bins=np.linspace(0,1,21),size_points=False, show_baseline=True, error_bars=True, error_bar_alpha = .05, marker='+',c='red', **kwargs): # if ax is None: # ax = plt.gca() # fig = ax.get_figure() digitized_x = np.digitize(x, bins) mean_count_array = np.array([[np.mean(y[digitized_x == i]),len(y[digitized_x == i]),np.mean(x[digitized_x==i])] for i in np.unique(digitized_x)]) x_pts_to_graph = mean_count_array[:,2] y_pts_to_graph = mean_count_array[:,0] pt_sizes = mean_count_array[:,1] plt.subplot(1,2,1) if show_baseline: plt.plot(np.linspace(0,1,100),(np.linspace(0,1,100)),'k--') # ax.loglog(np.linspace(0,1,100),(np.linspace(0,1,100)),'k--') for i in range(len(y_pts_to_graph)): if size_points: plt.scatter(x_pts_to_graph,y_pts_to_graph,s=pt_sizes,marker=marker,c=c, **kwargs) else: plt.scatter(x_pts_to_graph,y_pts_to_graph, c=c, **kwargs) plt.axis([-0.1,1.1,-0.1,1.1]) if error_bars: yerr_mat = binom.interval(1-error_bar_alpha,pt_sizes,x_pts_to_graph)/pt_sizes - x_pts_to_graph yerr_mat[0,:] = -yerr_mat[0,:] plt.errorbar(x_pts_to_graph, x_pts_to_graph, yerr=yerr_mat, capsize=5) plt.subplot(1,2,2) plt.hist(x,bins=bins) return(x_pts_to_graph,y_pts_to_graph,pt_sizes)
def check(N, p): global numfails, numchecks, mu, sigma2 H = NeuronGroup(1, 'v:1', threshold='False', name='H') G = NeuronGroup(N, 'v:1', threshold='False', name='G') S = Synapses(H, G, on_pre='v+=w', name='S') S.connect(p=p) m = len(S) low, high = binom.interval(alpha, N, p) if p==0: low = high = 0 elif p==1: low = high = N else: i = diff(S.j[:]) i = i[i<isi_max[p]] b = bincount(i, minlength=isi_max[p])[:isi_max[p]] if b[0]: print 'Major error: repeated indices for N=%d, p=%.3f' % (N, p) raise ValueError("Repeated indices") isi[p] += b num_isi[p] += sum(b) q = binom.cdf(low-0.1, N, p)+binom.sf(high+0.1, N, p) mu += q sigma2 += q*(1-q) numchecks += 1 if m<low or m>high: numfails += 1 return True else: return False
def test_nonpenalizable_tokens_2(self): probs = [0.5, 0.5] actual_num_nonequal_pairs = 0 samples_generated = 0 while samples_generated < _SAMPLES_NUM: token_sampler = TokenSampler(batch_size=1, banned_tokens_ids=[], non_penalizable_tokens_ids=[1], repetition_penalization_coefficient= REPETITION_PENALIZE_COEFFICIENT) first_token = token_sampler.sample(probs, sample_idx=0) if first_token == 0: samples_generated += 1 second_token = token_sampler.sample(probs, sample_idx=0) actual_num_nonequal_pairs += (first_token != second_token) # When we penalize for token#0, P(first != second | first=0) = P(second=1 | first=0) = 0.5 * r / (0.5 + 0.5 * r) = r / (1 + r) expected_nonequal_pair_rate = REPETITION_PENALIZE_COEFFICIENT / ( 1 + REPETITION_PENALIZE_COEFFICIENT) expected_nonequal_pair_rate_from, expected_nonequal_pair_rate_to = binom.interval( 1 - _CONFIDENCE_LEVEL, _SAMPLES_NUM, expected_nonequal_pair_rate) self.assertLessEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_to) self.assertGreaterEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_from)
def plot_ranks(self, x=None, p=None, nbins=None, figsize=6, testing_fn=None): if testing_fn is None: testing_fn = testing_fn_generator(x, p) ranks = zeros(p.shape) N, n = p.shape for i, m in enumerate(self.predict(testing_fn)): ranks[i] = sum(m["samples"] < p[i].reshape((1, -1)), axis=0) if nbins is None: nbins = int(N**.5) while self.n_samples % nbins: nbins += 1 f, ax = subplots(n, 1, figsize=(figsize, 2 / 3 * figsize * n), gridspec_kw={"hspace": 0.3}) if n == 1: ax = [ax] interval = binom.interval(0.9, N, 1 / nbins) edges = linspace(0, self.n_samples, nbins + 1) for i in range(n): ax[i].hist(ranks[:, i], edges) ax[i].axhspan(*interval, color="r", alpha=0.3) ax[i].set_title(self.labels[i]) ax[i].set_xlabel("rank of truth among posterior samples")
def qqplot(data, labels, n_quantiles=100, alpha=0.95, error_type='theoretical', distribution = 'binomial', log10conv=True, color=['k', 'r', 'b'], fill_dens=[0.1, 0.1, 0.1], type = 'uniform', title='title'): ''' Function for plotting Quantile Quantile (QQ) plots with confidence interval (CI) :param data: NumPy 1D array with data :param labels: :param type: type of the plot :param n_quantiles: number of quntiles to plot :param alpha: confidence interval :param log10conv: conversion to -log10(p) for the figure :return: nothing ''' xmax = 0 ymax = 0 if type == 'uniform': # we expect distribution from 0 to 1 for j in range(len(data)): # define quantiles positions: q_pos = np.concatenate([np.arange(99.)/len(data[j]), np.logspace(-np.log10(len(data[j]))+2, 0, n_quantiles)]) # define quantiles in data q_data = mquantiles(data[j], prob=q_pos, alphap=0, betap=1, limit=(0, 1)) # linear interpolation # define theoretical predictions q_th = q_pos.copy() # evaluate errors q_err = np.zeros([len(q_pos),2]) if np.sum(alpha) > 0: for i in range(0, len(q_pos)): if distribution == 'binomial': q_err[i, :] = binom.interval(alpha=alpha, n=len(data[j]), p=q_pos[i]) elif distribution == 'normal': q_err[i, :] = norm.interval(alpha, len(data[j])*q_pos[i], np.sqrt(len(data[j])*q_pos[i]*(1.-q_pos[i]))) q_err[i, q_err[i, :] < 0] = 1e-12 else: print('Distribution is not defined!') q_err /= 1.0*len(data[j]) for i in range(0, 100): q_err[i,:] += 1e-12 # print(q_err[100:, :]) slope, intercept, r_value, p_value, std_err = linregress(q_th, q_data) # print(labels[j], ' -- Slope: ', slope, " R-squared:", r_value**2) plt.plot(-np.log10(q_th[n_quantiles-1:]), -np.log10(q_data[n_quantiles-1:]), '-', color=color[j]) plt.plot(-np.log10(q_th[:n_quantiles]), -np.log10(q_data[:n_quantiles]), '.', color=color[j], label=labels[j]) xmax = np.max([xmax, - np.log10(q_th[1])]) ymax = np.max([ymax, - np.log10(q_data[0])]) # print(- np.log10(q_th[:])) if np.sum(alpha)>0: if error_type=='experimental': plt.fill_between(-np.log10(q_th), -np.log10(q_data/q_th*q_err[:,0]), -np.log10(q_data/q_th*q_err[:,1]), color=color[j], alpha=fill_dens[j], label='%1.3f CI'%alpha) if np.sum(alpha)>0: if error_type=='theoretical': plt.fill_between(-np.log10(q_th), -np.log10(q_err[:,0]), -np.log10(q_err[:,1]), color=color[j], alpha=fill_dens[j], label='%1.3f CI'%alpha) plt.legend(loc=4) plt.xlabel('Theoretical -log10') plt.ylabel('Experimental -log10') plt.plot([0, 100], [0, 100],'--k') # print(xmax,ymax) plt.xlim([0, np.ceil(xmax)]) plt.ylim([0, np.ceil(ymax*1.05)]) plt.title(title) plt.tight_layout()
def isRatioSignificantlyCorrect(variant_users, total_users, division, threshold): alpha = threshold #significance p-value lower_bound, upper_bound = binom.interval(1 - alpha, total_users, division) isSig = (variant_users > lower_bound and variant_users < upper_bound) return isSig, lower_bound, upper_bound
def proportion_ci(p, n, alpha=0.05): """ Create credible intervals for percentage data :param p: :param n: :param alpha: :return: """ from scipy.stats import binom lower, upper = binom.interval(1 - alpha, n, p) return lower / n, upper / n
def get_proper_hets(self): #just get the binomial added in, add filters later tmp_interval = binom.interval(0.95, self.__num_samples, 0.5) interval = [tmp_interval[0] * self.__num_samples, tmp_interval[1] * self.__num_samples] proper_variants = [] for variant in self.__vcf_file: if(len(variant.ALT) == 1 and variant.INFO.get('AF') >= interval[0] and variant.INFO.get('AF') <= interval[1]): proper_variants.append(variant) return proper_variants
def test2(): count = 1000 visit_params = [0.3, 0.6] ds = generate_visits(count, visit_params) assert (len(ds) == count) assert (len(ds[0]) == len(visit_params)) sums = np.sum(ds, axis=0) print(sums.shape) for i, col in enumerate(ds[0]): assert (col == 1 or col == 0) pct = visit_params[i] (lo, hi) = binom.interval(.954, 1000, pct) print("sums", i, lo, sums[i], hi) assert (lo <= sums[i] <= hi)
def test_sample_probs(self): probs = [0.3, 0.6, 0.1] token_sampler = TokenSampler( batch_size=1, banned_tokens_ids=[], non_penalizable_tokens_ids=range(len(probs)), repetition_penalization_coefficient=REPETITION_PENALIZE_COEFFICIENT) adjusted_confidence_level = _CONFIDENCE_LEVEL / len(probs) # bonferroni correction confidence_intervals = [binom.interval(1 - adjusted_confidence_level, _SAMPLES_NUM, p) for p in probs] est_probs_from, est_probs_to = zip(*confidence_intervals) samples = np.array([token_sampler.sample(probs, 0) for _ in xrange(_SAMPLES_NUM)]) counts = {val: np.sum(samples == val) for val in np.unique(samples)} for i, _ in enumerate(probs): self.assertLessEqual(counts[i], est_probs_to[i]) self.assertGreaterEqual(counts[i], est_probs_from[i])
def test_sample_probs(self): probs = [0.3, 0.6, 0.1] token_sampler = TokenSampler( batch_size=1, banned_tokens_ids=[], non_penalizable_tokens_ids=range(len(probs)), repetition_penalization_coefficient=REPETITION_PENALIZE_COEFFICIENT) adjusted_confidence_level = _CONFIDENCE_LEVEL / len(probs) # bonferroni correction confidence_intervals = [binom.interval(1 - adjusted_confidence_level, _SAMPLES_NUM, p) for p in probs] est_probs_from, est_probs_to = zip(*confidence_intervals) samples = np.array([token_sampler.sample(probs, 0) for _ in range(_SAMPLES_NUM)]) counts = {val: np.sum(samples == val) for val in np.unique(samples)} for i, _ in enumerate(probs): self.assertLessEqual(counts[i], est_probs_to[i]) self.assertGreaterEqual(counts[i], est_probs_from[i])
def stat_bursty_tweets(time_window_tweets, expectation_features, features_to_find): TP = 0 FP = 0 TN = 0 FN = 0 N = len(time_window_tweets) tokenized_tweets = [(tweet['id'], tokenize_tweet(tweet['full_text'])) for tweet in time_window_tweets] bag_of_feature = {} for id_flist in tokenized_tweets: for feature in id_flist[1]: bag_of_feature[feature] = bag_of_feature.get(feature, set()) bag_of_feature[feature].add(id_flist[0]) for feature in bag_of_feature: #Determine whether te feature matches the truth or not is_bursty = bl_rt(feature, features_to_find) n_feature_appear_in = len(bag_of_feature[feature]) feature_info = expectation_features.get(feature, [0, 0]) expected = feature_info[1] ra = math.floor(expected * N) #max of the distribution rb = binom.interval(0.999, N, expected)[1] #point where is (almost) 0 q = (rb + ra) / 2 if (n_feature_appear_in >= q): if is_bursty: TP += 1 else: FP += 1 else: if is_bursty: FN += 1 else: TN += 1 return TP, FP, TN, FN
def test_repetition_penalization(self): probs = [0.5, 0.5] actual_num_nonequal_pairs = 0 for _ in xrange(_SAMPLES_NUM): token_sampler = TokenSampler( batch_size=1, banned_tokens_ids=[], non_penalizable_tokens_ids=[], repetition_penalization_coefficient=REPETITION_PENALIZE_COEFFICIENT) first_token = token_sampler.sample(probs, sample_idx=0) second_token = token_sampler.sample(probs, sample_idx=0) actual_num_nonequal_pairs += int(first_token != second_token) # P(first != second) = P(first=0, second=1) + P(first=1, second=0) = # = 0.5 * 0.5 * r / (0.5 + 0.5 * r) + 0.5 * 0.5 * r / (0.5 + 0.5 * r) = r / (1 + r) expected_nonequal_pair_rate = REPETITION_PENALIZE_COEFFICIENT / (1 + REPETITION_PENALIZE_COEFFICIENT) expected_nonequal_pair_rate_from, expected_nonequal_pair_rate_to = \ binom.interval(1 - _CONFIDENCE_LEVEL, _SAMPLES_NUM, expected_nonequal_pair_rate) self.assertLessEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_to) self.assertGreaterEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_from)
def test_repetition_penalization(self): probs = [0.5, 0.5] actual_num_nonequal_pairs = 0 for _ in range(_SAMPLES_NUM): token_sampler = TokenSampler( batch_size=1, banned_tokens_ids=[], non_penalizable_tokens_ids=[], repetition_penalization_coefficient=REPETITION_PENALIZE_COEFFICIENT) first_token = token_sampler.sample(probs, sample_idx=0) second_token = token_sampler.sample(probs, sample_idx=0) actual_num_nonequal_pairs += int(first_token != second_token) # P(first != second) = P(first=0, second=1) + P(first=1, second=0) = # = 0.5 * 0.5 * r / (0.5 + 0.5 * r) + 0.5 * 0.5 * r / (0.5 + 0.5 * r) = r / (1 + r) expected_nonequal_pair_rate = REPETITION_PENALIZE_COEFFICIENT / (1 + REPETITION_PENALIZE_COEFFICIENT) expected_nonequal_pair_rate_from, expected_nonequal_pair_rate_to = \ binom.interval(1 - _CONFIDENCE_LEVEL, _SAMPLES_NUM, expected_nonequal_pair_rate) self.assertLessEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_to) self.assertGreaterEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_from)
def data_for_visualization(self): df = self.data df = df[df.dimension.notnull() & df.value.notnull()] output = dict() # Set title output['title'] = self.title # Set levels levels = sorted(df.value.unique().tolist()) level_s = [str(v) for v in levels] output['levels'] = level_s # Compute overall freq and cumulative frequency overall_f = self.freq_from_df(df, levels) overall_cum = self.cumulative_frequency(overall_f) output['overall_f'] = overall_f output['overall_cum'] = overall_cum # Compute dimensions dimensions = list() for dim in df.dimension.unique().tolist(): df_d = df[df.dimension == dim] pop_size = len(df_d) freq = self.freq_from_df(df_d, levels) cum = self.cumulative_frequency(freq) ranges = binom.interval(0.95, pop_size, overall_cum) range_low = [x / pop_size for x in ranges[0]] range_high = [x / pop_size for x in ranges[1]] dimensions.append( {'name': dim, 'freq': freq, 'cum': cum, 'overall_range_low': range_low, 'overall_range_high': range_high, }) output['dimensions'] = dimensions return output
def test_nonpenalizable_tokens_2(self): probs = [0.5, 0.5] actual_num_nonequal_pairs = 0 samples_generated = 0 while samples_generated < _SAMPLES_NUM: token_sampler = TokenSampler( batch_size=1, banned_tokens_ids=[], non_penalizable_tokens_ids=[1], repetition_penalization_coefficient=REPETITION_PENALIZE_COEFFICIENT) first_token = token_sampler.sample(probs, sample_idx=0) if first_token == 0: samples_generated += 1 second_token = token_sampler.sample(probs, sample_idx=0) actual_num_nonequal_pairs += (first_token != second_token) # When we penalize for token#0, P(first != second | first=0) = P(second=1 | first=0) = 0.5 * r / (0.5 + 0.5 * r) = r / (1 + r) expected_nonequal_pair_rate = REPETITION_PENALIZE_COEFFICIENT / (1 + REPETITION_PENALIZE_COEFFICIENT) expected_nonequal_pair_rate_from, expected_nonequal_pair_rate_to = binom.interval( 1 - _CONFIDENCE_LEVEL, _SAMPLES_NUM, expected_nonequal_pair_rate) self.assertLessEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_to) self.assertGreaterEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_from)
def Plot_the_distribution_MDs(GSCs_data, Non_GCSs_data, bins, MDs_lengths, genome_len, path_out): #Parameters ticks1 = [ 0, 500000, 1000000, 1500000, 2000000, 2500000, 3000000, 3500000, 4000000, 4500000 ] xticknames1 = [ '', '500', '1000', '1500', '2000', '2500', '3000', '3500', '4000', '4500' ] colors = [ '#7FCE79', '#BAE85C', '#ff878b', '#8991ff', '#ac5eff', '#50b3ff', '#ffd75e' ] plot_names = [ 'plot1', 'plot2', 'plot3', 'plot4', 'plot5', 'plot6', 'plot7' ] Y_labels = [ 'Cfx GCSs', 'RifCfx GCSs', 'Micro GCSs', 'Oxo GCSs', 'Score', 'GC%', 'Transcription\nlevel' ] yter = 1592477 yori = 3711828 #Prepare GCSs data to bar-compatible format. GCSs_data_bared = {} for ab, ab_ar in GSCs_data.items(): bar_ar = [] for i in range(len(bins) - 1): ab_num = 0 if bins[i] >= 0: for gcs in ab_ar: if bins[i + 1] > gcs >= bins[i]: ab_num += 1 bar_ar.append(ab_num) else: for gcs in ab_ar: if bins[i + 1] > gcs >= 0 or genome_len > gcs >= bins[ i] + genome_len: ab_num += 1 bar_ar.append(ab_num) bar_ar.append(bar_ar[0]) print(ab, bar_ar) GCSs_data_bared[ab] = bar_ar #Compute confident intervals for GCSs number fall into MDs. MDs_confident_intervals = {} for ab, ab_ar in GSCs_data.items(): upper_edge = [] lower_edge = [] print(MDs_lengths) for MD_len in MDs_lengths: upper_edge.append( binom.interval(0.999, len(ab_ar), MD_len / genome_len)[1]) lower_edge.append( binom.interval(0.999, len(ab_ar), MD_len / genome_len)[0]) upper_edge = [upper_edge[0]] + upper_edge + [upper_edge[0] ] + [upper_edge[0]] lower_edge = [lower_edge[0]] + lower_edge + [lower_edge[0] ] + [lower_edge[0]] MDs_confident_intervals[ab] = [lower_edge, upper_edge] print(ab, lower_edge) print(ab, upper_edge) #GCSs data plotting. fig, plot_names = plt.subplots(7, 1, figsize=(11, 15), dpi=100) print(bins) position = [0] + bins[1:] print(position) bin_width = [] position_bw = position + [genome_len] for j in range(len(position_bw) - 1): bin_width.append(position_bw[j + 1] - position_bw[j]) print(bin_width) position_centre = [] for j in range(len(bin_width)): position_centre.append(int(position[j] + (bin_width[j] / 2))) position_centre = [0] + position_centre + [genome_len] i = 0 Histo_comp_dict = { } #Will contain computed histogramm data (bins and values) for key, value in GCSs_data_bared.items(): plot_names[i].set_xlim(0, genome_len) plot_names[i].set_xticks(ticks1, minor=False) plot_names[i].set_xticks([yter, yori], minor=True) plot_names[i].set_xticklabels(xticknames1) plt.setp(plot_names[i].set_xticklabels(xticknames1), rotation=0, fontsize=14) plot_names[i].locator_params(axis='y', nbins=6) plot_names[i].tick_params(axis='x', which='major', labelsize=19) plot_names[i].bar(position, value, bin_width, color=colors[i], linewidth=1, edgecolor='black', align='edge', zorder=1) #Barplot for GCSs number plot_names[i].plot(position_centre, MDs_confident_intervals[key][0], linestyle=":", color="black", linewidth=1, zorder=8) #Lower confidential border plot_names[i].plot(position_centre, MDs_confident_intervals[key][1], linestyle=":", color="black", linewidth=1, zorder=9) #Upper confidential border plot_names[i].plot(position_centre, MDs_confident_intervals[key][0], marker="_", color="black", linewidth=0, markersize=15, zorder=6) #Lower confidential border plot_names[i].plot(position_centre, MDs_confident_intervals[key][1], marker="_", color="black", linewidth=0, markersize=15, zorder=7) #Upper confidential border plot_names[i].fill_between(position_centre, MDs_confident_intervals[key][0], MDs_confident_intervals[key][1], facecolor='grey', alpha=0.3, zorder=10) #Fill confident interval plot_names[i].tick_params(axis='y', which='major', pad=7, labelsize=15) plot_names[i].set_ylabel(Y_labels[i], size=22, labelpad=8, rotation=90) i += 1 #Score, GC, Transcription plotting. for key, value in Non_GCSs_data.items(): value.append(value[0]) plot_names[i].set_xlim(0, genome_len) if key == "GC": plot_names[i].set_ylim(45, max(value) + 2) elif key == "Score": plot_names[i].set_ylim(min(value) - 0.2, -1.5) plot_names[i].set_xticks(ticks1, minor=False) plot_names[i].set_xticks([yter, yori], minor=True) plot_names[i].set_xticklabels(xticknames1) plt.setp(plot_names[i].set_xticklabels(xticknames1), rotation=0, fontsize=14) plot_names[i].tick_params(axis='x', which='major', labelsize=19) plot_names[i].locator_params(axis='y', nbins=6) plot_names[i].bar(position, value, bin_width, color=colors[i], linewidth=1, edgecolor='black', align='edge') plot_names[i].tick_params(axis='y', which='major', pad=7, labelsize=15) plot_names[i].set_ylabel(Y_labels[i], size=22, labelpad=8, rotation=90) i += 1 plt.tight_layout() fig.savefig(path_out + "GCSs_num_score_GC133_transcription_distrib_thr_genome.png", figsize=(11, 15), dpi=400) #GCSs data plotting for Cfx, Micro, and Oxo only. GSCs_data_main = { 'Cfx': GCSs_data_bared['Cfx'], 'Micro': GCSs_data_bared['Micro'], 'Oxo': GCSs_data_bared['Oxo'] } Y_labels_main = ['Cfx GCSs', 'Micro GCSs', 'Oxo GCSs'] colors_main = ['#7FCE79', '#ff878b', '#8991ff'] fig, plot_names = plt.subplots(3, 1, figsize=(11, 7), dpi=100) i = 0 for key, value in GSCs_data_main.items(): plot_names[i].set_xlim(0, genome_len) plot_names[i].set_xticks(ticks1, minor=False) plot_names[i].set_xticks([yter, yori], minor=True) plot_names[i].set_xticklabels(xticknames1) plt.setp(plot_names[i].set_xticklabels(xticknames1), rotation=0, fontsize=14) plot_names[i].locator_params(axis='y', nbins=6) plot_names[i].tick_params(axis='x', which='major', labelsize=19) plot_names[i].bar(position, value, bin_width, color=colors_main[i], linewidth=1, edgecolor='black', align='edge') plot_names[i].tick_params(axis='y', which='major', pad=7, labelsize=15) plot_names[i].set_ylabel(Y_labels_main[i], size=22, labelpad=8, rotation=90) i += 1 plt.tight_layout() fig.savefig(path_out + "GCSs_number_Cfx_Micro_Oxo_distrib_thr_genome.png", figsize=(11, 7), dpi=400) return GCSs_data_bared, Non_GCSs_data
def generate_visits(self, count): dataset = [] for _ in range(count): obs = [] for param in self.visitor_params: val = 1 if random.random() < param else 0 obs.append(val) dataset.append(obs) return dataset if __name__ == "__main__": count = 1000 visitor_params = [0.3, 0.6] vm = VisitorModel(["a","b"], visitor_params) ds = vm.generate_visits(count) assert(len(ds) == count) assert(len(ds[0]) == len(visitor_params)) sums = np.sum(ds, axis=0) print(sums.shape) for i, col in enumerate(ds[0]): assert(col ==1 or col == 0) pct = visitor_params[i] (lo, hi) = binom.interval(.954, 1000, pct) print("sums", i, lo, sums[i], hi) assert(lo <= sums[i] <= hi)
return True else: return False for N in N_range: print 'Starting N =', N for p in p_range: num_Np_fails = 0 num_Np_checks = 0 for _ in xrange(repeats): if check(N, p): num_Np_fails += 1 num_Np_checks += 1 # work out what the failure probability is (approximately but not exactly 1-alpha # because it's a discrete distribution) low, high = binom.interval(alpha, N, p) if p==0: low = high = 0 elif p==1: low = high = N q = binom.cdf(low-0.1, N, p)+binom.sf(high+0.1, N, p) low, high = binom.interval(alpha, num_Np_checks, q) if q==0: low = high = 0 if num_Np_fails<low or num_Np_fails>high: print 'N=%d, p=%.3f failed %d of %d checks, outside range (%d, %d)' % (N, p, num_Np_fails, num_Np_checks, low, high) print failrate = float(numfails)/numchecks low, high = norm.interval(alpha, loc=mu, scale=sqrt(sigma2)) print '%d/%d=%.2f%% failed at %d%%' % (numfails, numchecks, numfails*100.0/numchecks, 100*alpha)
def plot_sbc(theta_samples, theta_test, param_names, bins=20, figsize=(15, 5), interval=0.99, show=True, filename=None, font_size=12): """ Plots the simulation-based posterior checking histograms as advocated by Talts et al. (2018). """ # Plot settings plt.rcParams['font.size'] = font_size N = int(theta_test.shape[0]) # Prepare figure if len(param_names) >= 6: n_col = int(np.ceil(len(param_names) / 2)) n_row = 2 else: n_col = int(len(param_names)) n_row = 1 # Initialize figure f, axarr = plt.subplots(n_row, n_col, figsize=figsize) if n_row > 1: axarr = axarr.flat # Compute ranks (using broadcasting) ranks = np.sum(theta_samples < theta_test[:, np.newaxis, :], axis=1) # Compute interval endpoints = binom.interval(interval, N, 1 / (bins + 1)) # Plot histograms for j in range(len(param_names)): # Add interval axarr[j].axhspan(endpoints[0], endpoints[1], facecolor='gray', alpha=0.3) axarr[j].axhline(np.mean(endpoints), color='gray', zorder=0, alpha=0.5) sns.distplot(ranks[:, j], kde=False, ax=axarr[j], color='#a34f4f', hist_kws=dict(edgecolor="k", linewidth=1, alpha=1.), bins=bins) axarr[j].set_title(param_names[j]) axarr[j].spines['right'].set_visible(False) axarr[j].spines['top'].set_visible(False) if j == 0: axarr[j].set_xlabel('Rank statistic') axarr[j].get_yaxis().set_ticks([]) f.tight_layout() # Show, if specified if show: plt.show() # Save if specified if filename is not None: f.savefig("figures/{}_sbc.png".format(filename), dpi=600)
def test_issue_11134(): alpha, n, p = 0.95, 10, 0 assert_equal(binom.interval(alpha=alpha, n=n, p=p), (0, 0))
def plot_reliability_diagram(y,x,bins=np.linspace(0,1,21),size_points=False, show_baseline=True, error_bars=True, error_bar_alpha=.05, show_histogram=False, c='red', **kwargs): """Plots a reliability diagram of predicted vs empirical probabilities. Parameters ---------- y: array-like, length (n_samples). The true outcome values as integers (0 or 1) x: The predicted probabilities, between 0 and 1 inclusize. bins: array-like, the endpoints of the bins used to aggregate and estimate the empirical probabilities. Default is 20 equally sized bins from 0 to 1, i.e. [0,0.05,0.1,...,.95, .1]. size_points: scale the size of the plotted points to reflect the number of data points in the bin. This may not work well if some bins are much larger than others. Default is False. show_baseline: whether or not to print a dotted black line representing y=x (perfect calibration). Default is True error_bars: whether to show error bars reflecting the confidence interval under the assumption that the input probabilities are perfectly calibrated. Default is True. error_bar_alpha: The alpha value to use for the error_bars. Default is .05 (a 95% CI). Confidence intervals are based on the exact binomial distribution, not the normal approximation. show_histogram: Whether or not to show a separate histogram of the number of values in each bin. Default is False c: color of the plotted points. Default is 'red'. **kwargs: additional args to be passed to the plt.scatter matplotlib call. Returns ------- A tuple of the x_values, y_values, and associated bin_counts for each of the points in the plot. """ digitized_x = np.digitize(x, bins) mean_count_array = np.array([[np.mean(y[digitized_x == i]), len(y[digitized_x == i]), np.mean(x[digitized_x==i])] for i in np.unique(digitized_x)]) x_pts_to_graph = mean_count_array[:,2] y_pts_to_graph = mean_count_array[:,0] pt_sizes = mean_count_array[:,1] if show_histogram: plt.subplot(1,2,1) if show_baseline: plt.plot(np.linspace(0,1,100),(np.linspace(0,1,100)),'k--') for i in range(len(y_pts_to_graph)): if size_points: plt.scatter(x_pts_to_graph, y_pts_to_graph, s=pt_sizes, c=c, **kwargs) else: plt.scatter(x_pts_to_graph,y_pts_to_graph, c=c, **kwargs) plt.axis([-0.1,1.1,-0.1,1.1]) plt.xlabel('Predicted') plt.ylabel('Empirical') if error_bars: yerr_mat = binom.interval(1-error_bar_alpha,pt_sizes,x_pts_to_graph)/pt_sizes - x_pts_to_graph yerr_mat[0,:] = -yerr_mat[0,:] plt.errorbar(x_pts_to_graph, x_pts_to_graph, yerr=yerr_mat, capsize=5) if show_histogram: plt.subplot(1,2,2) plt.hist(x,bins=bins) return(x_pts_to_graph,y_pts_to_graph,pt_sizes)
def plot_reliability_diagram(y, x, bins=np.linspace(0, 1, 21), show_baseline=True, error_bars=True, error_bar_alpha=.05, show_histogram=False, scaling='none', scaling_eps=.0001, scaling_base=10, c='red', **kwargs): """Plots a reliability diagram of predicted vs empirical probabilities. Parameters ---------- y: array-like, length (n_samples). The true outcome values as integers (0 or 1) x: The predicted probabilities, between 0 and 1 inclusive. bins: array-like, the endpoints of the bins used to aggregate and estimate the empirical probabilities. Default is 20 equally sized bins from 0 to 1, i.e. [0,0.05,0.1,...,.95, .1]. show_baseline: whether or not to print a dotted black line representing y=x (perfect calibration). Default is True error_bars: whether to show error bars reflecting the confidence interval under the assumption that the input probabilities are perfectly calibrated. Default is True. error_bar_alpha: The alpha value to use for the error_bars. Default is .05 (a 95% CI). Confidence intervals are based on the exact binomial distribution, not the normal approximation. show_histogram: Whether or not to show a separate histogram of the number of values in each bin. Default is False scaling: Default is 'none'. Alternative is 'logit' which is useful for better examination of calibration near 0 and 1. Values shown are on the scale provided and then tick marks are relabeled. scaling_eps: default is .0001. Ignored unless scaling='logit'. This indicates the smallest meaningful positive probability you want to consider. scaling_base: default is 10. Ignored unless scaling='logit'. This indicates the base used when scaling back and forth. Matters only in how it affects the automatic tick marks. c: color of the plotted points. Default is 'red'. **kwargs: additional args to be passed to the plt.scatter matplotlib call. Returns ------- A dictionary containing the x and y points plotted (unscaled) and the count in each bin. """ digitized_x = np.digitize(x, bins) mean_count_array = np.array([[ np.mean(y[digitized_x == i]), len(y[digitized_x == i]), np.mean(x[digitized_x == i]) ] for i in np.unique(digitized_x)]) x_pts_to_graph = mean_count_array[:, 2] y_pts_to_graph = mean_count_array[:, 0] bin_counts = mean_count_array[:, 1] if show_histogram: plt.subplot(1, 2, 1) if scaling == 'logit': x_pts_to_graph_scaled = my_logit(x_pts_to_graph, eps=scaling_eps, base=scaling_base) y_pts_to_graph_scaled = my_logit(y_pts_to_graph, eps=scaling_eps, base=scaling_base) prec_int = np.max([ -np.floor(np.min(x_pts_to_graph_scaled)), np.ceil(np.max(x_pts_to_graph_scaled)) ]) prec_int = np.max([prec_int, -np.floor(np.log10(scaling_eps))]) low_mark = -prec_int high_mark = prec_int if show_baseline: plt.plot([low_mark, high_mark], [low_mark, high_mark], 'k--') # for i in range(len(y_pts_to_graph)): plt.scatter(x_pts_to_graph_scaled, y_pts_to_graph_scaled, c=c, **kwargs) locs, labels = plt.xticks() labels = np.round(my_logistic(locs, base=scaling_base), decimals=4) plt.xticks(locs, labels) locs, labels = plt.yticks() labels = np.round(my_logistic(locs, base=scaling_base), decimals=4) plt.yticks(locs, labels) if error_bars: prob_range_mat = binom.interval(1 - error_bar_alpha, bin_counts, x_pts_to_graph) / bin_counts yerr_mat = ( my_logit(prob_range_mat, eps=scaling_eps, base=scaling_base) - my_logit(x_pts_to_graph, eps=scaling_eps, base=scaling_base)) yerr_mat[0, :] = -yerr_mat[0, :] plt.errorbar(x_pts_to_graph_scaled, x_pts_to_graph_scaled, yerr=yerr_mat, capsize=5) plt.axis( [low_mark - .1, high_mark + .1, low_mark - .1, high_mark + .1]) if scaling != 'logit': if show_baseline: plt.plot(np.linspace(0, 1, 100), (np.linspace(0, 1, 100)), 'k--') # for i in range(len(y_pts_to_graph)): plt.scatter(x_pts_to_graph, y_pts_to_graph, c=c, **kwargs) plt.axis([-0.1, 1.1, -0.1, 1.1]) if error_bars: yerr_mat = binom.interval( 1 - error_bar_alpha, bin_counts, x_pts_to_graph) / bin_counts - x_pts_to_graph yerr_mat[0, :] = -yerr_mat[0, :] plt.errorbar(x_pts_to_graph, x_pts_to_graph, yerr=yerr_mat, capsize=5) plt.xlabel('Predicted') plt.ylabel('Empirical') if show_histogram: plt.subplot(1, 2, 2) plt.hist(x, bins=bins) out_dict = {} out_dict['pred_probs'] = x_pts_to_graph out_dict['emp_probs'] = y_pts_to_graph out_dict['bin_counts'] = bin_counts return (out_dict)
def run_benchmark(args): def verbose_print(*a, **kw): if args.verbose: print(*a, **kw) diff_programs = get_diff_programs_for_args(args) all_diff_program_extra_fields = sorted( {k for p in diff_programs for k in p.get("extra_fields", {}).keys()}) print(f"{len(diff_programs)} diff programs") try: with (args.input_dir / "index.json").open("r", encoding="utf-8") as f: benchmark_input_index = json.load(f) except Exception: print('Failed to load benchmark inputs. Did you run "prepare"?') raise shuffled_generation_configs = benchmark_input_index[ "shuffled_generation_configs"] num_regens = benchmark_input_index["num_regens"] assert num_regens >= 1 test_combination_factors = [ len(shuffled_generation_configs), num_regens, len(diff_programs), # don't count number of repetitions, because it is possibly dynamic ] total_test_combinations = np.prod(test_combination_factors) print( f'{" * ".join(str(v) for v in test_combination_factors)} = {total_test_combinations} total test combinations' ) csv_output_file = open(args.output_csv, "w", newline="") csv_output_writer = CSVOutputWriter(csv_output_file) def get_extra_file_path(suffix): name = ".".join(args.output_csv.name.split(".")[:-1]) + suffix return args.output_csv.parent / name failed_file_path = get_extra_file_path("-FAILED.txt") failed_file = open(failed_file_path, "w") if args.auto_repetitions: num_repetitions = args.max_repetitions else: num_repetitions = args.min_repetitions if args.no_progress_bar: progress_bar = NoopProgressBar() else: progress_bar = tqdm(total=total_test_combinations, smoothing=0) if args.skip_estimated_timeouts: # record smallest input length with timeout per diff_program to skip larger test cases smallest_timeout = {} last_flush_time = time.monotonic() break_flag = False some_benchmarks_failed = False for _entry in shuffled_generation_configs: generation_config_i = _entry["i"] generation_config = _entry["config"] for regen_i in range(num_regens): verbose_print("generation_config", generation_config) test_case_dir = (args.input_dir / f"config-{generation_config_i}-regen-{regen_i}") for diff_program in diff_programs: diff_prog_full_name = (diff_program["name"] + "_" + str( diff_program.get("extra_fields", {}).get("mpi_procs", 1))) if (args.skip_estimated_timeouts and diff_prog_full_name in smallest_timeout and generation_config["length_1"] >= smallest_timeout[diff_prog_full_name]): some_benchmarks_failed = True print(diff_prog_full_name + "\t", file=failed_file, end="") print(generation_config, file=failed_file, end="") print( f"\t skipped due to estimated timeout, since length_1 {generation_config['length_1']} >= {smallest_timeout[diff_prog_full_name]}", file=failed_file, ) progress_bar.update() continue # sorted list of measurements micros_until_len_res = [] check_interval = min_repetitions_for_confidence - 1 for repetition_i in range(num_repetitions): if time.monotonic( ) - last_flush_time > flush_every_seconds: csv_output_file.flush() failed_file.flush() last_flush_time = time.monotonic() verbose_print(" diff_program", diff_program["name"]) extra_fields_for_output = { k: diff_program.get("extra_fields", {}).get(k, "") for k in all_diff_program_extra_fields } extra_fields_for_run = deepcopy( diff_program.get("extra_fields", {})) if (args.no_direct_mpi_procs_limit and "mpi_procs" in extra_fields_for_run): extra_fields_for_run["mpi_procs"] = None try: program_result = diff_program["run"]( test_case_dir / "in_1.txt", test_case_dir / "in_2.txt", extra_fields_for_run, ) verbose_print(" micros_until_len", program_result.micros_until_len) except KeyboardInterrupt: # exit the benchmark break_flag = True break except TimeoutExpired as te: some_benchmarks_failed = True print(diff_prog_full_name + "\t", file=failed_file, end="") print(generation_config, file=failed_file, end="") print("\t" + repr(te), file=failed_file) if args.auto_repetitions: timeout_micros = te.timeout * 1e6 # seconds to microseconds if (repetition_i >= 5 and micros_until_len_res[0] == timeout_micros): if args.skip_estimated_timeouts: smallest_timeout[ diff_prog_full_name] = generation_config[ "length_1"] break # if five iterations timed out -> assume all will timeout, don't try again micros_until_len_res.append(timeout_micros) else: if args.skip_estimated_timeouts: smallest_timeout[ diff_prog_full_name] = generation_config[ "length_1"] continue except Exception as e: # catch all some_benchmarks_failed = True print(diff_prog_full_name + "\t", file=failed_file, end="") print(generation_config, file=failed_file, end="") print("\t" + repr(e), file=failed_file) break # assumption: will always fail with these exceptions -> no need to run all repetitions output_data = { "generation_config_i": generation_config_i, **{ f"input_{k}": v for k, v in generation_config.items() }, "regen_i": regen_i, "repetition_i": repetition_i, "diff_program": diff_program["name"], **extra_fields_for_output, "mpi_comm_world": getattr(program_result, "mpi_comm_world", 1), "micros_input": program_result.micros_input, "micros_precompute": program_result.micros_precompute, "micros_until_len": program_result.micros_until_len, "micros_edit_script": program_result.micros_edit_script, "min_edit_length": program_result.min_edit_length, } csv_output_writer.write_row(output_data) if args.auto_repetitions: bisect.insort(micros_until_len_res, program_result.micros_until_len) if ( repetition_i >= args.min_repetitions and repetition_i % check_interval == 0 ) or repetition_i == num_repetitions - 1: # reached the last iteration # check if required confidence interval is reached if repetition_i % 2 == 0: # odd number of results current_median = micros_until_len_res[ repetition_i // 2] else: current_median = (micros_until_len_res[ (repetition_i - 1) // 2] + micros_until_len_res[ (repetition_i + 1) // 2]) / 2 # check about every 20 ms = 20'000 microseconds (overhead is about 1 ms) => max 5% overhead check_interval = math.ceil(20000 / current_median) lower_idx, upper_idx = binom.interval( confidence_level, repetition_i + 1, 0.5) # to get correct indices in python (Boudec paper Appendix A - 1) lower_idx -= 1 # sometimes the interval is a little bit wider than in the Boudec paper, but this just means more confidence if (micros_until_len_res[int(lower_idx)] >= (1 - args.max_median_error) * current_median and micros_until_len_res[int(upper_idx)] <= (1 + args.max_median_error) * current_median): break if repetition_i == num_repetitions - 1: # failed to reach required confidence some_benchmarks_failed = True print( diff_prog_full_name + "\t", file=failed_file, end="", ) print(generation_config, file=failed_file, end="") print( "\t" + f"Failed to reach required confidence after {num_repetitions} repetitions; " + f"current median: {current_median}, left end of CI: {micros_until_len_res[int(lower_idx)]}, right end of CI: {micros_until_len_res[int(upper_idx)]}", file=failed_file, ) progress_bar.update() if break_flag: break if break_flag: break if break_flag: break progress_bar.close() csv_output_file.close() failed_file.close() if not some_benchmarks_failed: failed_file_path.unlink()
def plot_sbc(theta_samples, theta_test, param_names, bins=25, dpi=300, figsize=(24, 12), interval=0.99, show=True, font_size=12): """ Plots the simulation-based posterior checking histograms as advocated by Talts et al. (2018). Parameters ---------- theta_samples: np.array Array of sampled parameters theta_test: np.array Array of test parameters param_names: list(str) List of parameter names for plotting. bins: int, default: 25 Bins for histogram plot dpi: int, default: 300 Dots per inch (dpi) for plot figsize: tuple(int, int), default: (24, 12) Figure size interval: float, default: 0.99 Interval to plot show: bool, default: True Controls whether the plot shall be printed font_size: int, default:12 Font size """ # Plot settings plt.rcParams['font.size'] = font_size N = int(theta_test.shape[0]) # Determine n_subplots dynamically n_row = int(np.ceil(len(param_names) / 6)) n_col = int(np.ceil(len(param_names) / n_row)) # Initialize figure f, axarr = plt.subplots(n_row, n_col, figsize=figsize) if n_row > 1: axarr = axarr.flat # Compute ranks (using broadcasting) ranks = np.sum(theta_samples < theta_test, axis=0) # Compute interval endpoints = binom.interval(interval, N, 1 / (bins + 1)) # Plot histograms for j in range(len(param_names)): # Add interval axarr[j].axhspan(endpoints[0], endpoints[1], facecolor='gray', alpha=0.3) axarr[j].axhline(np.mean(endpoints), color='gray', zorder=0, alpha=0.5) sns.histplot(ranks[:, j], kde=False, ax=axarr[j], color='#a34f4f', bins=bins, alpha=0.95) axarr[j].set_title(param_names[j]) axarr[j].spines['right'].set_visible(False) axarr[j].spines['top'].set_visible(False) if j == 0: axarr[j].set_xlabel('Rank statistic') axarr[j].get_yaxis().set_ticks([]) axarr[j].set_ylabel('') f.tight_layout() # Show, if specified if show: plt.show() return f
def get_bursty_tweet(time_window_tweets, expectation_features, train=True, print_stats=False): N = len(time_window_tweets) id_ret = set() # Tokenize all the tweets - reduce them to unigram features tokenized_tweets = [(tweet['id'], tokenize_tweet(tweet['full_text'])) for tweet in time_window_tweets] # Get a list feature - list of tweet_ids where the feature appear in bag_of_feature = {} for id_flist in tokenized_tweets: for feature in id_flist[1]: bag_of_feature[feature] = bag_of_feature.get(feature, set()) bag_of_feature[feature].add(id_flist[0]) #print(bag_of_feature.keys()) for feature in bag_of_feature: # Count feature, number of tweets it appear in and reporpotionate it n_feature_appear_in = len(bag_of_feature[feature]) #Probability f_j appears in the time windows P_o(n_{i,j}) Prob_f_window = n_feature_appear_in / N feature_info = expectation_features.get(feature, [0, 0]) expected = feature_info[1] windows_feature_appeared = feature_info[0] ra = math.floor(expected * N) #max of the distribution rb = binom.interval(0.999, N, expected)[1] #point where distribution appraches 0 q = (rb + ra) / 2 if (n_feature_appear_in >= q): id_ret = id_ret.union(bag_of_feature[feature]) if train: expected = ((expected * windows_feature_appeared) + Prob_f_window) / (windows_feature_appeared + 1) expectation_features[feature] = [ windows_feature_appeared + 1, expected ] if print_stats: print( "------------------------------------------------------------------------" ) print("Feature: ", feature) print("Window size: ", N) print("n_{i,j}: ", n_feature_appear_in) print("Probability in the time window of the feature: " + str("{0:.2f}".format(Prob_f_window))) print("Expectation of the feature: " + str("{0:.2f}".format(expected))) #plot_graph(N, expected,int(N*n_feature_appear_in/N_tweets_window), Prob_f_window) print( "------------------------------------------------------------------------" ) return id_ret
def calcPercentile(): data_binom = binom.rvs(n=120, p=1 / 120, size=20000) CI = binom.interval(0.95, 120, 1 / 120) print(CI) plt.hist(data_binom) plt.show()
def qqplot(data, labels, n_quantiles=100, alpha=0.95, error_type='theoretical', distribution='binomial', log10conv=True, color=['k', 'r', 'b'], fill_dens=[0.1, 0.1, 0.1], type='uniform', title='title'): ''' Function for plotting Quantile Quantile (QQ) plots with confidence interval (CI) :param data: NumPy 1D array with data :param labels: :param type: type of the plot :param n_quantiles: number of quntiles to plot :param alpha: confidence interval :param distribution: beta/normal/binomial -- type of the error estimation. Most common in the literature is 'beta'. :param log10conv: conversion to -log10(p) for the figure :return: nothing ''' xmax = 0 ymax = 0 np.seterr(divide='ignore') if type == 'uniform': # we expect distribution from 0 to 1 for j in range(len(data)): # define quantiles positions: q_pos = np.concatenate([ np.arange(99.) / len(data[j]), np.logspace(-np.log10(len(data[j])) + 2, 0, n_quantiles) ]) # define quantiles in data q_data = mquantiles(data[j], prob=q_pos, alphap=0, betap=1, limit=(0, 1)) # linear interpolation # define theoretical predictions q_th = q_pos.copy() # evaluate errors q_err = np.zeros([len(q_pos), 2]) if np.sum(alpha) > 0: for i in range(0, len(q_pos)): if distribution == 'beta': q_err[i, :] = beta.interval( alpha, len(data[j]) * q_pos[i], len(data[j]) - len(data[j]) * q_pos[i]) elif distribution == 'binomial': q_err[i, :] = binom.interval(alpha=alpha, n=len(data[j]), p=q_pos[i]) elif distribution == 'normal': q_err[i, :] = norm.interval( alpha, len(data[j]) * q_pos[i], np.sqrt(len(data[j]) * q_pos[i] * (1. - q_pos[i]))) else: print('Distribution is not defined!') q_err[i, q_err[i, :] < 0] = 1e-15 if (distribution == 'binomial') | (distribution == 'normal'): q_err /= 1.0 * len(data[j]) for i in range(0, 100): q_err[i, :] += 1e-15 # print(q_err[100:, :]) slope, intercept, r_value, p_value, std_err = linregress( q_th, q_data) # print(labels[j], ' -- Slope: ', slope, " R-squared:", r_value**2) plt.plot(-np.log10(q_th[n_quantiles - 1:]), -np.log10(q_data[n_quantiles - 1:]), '-', color=color[j]) plt.plot(-np.log10(q_th[:n_quantiles]), -np.log10(q_data[:n_quantiles]), '.', color=color[j], label=labels[j]) xmax = np.max([xmax, -np.log10(q_th[1])]) ymax = np.max([ymax, -np.log10(q_data[0])]) # print(- np.log10(q_th[:])) if np.sum(alpha) > 0: if error_type == 'experimental': plt.fill_between(-np.log10(q_th), -np.log10(q_data / q_th * q_err[:, 0]), -np.log10(q_data / q_th * q_err[:, 1]), color=color[j], alpha=fill_dens[j], label='%1.3f CI' % alpha) if np.sum(alpha) > 0: if error_type == 'theoretical': plt.fill_between(-np.log10(q_th), -np.log10(q_err[:, 0]), -np.log10(q_err[:, 1]), color=color[j], alpha=fill_dens[j], label='%1.3f CI' % alpha) plt.legend(loc=4) plt.xlabel('Theoretical -log10(p)') plt.ylabel('Experimental -log10(p)') plt.plot([0, 100], [0, 100], '--k') # print(xmax,ymax) plt.xlim([0, np.ceil(xmax)]) plt.ylim([0, np.ceil(ymax * 1.05)]) plt.title(title) plt.tight_layout() np.seterr(divide='warn')
def model_dict(self, round_state, hole_card): # CHANGE!!! Added 'hole_card' self.this_round_length = 0 action_histories = round_state['action_histories'] for x in self.opponent_model.keys(): self.opponent_model[x]['fold_this_round'] = 0 self.opponent_model[x]['raise_this_round'] = 0 self.opponent_model[x]['call_this_round'] = 0 for game_round in action_histories.keys(): for x in range(len(action_histories[str(game_round)])): if len(action_histories[str(game_round)]) != 0: self.this_round_length += 1 # CHANGE !!! Simple method for adding priors. Probably not the best option. self.opponent_model[action_histories[str( game_round)][x]['uuid']][ 'fold'] = 2.16 # or 4.32 (or any multiple of 2.16) self.opponent_model[action_histories[str( game_round)][x]['uuid']][ 'call'] = .42 # or .84 (or any multiple of .42) self.opponent_model[action_histories[str( game_round)][x]['uuid']][ 'raise'] = .42 # or .84 (or any multiple of .42) if action_histories[str(game_round)][x]['action'] == 'FOLD': self.opponent_model[action_histories[str(game_round)][x] ['uuid']]['fold'] += 1 self.opponent_model[action_histories[str(game_round)][x] ['uuid']]['fold_this_round'] += 1 elif action_histories[str(game_round)][x]['action'] == 'CALL': self.opponent_model[action_histories[str(game_round)][x] ['uuid']]['call'] += 1 self.opponent_model[action_histories[str(game_round)][x] ['uuid']]['call_this_round'] += 1 elif action_histories[str(game_round)][x]['action'] == 'RAISE': self.opponent_model[action_histories[str(game_round)][x] ['uuid']]['raise'] += 1 self.opponent_model[action_histories[str(game_round)][x] ['uuid']]['raise_this_round'] += 1 amt = action_histories[str(game_round)][x]['amount'] self.opponent_model[action_histories[str( game_round)][x]['uuid']]['raise_shares'] += ( amt / self.opponent_model[action_histories[str( game_round)][x]['uuid']]['stack']) output = {} probability_list = [] output['probability_list'] = probability_list for x in self.opponent_model.keys(): try: n = (self.opponent_model[x]['fold'] + self.opponent_model[x]['raise'] + self.opponent_model[x]['call']) fold_freq = self.opponent_model[x]['fold'] / ( self.opponent_model[x]['fold'] + self.opponent_model[x]['raise'] + self.opponent_model[x]['call']) # p_hat conf_int = binom.interval(.05, n, fold_freq) print(conf_int) self.opponent_model[x][ 'fold_freq'] = self.opponent_model[x]['fold'] / ( self.opponent_model[x]['fold'] + # CHANGE!!! self.opponent_model[x]['raise'] + self.opponent_model[x]['call']) except: self.opponent_model[x]['fold_freq'] = .72 try: self.opponent_model[x]['raises:calls'] = self.opponent_model[ x]['raise'] / self.opponent_model[x]['call'] except: self.opponent_model[x]['raises:calls'] = 1 community_card = round_state['community_card'] win_rate = estimate_hole_card_win_rate( nb_simulation=NB_SIMULATION, nb_player=self.nb_player, hole_card=gen_cards(hole_card), community_card=gen_cards(community_card)) self.opponent_model[x]['probability'] = (1 - win_rate) / self.nb_player if self.opponent_model[x]['raise_this_round'] > 0: self.opponent_model[x]['probability'] += ( 1 - self.opponent_model[x]['aggressiveness'] * self.aggressiveness_raise_prob_factor ) * self.opponent_model[x]['probability'] if self.opponent_model[x]['call_this_round'] > 0: self.opponent_model[x]['probability'] += ((1 - self.opponent_model[x]['frequency']) * \ self.opponent_model[x][ 'probability']) * self.frequency_call_factor if self.opponent_model[x]['fold_this_round'] > 0: self.opponent_model[x]['probability'] = 0 # I'm skeptical that opponent model dictionary is tracking properly. I think it only tracks while we're playing a hand. # print(self.opponent_model[x]) if self.opponent_model[x]['name'] != self.name: output['probability_list'].append( self.opponent_model[x]['probability']) else: output['stack'] = self.opponent_model[x]['stack'] return (output)
def Plot_the_distribution(GSCs_data, Non_GCSs_data, bins, genome_len, path_out): #Parameters ticks1 = [ 0, 500000, 1000000, 1500000, 2000000, 2500000, 3000000, 3500000, 4000000, 4500000 ] xticknames1 = [ '', '500', '1000', '1500', '2000', '2500', '3000', '3500', '4000', '4500' ] colors = [ '#7FCE79', '#BAE85C', '#ff878b', '#8991ff', '#ac5eff', '#50b3ff', '#ffd75e' ] plot_names = [ 'plot1', 'plot2', 'plot3', 'plot4', 'plot5', 'plot6', 'plot7' ] Y_labels = [ 'Cfx GCSs', 'RifCfx GCSs', 'Micro GCSs', 'Oxo GCSs', 'Score', 'GC%', 'Transcription\nlevel' ] yter = 1592477 yori = 3711828 #GCSs data plotting. fig, plot_names = plt.subplots(7, 1, figsize=(11, 15), dpi=100) i = 0 Histo_comp_dict = { } #Will contain computed histogramm data (bins and values) for key, value in GSCs_data.items(): plot_names[i].set_xlim(0, genome_len) plot_names[i].set_xticks(ticks1, minor=False) plot_names[i].set_xticks([yter, yori], minor=True) plot_names[i].set_xticklabels(xticknames1) plt.setp(plot_names[i].set_xticklabels(xticknames1), rotation=0, fontsize=14) conf_interval = [ binom.interval(0.999, len(value), 1 / 10)[0], binom.interval(0.999, len(value), 1 / 10)[1] ] plot_names[i].set_yticks(conf_interval, minor=True) plot_names[i].yaxis.grid(True, which='minor', linewidth=0.4, linestyle='--', color='black') plot_names[i].fill_between(bins, conf_interval[0], conf_interval[1], facecolor='grey', alpha=0.3) plot_names[i].locator_params(axis='y', nbins=6) plot_names[i].tick_params(axis='x', which='major', labelsize=19) Histo_comp_dict[key] = plot_names[i].hist( value, bins, facecolor=colors[i], alpha=0.7, linewidth=1, edgecolor='black' ) #Plot histo and save computed histogramm data (bins and values) plot_names[i].tick_params(axis='y', which='major', pad=7, labelsize=15) plot_names[i].set_ylabel(Y_labels[i], size=22, labelpad=8, rotation=90) i += 1 #Score, GC, Transcription plotting. bin_width = int(bins[1]) position = bins[:-1] print(len(position)) for key, value in Non_GCSs_data.items(): len(value) plot_names[i].set_xlim(0, genome_len) if key == "GC": plot_names[i].set_ylim(45, max(value) + 2) elif key == "Score": plot_names[i].set_ylim(min(value) - 0.2, -1.5) plot_names[i].set_xticks(ticks1, minor=False) plot_names[i].set_xticks([yter, yori], minor=True) plot_names[i].set_xticklabels(xticknames1) plt.setp(plot_names[i].set_xticklabels(xticknames1), rotation=0, fontsize=14) plot_names[i].tick_params(axis='x', which='major', labelsize=19) plot_names[i].locator_params(axis='y', nbins=6) plot_names[i].bar(position, value, bin_width, color=colors[i], linewidth=1, edgecolor='black', align='edge') plot_names[i].tick_params(axis='y', which='major', pad=7, labelsize=15) plot_names[i].set_ylabel(Y_labels[i], size=22, labelpad=8, rotation=90) i += 1 plt.tight_layout() fig.savefig(path_out + "GCSs_num_score_GC133_transcription_distrib_thr_genome.png", figsize=(11, 15), dpi=400) #GCSs data plotting for Cfx, Micro, and Oxo only. GSCs_data_main = { 'Cfx': GSCs_data['Cfx'], 'Micro': GSCs_data['Micro'], 'Oxo': GSCs_data['Oxo'] } Y_labels_main = ['Cfx GCSs', 'Micro GCSs', 'Oxo GCSs'] colors_main = ['#7FCE79', '#ff878b', '#8991ff'] fig, plot_names = plt.subplots(3, 1, figsize=(11, 7), dpi=100) i = 0 for key, value in GSCs_data_main.items(): plot_names[i].set_xlim(0, genome_len) plot_names[i].set_xticks(ticks1, minor=False) plot_names[i].set_xticks([yter, yori], minor=True) plot_names[i].set_xticklabels(xticknames1) plt.setp(plot_names[i].set_xticklabels(xticknames1), rotation=0, fontsize=14) conf_interval = [ binom.interval(0.999, len(value), 1 / 10)[0], binom.interval(0.999, len(value), 1 / 10)[1] ] plot_names[i].set_yticks(conf_interval, minor=True) plot_names[i].yaxis.grid(True, which='minor', linewidth=0.4, linestyle='--', color='black') plot_names[i].fill_between(bins, conf_interval[0], conf_interval[1], facecolor='grey', alpha=0.3) plot_names[i].locator_params(axis='y', nbins=6) plot_names[i].tick_params(axis='x', which='major', labelsize=19) plot_names[i].hist( value, bins, facecolor=colors_main[i], alpha=0.7, linewidth=1, edgecolor='black' ) #Plot histo and save computed histogramm data (bins and values) plot_names[i].tick_params(axis='y', which='major', pad=7, labelsize=15) plot_names[i].set_ylabel(Y_labels_main[i], size=22, labelpad=8, rotation=90) i += 1 plt.tight_layout() fig.savefig(path_out + "GCSs_number_Cfx_Micro_Oxo_distrib_thr_genome.png", figsize=(11, 7), dpi=400) return Histo_comp_dict