def bayesian_log_normal_with_0_test (A_data, B_data, m0, k0,s_sq0, v0, mean_lift = 0): # modeling zero vs. non-zero non_zeros_A = sum(A_data > 0) total_A = len(A_data) non_zeros_B = sum(B_data > 0) total_B = len(B_data) alpha = 1 # uniform prior beta = 1 n_samples = 10000 # number of samples to draw A_conv_samps = beta_dist(non_zeros_A+alpha, total_A-non_zeros_A+beta, n_samples) B_conv_samps = beta_dist(non_zeros_B+alpha, total_B-non_zeros_B+beta, n_samples) # modeling the non-zeros with a log-normal A_non_zero_data = A_data[A_data > 0] B_non_zero_data = B_data[B_data > 0] A_order_samps = draw_log_normal_means(A_non_zero_data,m0,k0,s_sq0,v0) B_order_samps = draw_log_normal_means(B_non_zero_data,m0,k0,s_sq0,v0) # combining the two A_rps_samps = A_conv_samps*A_order_samps B_rps_samps = B_conv_samps*B_order_samps # the result print mean(A_rps_samps > B_rps_samps)
def bayesian_log_normal_with_0_test(A_data, B_data, m0, k0, s_sq0, v0, mean_lift=0): # modeling zero vs. non-zero non_zeros_A = sum(A_data > 0) total_A = len(A_data) non_zeros_B = sum(B_data > 0) total_B = len(B_data) alpha = 1 # uniform prior beta = 1 n_samples = 10000 # number of samples to draw A_conv_samps = beta_dist(non_zeros_A + alpha, total_A - non_zeros_A + beta, n_samples) B_conv_samps = beta_dist(non_zeros_B + alpha, total_B - non_zeros_B + beta, n_samples) # modeling the non-zeros with a log-normal A_non_zero_data = A_data[A_data > 0] B_non_zero_data = B_data[B_data > 0] A_order_samps = draw_log_normal_means(A_non_zero_data, m0, k0, s_sq0, v0) B_order_samps = draw_log_normal_means(B_non_zero_data, m0, k0, s_sq0, v0) # combining the two A_rps_samps = A_conv_samps * A_order_samps B_rps_samps = B_conv_samps * B_order_samps # the result print mean(A_rps_samps > B_rps_samps)
def calc_beta_prob(df_grouped, num_samples=100000): #perform bayesian test to simulate future data distributions clicks_new = df_grouped[df_grouped['optimum'] == 1]['clicked'] view_new = df_grouped[df_grouped['optimum'] == 1]['counter'] clicks_old = df_grouped[df_grouped['optimum'] == 0]['clicked'] view_old = df_grouped[df_grouped['optimum'] == 0]['counter'] new_samples = beta_dist(1 + clicks_new, 1 + view_new - clicks_new, num_samples) old_samples = beta_dist(1 + clicks_old, 1 + view_old - clicks_old, num_samples) return np.mean(new_samples - old_samples > .015)
def calculate_statistics(pool, events, samples_to_draw): #useful for identifying which variables are possible to be pulled out for scaling or different distributions, despite appearing like arbitrary abstractions c = 1 #used to vary sample size by a scalar multiplier alpha = 1 #30 #prior beta = 1 #70 #prior views = pool * c clicks = events * c return beta_dist(clicks + alpha, views - clicks + beta, samples_to_draw)
def sample(self, data=None, n=1): """Return n samples from distribution""" if data is None: data = self.data successes = count_nonzero(data) total = len(data) samples = beta_dist(self.alpha + successes, self.beta + total - successes, n) return samples
def p_donate_ci(self, a=5, alpha =1, beta=1): """ Rretuns a 100-a credible interval for the donation rate """ ones = self.counts[1:] zeros = self.counts[0] dist = beta_dist(ones + alpha, zeros + beta, 10000) lower_bound = np.percentile(dist, a / 2.0) upper_bound = np.percentile(dist, 100 - a / 2.0) mean = np.mean(dist) return (lower_bound, self.p_donate, upper_bound)
def calculate_clickthrough_prob(clicks_A, views_A, clicks_B, views_B): ''' INPUT: INT, INT, INT, INT OUTPUT: FLOAT Calculate and return an estimated probability that SiteA performs better (has a higher click-through rate) than SiteB. Hint: Use Bayesian A/B Testing (multi-armed-bandit repo) ''' samp = 10000 Aa = clicks_A + 1 Ab = clicks_B + 1 Ba = views_A - clicks_A + 1 Bb = views_B - clicks_B + 1 A_prob = beta_dist(Aa, Ba, samp) B_prob = beta_dist(Ab, Bb, samp) return np.sum(A_prob > B_prob) / float(samp)
def p_donate_ci(self, a=5, alpha=1, beta=1): """ Rretuns a 100-a credible interval for the donation rate """ ones = self.counts[1:] zeros = self.counts[0] dist = beta_dist(ones + alpha, zeros + beta, 10000) lower_bound = np.percentile(dist, a / 2.0) upper_bound = np.percentile(dist, 100 - a / 2.0) mean = np.mean(dist) return (lower_bound, self.p_donate, upper_bound)
def _sampled_based(vuln_function, gmf_set, epsilon_provider, asset): """Compute the set of loss ratios when at least one CV (Coefficent of Variation) defined in the vulnerability function is greater than zero. :param vuln_function: the vulnerability function used to compute the loss ratios. :type vuln_function: :py:class:`openquake.shapes.VulnerabilityFunction` :param gmf_set: ground motion fields used to compute the loss ratios :type gmf_set: :py:class:`dict` with the following keys: **IMLs** - tuple of ground motion fields (float) **TimeSpan** - time span parameter (float) **TSES** - time representative of the Stochastic Event Set (float) :param epsilon_provider: service used to get the epsilon when using the sampled based algorithm. :type epsilon_provider: object that defines an :py:meth:`epsilon` method :param asset: the asset used to compute the loss ratios. :type asset: an :py:class:`openquake.db.model.ExposureData` instance """ loss_ratios = [] for ground_motion_field in gmf_set["IMLs"]: if ground_motion_field < vuln_function.imls[0]: loss_ratios.append(0.0) else: if ground_motion_field > vuln_function.imls[-1]: ground_motion_field = vuln_function.imls[-1] mean_ratio = vuln_function.loss_ratio_for(ground_motion_field) cov = vuln_function.cov_for(ground_motion_field) if vuln_function.is_beta: stddev = cov * mean_ratio alpha = compute_alpha(mean_ratio, stddev) beta = compute_beta(mean_ratio, stddev) loss_ratios.append(beta_dist(alpha, beta, size=None)) else: variance = (mean_ratio * cov) ** 2.0 epsilon = epsilon_provider.epsilon(asset) sigma = math.sqrt( math.log((variance / mean_ratio ** 2.0) + 1.0)) mu = math.log(mean_ratio ** 2.0 / math.sqrt( variance + mean_ratio ** 2.0)) loss_ratios.append(math.exp(mu + (epsilon * sigma))) return array(loss_ratios)
def __init__(self, alpha: float, beta: float, labels_1: Set[str], labels_2: Set[str], equalities: Mapping, threshold: float = 0.): """ :param alpha: alpha value for beta distribution :param beta: beta value for beta distribution :param labels_1: source space :param labels_2: target space :param equalities: labels that are considered equal (1 - beta) :param threshold: every similarity below is considered to be zero """ s = ((a, b, beta_dist(alpha, beta)) for a, b in product(labels_1, labels_2)) s = ((a, b, 1 - w if (a, b) in equalities else w) for a, b, w in s) s = ((a, b, w if w > threshold else 0.) for a, b, w in s) super().__init__(s)
def _sampled_based(vuln_function, gmf_set, epsilon_provider, asset): """Compute the set of loss ratios when at least one CV (Coefficent of Variation) defined in the vulnerability function is greater than zero. :param vuln_function: the vulnerability function used to compute the loss ratios. :type vuln_function: :py:class:`openquake.shapes.VulnerabilityFunction` :param gmf_set: ground motion fields used to compute the loss ratios :type gmf_set: :py:class:`dict` with the following keys: **IMLs** - tuple of ground motion fields (float) **TimeSpan** - time span parameter (float) **TSES** - time representative of the Stochastic Event Set (float) :param epsilon_provider: service used to get the epsilon when using the sampled based algorithm. :type epsilon_provider: object that defines an :py:meth:`epsilon` method :param asset: the asset used to compute the loss ratios. :type asset: an :py:class:`openquake.db.model.ExposureData` instance """ loss_ratios = [] for ground_motion_field in gmf_set["IMLs"]: if ground_motion_field < vuln_function.imls[0]: loss_ratios.append(0.0) else: if ground_motion_field > vuln_function.imls[-1]: ground_motion_field = vuln_function.imls[-1] mean_ratio = vuln_function.loss_ratio_for(ground_motion_field) cov = vuln_function.cov_for(ground_motion_field) if vuln_function.is_beta: stddev = cov * mean_ratio alpha = compute_alpha(mean_ratio, stddev) beta = compute_beta(mean_ratio, stddev) loss_ratios.append(beta_dist(alpha, beta, size=None)) else: variance = (mean_ratio * cov)**2.0 epsilon = epsilon_provider.epsilon(asset) sigma = math.sqrt(math.log((variance / mean_ratio**2.0) + 1.0)) mu = math.log(mean_ratio**2.0 / math.sqrt(variance + mean_ratio**2.0)) loss_ratios.append(math.exp(mu + (epsilon * sigma))) return array(loss_ratios)
def sampleSuccessRateForBinomialDataAndBetaPriori(data_n,data_k,alpha=1,beta=1,samples=10000): return beta_dist(data_k+alpha, data_n-data_k+beta, samples)
from numpy.random import beta as beta_dist from numpy import percentile import numpy as np N_samp = 10000000 # number of samples to draw c = 1 #used to vary sample size by a scalar multiplier ## INSERT YOUR OWN DATA HERE clicks_A = (44) * c views_A = (9610) * c clicks_B = (426) * c views_B = (83617) * c alpha = 1 #30 #prior beta = 1 #70 #prior A_samples = beta_dist(clicks_A + alpha, views_A - clicks_A + beta, N_samp) B_samples = beta_dist(clicks_B + alpha, views_B - clicks_B + beta, N_samp) #confidence intervals: eg 2.5 = 95, 10 = 80 print[ round(np.percentile((B_samples - A_samples) / B_samples, 2.5), 4), round(np.percentile((B_samples - A_samples) / B_samples, 97.5), 4) ] print[ round(np.percentile((B_samples - A_samples) / B_samples, 10), 4), round(np.percentile((B_samples - A_samples) / B_samples, 90), 4) ] # percent lift needed # base lift 1, '''
v_conversions = 1799 * c v_visits = 207434 * c #lift_perc = .03 c_mean = 80.03 c_var = 503950.69 v_mean = 78.64 v_var = 493547.51 N_samp = 75000 clicks_A = c_conversions views_A = c_visits clicks_B = v_conversions views_B = v_visits alpha = 1 beta = 1 A_conv_samps = beta_dist(clicks_A + alpha, views_A - clicks_A + beta, N_samp) B_conv_samps = beta_dist(clicks_B + alpha, views_B - clicks_B + beta, N_samp) A_order_samps = draw_mus(c_conversions, c_mean, c_var, 0, 1, 1, 1, N_samp) B_order_samps = draw_mus(v_conversions, v_mean, v_var, 0, 1, 1, 1, N_samp) A_rps_samps = A_conv_samps * A_order_samps B_rps_samps = B_conv_samps * B_order_samps # set current winner if (mean(A_rps_samps) >= mean(B_rps_samps)): Current_Winner_rps_samps = A_rps_samps Current_Loser_rps_samps = B_rps_samps current_winner_str = "CHOOSE CONTROL" else: Current_Winner_rps_samps = B_rps_samps
def get_samples(success, population, alpha, beta, sample_size): return beta_dist(success+alpha, population-success+beta, sample_size)
A_log_norm_data = lognormal(mean=4.20, sigma=1.0, size=100) B_log_norm_data = lognormal(mean=4.00, sigma=1.0, size=100) # appending many many zeros A_data = concatenate([A_log_norm_data,zeros((10000))]) B_data = concatenate([B_log_norm_data,zeros((10000))]) # modeling zero vs. non-zero non_zeros_A = sum(A_data > 0) total_A = len(A_data) non_zeros_B = sum(B_data > 0) total_B = len(B_data) alpha = 1 # uniform prior beta = 1 n_samples = 100000 # number of samples to draw A_conv_samps = beta_dist(non_zeros_A+alpha, total_A-non_zeros_A+beta, n_samples) B_conv_samps = beta_dist(non_zeros_B+alpha, total_B-non_zeros_B+beta, n_samples) # modeling the non-zeros with a log-normal A_non_zero_data = A_data[A_data > 0] B_non_zero_data = B_data[B_data > 0] m0 = 4. k0 = 1. s_sq0 = 1. v0 = 1. A_order_samps = draw_log_normal_means(A_non_zero_data,m0,k0,s_sq0,v0,n_samples) B_order_samps = draw_log_normal_means(B_non_zero_data,m0,k0,s_sq0,v0,n_samples) # combining the two
def init_user(self, graph): dist = beta_dist(self.alpha, self.beta, len(graph._workers._nodes)) for i, (worker, _, _, _, _) in enumerate(graph.workers()): worker.p = dist[i]
def get_samples(success, population, alpha, beta, sample_size): return beta_dist(success + alpha, population - success + beta, sample_size)