def PrintData(self): print '==============================================Statistics==============================================' print args print '==============================================Statistics==============================================' print '#Rounds:' print self.num_rounds print '#Finished rounds:' print self.num_finishedrounds print 'Ave reward: discountred / non-discounted:' print '%.3f (%.3f)/ %.3f (%.3f)' % (sum(self.total_reward)/float(self.num_finishedrounds), stats.sem(self.total_reward,axis=None, ddof=0), sum(self.total_nondis_reward)/float(self.num_finishedrounds), stats.sem(self.total_nondis_reward,axis=None, ddof=0)) print 'Success %:' print float(self.success_count)/float(self.num_finishedrounds), print str(binom.std(self.num_finishedrounds, float(self.success_count)/float(self.num_finishedrounds), loc=0)/float(self.num_finishedrounds)) print 'collision % per round' print float(self.collision_count)/float(self.num_finishedrounds), print str(binom.std(self.num_finishedrounds, float(self.collision_count)/float(self.num_finishedrounds), loc=0)/float(self.num_finishedrounds)) print 'collision % per step' print float(self.collision_count)/float(self.total_step), print str(binom.std(self.total_step, float(self.collision_count)/float(self.total_step), loc=0)/float(self.total_step)) print 'collision % per meter:' if(sum(self.dis_trav) > 0): print float(self.collision_count)/float(sum(self.dis_trav)), print str(binom.std(sum(self.dis_trav), float(self.collision_count)/float(sum(self.dis_trav)), loc=0)/float(sum(self.dis_trav))) print 'Ave distance travelled per round:' print '%.3f' % (numpy.mean(numpy.array(self.dis_trav))), print str(stats.sem(self.dis_trav)) print 'smoothness:' print float(sum(self.dec_count))/self.num_finishedrounds, print str(stats.sem(self.dec_count)) print 'Total steps per round:' print float(self.total_step)/float(self.num_rounds) print 'Min / Max num of trials:' print str(min(self.num_trial))+'/'+str(max(self.num_trial))+' ('+str(numpy.std(numpy.array(self.num_trial),ddof=1))+')' print 'Default move count: ' print float(self.default_count/self.num_rounds) print 'Ave tree nodes / Ave expanded nodes / Ave policy sizes: ' print '%.3f %.3f / %.3f / %.3f ' % (float(sum(self.tree_nodes))/float(self.total_step), stats.sem(self.tree_nodes,axis=None, ddof=0), float(self.expansion_count)/float(self.total_step), float(self.policy_size)/float(self.total_step)) print 'Max expanded nodes: ' print '%d' % (max(self.expanded_nodes)) print 'Ave expansion time: ' print '%.3f' % (self.expansion_time/float(self.total_step)) print 'Ave total time: ' print '%.3f' % (self.total_time/float(self.total_step)) print 'Initial bounds: ' print '( %.3f , %.3f )' % (self.init_lb/float(self.total_step), self.init_ub/float(self.total_step)) print 'Final bounds: ' print '( %.3f , %.3f )' % (self.final_lb/float(self.total_step), self.final_ub/float(self.total_step))
def gen(n, p, name): """Generate fixture data and write to file. # Arguments * `n`: number of trials * `p`: success probability * `name::str`: output filename # Examples ``` python python> n = np.round( rand(1000) * 100.0 ) python> p = rand(1000) python> gen(n, p, './data.json') ``` """ y = list() for a, b in np.nditer([n, p]): y.append(binom.std(a, b)) # Store data to be written to file as a dictionary: data = {"n": n.tolist(), "p": p.tolist(), "expected": y} # Based on the script directory, create an output filepath: filepath = os.path.join(DIR, name) # Write the data to the output filepath as JSON: with open(filepath, "w") as outfile: json.dump(data, outfile)
def std(self): """ Compute the standard deviation of the distribution. Returns: -------- std : float """ return binom.std(self.__n, self.__p)
def check_row(row, prob): #row is df row as list n = row[1] mean = [] sdev = [] for p in prob: mean.append(int(n * p)) sdev.append(int(binom.std(n, p))) return mean, sdev
def demo13(): n = 100 p = 0.25 x = np.array(range(0, n + 1)) prob = np.array([binom.pmf(k, n, p) for k in x]) print(binom.mean(n, p)) print(binom.var(n, p)) print(binom.std(n, p)) plt.xlabel('x') plt.ylabel('Possibility') plt.bar(x, prob) plt.show()
def do_statistics(histogram, latex=False): data = gen_data(histogram) emean, evar, eskew, ekurt = binom.stats(TEXT_SIZE * 8, 0.5, moments='mvsk') estd = binom.std(TEXT_SIZE * 8, 0.5) mean = s.mean(data) median = s.median(data) # We will use sample versions because we don't have all the population stdev = s.stdev(data) variance = s.variance(data) sk = skew(data) ku = kurtosis(data) mean_err = abs(emean - mean) / emean std_err = abs(estd - stdev) / estd var_err = abs(evar - variance) / evar kurt_err = abs(ekurt - ku) / ekurt print("Media:\t\t{}\t\t\t(Err) {}".format(mean, mean_err)) print("Mediana:\t{}".format(median)) print("Desv. Std.:\t{}\t(Err) {}".format(stdev, std_err)) print("Varianza.:\t{}\t(Err) {}".format(variance, var_err)) print("Simetría:\t{}".format(sk)) print("Kurtosis:\t{}\t(Err) {}".format(ku, kurt_err)) if latex: print("\n\nLaTeX\n") print( " &".join("{:>11}".format(x) for x in [ "Muestra", "Media", "Mediana", "Desviación", "Variance", "Asimetría", "Curtosis" ]), "\\\\ \\hline") print( " &".join( "{:>11.2f}".format(x) for x in [STUDY_SIZE, mean, median, stdev, variance, sk, ku]), "\\\\") print("\n\n") print(list(enumerate(histogram))) print("\n\n")
def _append_binom_stats(self, df, eff): ''' Add binomial metrics to df ''' # Manual-ish # df['binom_mean'] = df['nocut'] * eff # df['binom_stddev'] = np.sqrt(df['nocut'] * eff * (1 - eff)) # # I don't think this is right, it's a two-sided test # df['cut_binom_pval'] = df.apply( # lambda x: binom_test(x['cut'], x['nocut'], eff), # axis=1, # ) # Purely using scipy binom class? df['binom_mean'] = binom.mean(df['nocut'], eff) df['binom_stddev'] = binom.std(df['nocut'], eff) print(df['binom_stddev']) df['cut_binom_pval'] = df.apply( lambda x: binom_test(x['cut'], x['nocut'], eff), # lambda x: binom.cdf(x['cut'], x['nocut'], eff), axis=1, ) return df
def find_statistical_saboteurs( groups_data, pvalue_threshold=0.1, effect_threshold=0, max_significant_members=10 ): """Return statistics on possible bad elements in the data. Parameters ---------- groups_data Result of ``csv_to_groups_data()``. pvalue_threshold Only failure-associated elements with a p-value below this threshold will be included in the final statistics. """ groups_data = deepcopy(groups_data) twins, almost_tweens, has_twins = _find_twins(groups_data) members_sets = [set(group["members"]) for group in groups_data.values()] all_members = set().union(*members_sets) conserved_members = members_sets[0].intersection(*members_sets) members_with_twins = set().union(*twins.values()) varying_members = sorted( all_members.difference(conserved_members).difference(members_with_twins) ) # Build the data def build_data_and_observed(selected_members, by_group=False): data = [] observed = [] for group_name, group_data in groups_data.items(): attempts = int(group_data["attempts"]) failures = int(group_data["failures"]) vector = [[(mb in group_data["members"]) for mb in selected_members]] if by_group: data += vector observed.append(1.0 * failures / attempts) else: data += attempts * vector observed += (attempts - failures) * [0] + failures * [1] return np.array(data), np.array(observed) # LASSO model (gives positive / negative impact) data, observed = build_data_and_observed(varying_members) regression = linear_model.RidgeCV() regression.fit(data, observed) # ANOVA analysis (for p-values) selector = SelectFpr(f_classif, alpha=pvalue_threshold) selector.fit(data, observed) # select the most interesting parts data_ = zip(selector.pvalues_, regression.coef_, varying_members) significant_members = OrderedDict( [ (name, {"pvalue": pvalue, "twins": twins.get(name, [])}) for pvalue, coef, name in sorted(data_) if (pvalue < pvalue_threshold) and (coef > 0) ] ) if len(significant_members) == 0: return { "groups_data": groups_data, "conserved_members": conserved_members, "varying_members": varying_members, "significant_members": significant_members, } # LASSO model (significant parts only) data, observed = build_data_and_observed(significant_members) regression.fit(data, observed) zipped = zip(regression.coef_, significant_members.items()) for coef, (name, data_) in zipped: data_["effect"] = coef for member in list(significant_members.keys()): if significant_members[member]["effect"] < effect_threshold: significant_members.pop(member) # print (significant_members) # significant_members = significant_members[:max_significant_members] # Build a classifier to compute a L1 score classifier = linear_model.LogisticRegressionCV(penalty="l2") classifier.fit(data, observed) f1_score = metrics.f1_score(observed, classifier.predict(data)) # Find constructs which are less explained by the parts: data, observed = build_data_and_observed(significant_members, by_group=True) regression.fit(data, observed) predictions = regression.predict(data) zipped = zip(groups_data.values(), observed, predictions) intercept = min(0.9, max(0.1, regression.intercept_)) for group_data, obs, pred in zipped: std = binom.std(group_data["attempts"], intercept) / group_data["attempts"] group_data["failure_rate"] = obs group_data["deviation"] = np.round((obs - pred) / std, decimals=1) return { "groups_data": groups_data, "conserved_members": conserved_members, "varying_members": varying_members, "significant_members": significant_members, "f1_score": f1_score, }
# blood? (Moore, David S. The Basic Practice of Statistics. 4th # ed. New York: W. H. Freeman, 2007, p. 329, example 13.4.) p1Prob = binom.pmf(2, 5, 0.25) # Problem 2: A music distributor inspects an SRS of 10 CDs from a # shipment of 10,000 music CDs. Suppose that (unknown to the # distributor) 10% of the CDs in the shipment have defective # copy-protection schemes that will harm personal computers. The # number X of CDs with defective copy protection has approximately the # binomial distribution with n = 10 and p = 0.1. What is the # probability that the sample contains no more than 1 defective CD? # (Moore, David S. The Basic Practice of Statistics. 4th ed. New York: # W. H. Freeman, 2007, pp. 327-28 and 330-331, examples 13.3 and # 13.5.) p2Prob = binom.cdf(1, 10, 0.1) p2Mean = binom.mean(10, 0.1) p2StdDev = binom.std(10, 0.1) def main(): print "Problem 1: probability %.4f\n" % p1Prob print "Problem 2: probability %.4f" % p2Prob print " mean %.4f" % p2Mean print " standard dev. %.4f" % p2StdDev if __name__ == "__main__": main()
# probability 0.25 of having blood type O. If these parents have 5 # children, what is the probability that exactly 2 of them have type O # blood? (Moore, David S. The Basic Practice of Statistics. 4th # ed. New York: W. H. Freeman, 2007, p. 329, example 13.4.) p1Prob = binom.pmf(2, 5, 0.25) # Problem 2: A music distributor inspects an SRS of 10 CDs from a # shipment of 10,000 music CDs. Suppose that (unknown to the # distributor) 10% of the CDs in the shipment have defective # copy-protection schemes that will harm personal computers. The # number X of CDs with defective copy protection has approximately the # binomial distribution with n = 10 and p = 0.1. What is the # probability that the sample contains no more than 1 defective CD? # (Moore, David S. The Basic Practice of Statistics. 4th ed. New York: # W. H. Freeman, 2007, pp. 327-28 and 330-331, examples 13.3 and # 13.5.) p2Prob = binom.cdf(1, 10, 0.1) p2Mean = binom.mean(10, 0.1) p2StdDev = binom.std(10, 0.1) def main(): print "Problem 1: probability %.4f\n" % p1Prob print "Problem 2: probability %.4f" % p2Prob print " mean %.4f" % p2Mean print " standard dev. %.4f" % p2StdDev if __name__ == "__main__": main()
a)At least 3 successes b)At most 3 successes c)Exactly 3 failures ''' print("Assignment 1") print("At least 3 successes:", 1 - binom.cdf(k=2, n=5, p=0.3)) print("At most 3 successes:", binom.cdf(k=3, n=5, p=0.3)) print("Excatly 3 failures", binom.pmf(k=5 - 3, n=5, p=0.3)) print("\n") ''' 2.If on an average one vessel in every ten is wrecked, find the probability that out of five vessels expected to arrive, four at least will arrive safely ''' print("Assignment 2") print("Probabilty of atleast 4/5 arrive safely", binom.pmf(k=4, n=5, p=9 / 10) + binom.pmf(k=5, n=5, p=9 / 10)) print("\n") ''' 3.Five coins are tossed 3,200 times. a)Find the Frequencies of the distribution of heads and tabulate the results b)Calculate the mean number of success and standard deviations ''' print("Assignment 3") print("Frequency Distribution of Heads") for k in np.arange(5 + 1): print("Frequency of {} Heads is {}"\ .format(k,3_200*binom.pmf(k=k,n=5,p=1/2))) print("Mean of Success", binom.mean(n=5, p=1 / 2)) print("Standard Deviation of Success", binom.std(n=5, p=1 / 2))
def std(self, dist): return binom.std(*self._get_params(dist))
# # I.e. error in the probability scales proportionally with $\displaystyle \propto \frac{1}{\sqrt{N}}$. # Calculately the error in the two-step model cannot be done analytically. # However, if we take a random sub-sample (for example of only 1000 visitor) we can derive a predicted conversion rate from the model. # # Repeating this a large number times for different sub-samples we can can measure the scatter in the model prediciton. This should represent the upper-bounds on the model error. # # We can also calculate the error on the visitor value the same way (for both the statistical and two-step model methods). # + #for sample sizes ranging between 1 and 100000 n_sample = np.logspace(0,5,11).astype(int) #theoretical error from binomial distribution conv_samp_err_theo = binom.std(n_sample, conversion_rate_true) / n_sample.astype(float) #empty arrays for storing results conv_samp_err = [] conv_pred_err = [] value_samp_err = [] value_pred_err = [] random_state = np.random.RandomState(0) for n in n_sample: #for smaller sample sizes perform more Monte Carlo iterations n_mc = int(np.clip(1e5 / n, 100, 10000)) conv_samp_mc = [] conv_pred_mc = [] value_samp_mc = []
[1, 1, 1, 0], [1, 1, 0, 1], [1, 0, 1, 1], [0, 1, 1, 1]]) pattern4Cnt = np.zeros((60, 16), int) pattern4Deviations = np.zeros(60, int) # Standard deviation of binomial distribution n1 = 200 n2 = 199 n3 = 198 n4 = 197 p1 = 1/2 p2 = 1/4 p3 = 1/8 p4 = 1/16 std1 = binom.std(n1, p1) std2 = binom.std(n2, p2) std3 = binom.std(n3, p3) std4 = binom.std(n4, p4) # Count number of patterns nStd = 2 for i in range(np.size(D, 0)) : if sums[i] < mean1 - nStd*std1 or sums[i] > mean1 + nStd*std1 : sumsDeviations[i] += 1 for j in range(np.size(D, 1)-1) : for k in range(np.size(pattern2, 0)) : if D[i,j] == pattern2[k,0] and D[i,j+1] == pattern2[k,1] :
from scipy.stats import binom import numpy as np # Binomal Distribution n = 8 k = 4 p = 0.5 q = 1 - p expect = binom.expect(args=(n, p)) mean = binom.mean(n, p) var = binom.var(n, p) sigma = binom.std(n, p) mode = np.floor((n + 1) * p) pmf = binom.pmf(k, n, p) cdf = binom.cdf(k, n, p) ppf = binom.ppf(q, n, p) print('expected value = ', expect) print('mean = ', mean) print('variance = ', var) print('std. dev. = ', sigma) print('mode = ', mode) print('pmf = ', pmf) print('cdf = ', cdf) print('ppf = ', ppf)
def SaveData(self, args, rand, folder): filename=folder+'/Search_record'+str(rand)+'.txt' with open(filename, "a") as output: output.write('==============================================Statistics==============================================\n') output.write( args+'\n') output.write('==============================================Statistics==============================================\n') output.write( '#Rounds:'+'\n') output.write( str(self.num_rounds) +'\n') output.write( '#Finished rounds:'+'\n') output.write( str(self.num_finishedrounds)+'\n') output.write( 'Ave reward: discountred / non-discounted:'+'\n') output.write( '%.3f (%.3f) / %.3f (%.3f)' % (sum(self.total_reward)/float(self.num_finishedrounds), stats.sem(self.total_reward,axis=None, ddof=0), sum(self.total_nondis_reward)/float(self.num_finishedrounds), stats.sem(self.total_nondis_reward,axis=None, ddof=0))+'\n') output.write( 'Success %:' +'\n') output.write( str(float(self.success_count)/float(self.num_finishedrounds))+' ') output.write( str(binom.std(self.num_finishedrounds, float(self.success_count)/float(self.num_finishedrounds), loc=0)/float(self.num_finishedrounds)) + '\n') output.write('collision % per round' + '\n') output.write(str(float(self.collision_count)/float(self.num_finishedrounds)) + ' ') output.write( str(binom.std(self.num_finishedrounds, float(self.collision_count)/float(self.num_finishedrounds), loc=0)/float(self.num_finishedrounds)) + '\n') output.write( 'collision % per step' + '\n') output.write( str(float(self.collision_count)/float(self.total_step)) + ' ' ) output.write( str(binom.std(self.total_step, float(self.collision_count)/float(self.total_step), loc=0)/float(self.total_step)) + '\n') output.write( 'collision % per meter:' + '\n') if(sum(self.dis_trav) > 0): output.write( str(float(self.collision_count)/float(sum(self.dis_trav))) + ' ') output.write( str(binom.std(sum(self.dis_trav), float(self.collision_count)/float(sum(self.dis_trav)), loc=0)/float(sum(self.dis_trav))) + '\n' ) #output.write( 'collision %'+'\n') #output.write( str(float(self.collision_count)/float(self.num_finishedrounds))+'\n') output.write( 'Ave distance travelled per round:'+'\n') output.write( '%.3f' % (sum(self.dis_trav)/self.num_finishedrounds)+' ') output.write( str(stats.sem(self.dis_trav)) + '\n') output.write( 'smoothness:'+'\n') if(sum(self.dec_count) > 0): output.write( str(float(sum(self.dis_trav))/float(sum(self.dec_count)))+'\n') output.write('dec count:' + '\n') output.write( str(float(sum(self.dec_count))/self.num_finishedrounds) + ' ') output.write( str(stats.sem(self.dec_count)) + '\n') output.write( 'Total steps per round:'+'\n') output.write( str(float(self.total_step)/float(self.num_rounds))+'\n') output.write( 'Max search depth per step:'+'\n') output.write( str(float(self.total_search_depth)/float(self.total_step))+'\n') output.write( 'Min / Max num of trials:'+'\n') output.write( str(min(self.num_trial))+'/'+str(max(self.num_trial))+'\n') output.write( 'Default move count: '+'\n') output.write( str(float(self.default_count/self.num_rounds))+'\n') output.write( 'Ave tree nodes / Ave expanded nodes / Ave policy sizes: '+'\n') output.write( '%.3f / %.3f / %.3f ' % (float(sum(self.tree_nodes))/float(self.total_step), float(self.expansion_count)/float(self.total_step), float(self.policy_size)/float(self.total_step))+'\n') output.write('Max expanded nodes: ' + '\n') output.write( '%d' % (max(self.expanded_nodes)) + '\n') output.write( 'Ave expansion time: '+'\n') output.write( '%.3f' % (self.expansion_time/float(self.total_step))+'\n') output.write( 'Ave total time: '+'\n') output.write( '%.3f' % (self.total_time/float(self.total_step))+'\n') output.write( 'Initial bounds: '+'\n') output.write( '( %.3f , %.3f )' % (self.init_lb/float(self.total_step), self.init_ub/float(self.total_step))+'\n') output.write( 'Final bounds: '+'\n') output.write( '( %.3f , %.3f )' % (self.final_lb/float(self.total_step), self.final_ub/float(self.total_step))+'\n')
import matplotlib.pyplot as plt import numpy as np from scipy.stats import binom np.random.seed(1) # Binomial Distribution n = 10 p = 0.3 mean, var = binom.stats(n, p, loc=0, moments='mv') std = binom.std(n, p) x = np.arange(0, n + 1) pmf = binom(n, p).pmf(x) fig, ax = plt.subplots(1, 1) ax.plot(x, pmf, 'b-', lw=3, alpha=0.6, label='bnom') q1 = binom.ppf(.25, n, p) median = binom.ppf(.5, n, p) q3 = binom.ppf(.75, n, p) plt.title( 'Binomial Distribution \n($\mu$: {:.2f}, $\sigma$: {:.2f}, $\sigma^2$: {:.2f})' .format(mean, std, var), size='xx-large') plt.xlabel('X', size='large') plt.ylabel('P(X)', size='large')
import matplotlib.pyplot as plt import numpy as np from scipy.stats import binom, skew np.random.seed(5) # Binomial Distribution Parameters n = 10 p = 0.2 # Descriptive Stats for Original Probability Distribution pop_mean, pop_var = binom.stats(n, p, loc=0, moments='mv') pop_std = binom.std(n, p) print('Population Mean: ', pop_mean) print('Population Variance: ', pop_var) # Random samples x = np.arange(0, n + 1) pmf = binom(n, p).pmf(x) # Sample statistics mean_of_means = [] variances = [] skews = [] samp_sizes = [5, 10, 50, 100] for i, samp_size in enumerate(samp_sizes, start=1): means = [] for sample in range(10000): pts = binom.rvs(n, p, size=samp_size)
df = pd.DataFrame(z, columns = ['x', 'y']) fig = px.line(df, x = 'x', y = ['y']) fig.show() z = norm.ppf(0.995, loc=0, scale=1) sterror = np.sqrt((p*(1-p)/n)) z1 = norm.ppf(0.975, loc=0, scale=1) print(f'Z score: {z}') #### calculates the z score of confidence interval 95% (for 2 tailed tests) #### with mean = 0, std = 1 #### used for multiplying with Standard Error to obtain Margin of Error moe = z * sterror print(f'Margin of Error: {moe}') print(f'Lower interval: {(p-moe)*n}, {p-moe}') print(f'Upper interval: {(moe+p)*n}, {p+moe}') np_up = norm.ppf(0.995, loc = binom.mean(n,p), scale = binom.std(n,p)) np_down =norm.ppf(0.005, loc = binom.mean(n,p), scale = binom.std(n,p)) print(f'norm.ppf CI calculation: {np_down}, {np_up}') #### directly gives you where the value lies at both the higher/lower end of the normal distribution #### using normal distribution because we pass the checks for using this print(f'binom.interval CI calculation: {stats.binom.interval(alpha = 0.99, n = n, p = p)}') #### calculates the # %%