def MakeFigures(pool, firsts, others): """Creates several figures for the book.""" bar_options = [ dict(linewidth=0, color='blue'), dict(linewidth=0, color='orange') ] # plot PMFs of birth weights for first babies and others myplot.Hists([firsts.weight_pmf, others.weight_pmf], root='nsfg_birthwgt_pmf', bar_options=bar_options, title='Birth weight PMF', xlabel='weight (ounces)', ylabel='probability') plot_options = [ dict(linewidth=2, color='blue'), dict(linewidth=2, color='orange') ] # plot CDFs of birth weights for first babies and others myplot.Cdfs([firsts.weight_cdf, others.weight_cdf], root='nsfg_birthwgt_cdf', plot_options=plot_options, title='Birth weight CDF', xlabel='weight (ounces)', ylabel='probability', axis=[0, 200, 0, 1])
def testGeneratePrevalence(self): sample = 'ab' prior = Pmf.MakePmfFromList(range(1, 4)) meta = rarefaction.MetaHypo(prior, sample) for taxon in sample: meta.Update(taxon) k = 3 taxon = 'other' iters = 1000 plot = False hypos = meta.GetHypos() for hypo, prob in hypos.Items(): self.assertAlmostEquals(prob, 0.4) if hypo.k==2 else 0 self.assertAlmostEquals(prob, 0.6) if hypo.k==3 else 0 if hypo.k != k or not plot: continue # check the distribution of generated prevalences ps = [hypo.GeneratePrevalence().Prob(taxon) for i in xrange(iters)] cdf = Cdf.MakeCdfFromList(ps) # compare to what the distribution is supposed to be dist = hypo.Get(taxon) ps2 = [dist.Random() for i in xrange(iters)] cdf2 = Cdf.MakeCdfFromList(ps2) myplot.Cdfs([cdf, cdf2], show=True)
def main(): # Exercise 5.6 sample_size=365 for n in range(1,6): distribution = [] for _ in range(sample_size): distribution.append(get_bread(n)) print(np.mean(distribution), np.std(distribution)) found_distribution = [] for _ in range(sample_size): found_distribution.append(int(get_bread(4))) print('picking 4 loaves: mu={} sigma={}'.format(np.mean(found_distribution),np.std(found_distribution))) expected_distribution = np.random.normal(np.mean(found_distribution), np.std(found_distribution), sample_size) myplot.Cdfs(cdfs=(Cdf.MakeCdfFromList(found_distribution), Cdf.MakeCdfFromList(expected_distribution))) myplot.show() plt.hist(found_distribution, normed=True, bins=20, label='found') plt.hist(expected_distribution, normed=True, bins=20, alpha=.75, label='expected') plt.ylabel('Probability') plt.xlabel('Bread Weight') plt.legend() plt.show() # Exercise 5.7 men_heights = np.random.normal(loc=178, scale=59.4, size=99999) women_heights = np.random.normal(loc=163, scale=52.8, size=99999) pairs = zip(men_heights, women_heights) l = [w>m for m, w in pairs] print('In {}% of the pairs the woman will be taller than the man'.format(sum(l)/len(l)*100.0))
def PlotDiffs(groups, low, high, root): """Plots the CDF of diffs for each group. Args: low, high: range of diffs to include root: string filename root """ diff_list = [] for gender, res in groups.iteritems(): diffs = ComputeDiffs(res, low=low, high=high) diff_list.append((gender, diffs)) print 'PlotDiffs', gender, len(diffs) cdfs = [] for name, diffs in diff_list: cdf = Cdf.MakeCdfFromList(diffs, name=name) cdfs.append(cdf) options = [dict(linewidth=2) for cdf in cdfs] myplot.Cdfs(cdfs, xlabel='time - qualifying time (min)', ylabel='P(difference < x)', plot_options=options, root=root)
def plot_accident_cdfs(): """Plots CDF of accident counts for the control and treatment areas. Before and after the date CAWS was deployed (known to be November 1996). """ cdfs = [] for label in ['control', 'treatment']: print label filename = label + '_data.csv' col_dict = process_merged_file(filename) # November 15, 1996 before, after = split_col_dict(col_dict, 1780) print 'before' cdf = accident_cdf(before, 'accidents') cdf.name = label + ' before' cdfs.append(cdf) print 'after' cdf = accident_cdf(after, 'accidents') cdf.name = label + ' after' cdfs.append(cdf) myplot.Cdfs(cdfs, root='caws.poisson', transform='exponential', title='CCDF of Accident Counts', xlabel='Number of accidents', ylabel='Complementary CDF')
def PlotCdfs(): """Plots distribution of ability for different number of factors. After 100000 people: n max value 50 0.333842852938 10 0.6483317765470 5 0.837633976492 1 0.983619459771 """ cdfs = [] for n in [50, 10, 5, 1]: pmf, data = WorldRecord(m=10000, n=n) cdf = Cdf.MakeCdfFromPmf(pmf, name='n=%d' % n) print n, max(cdf.Values()) cdfs.append(cdf) options = dict(linewidth=2) plot_options = [options] * len(cdfs) myplot.Cdfs(cdfs, root='world_record_cdfs', plot_options=plot_options, title='Distribution of potential', xlabel='potential', ylabel='CDF')
def Main(): truth = ReadTruth() truth_map = {} for pcode, label in truth: truth_map[pcode] = label labels = ReadLabels() photo_map, labeler_map = MakeObjects(labels) RunUpdates(photo_map, labeler_map, labels) yes = [] no = [] for pcode, photo in photo_map.iteritems(): if pcode in truth_map: mean = photo.Mean() if truth_map[pcode] == '1': yes.append(mean) else: no.append(mean) myplot.Clf() cdf_yes = thinkbayes.MakeCdfFromList(yes, name='yes') cdf_no = thinkbayes.MakeCdfFromList(no, name='no') myplot.Cdfs([cdf_yes, cdf_no]) myplot.Show() return myplot.Clf() PlotPosteriorMeans(photo_map, 'photos') PlotPosteriorMeans(labeler_map, 'labelers') myplot.Show()
def MakeFigure(): frac1 = 0.8 frac2 = 1 - frac1 xs, ys = RenderPdf(1170, 179) pmf1 = Pmf.MakePmfFromDict(dict(zip(xs, ys)), name='blue') xs, ys = RenderPdf(995, 167) pmf2 = Pmf.MakePmfFromDict(dict(zip(xs, ys)), name='green') myplot.Pmfs( [pmf1, pmf2], root='normal1', xlabel='CLA score', ylabel='PDF', ) pmf1.Normalize(frac1) pmf2.Normalize(frac2) ymax = max(pmf1.MaxLike(), pmf2.MaxLike()) ymax = 0.003 pyplot.clf() threshes = [1200, 1300, 1400, 1500, 1570] for thresh in threshes: myplot.Plot([thresh, thresh], [0, ymax], clf=False, line_options=dict(color='gray', alpha=0.5, linewidth=1)) plot_options = [ dict(color='blue', linewidth=2), dict(color='green', linewidth=2) ] myplot.Pmfs( [pmf1, pmf2], plot_options=plot_options, clf=False, root='normal2', xlabel='CLA score', ylabel='PDF', ) cdf1 = Cdf.MakeCdfFromPmf(pmf1) cdf2 = Cdf.MakeCdfFromPmf(pmf2) for thresh in threshes: p1 = frac1 * (1 - cdf1.Prob(thresh)) p2 = frac2 * (1 - cdf2.Prob(thresh)) den = p1 + p2 rep1 = p1 / den rep2 = p2 / den print thresh, den, rep1, rep2 return myplot.Cdfs([cdf1, cdf2], root='normal2', xlabel='', ylabel='', title='')
def Resample(cdf, n=10000): sample = cdf.Sample(n) new_cdf = Cdf.MakeCdfFromList(sample, 'resampled') myplot.Cdfs([cdf, new_cdf], root='resample_cdf', title='CDF', xlabel='weight in oz', ylabel='CDF(x)')
def main(): all_recs = cyb_records.Stats() all_recs.ReadRecords() print 'Number of total stats', len(all_recs.records) cdf = CdfPerDay(all_recs.records) myplot.Cdfs(cdf) myplot.Show(title="CDF: daily usage of machines at the YMCA", xlabel = 'Distance (in m / day)', ylabel = 'Percentile')
def MakeFigures(pool, firsts, others): """Creates several figures for the book.""" # CDF of all ages myplot.Clf() myplot.Cdf(pool.age_cdf) myplot.Save(root='agemodel_age_cdf', title="Distribution of mother's age", xlabel='age (years)', ylabel='CDF', legend=False) # CDF of all weights myplot.Clf() myplot.Cdf(pool.weight_cdf) myplot.Save(root='agemodel_weight_cdf', title="Distribution of birth weight", xlabel='birth weight (oz)', ylabel='CDF', legend=False) # plot CDFs of birth ages for first babies and others myplot.Clf() myplot.Cdfs([firsts.age_cdf, others.age_cdf]) myplot.Save(root='agemodel_age_cdfs', title="Distribution of mother's age", xlabel='age (years)', ylabel='CDF') myplot.Clf() myplot.Cdfs([firsts.weight_cdf, others.weight_cdf]) myplot.Save(root='agemodel_weight_cdfs', title="Distribution of birth weight", xlabel='birth weight (oz)', ylabel='CDF') # make a scatterplot of ages and weights ages, weights = GetAgeWeight(pool) pyplot.clf() #pyplot.scatter(ages, weights, alpha=0.2) pyplot.hexbin(ages, weights, cmap=matplotlib.cm.gray_r) myplot.Save(root='agemodel_scatter', xlabel='Age (years)', ylabel='Birth weight (oz)', legend=False)
def MakeFigure(): fp = open('babyboom.dat') # skip to the beginning of the data for line in fp: if line.find('START DATA') != -1: break # read a list of times times = [] for line in fp: t = line.split() time = int(t[-1]) times.append(time) # compute interarrival times diffs = [times[0]] for i in range(len(times)-1): diff = times[i+1] - times[i] diffs.append(diff) n = len(diffs) mu = thinkstats.Mean(diffs) print 'mean interarrival time', mu cdf = Cdf.MakeCdfFromList(diffs, 'actual') sample = [random.expovariate(1/mu) for i in range(n)] model = Cdf.MakeCdfFromList(sample, 'model') myplot.Cdf(cdf) myplot.Save(root='interarrivals', title='Time between births', xlabel='minutes', ylabel='CDF', legend=False, formats=['eps', 'png', 'pdf']) myplot.Cdfs([cdf, model], complement=True) myplot.Save(root='interarrivals_model', title='Time between births', xlabel='minutes', ylabel='Complementary CDF', yscale='log', formats=['eps', 'png', 'pdf']) pyplot.subplots_adjust(bottom=0.11) myplot.Cdf(cdf, complement=True) myplot.Save(root='interarrivals_logy', title='Time between births', xlabel='minutes', ylabel='Complementary CDF', yscale='log', legend=False, formats=['eps', 'png', 'pdf'])
def main(): all_recs = cyb_records.Stats() all_recs.ReadRecords() print 'Number of total stats', len(all_recs.records) cdf = CdfPerMachine(all_recs.records) myplot.Cdfs(cdf) myplot.Show(title="CDF of cardio machine average distances", xlabel='Average Distances', ylabel='Probability')
def main(): firsts, others, babies = Babies.PartitionBabies() cdf0 = Cdf.MakeCdfFromList(Babies.GetWightList(babies), name='cdf0') print("Sample(cdf, 10) : ", Sample(cdf0, 10)) d1 = WeightRandomSample(cdf0, 100) cdf1 = Cdf.MakeCdfFromList(d1, name='cdf1') d2 = WeightRandomSample(cdf0, 1000) cdf2 = Cdf.MakeCdfFromList(d2, name='cdf2') myplot.Cdfs([cdf0, cdf1, cdf2], complement=False, transform=None) myplot.Show()
def MakeFigure(xmin=100, alpha=1.7, mu=150, sigma=25): t1 = [xmin * random.paretovariate(alpha) for i in range(10000)] cdf1 = Cdf.MakeCdfFromList(t1, name='pareto') t2 = [random.normalvariate(mu, sigma) for i in range(10000)] cdf2 = Cdf.MakeCdfFromList(t2, name='normal') myplot.Cdfs([cdf1, cdf2], root='pareto_world2', title='Pareto World', xlabel='height (cm)', ylabel='CDF')
def main(): resp = brfss.Respondents() resp.ReadRecords(data_dir='res') d = resp.SummarizeHeight() man_d = d[1] lady_d = d[2] # 男性的mu, var, sigma, 变异系数CV man_mu, man_var = thinkstats.TrimmedMeanVar(man_d) man_sigma = math.sqrt(man_var) man_cv = man_sigma/man_mu print("man: mu = %.3f, var = %.3f, sigma = %.3f, cv = %.3f" % (man_mu, man_var, man_sigma, man_cv)) # 女性的mu, var, sigma, 变异系数CV lady_mu, lady_var = thinkstats.TrimmedMeanVar(lady_d) lady_sigma = math.sqrt(lady_var) lady_cv = lady_sigma/lady_mu print("lady: mu = %.3f, var = %.3f, sigma = %.3f, cv = %.3f" % (lady_mu, lady_var, lady_sigma, lady_cv)) # 男性, 女性Hist分布 man_hist = Pmf.MakeHistFromList(man_d, name='man hist') myplot.Hist(man_hist) myplot.Show() myplot.Clf() lady_hist = Pmf.MakeHistFromList(lady_d, name='lady hist') myplot.Hist(lady_hist) myplot.Show() myplot.Clf() # 男性, 女性Pmf分布 man_pmf = Pmf.MakePmfFromHist(man_hist, name='man pmf') myplot.Pmf(man_pmf) myplot.Show() myplot.Clf() lady_pmf = Pmf.MakePmfFromHist(lady_hist, name='lady pmf') myplot.Pmf(lady_pmf) myplot.Show() myplot.Clf() # 男性/女性Cdf累积分布 man_cdf = Cdf.MakeCdfFromPmf(man_pmf, name='man cdf') lady_cdf = Cdf.MakeCdfFromPmf(lady_pmf, name='lady cdf') myplot.Cdfs((man_cdf, lady_cdf), complement=False, transform=None) myplot.Show()
def main(): firsts, others, babies = Babies.PartitionBabies() cdf_babies = Cdf.MakeCdfFromList(Babies.GetWightList(babies), name='babies') cdf_firsts = Cdf.MakeCdfFromList(Babies.GetWightList(firsts), name='firsts') cdf_others = Cdf.MakeCdfFromList(Babies.GetWightList(others), name='others') print("babies percentile rank: ", 100 * cdf_babies.Prob(mywt)) print("firsts percentile rank: ", 100 * cdf_firsts.Prob(mywt)) print("others percentile rank: ", 100 * cdf_others.Prob(mywt)) myplot.Cdfs([cdf_babies, cdf_firsts, cdf_others]) myplot.Show()
def MakeFigures(pmf, biased_pmf): """Makes figures showing the CDF of the biased and unbiased PMFs""" cdf = Cdf.MakeCdfFromPmf(pmf, 'unbiased') print 'unbiased median', cdf.Percentile(50) print 'percent < 100', cdf.Prob(100) print 'percent < 1000', cdf.Prob(1000) biased_cdf = Cdf.MakeCdfFromPmf(biased_pmf, 'biased') print 'biased median', biased_cdf.Percentile(50) myplot.Cdfs([cdf, biased_cdf], root='slashdot.logx', xlabel='Number of friends/foes', ylabel='CDF', xscale='log')
def PlotCdfs(samples): """Make CDFs showing the distribution of outliers.""" cdfs = [] for label, sample in samples.iteritems(): outliers = [x for x in sample if x < 150] cdf = Cdf.MakeCdfFromList(outliers, label) cdfs.append(cdf) myplot.Clf() myplot.Cdfs(cdfs) myplot.Save(root='bayes_height_cdfs', title='CDF of height', xlabel='Reported height (cm)', ylabel='CDF')
def main(): results = ReadResults() speeds = GetSpeeds(results) pmf = Pmf.MakePmfFromList(speeds, 'speeds') myplot.Pmf(pmf) myplot.Show(title='PMF of running speed', xlabel='speed (mph)', ylabel='probability') import Cdf cdf = Cdf.MakeCdfFromList(speeds, 'speeds') myplot.Cdf(cdf) myplot.Show() myplot.Cdfs(cdf) myplot.Show()
def Main(script): # read 'em and sort 'em birthdays = ReadBirthdays() birthdays.sort() # compute the intervals in days deltas = Diff(birthdays) days = [inter.days for inter in deltas] # make and plot the CCDF on a log scale. cdf = Cdf.MakeCdfFromList(days, name='intervals') myplot.Cdfs([cdf], 'intervals', xlabel='days', ylabel='ccdf', yscale='log', complement=True)
def PlotPrevalence(self, root=None, clf=False, n=6): """Looks up the PMFs for a given taxon and plots them.""" if root: clf = True cdfs = [] for taxon in lowercase[:n]: pmf = self.GetPrevalence(taxon) cdf = Cdf.MakeCdfFromPmf(pmf) cdfs.append(cdf) median = cdf.Percentile(50) ci = cdf.Percentile(5), cdf.Percentile(95) print taxon, median, ci myplot.Cdfs(cdfs, root=root, clf=clf, xlabel='prevalence', ylabel='prob')
def MakeFigures(pool, firsts, others): """Creates several figures for the book.""" # plot CDFs of birth ages for first babies and others line_options = [dict(linewidth=0.5), dict(linewidth=0.5)] myplot.Cdfs([firsts.age_cdf, others.age_cdf], root='nsfg_age_cdf', line_options=line_options, title="Mother's age CDF", xlabel='age (years)', ylabel='probability') # make a scatterplot of ages and weights ages, weights = GetAgeWeight(pool) pyplot.clf() #pyplot.scatter(ages, weights, alpha=0.2) pyplot.hexbin(ages, weights, cmap=matplotlib.cm.gray_r) myplot.Save(root='age_scatter', xlabel='Age (years)', ylabel='Birth weight (oz)', legend=False)
def MakeFigure(xmin=100, alpha=1.7, mu=150, sigma=25): """Makes a figure showing the CDF of height in ParetoWorld. Compared to a normal distribution. xmin: parameter of the Pareto distribution alpha: parameter of the Pareto distribution mu: parameter of the Normal distribution sigma: parameter of the Normal distribution """ t1 = [xmin * random.paretovariate(alpha) for i in range(10000)] cdf1 = Cdf.MakeCdfFromList(t1, name='pareto') t2 = [random.normalvariate(mu, sigma) for i in range(10000)] cdf2 = Cdf.MakeCdfFromList(t2, name='normal') myplot.Clf() myplot.Cdfs([cdf1, cdf2]) myplot.Save(root='pareto_world2', title='Pareto World', xlabel='height (cm)', ylabel='CDF')
def PlotDiffs(half_diffs, diffs): half_cdf = Cdf.MakeCdfFromList(half_diffs, 'half') cdf = Cdf.MakeCdfFromList(diffs, 'full') options = dict(linewidth=2) myplot.Cdfs([half_cdf, cdf], xlabel='time - qualifying time (min)', ylabel='CDF', plot_options=[options, options], root='marathon_cdf') diffs = [int(x) for x in diffs] half_diffs = [int(x) for x in half_diffs] pmf = Pmf.MakePmfFromList(diffs, 'full') half_pmf = Pmf.MakePmfFromList(half_diffs, 'half') myplot.Pmfs([half_pmf, pmf], xlabel='time - qualifying time (min)', ylabel='PMF', plot_options=[options, options], root='marathon_pmf')
#!/usr/bin/python3 # -*- coding: utf-8 -*- import math import Babies import Cdf import myplot import thinkstats import erf if __name__ == "__main__": firsts, others, babies = Babies.PartitionBabies() preglengths = Babies.GetPregnacyList(babies) mu = thinkstats.Mean(preglengths) sigma = math.sqrt(thinkstats.Var(preglengths, mu)) print("mu = %.3f sigma = %.3f" % (mu, sigma)) cdf0 = Cdf.MakeCdfFromList(preglengths, name='cdf0') ys = [erf.NormalCdf(x, mu=mu, sigma=sigma) for x in preglengths] cdf1 = Cdf.Cdf(preglengths, ys, 'cdf1') myplot.Cdf(cdf1, complement=False, transform=None) myplot.Cdfs([cdf0, cdf1], complement=False, transform=None) myplot.Show() # TODO wrong
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html """ import math import random import matplotlib.pyplot as pyplot import Cdf import myplot import Pmf cdfs = [] allbday = [] for i in range(10): n = 30 t = [random.randrange(365) for i in range(n)] t.sort() pmf = Pmf.Pmf() for i in range(len(t) - 1): x = t[i + 1] - t[i] pmf.Incr(x) allbday.append(x) cdf = Cdf.MakeCdfFromPmf(pmf) cdfs.append(cdf) cdf = Cdf.MakeCdfFromList(allbday, 'all') cdfs.append(cdf) myplot.Cdfs(cdfs, root='birthday', transform='exponential')
# Example 3-9 import survey, Cdf, myplot def Sample(cdf, n): #return [cdf.Value(random.random()) for i in range(n)] #return random.sample(Cdf.Values(), n) return [cdf.Random() for i in range(n)] table = survey.Pregnancies() table.ReadRecords() births = [rec for rec in table.records if rec.outcome == 1] weights = [x.birthwgt_lb for x in births if x.birthwgt_lb < 97] weights_cdf = Cdf.MakeCdfFromList(weights, 'birth weights') sample = Sample(weights_cdf, 10000) sample_cdf = Cdf.MakeCdfFromList(sample, 'sample weights') myplot.Clf() myplot.Cdfs( (weights_cdf, sample_cdf) ) myplot.Show(title='CDF of all birth weights', xlabel='weight (lbs)', ylabel='cumulative probability')