def testPmf(self): pmf = thinkstats2.Pmf('allen') # this one might not be a robust test self.assertEquals(len(str(pmf)), 45) self.assertEquals(len(pmf), 4) self.assertEquals(pmf.Prob('l'), 0.4) self.assertEquals(pmf['l'], 0.4) self.assertEquals(pmf.Percentile(50), 'l') pmf = thinkstats2.Pmf(Counter('allen')) self.assertEquals(len(pmf), 4) self.assertEquals(pmf.Prob('l'), 0.4) pmf = thinkstats2.Pmf(pmf) self.assertEquals(len(pmf), 4) self.assertEquals(pmf.Prob('l'), 0.4) pmf = thinkstats2.Pmf(pmf.d.items()) self.assertEquals(len(pmf), 4) self.assertEquals(pmf.Prob('l'), 0.4) pmf2 = pmf.Copy() self.assertEquals(pmf, pmf2) xs, ys = pmf.Render() self.assertEquals(tuple(xs), tuple(sorted(pmf.Values())))
def testPmfFromCdf(self): t = [1, 2, 2, 3, 5] pmf = thinkstats2.Pmf(t) cdf = thinkstats2.Cdf(pmf) pmf2 = thinkstats2.Pmf(cdf) for x in pmf.Values(): self.assertAlmostEquals(pmf[x], pmf2[x])
def MakeStep(greq, less): axis = [0, 50, 0, 0.6] greqpmf = thinkstats2.Pmf(greq.prglngth, label='greater/equal to 30') lesspmf = thinkstats2.Pmf(less.prglngth, label='less than 30') thinkplot.Pmfs([greqpmf, lesspmf]) thinkplot.Config(xlabel='Pregnancy length(weeks)', axis=axis) thinkplot.Show()
def testSortedItems(self): pmf = thinkstats2.Pmf('allen') items = pmf.SortedItems() self.assertEqual(len(items), 4) pmf = thinkstats2.Pmf(['a', float('nan'), 1, pmf]) # should generate a warning items = pmf.SortedItems() self.assertEqual(len(items), 4)
def MakePmfs(greq, less): width = 0.45 axis = [0, 50, 0, 0.6] greqpmf = thinkstats2.Pmf(greq.prglngth, label='greater/equal to 30') lesspmf = thinkstats2.Pmf(less.prglngth, label='less than 30') thinkplot.Hist(lesspmf, align='left', width=width) thinkplot.Hist(greqpmf, align='right', width=width) thinkplot.Config(axis=axis) thinkplot.Show()
def MakePmfs(greq, less): axis = [0, 15, 0, 0.04] width = .4 / 16 greqpmf = thinkstats2.Pmf(greq.totalwgt_lb, label='greater/equal to 30') lesspmf = thinkstats2.Pmf(less.totalwgt_lb, label='less than 30') thinkplot.Pmf(lesspmf, align='left', width=width) thinkplot.Pmf(greqpmf, align='right', width=width) thinkplot.Config(axis=axis) thinkplot.Show()
def MakeStep(male, female): axis = [0, 800, 0, 0.1] malepmf = thinkstats2.Pmf(male.alcwknd, label='Male') femalepmf = thinkstats2.Pmf(female.alcwknd, label='Female') thinkplot.Pmfs([malepmf, femalepmf]) thinkplot.Config(xlabel='Alcohol Consumption (grams)', ylabel='PMF', axis=axis, title='Weekend Alcohol Consumption') thinkplot.Show()
def MakePmfs(male, female): width = 0.45 axis = [0, 800, 0, 0.1] malepmf = thinkstats2.Pmf(male.alcwknd, label='Male') femalepmf = thinkstats2.Pmf(female.alcwknd, label='Female') thinkplot.Hist(malepmf, align='left', width=width) thinkplot.Hist(femalepmf, align='right', width=width) thinkplot.Config(xlabel='Alcohol Consumption (grams)', ylabel='PMF', axis=axis, title='Weekend Alcohol Consumption') thinkplot.Show()
def testPmfMax(self): d6 = thinkstats2.Pmf(range(1, 7)) two = d6 + d6 three = two + d6 cdf = three.Max(6) thinkplot.Cdf(cdf) self.assertAlmostEqual(cdf[14], 0.558230962626)
def main(script): """Tests the functions in this module. script: string script name """ live, firsts, others = first.MakeFrames() pmf = thinkstats2.Pmf(live.prglngth) # test Mode mean = PmfMean(pmf) print('Mean of preg length', mean) assert mean == pmf.Mean(), mean variance = PmfVar(pmf) print('Variance of preg length', variance) assert variance == pmf.Var(), variance # test AllModes # modes = AllModes(hist) # assert modes[0][1] == 4693, modes[0][1] # for value, freq in modes[:5]: # print(value, freq) print('%s: All tests passed.' % script)
def main(): results = relay.ReadResults() speeds = relay.GetSpeeds(results) speeds = relay.BinData(speeds, 3, 12, 100) # plot the distribution of actual speeds pmf = thinkstats2.Pmf(speeds, 'actual speeds') # plot the biased distribution seen by the observer biased = ObservedPmf(pmf, 7.5, label='observed speeds') thinkplot.Pmf(biased) thinkplot.Save(root='observed_speeds', title='PMF of running speed', xlabel='speed (mph)', ylabel='PMF') cdf = thinkstats2.Cdf(pmf) cdf_biased = thinkstats2.Cdf(biased) thinkplot.PrePlot(2) thinkplot.Cdfs([cdf, cdf_biased]) thinkplot.Save(root='observed_speeds_cdf', title='CDF of running speed', xlabel='speed (mph)', ylabel='CDF')
def testCdf(self): t = [1, 2, 2, 3, 5] pmf = thinkstats2.Pmf(t) cdf = thinkstats2.Cdf(pmf) self.assertEquals(len(cdf), 4) self.assertAlmostEquals(cdf.Prob(2), 0.6) self.assertAlmostEquals(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(pmf.Items()) self.assertEquals(len(cdf), 4) self.assertAlmostEquals(cdf.Prob(2), 0.6) self.assertAlmostEquals(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(t) self.assertEquals(len(cdf), 4) self.assertAlmostEquals(cdf.Prob(2), 0.6) self.assertAlmostEquals(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(Counter(t)) self.assertEquals(len(cdf), 4) self.assertAlmostEquals(cdf.Prob(2), 0.6) self.assertAlmostEquals(cdf.Value(0.6), 2) cdf2 = cdf.Copy() self.assertAlmostEquals(cdf2.Prob(2), 0.6) self.assertAlmostEquals(cdf2.Value(0.6), 2)
def ClassSizes(): """Generate PMFs of observed and actual class size. """ # start with the actual distribution of class sizes from the book d = {7: 8, 12: 8, 17: 14, 22: 4, 27: 6, 32: 12, 37: 8, 42: 3, 47: 2} # form the pmf pmf = thinkstats2.Pmf(d, label='actual') print('mean', pmf.Mean()) print('var', pmf.Var()) # compute the biased pmf biased_pmf = BiasPmf(pmf, label='observed') print('mean', biased_pmf.Mean()) print('var', biased_pmf.Var()) # unbias the biased pmf unbiased_pmf = UnbiasPmf(biased_pmf, label='unbiased') print('mean', unbiased_pmf.Mean()) print('var', unbiased_pmf.Var()) # plot the Pmfs thinkplot.PrePlot(2) thinkplot.Pmfs([pmf, biased_pmf]) thinkplot.Save(root='class_size1', xlabel='class size', ylabel='PMF', axis=[0, 52, 0, 0.27])
def MakeModel(self): observado, esperado = self.data self.n = len(observado) self.pool = np.hstack((observado, esperado)) pmf = thinkstats2.Pmf(self.pool) self.values = observado self.expected_probs = np.array(pmf.Probs(self.values))
def MakeModel(self): firsts, others = self.data self.n = len(firsts) self.pool = np.hstack((firsts, others)) pmf = thinkstats2.Pmf(self.pool) self.values = range(35, 44) self.expected_probs = np.array(pmf.Probs(self.values))
def main(): live, firsts, others = first.MakeFrames() diffs = PairwiseDiff(live) mean = thinkstats2.Mean(diffs) print('Mean: ', mean) pmf = thinkstats2.Pmf(diffs) thinkplot.Hist(pmf) thinkplot.Show(xlabel='Diff in wks', ylabel='PMF')
def main(): preg = nsfg.ReadFemPreg() live = preg[preg.outcome == 1] pmf = thinkstats2.Pmf(live.prglngth) assert (pmf.Mean() == PmfMean(pmf)) assert (pmf.Var() == PmfVar(pmf)) print('All test pased')
def testPmfProbLess(self): d6 = thinkstats2.Pmf(range(1,7)) self.assertEqual(d6.ProbLess(4), 0.5) self.assertEqual(d6.ProbGreater(3), 0.5) two = d6 + d6 three = two + d6 # Pmf no longer supports magic comparators self.assertAlmostEqual(two.ProbGreater(three), 0.15200617284) self.assertAlmostEqual(two.ProbLess(three), 0.778549382716049)
def MakeUniformPmf(low, high): """Make a uniform Pmf. low: lowest value (inclusive) high: highest value (inclusive) """ xs = MakeRange(low, high) pmf = thinkstats2.Pmf(xs) return pmf
def MakeFigures(live, firsts, others): """Creates several figures for the book. live: DataFrame firsts: DataFrame others: DataFrame """ first_wgt = firsts.totalwgt_lb first_wgt_dropna = first_wgt.dropna() print('Firsts', len(first_wgt), len(first_wgt_dropna)) #assert len(first_wgt_dropna) == 4381 other_wgt = others.totalwgt_lb other_wgt_dropna = other_wgt.dropna() print('Others', len(other_wgt), len(other_wgt_dropna)) #assert len(other_wgt_dropna) == 4706 first_pmf = thinkstats2.Pmf(first_wgt_dropna, label='first') other_pmf = thinkstats2.Pmf(other_wgt_dropna, label='other') width = 0.4 / 16 # plot PMFs of birth weights for first babies and others thinkplot.PrePlot(2) thinkplot.Hist(first_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Save(root='cumulative_birthwgt_pmf', title='Birth weight', xlabel='weight (pounds)', ylabel='PMF') # plot CDFs of birth weights for first babies and others first_cdf = thinkstats2.Cdf(firsts.totalwgt_lb, label='first') other_cdf = thinkstats2.Cdf(others.totalwgt_lb, label='other') thinkplot.PrePlot(2) thinkplot.Cdfs([first_cdf, other_cdf]) thinkplot.Save(root='cumulative_birthwgt_cdf', title='Birth weight', xlabel='weight (pounds)', ylabel='CDF', axis=[0, 12.5, 0, 1] )
def ProbilityMassFunction(group): hist = thinkstats2.Hist(group) n = hist.Total() map_prob = {} for x, v in hist.Items(): map_prob[x] = v / n #OR pmf = thinkstats2.Pmf(group) #print(type(map_prob) ," AND ", type(pmf)); return pmf
def testPmfProbLess(self): d6 = thinkstats2.Pmf(range(1, 7)) self.assertEqual(d6.ProbLess(4), 0.5) self.assertEqual(d6.ProbGreater(3), 0.5) two = d6 + d6 three = two + d6 self.assertAlmostEqual(two > three, 0.15200617284) self.assertAlmostEqual(two < three, 0.778549382716049) self.assertAlmostEqual(two.ProbGreater(three), 0.15200617284) self.assertAlmostEqual(two.ProbLess(three), 0.778549382716049)
def testPmf(self): pmf = thinkstats2.Pmf('allen') self.assertEquals(len(pmf), 4) self.assertEquals(pmf.Prob('l'), 0.4) pmf = thinkstats2.Pmf(Counter('allen')) self.assertEquals(len(pmf), 4) self.assertEquals(pmf.Prob('l'), 0.4) pmf = thinkstats2.Pmf(pmf) self.assertEquals(len(pmf), 4) self.assertEquals(pmf.Prob('l'), 0.4) pmf = thinkstats2.Pmf(pmf.d.items()) self.assertEquals(len(pmf), 4) self.assertEquals(pmf.Prob('l'), 0.4) pmf2 = pmf.Copy() self.assertEquals(pmf, pmf2)
def summ_stats(x): for i in x: base = thinkstats2.Pmf(df[i]) mean = base.Mean() mode = base.Mode() spread = base.Var() tails = df[i].kurtosis() print( "{} Crime Stats: mean = {:.2f}, mode = {:.2f}, spread = {:.2f}, tails = {:.2f}." .format(i, mean, mode, spread, tails))
def Experiment5(lam=2.5, m=100): pmf = thinkstats2.Pmf() for i in range(m): L = SimulateGame(lam) pmf.Incr(L) pmf.Normalize() thinkplot.Hist(pmf) thinkplot.Show()
def RunLoop(gap_times, nums, lam=0.0333): """Runs the basic analysis for a range of num_passengers. gap_times: sequence of float nums: sequence of values for num_passengers lam: arrival rate in passengers per second Returns: WaitMixtureEstimator """ global UPPER_BOUND UPPER_BOUND = 4000 thinkplot.Clf() RandomSeed(18) # resample gap_times n = 220 cdf_z = thinkstats2.Cdf(gap_times) sample_z = cdf_z.Sample(n) pmf_z = thinkstats2.Pmf(sample_z) # compute the biased pmf and add some long delays cdf_zp = BiasPmf(pmf_z).MakeCdf() sample_zb = numpy.append(cdf_zp.Sample(n), [1800, 2400, 3000]) # smooth the distribution of zb pdf_zb = thinkstats2.EstimatedPdf(sample_zb) xs = MakeRange(low=60) pmf_zb = pdf_zb.MakePmf(xs=xs) # unbias the distribution of zb and make wtc pmf_z = UnbiasPmf(pmf_zb) wtc = WaitTimeCalculator(pmf_z) probs = [] for num_passengers in nums: ete = ElapsedTimeEstimator(wtc, lam, num_passengers) # compute the posterior prob of waiting more than 15 minutes cdf_y = ete.pmf_y.MakeCdf() prob = 1 - cdf_y.Prob(900) probs.append(prob) # thinkplot.Cdf(ete.pmf_y.MakeCdf(label=str(num_passengers))) thinkplot.Plot(nums, probs) thinkplot.Save( root='redline5', xlabel='Num passengers', ylabel='P(y > 15 min)', formats=FORMATS, )
def main(): results = ReadResults() speeds = GetSpeeds(results) speeds = BinData(speeds, 3, 12, 100) pmf = thinkstats2.Pmf(speeds, 'speeds') thinkplot.Pmf(pmf) thinkplot.Show(title='PMF of running speed', xlabel='speed (mph)', ylabel='probability')
def MakeFigures(firsts, others): """Plot Pmfs of pregnancy length. firsts: DataFrame others: DataFrame """ # plot the PMFs first_pmf = thinkstats2.Pmf(firsts.prglngth, label='first') other_pmf = thinkstats2.Pmf(others.prglngth, label='other') width = 0.45 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(first_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Config(xlabel='weeks', ylabel='probability', axis=[27, 46, 0, 0.6]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([first_pmf, other_pmf]) thinkplot.Save(root='probability_nsfg_pmf', xlabel='weeks', axis=[27, 46, 0, 0.6]) # plot the differences in the PMFs weeks = range(35, 46) diffs = [] for week in weeks: p1 = first_pmf.Prob(week) p2 = other_pmf.Prob(week) diff = 100 * (p1 - p2) diffs.append(diff) thinkplot.Bar(weeks, diffs) thinkplot.Save(root='probability_nsfg_diffs', title='Difference in PMFs', xlabel='weeks', ylabel='percentage points', legend=False)
def ResampleRowsWeighted(df, attr='finalwgt'): """Resamples a DataFrame using probabilities proportional to finalwgt. df: DataFrame attr: string column name to use as weights returns: DataFrame """ weights = df[attr] cdf = thinkstats2.Pmf(weights).MakeCdf() indices = cdf.Sample(len(weights)) sample = df.loc[indices] return sample
def SimulateManyGames(lam, iters=1000000): lam_est = [] for _ in np.arange(iters): lam_est.append(SimulateGame(lam)) print('Mean Error =', MeanError(lam_est, lam)) print('RMSE =', RMSE(lam_est, lam)) lam_cdf = thinkstats2.Cdf(lam_est) ci = lam_cdf.Percentile(5), lam_cdf.Percentile(95) lam_pmf = thinkstats2.Pmf(lam_est) thinkplot.Cdf(lam_cdf) thinkplot.Plot([ci[0], ci[0]], [0, 1], linewidth=2, color='0.8') thinkplot.Plot([ci[1], ci[1]], [0, 1], linewidth=2, color='0.8') thinkplot.Config(xlabel='Goals per game', ylabel='CDF', legend=False)