def PlotHazard(complete, ongoing): """Plots the hazard function and survival function. complete: list of complete lifetimes ongoing: list of ongoing lifetimes """ # plot S(t) based on only complete pregnancies cdf = thinkstats2.Cdf(complete) sf = SurvivalFunction(cdf) thinkplot.Plot(sf, label='old S(t)', alpha=0.1) thinkplot.PrePlot(2) # plot the hazard function hf = EstimateHazardFunction(complete, ongoing) thinkplot.Plot(hf, label='lams(t)', alpha=0.5) # plot the survival function sf = hf.MakeSurvival() thinkplot.Plot(sf, label='S(t)') thinkplot.Show(xlabel='t (weeks)')
def testNormalPdf(self): pdf = thinkstats2.NormalPdf(mu=1, sigma=2) self.assertEqual(len(str(pdf)), 29) self.assertAlmostEqual(pdf.Density(3), 0.12098536226) pmf = pdf.MakePmf() self.assertAlmostEqual(pmf[1.0], 0.0239951295619) xs, ps = pdf.Render() self.assertEqual(xs[0], -5.0) self.assertAlmostEqual(ps[0], 0.0022159242059690038) pmf = thinkstats2.Pmf(pdf) self.assertAlmostEqual(pmf[1.0], 0.0239951295619) xs, ps = pmf.Render() self.assertEqual(xs[0], -5.0) self.assertAlmostEqual(ps[0], 0.00026656181123) cdf = thinkstats2.Cdf(pdf) self.assertAlmostEqual(cdf[1.0], 0.51199756478094904) xs, ps = cdf.Render() self.assertEqual(xs[0], -5.0) self.assertAlmostEqual(ps[0], 0.0)
def BinnedPercentiles(df): """Bin the data by age and plot percentiles of weight for each bin. df: DataFrame """ bins = np.arange(10, 48, 3) indices = np.digitize(df.agepreg, bins) groups = df.groupby(indices) ages = [group.agepreg.mean() for i, group in groups][1:-1] cdfs = [thinkstats2.Cdf(group.totalwgt_lb) for i, group in groups][1:-1] thinkplot.PrePlot(3) for percent in [75, 50, 25]: weights = [cdf.Percentile(percent) for cdf in cdfs] label = '%dth' % percent thinkplot.Plot(ages, weights, label=label) thinkplot.Save(root='chap07scatter3', formats=['jpg'], xlabel="mother's age (years)", ylabel='birth weight (lbs)')
def BinPerc(df): """ param: df (data frame) - contains ages and weights """ bins = np.arange(10, 48, 3) indices = np.digitize(df.agepreg, bins) groups = df.groupby(indices) ages = [group.agepreg.mean() for i, group in groups][1:-1] cdfs = [thinkstats2.Cdf(group.totalwgt_lb) for i, group in groups][1:-1] plt.style.use('ggplot') percents = [25, 50, 75] for p in percents: weights = [cdf.Percentile(p) for cdf in cdfs] plt.plot(ages, weights, label=str(p)) plt.title("Percentiles of Birth weight vs Mother's Age") plt.xlabel("Age (years)") plt.ylabel("Birth Weight (lbs)") plt.legend() plt.xlim(14, 45)
def main(): counter = Counter() for i in range(10000): sample = ParetoSample(1.7, 0.001, 10000) counter.update(Counter(sample)) print(len(counter)) return pmf = thinkstats2.Pmf(counter) print('mean', pmf.Mean()) for x, prob in pmf.Largest(10): print(x) cdf = thinkstats2.Cdf(pmf) thinkplot.Cdf(cdf, complement=True) thinkplot.Show(xscale='log', yscale='log') return MakeFigure() MakeParetoCdf() print(TallestPareto(iters=2))
def MakeBabyBoom(): """Plot CDF of interarrival time on log and linear scales. """ # compute the interarrival times df = ReadBabyBoom() diffs = df.minutes.diff() cdf = thinkstats2.Cdf(diffs, label='actual') thinkplot.PrePlot(cols=2) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='minutes', ylabel='CDF', legend=False) thinkplot.SubPlot(2) thinkplot.Cdf(cdf, complement=True) thinkplot.Config(xlabel='minutes', ylabel='CCDF', yscale='log', legend=False) thinkplot.Save(root='analytic_interarrivals', legend=False)
def PlotResidualPercentiles(model, results, index=1, num_bins=20): """Plots percentiles of the residuals. model: StatsModel model object results: StatsModel results object index: which exogenous variable to use num_bins: how many bins to divide the x-axis into """ exog = model.exog[:, index] resid = results.resid.values df = pandas.DataFrame(dict(exog=exog, resid=resid)) bins = np.linspace(np.min(exog), np.max(exog), num_bins) indices = np.digitize(exog, bins) groups = df.groupby(indices) means = [group.exog.mean() for _, group in groups][1:-1] cdfs = [thinkstats2.Cdf(group.resid) for _, group in groups][1:-1] thinkplot.PrePlot(3) for percent in [75, 50, 25]: percentiles = [cdf.Percentile(percent) for cdf in cdfs] label = '%dth' % percent thinkplot.Plot(means, percentiles, label=label)
def EstimateGoals(lam, m): def VertLine(x, y=1): thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3) lams = [] for _ in range(m): goals = SimulateGame(lam) lams.append(goals) print('RMSE of Goals: ', estimation.RMSE(lams, lam)) print('Mean Error of Goals: ', estimation.MeanError(lams, lam)) cdf = thinkstats2.Cdf(lams) ci = cdf.Percentile(5), cdf.Percentile(95) VertLine(ci[0]) VertLine(ci[1]) thinkplot.Cdf(cdf) #thinkplot.Show(xlabel = 'Goals', ylabel = 'CumProb', title = 'Sampling Distribution, lam = ' + str(lam)) thinkplot.SaveFormat(root='Q9_sampling_dist', fmt='png', xlabel='Goals', ylabel='CumProb', title='Sampling Distribution, lam = ' + str(lam))
xs, ps = thinkstats2.RenderExpoCdf(lam, 0, 3.0, 50) label = r'$\lambda=%g$' % lam thinkplot.Plot(xs, ps, label=label) thinkplot.Config(title='Exponential CDF', xlabel='x', ylabel='CDF', loc='lower right') #%% [markdown] # Here's the distribution of interarrival times from a dataset of birth times. #%% df = analytic.ReadBabyBoom() diffs = df.minutes.diff() cdf = thinkstats2.Cdf(diffs, label='actual') thinkplot.Cdf(cdf) thinkplot.Config(xlabel='Time between births (minutes)', ylabel='CDF') #%% [markdown] # Here's what the CCDF looks like on a log-y scale. A straight line is consistent with an exponential distribution. #%% thinkplot.Cdf(cdf, complement=True) thinkplot.Config(xlabel='Time between births (minutes)', ylabel='CCDF', yscale='log', loc='upper right') #%% [markdown]
# each range arrays = [] for _, row in df.iterrows(): vals = np.linspace(row.log_lower, row.log_upper, row.freq) arrays.append(vals) # collect the arrays into a single sample log_sample = np.concatenate(arrays) return log_sample #%% # create a log_sample (using modified InterpolateSample) log_sample = InterpolateSample(df) #%% get the cdf and plot it log_cdf = thinkstats2.Cdf(log_sample) thinkplot.Cdf(log_cdf) # get a sample to calc mean, median sample = np.power(10, log_sample) mean, median = density.Summarize(sample) #print("The mean is: {}".format(mean)) #print("The median is: {}".format(median)) #%% # fraction of households below the mean cdf = thinkstats2.Cdf(sample) print('The fraction of households below the mean: {:.2f}'.format(cdf[mean]))
def Median(xs): cdf = thinkstats2.Cdf(xs) return cdf.Value(0.5)
## make pdf of birth weight and calculate statistics pdf = thinkstats2.EstimatedPdf(birth_weights) thinkplot.Pdf(pdf, label='birth weight') thinkplot.Show(xlabel='PDF', ylabel='lbs') ## make adult weight data frames adult_weights = df.wtkg2.dropna() ## evaluate skewness of adult weights pdf = thinkstats2.EstimatedPdf(adult_weights) thinkplot.Pdf(pdf, label='Adult weight') thinkplot.Show(xlabel='Adult weight (kg)', ylabel='PDF') ## weight kurtosis print('Kurtosis(adult_weights):\n', Kurtosis(adult_weights)) print('SampleExcessKertosis(adult_weights):\n', SampleExcessKertosis(adult_weights)) ## compute statistics of income data df = hinc.ReadData() log_sample = hinc2.InterpolateSample(df, log_upper=6.0) ## Convert sample from log $ to $ sample = np.power(10, log_sample) cdf = thinkstats2.Cdf(sample, label='interp. data') thinkplot.Cdf(cdf) thinkplot.Show(xlabel='Income ($)', ylabel='CDF') ## Compute statistics SampleStatistics(sample)
axis=[140, 210, 20, 200], legend=False) ## bin data cleaned = df.dropna(subset=['htm3', 'wtkg2']) bins = np.arange(135, 210, 5) indices = np.digitize(cleaned.htm3, bins) groups = cleaned.groupby(indices) ## print binned data for i, group in groups: print(i, len(group)) ## compute cdf for each group mean_heights = [group.htm3.mean() for i, group in groups] cdfs = [thinkstats2.Cdf(group.wtkg2) for i, group in groups] ## extract 25th, 50th, 75th percentiles for percent in [75, 50, 25]: weight_percentiles = [cdf.Percentile(percent) for cdf in cdfs] label = '%dth' % percent thinkplot.Plot(mean_heights, weight_percentiles, label=label) thinkplot.Show(xlabel='Height (cm)', ylabel='Weight (kg)', axis=[140, 210, 20, 200], legend=False) ## re-bin data and make new cdfs bins = np.arange(135, 210, 15) indices = np.digitize(cleaned.htm3, bins)
def MakeCdf(self): """Makes a CDF of lifetimes. returns: Cdf """ return thinkstats2.Cdf(self.ts, 1 - self.ss)
#PMF #creating a variable for PMF of NO2 AQI & SO2 AQI no2_pmf = thinkstats2.Pmf(grp_pollution_df['NO2AQI']) so2_pmf = thinkstats2.Pmf(grp_pollution_df['SO2AQI']) thinkplot.PrePlot(2, cols=2) thinkplot.Hist(no2_pmf, label='NO2', align='right', width=0.75) thinkplot.Hist(so2_pmf, label='SO2', align='left', width=0.75) thinkplot.Show(xlabel='Parts per Billion', ylabel='Probability', axis=[0, 80, 0, 0.10]) #creating the CDF of O3 AQI t = (grp_pollution_df['O3AQI']) cdf = thinkstats2.Cdf(t, label='O3') thinkplot.Clf() thinkplot.Cdf(cdf) thinkplot.Show(xlabel='Parts per Million', ylabel='CDF') #plotting a complementary CDF (CCDF) of O3 thinkplot.Cdf(cdf, complement=True) thinkplot.Show(xlabel='minutes', ylabel='CCDF', yscale='log') #normal CDF with a range of parameters thinkplot.PrePlot(3) mus = [1.0, 2.0, 3.0] #should change to my own numbers instead sigmas = [0.5, 0.4, 0.3] for mu, sigma in zip(mus, sigmas):
def testCdf(self): t = [1, 2, 2, 3, 5] pmf = thinkstats2.Pmf(t) hist = thinkstats2.Hist(t) cdf = thinkstats2.Cdf(pmf) self.assertEqual(len(str(cdf)), 37) self.assertEqual(cdf[0], 0) self.assertAlmostEqual(cdf[1], 0.2) self.assertAlmostEqual(cdf[2], 0.6) self.assertAlmostEqual(cdf[3], 0.8) self.assertAlmostEqual(cdf[4], 0.8) self.assertAlmostEqual(cdf[5], 1) self.assertAlmostEqual(cdf[6], 1) xs = range(7) ps = cdf.Probs(xs) for p1, p2 in zip(ps, [0, 0.2, 0.6, 0.8, 0.8, 1, 1]): self.assertAlmostEqual(p1, p2) self.assertEqual(cdf.Value(0), 1) self.assertEqual(cdf.Value(0.1), 1) self.assertEqual(cdf.Value(0.2), 1) self.assertEqual(cdf.Value(0.3), 2) self.assertEqual(cdf.Value(0.4), 2) self.assertEqual(cdf.Value(0.5), 2) self.assertEqual(cdf.Value(0.6), 2) self.assertEqual(cdf.Value(0.7), 3) self.assertEqual(cdf.Value(0.8), 3) self.assertEqual(cdf.Value(0.9), 5) self.assertEqual(cdf.Value(1), 5) ps = np.linspace(0, 1, 11) xs = cdf.ValueArray(ps) self.assertTrue((xs == [1, 1, 1, 2, 2, 2, 2, 3, 3, 5, 5]).all()) np.random.seed(17) xs = cdf.Sample(7) self.assertListEqual(xs.tolist(), [2, 2, 1, 1, 3, 3, 3]) # when you make a Cdf from a Pdf, you might get some floating # point representation error self.assertEqual(len(cdf), 4) self.assertAlmostEqual(cdf.Prob(2), 0.6) self.assertAlmostEqual(cdf[2], 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromPmf(pmf) self.assertEqual(len(cdf), 4) self.assertAlmostEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromItems(pmf.Items()) self.assertEqual(len(cdf), 4) self.assertAlmostEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(pmf.d) self.assertEqual(len(cdf), 4) self.assertAlmostEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromDict(pmf.d) self.assertEqual(len(cdf), 4) self.assertAlmostEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(hist) self.assertEqual(len(cdf), 4) self.assertEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromHist(hist) self.assertEqual(len(cdf), 4) self.assertEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(t) self.assertEqual(len(cdf), 4) self.assertEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromList(t) self.assertEqual(len(cdf), 4) self.assertEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(Counter(t)) self.assertEqual(len(cdf), 4) self.assertEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf2 = cdf.Copy() self.assertEqual(cdf2.Prob(2), 0.6) self.assertEqual(cdf2.Value(0.6), 2)
#%% import random sample = [random.gauss(mean, std) for _ in range(500)] sample_pdf = thinkstats2.EstimatedPdf(sample) thinkplot.Pdf(sample_pdf, label='sample KDE') thinkplot.Pdf(pdf, label='normal') thinkplot.Show(xlabel='height (cm)', ylabel='dencity') #%% import numpy as np hist = thinkstats2.Hist(np.floor(sample)) thinkplot.Hist(hist) #%% cdf = thinkstats2.Cdf(np.floor(sample)) thinkplot.Cdf(cdf) #%% [markdown] # # ### Raw moment # $ m_k = \frac{1}{n} \sum_{i}{{x_i}^k} $ #%% def RawMoment(xs, k): return sum(x**k for x in xs) / len(xs) #%% [markdown] #
def testCdfProbs(self): t = [-1, 1, 2, 2, 3, 5] cdf = thinkstats2.Cdf(t) ps = cdf.Probs(t) print(ps)
def PlotSamplingDistributions(live): """Plots confidence intervals for the fitted curve and sampling dists. live: DataFrame """ ages = live.agepreg weights = live.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) res = thinkstats2.Residuals(ages, weights, inter, slope) r2 = thinkstats2.CoefDetermination(weights, res) print('rho', thinkstats2.Corr(ages, weights)) print('R2', r2) print('R', math.sqrt(r2)) print('Std(ys)', thinkstats2.Std(weights)) print('Std(res)', thinkstats2.Std(res)) # plot the confidence intervals inters, slopes = SamplingDistributions(live, iters=1001) PlotConfidenceIntervals(ages, inters, slopes, percent=90, alpha=0.3, label='90% CI') thinkplot.Text(42, 7.53, '90%') PlotConfidenceIntervals(ages, inters, slopes, percent=50, alpha=0.5, label='50% CI') thinkplot.Text(42, 7.59, '50%') thinkplot.Save(root='linear3', xlabel='age (years)', ylabel='birth weight (lbs)', legend=False) # plot the confidence intervals thinkplot.PrePlot(2) thinkplot.Scatter(ages, weights, color='gray', alpha=0.1) PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2) PlotConfidenceIntervals(ages, inters, slopes) thinkplot.Save(root='linear5', xlabel='age (years)', ylabel='birth weight (lbs)', title='90% CI', axis=[10, 45, 0, 15], legend=False) # plot the sampling distribution of slope under null hypothesis # and alternate hypothesis sampling_cdf = thinkstats2.Cdf(slopes) print('p-value, sampling distribution', sampling_cdf[0]) ht = SlopeTest((ages, weights)) pvalue = ht.PValue() print('p-value, slope test', pvalue) print('inter', inter, thinkstats2.Mean(inters)) Summarize(inters, inter) print('slope', slope, thinkstats2.Mean(slopes)) Summarize(slopes, slope) thinkplot.PrePlot(2) thinkplot.Plot([0, 0], [0, 1], color='0.8') ht.PlotCdf(label='null hypothesis') thinkplot.Cdf(sampling_cdf, label='sampling distribution') thinkplot.Save(root='linear4', xlabel='slope (lbs / year)', ylabel='CDF', xlim=[-0.03, 0.03], loc='upper left')
print('Cohen\'s d for pregnancy length in weeks:', plen_cohend) #--- Chapter3 Ex1 actual_pmf = thinkstats2.Pmf(resp.numkdhh, label='actual') biased_pmf = BiasPmf(actual_pmf, label='biased') thinkplot.PrePlot(2) actual_hist = thinkplot.Pmf(actual_pmf) biased_hist = thinkplot.Pmf(biased_pmf) thinkplot.Show(xlabel='#kids in household', ylabel='PMF') print('Actual Mean:', actual_pmf.Mean()) print('Biased Mean:', biased_pmf.Mean()) #--- Chapter4 Ex2 my_seq = np.random.random(1000) my_pmf = thinkstats2.Pmf(my_seq) my_cdf = thinkstats2.Cdf(my_seq) thinkplot.Pmf(my_pmf, linewidth=0.1) thinkplot.Show(xlabel='Random variable', ylabel='PMF') thinkplot.Cdf(my_cdf) thinkplot.Show(xlabel='Random variable', ylabel='CDF') #--- Chapter5 Ex1 mu = 178 sigma = 7.7 mhgt_dist = scipy.stats.norm(loc=mu, scale=sigma) m1 = 177.8 #5'10" in cm m2 = 185.42 #6'1" in cm print('Percent Male population between 5\'10" and 6\'1" is %.2f' % (100 * (mhgt_dist.cdf(m2) - mhgt_dist.cdf(m1)))) #--- Chapter7 Ex1
#%% t = [1, 2, 2, 3, 5] for x in range(6): print("CDF({0}) = {1}".format(x, EvalCdf(t, x))) #%% [markdown] # ## 4.4 CDF の表現 #%% import thinkstats2 import first import thinkplot live , firsts,others = first.MakeFrames() cdf = thinkstats2.Cdf(live.prglngth, label='prglngth') thinkplot.Cdf(cdf) thinkplot.show(xlabel='weeks', ylabel='CDF') #%% print("10% {0} weeks".format(cdf.Value(0.1))) print("90% {0} weeks".format(cdf.Value(0.9))) #%% [markdown] # ## 4.5 CDFを比較する #%% first_cdf = thinkstats2.Cdf(firsts.totalwgt_lb, label='first') other_cdf = thinkstats2.Cdf(others.totalwgt_lb, label='other')
def ComputeProbSurvival(ts, ss, t): """Given a survival curve, find the probability of survival >= t.""" ps = [1 - s for s in ss] cdf = thinkstats2.Cdf(ts, ps) s = 1 - cdf.Prob(t) return s
def testShift(self): t = [1, 2, 2, 3, 5] cdf = thinkstats2.Cdf(t) cdf2 = cdf.Shift(1) self.assertEqual(cdf[1], cdf2[2])
#Plot pmf of age range for clicked ads vs non clicked ads width=1000 axis = [10000, 70000, 0, 0.01] thinkplot.PrePlot(2) #thinkplot.SubPlot(2) thinkplot.Pmfs([clicked_pmf, nonclicked_pmf]) thinkplot.Config(xlabel='Area Income', axis=axis) thinkplot.show() ############################################################################ #############################Section 3 -CDF################################# ############################################################################ age_grp_30_to_39_cdf = thinkstats2.Cdf(age_grp_30_to_39_ds.Daily_Time_Spent, label='30-39') age_grp_18_to_29_cdf = thinkstats2.Cdf(age_grp_18_to_29_ds.Daily_Time_Spent, label='18-29') thinkplot.PrePlot(2) thinkplot.Cdfs([age_grp_30_to_39_cdf, age_grp_18_to_29_cdf]) thinkplot.Config(xlabel='Daily Time Spent in minutes', ylabel='CDF') thinkplot.show() male_cdf = thinkstats2.Cdf(male_ds.Daily_Time_Spent, label='male') female_cdf = thinkstats2.Cdf(female_ds.Daily_Time_Spent, label='female') thinkplot.PrePlot(2) thinkplot.Cdfs([male_cdf, female_cdf]) thinkplot.Config(xlabel='Daily Time Spent in minutes', ylabel='CDF') thinkplot.show()