Пример #1
0
def MakePdfs(greq, less):
    greqpdf = thinkstats2.EstimatedPdf(greq.totalwgt_lb.dropna())
    lesspdf = thinkstats2.EstimatedPdf(less.totalwgt_lb.dropna())
    thinkplot.PrePlot(rows=1, cols=2)
    thinkplot.SubPlot(1)
    thinkplot.Pdf(greqpdf, label='greater/equal to 30')
    thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF')
    thinkplot.SubPlot(2)
    thinkplot.Pdf(lesspdf, label='less than 30')
    thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF')
    thinkplot.Show()
Пример #2
0
def RunSimpleProcess(gap_times, lam=0.0333, num_passengers=15, plot=True):
    """Runs the basic analysis and generates figures.

    gap_times: sequence of float
    lam: arrival rate in passengers per second
    num_passengers: int number of passengers on the platform
    plot: boolean, whether to generate plots

    Returns: WaitTimeCalculator, ElapsedTimeEstimator
    """
    global UPPER_BOUND
    UPPER_BOUND = 1200

    cdf_z = thinkstats2.Cdf(gap_times).Scale(1.0 / 60)
    print('CI z', cdf_z.CredibleInterval(90))

    xs = MakeRange(low=10)

    pdf_z = thinkstats2.EstimatedPdf(gap_times)
    pmf_z = pdf_z.MakePmf(xs=xs, label="z")

    wtc = WaitTimeCalculator(pmf_z, inverse=False)

    if plot:
        wtc.PlotPmfs()
        wtc.MakePlot()

    ete = ElapsedTimeEstimator(wtc, lam, num_passengers)

    if plot:
        ete.MakePlot()

    return wtc, ete
Пример #3
0
def main():
    filename = 'mystery0.dat'
    data = read_file(filename)

    pmf = thinkstats2.MakePmfFromList(data)
    cdf = thinkstats2.MakeCdfFromList(data)

    pdf = thinkstats2.EstimatedPdf(data)
    low, high = min(data), max(data)
    xs = numpy.linspace(low, high, 101)
    kde_pmf = pdf.MakePmf(xs)

    bin_data = BinData(data, low, high, 51)
    bin_pmf = thinkstats2.MakePmfFromList(bin_data)

    thinkplot.SubPlot(2, 2, 1)
    thinkplot.Hist(pmf, width=0.1)
    thinkplot.Config(title='Naive Pmf')

    thinkplot.SubPlot(2, 2, 2)
    thinkplot.Hist(bin_pmf)
    thinkplot.Config(title='Binned Hist')

    thinkplot.SubPlot(2, 2, 3)
    thinkplot.Pmf(kde_pmf)
    thinkplot.Config(title='KDE PDF')

    thinkplot.SubPlot(2, 2, 4)
    thinkplot.Cdf(cdf)
    thinkplot.Config(title='CDF')

    thinkplot.Show()
Пример #4
0
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000):
    
    means = []
    for j in range(m):
        xs = [random.gauss(mu, sigma) for i in range(n)]
        xbar = numpy.mean(xs)
        means.append(xbar)

    print 'rmse', RMSE(means, mu)

    cdf = thinkstats2.MakeCdfFromList(means)
    print 'confidence interval', cdf.Percentile(5), cdf.Percentile(95) 

    # estimate the PDF by KDE
    pdf = thinkstats2.EstimatedPdf(means)
    stderr = sigma / math.sqrt(n)
    vals = numpy.linspace(mu-3*stderr, mu+3*stderr, 101)
    pmf = pdf.MakePmf(vals)
    #thinkplot.Pmf(pmf)

    # plot the CDF
    thinkplot.Cdf(cdf)
    thinkplot.Save(root='estimate1',
                   xlabel='sample mean',
                   ylabel='CDF',
                   title='Sampling distribution'
                   )
Пример #5
0
def TestGte():
    """Tests the GapTimeEstimator."""
    random.seed(17)

    xs = [60, 120, 240]

    gap_times = [60, 60, 60, 60, 60, 120, 120, 120, 240, 240]

    # distribution of gap time (z)
    pdf_z = thinkstats2.EstimatedPdf(gap_times)
    pmf_z = pdf_z.MakePmf(xs=xs, label="z")

    wtc = WaitTimeCalculator(pmf_z, inverse=False)

    lam = 0.0333
    n = 100
    passenger_data = wtc.GenerateSamplePassengers(lam, n)

    pcounts = [0, 0, 0]

    ite = GapTimeEstimator(xs, pcounts, passenger_data)

    thinkplot.Clf()

    # thinkplot.Cdf(wtc.pmf_z.MakeCdf(label="actual z"))
    thinkplot.Cdf(wtc.pmf_zb.MakeCdf(label="actual zb"))
    ite.MakePlot()
Пример #6
0
 def testEstimatedPdf(self):
     pdf = thinkstats2.EstimatedPdf([1, 2, 2, 3, 5])
     self.assertEqual(len(str(pdf)), 30)
     self.assertAlmostEqual(pdf.Density(3)[0], 0.19629968)
     pmf = pdf.MakePmf()
     self.assertAlmostEqual(pmf[1.0], 0.010172282816895044)
     pmf = pdf.MakePmf(low=0, high=6)
     self.assertAlmostEqual(pmf[0.0], 0.0050742294053582942)
Пример #7
0
def ComputeSkewnesses():
    """Plots KDE of birthweight and adult weight.
    """
    def VertLine(x, y):
        thinkplot.Plot([x, x], [0, y], color='0.6', linewidth=1)

    live, firsts, others = first.MakeFrames()
    data = live.totalwgt_lb.dropna()
    print('Birth weight')
    mean, median = Summarize(data)

    y = 0.35
    VertLine(mean, y)
    thinkplot.Text(mean - 0.15, 0.1 * y, 'mean', horizontalalignment='right')
    VertLine(median, y)
    thinkplot.Text(median + 0.1, 0.1 * y, 'median', horizontalalignment='left')

    pdf = thinkstats2.EstimatedPdf(data)
    thinkplot.Pdf(pdf, label='birth weight')
    thinkplot.Save(root='density_totalwgt_kde', xlabel='lbs', ylabel='PDF')

    df = brfss.ReadBrfss(nrows=None)
    data = df.wtkg2.dropna()
    print('Adult weight')
    mean, median = Summarize(data)

    y = 0.02499
    VertLine(mean, y)
    thinkplot.Text(mean + 1, 0.1 * y, 'mean', horizontalalignment='left')
    VertLine(median, y)
    thinkplot.Text(median - 1.5,
                   0.1 * y,
                   'median',
                   horizontalalignment='right')

    pdf = thinkstats2.EstimatedPdf(data)
    thinkplot.Pdf(pdf, label='adult weight')
    thinkplot.Save(root='density_wtkg2_kde',
                   xlabel='kg',
                   ylabel='PDF',
                   xlim=[0, 200])
Пример #8
0
def RunLoop(gap_times, nums, lam=0.0333):
    """Runs the basic analysis for a range of num_passengers.

    gap_times: sequence of float
    nums: sequence of values for num_passengers
    lam: arrival rate in passengers per second

    Returns: WaitMixtureEstimator
    """
    global UPPER_BOUND
    UPPER_BOUND = 4000

    thinkplot.Clf()

    RandomSeed(18)

    # resample gap_times
    n = 220
    cdf_z = thinkstats2.Cdf(gap_times)
    sample_z = cdf_z.Sample(n)
    pmf_z = thinkstats2.Pmf(sample_z)

    # compute the biased pmf and add some long delays
    cdf_zp = BiasPmf(pmf_z).MakeCdf()
    sample_zb = numpy.append(cdf_zp.Sample(n), [1800, 2400, 3000])

    # smooth the distribution of zb
    pdf_zb = thinkstats2.EstimatedPdf(sample_zb)
    xs = MakeRange(low=60)
    pmf_zb = pdf_zb.MakePmf(xs=xs)

    # unbias the distribution of zb and make wtc
    pmf_z = UnbiasPmf(pmf_zb)
    wtc = WaitTimeCalculator(pmf_z)

    probs = []
    for num_passengers in nums:
        ete = ElapsedTimeEstimator(wtc, lam, num_passengers)

        # compute the posterior prob of waiting more than 15 minutes
        cdf_y = ete.pmf_y.MakeCdf()
        prob = 1 - cdf_y.Prob(900)
        probs.append(prob)

        # thinkplot.Cdf(ete.pmf_y.MakeCdf(label=str(num_passengers)))

    thinkplot.Plot(nums, probs)
    thinkplot.Save(
        root='redline5',
        xlabel='Num passengers',
        ylabel='P(y > 15 min)',
        formats=FORMATS,
    )
Пример #9
0
def GenerateSampleData(gap_times, lam=0.0333, n=10):
    """Generates passenger data based on actual gap times.

    gap_times: sequence of float
    lam: arrival rate in passengers per second
    n: number of simulated observations
    """
    xs = MakeRange(low=10)
    pdf_z = thinkstats2.EstimatedPdf(gap_times)
    pmf_z = pdf_z.MakePmf(xs=xs, label="z")

    wtc = WaitTimeCalculator(pmf_z, inverse=False)
    passenger_data = wtc.GenerateSamplePassengers(lam, n)
    return wtc, passenger_data
Пример #10
0
def main():
    df = hinc.ReadData()
    log_sample = InterpolateSample(df, log_upper=6.0)

    log_cdf = thinkstats2.Cdf(log_sample)
    thinkplot.Cdf(log_cdf)
    thinkplot.Show(xlabel='household income', ylabel='CDF')

    sample = np.power(10, log_sample)
    mean, median = density.Summarize(sample)

    cdf = thinkstats2.Cdf(sample)
    print('cdf[mean]', cdf[mean])

    pdf = thinkstats2.EstimatedPdf(sample)
    thinkplot.Pdf(pdf)
    thinkplot.Show(xlabel='household income', ylabel='PDF')
Пример #11
0
def main():
    df = hinc.ReadData()
    log_sample = InterpolateSample(df, log_upper=6.0)

    log_cdf = thinkstats2.Cdf(log_sample)
    thinkplot.Cdf(log_cdf)
    thinkplot.Show(xlabel='household income', ylabel='CDF')

    sample = np.power(10, log_sample)
    mean = np.mean(sample)
    cdf = thinkstats2.Cdf(sample)
    print "Median:", np.median(sample)
    print "Mean:", mean
    print "Skewness:", thinkstats2.Skewness(sample)
    print "Pearson's Skewness:", thinkstats2.PearsonMedianSkewness(sample)
    print "Percent of people with incomes <= mean:", cdf[mean]
    pdf = thinkstats2.EstimatedPdf(sample)
    thinkplot.Pdf(pdf)
Пример #12
0
def MakePdfExample():
    # mean and var of women's heights in cm, from the BRFSS
    mean, var = 163, 52.8
    std = math.sqrt(var)

    # make a PDF and compute a density, FWIW
    pdf = thinkstats2.GaussianPdf(mean, std)
    print(pdf.Density(mean + std))

    # make a PMF and plot it
    thinkplot.PrePlot(2)
    thinkplot.Pdf(pdf, label='Gaussian')

    # make a sample, make an estimated PDF, and plot it
    sample = [random.gauss(mean, std) for i in range(100)]
    sample_pdf = thinkstats2.EstimatedPdf(sample)
    thinkplot.Pdf(sample_pdf, label='sample KDE')

    thinkplot.Save(root='pdf_example', xlabel='Height (cm)', ylabel='Density')
Пример #13
0
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000):

    means = []
    for j in range(m):
        xs = [random.gauss(mu, sigma) for i in range(n)]
        xbar = numpy.mean(xs)
        means.append(xbar)

    print 'rmse', RMSE(means, mu)

    cdf = thinkstats2.MakeCdfFromList(means)
    print 'confidence interval', cdf.Percentile(5), cdf.Percentile(95)

    pdf = thinkstats2.EstimatedPdf(means)
    stderr = sigma / math.sqrt(n)
    vals = numpy.linspace(mu - 3 * stderr, mu + 3 * stderr, 101)
    pmf = pdf.MakePmf(vals)
    #thinkplot.Pmf(pmf)

    thinkplot.Cdf(cdf)
    thinkplot.Show()
Пример #14
0
def MakePdfExample(n=500):
    """Plots a normal density function and a KDE estimate.

    n: sample size
    """
    # mean and var of women's heights in cm, from the BRFSS
    mean, var = 163, 52.8
    std = math.sqrt(var)

    # make a PDF and compute a density, FWIW
    pdf = thinkstats2.NormalPdf(mean, std)
    print(pdf.Density(mean + std))

    # make a PMF and plot it
    thinkplot.PrePlot(2)
    thinkplot.Pdf(pdf, label='normal')

    # make a sample, make an estimated PDF, and plot it
    sample = [random.gauss(mean, std) for _ in range(n)]
    sample_pdf = thinkstats2.EstimatedPdf(sample)
    thinkplot.Pdf(sample_pdf, label='sample KDE')

    thinkplot.Save(root='pdf_example', xlabel='Height (cm)', ylabel='Density')
Пример #15
0
def main():
    random.seed(17)

    # mean and var of women's heights in cm, from the BRFSS
    mean, var = 163, 52.8
    sigma = math.sqrt(var)

    # make a PDF and compute a density, FWIW
    pdf = thinkstats2.GaussianPdf(mean, sigma)
    print pdf.Density(mean + sigma)

    # make a PMF and plot it
    xs = numpy.linspace(mean - 3 * sigma, mean + 3 * sigma, 100)
    pmf = pdf.MakePmf(xs)
    thinkplot.Pmf(pmf, label='Gaussian')

    # make a sample, make an estimated PDF, and plot it
    sample = [random.gauss(mean, sigma) for i in range(1000)]
    sample_pdf = thinkstats2.EstimatedPdf(sample)
    sample_pmf = sample_pdf.MakePmf(xs)
    thinkplot.Pmf(sample_pmf, label='KDE')

    thinkplot.Save(root='pdf_example', xlabel='Height (cm)', ylabel='Density')
Пример #16
0
import thinkplot
thinkplot.Pdf(pdf, label='normal')
thinkplot.Show(xlabel='height (cm)', ylabel='dencity')

#%%
pmf = pdf.MakePmf()

#%% [markdown]
# ## 6.2 KDE
#
# - Kernel density estimation

#%%
import random
sample = [random.gauss(mean, std) for _ in range(500)]
sample_pdf = thinkstats2.EstimatedPdf(sample)
thinkplot.Pdf(sample_pdf, label='sample KDE')
thinkplot.Pdf(pdf, label='normal')
thinkplot.Show(xlabel='height (cm)', ylabel='dencity')

#%%
import numpy as np
hist = thinkstats2.Hist(np.floor(sample))
thinkplot.Hist(hist)

#%%
cdf = thinkstats2.Cdf(np.floor(sample))
thinkplot.Cdf(cdf)

#%% [markdown]
#
Пример #17
0
    pdf = thinkstats2.NormalPdf(mean, std)
    pmf = pdf.MakePmf()
    thinkplot.PrePlot(2)
    thinkplot.Pdf(pdf, label='normal pdf')

    thinkplot.Pmf(pmf, label='normal pmf')
    thinkplot.Show(xlabel='x', xlim=[140, 186])

    ## KDE of normal pdf
    i = 6
    thinkplot.PrePlot(i + 1)
    thinkplot.Pdf(pdf, label='normal')

    for _ in range(i):
        sample = np.random.normal(mean, std, 500)
        sample_pdf = thinkstats2.EstimatedPdf(sample, label='sample')
        thinkplot.Pdf(sample_pdf, label='sample KDE')

    thinkplot.Show(xlabel='x', ylabel='PDF', xlim=[140, 186])

    ## calculate moments
    print('RawMoment')
    print(RawMoment(female_heights, 1), RawMoment(female_heights, 2))
    print('\n CentralMoment')
    print(CentralMoment(female_heights, 1), CentralMoment(female_heights, 2),
          CentralMoment(female_heights, 3))
    print('\n StandardizedMoment')
    print(StandardizedMoment(female_heights, 1),
          StandardizedMoment(female_heights, 2),
          StandardizedMoment(female_heights, 3))