示例#1
0
def main():
    filename = 'mystery0.dat'
    data = read_file(filename)

    pmf = thinkstats2.MakePmfFromList(data)
    cdf = thinkstats2.MakeCdfFromList(data)

    pdf = thinkstats2.EstimatedPdf(data)
    low, high = min(data), max(data)
    xs = numpy.linspace(low, high, 101)
    kde_pmf = pdf.MakePmf(xs)

    bin_data = BinData(data, low, high, 51)
    bin_pmf = thinkstats2.MakePmfFromList(bin_data)

    thinkplot.SubPlot(2, 2, 1)
    thinkplot.Hist(pmf, width=0.1)
    thinkplot.Config(title='Naive Pmf')

    thinkplot.SubPlot(2, 2, 2)
    thinkplot.Hist(bin_pmf)
    thinkplot.Config(title='Binned Hist')

    thinkplot.SubPlot(2, 2, 3)
    thinkplot.Pmf(kde_pmf)
    thinkplot.Config(title='KDE PDF')

    thinkplot.SubPlot(2, 2, 4)
    thinkplot.Cdf(cdf)
    thinkplot.Config(title='CDF')

    thinkplot.Show()
示例#2
0
def MakeFigures(df):
    """Generates CDFs and normal prob plots for weights and log weights."""
    weights = df.wtkg2.dropna()
    log_weights = np.log10(weights)

    # plot weights on linear and log scales
    thinkplot.PrePlot(cols=2)
    MakeNormalModel(weights)
    thinkplot.Config(xlabel='adult weight (kg)', ylabel='CDF')

    thinkplot.SubPlot(2)
    MakeNormalModel(log_weights)
    thinkplot.Config(xlabel='adult weight (log10 kg)')

    thinkplot.Save(root='brfss_weight')

    # make normal probability plots on linear and log scales
    thinkplot.PrePlot(cols=2)
    MakeNormalPlot(weights)
    thinkplot.Config(xlabel='z', ylabel='weights (kg)')

    thinkplot.SubPlot(2)
    MakeNormalPlot(log_weights)
    thinkplot.Config(xlabel='z', ylabel='weights (log10 kg)')

    thinkplot.Save(root='brfss_weight_normal')
示例#3
0
def MakeFigures(df):
    """Make scatterplots.
    """
    sample = thinkstats2.SampleRows(df, 5000)

    # simple scatter plot
    thinkplot.PrePlot(cols=2)
    heights, weights = GetHeightWeight(sample)
    ScatterPlot(heights, weights)

    # scatter plot with jitter
    thinkplot.SubPlot(2)
    heights, weights = GetHeightWeight(sample, hjitter=1.3, wjitter=0.5)
    ScatterPlot(heights, weights)

    thinkplot.Save(root='scatter1')

    # with jitter and transparency
    thinkplot.PrePlot(cols=2)
    ScatterPlot(heights, weights, alpha=0.1)

    # hexbin plot
    thinkplot.SubPlot(2)
    heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=0.5)
    HexBin(heights, weights)
    thinkplot.Save(root='scatter2')
示例#4
0
def ex3():
    def VertLine(x, y=1):
        thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3)

    lam = 4
    goal_totals = [SimulateGame(lam=lam) for _ in range(1000)]
    print('RMSE', RMSE(goal_totals, lam))
    hist = thinkstats2.Hist(goal_totals)
    cdf = thinkstats2.Cdf(goal_totals)
    thinkplot.PrePlot(rows=2, cols=2)
    thinkplot.SubPlot(1)
    thinkplot.Hist(hist)
    thinkplot.SubPlot(2)
    thinkplot.Cdf(cdf)
    VertLine(cdf.Percentile(5))
    VertLine(cdf.Percentile(95))
    thinkplot.SubPlot(3)

    # lambda vs. rmse
    # rmse goes up as lambda goes up
    lams = range(1, 15)
    rmses = [RMSE([SimulateGame(lam=l) for _ in range(1000)], l) for l in lams]
    thinkplot.Plot(lams, rmses)
    thinkplot.SubPlot(4)

    # m vs. rmse
    # maybe rmse very slowly goes down as m goes up?
    # not at all clear that's really the case...
    ms = np.arange(10, 1000, 10)
    rmses = [RMSE([SimulateGame() for _ in range(m)], 4) for m in ms]
    thinkplot.Plot(ms, rmses)

    thinkplot.show()
示例#5
0
def MakePdfs(greq, less):
    greqpdf = thinkstats2.EstimatedPdf(greq.totalwgt_lb.dropna())
    lesspdf = thinkstats2.EstimatedPdf(less.totalwgt_lb.dropna())
    thinkplot.PrePlot(rows=1, cols=2)
    thinkplot.SubPlot(1)
    thinkplot.Pdf(greqpdf, label='greater/equal to 30')
    thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF')
    thinkplot.SubPlot(2)
    thinkplot.Pdf(lesspdf, label='less than 30')
    thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF')
    thinkplot.Show()
示例#6
0
def PlotRemainingLifetime(sf1, sf2):
    """Plots remaining lifetimes for pregnancy and age at first marriage.

    sf1: SurvivalFunction for pregnancy length
    sf2: SurvivalFunction for age at first marriage
    """
    thinkplot.PrePlot(cols=2)
    rem_life1 = sf1.RemainingLifetime()
    thinkplot.Plot(rem_life1)
    thinkplot.Config(title='remaining pregnancy length',
                     xlabel='weeks',
                     ylabel='mean remaining weeks')

    thinkplot.SubPlot(2)
    func = lambda pmf: pmf.Percentile(50)
    rem_life2 = sf2.RemainingLifetime(filler=np.inf, func=func)
    thinkplot.Plot(rem_life2)
    thinkplot.Config(title='years until first marriage',
                     ylim=[0, 15],
                     xlim=[11, 31],
                     xlabel='age (years)',
                     ylabel='median remaining years')

    thinkplot.Save(root='survival6',
                   formats=FORMATS)
示例#7
0
def main():
    df = ReadData()
    cdf = thinkstats2.Cdf(df['ps'])

    thinkplot.PrePlot(rows=1, cols=2)
    thinkplot.SubPlot(1)
    scale = thinkplot.Cdf(cdf, xscale='log')
    thinkplot.Config(title='logx', **scale)

    thinkplot.SubPlot(2)
    scale = thinkplot.Cdf(cdf, transform='pareto')
    thinkplot.Config(title='pareto', **scale)

    thinkplot.show(legend=False)
    
    print(df)
示例#8
0
def MakeBabyBoom():
    """Plot CDF of interarrival time on log and linear scales.
    """
    # compute the interarrival times
    df = ReadBabyBoom()
    diffs = df.minutes.diff()
    cdf = thinkstats2.Cdf(diffs, label='actual')

    thinkplot.PrePlot(cols=2)
    thinkplot.Cdf(cdf)
    thinkplot.Config(xlabel='minutes', ylabel='CDF', legend=False)

    thinkplot.SubPlot(2)
    thinkplot.Cdf(cdf, complement=True)
    thinkplot.Config(xlabel='minutes',
                     ylabel='CCDF',
                     yscale='log',
                     legend=False)

    thinkplot.Save(root='analytic_interarrivals')

    n = len(diffs)
    lam = 44 / 24 * 60.0
    sample = [random.expovariate(lam) for _ in range(n)]
    model = thinkstats2.Cdf(sample, label='model')

    thinkplot.PrePlot(2)
    thinkplot.Cdfs([cdf, model], complement=True)
    thinkplot.Save(root='analytic_interarrivals_model',
                   title='Time between births',
                   xlabel='minutes',
                   ylabel='CCDF',
                   yscale='log')
示例#9
0
def MakeHists(live):
    """Plot Hists for live births

    live: DataFrame
    others: DataFrame
    """
    hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg')
    thinkplot.PrePlot(2, cols=2)

    thinkplot.SubPlot(1)
    thinkplot.Hist(hist)
    thinkplot.Config(xlabel='years', ylabel='frequency', axis=[0, 45, 0, 700])

    thinkplot.SubPlot(2)
    thinkplot.Pmf(hist)

    thinkplot.Save(root='probability_agepreg_hist',
                   xlabel='years',
                   axis=[0, 45, 0, 700])
示例#10
0
def main(script, filename='mystery0.dat'):
    data = ReadFile(filename)
    cdf = thinkstats2.Cdf(data)

    thinkplot.PrePlot(num=6, rows=2, cols=3)
    thinkplot.SubPlot(1)
    thinkplot.Cdf(cdf, color='C0', label=filename)
    thinkplot.Config(title='CDF on linear scale', ylabel='CDF')

    thinkplot.SubPlot(2)
    scale = thinkplot.Cdf(cdf, xscale='log', color='C0')
    thinkplot.Config(title='CDF on log-x scale', ylabel='CDF', **scale)

    thinkplot.SubPlot(3)
    scale = thinkplot.Cdf(cdf, transform='exponential', color='C0')
    thinkplot.Config(title='CCDF on log-y scale', ylabel='log CCDF', **scale)

    thinkplot.SubPlot(4)
    xs, ys = thinkstats2.NormalProbability(data)
    thinkplot.Plot(xs, ys, color='C0')
    thinkplot.Config(title='Normal probability plot',
                     xlabel='random normal',
                     ylabel='data')

    thinkplot.SubPlot(5)
    scale = thinkplot.Cdf(cdf, transform='pareto', color='C0')
    thinkplot.Config(title='CCDF on log-log scale', ylabel='log CCDF', **scale)

    thinkplot.SubPlot(6)
    scale = thinkplot.Cdf(cdf, transform='weibull', color='C0')
    thinkplot.Config(title='CCDF on loglog-y log-x scale',
                     ylabel='log log CCDF',
                     **scale)

    thinkplot.Show(legend=False)
示例#11
0
def main():
    filename = 'mystery0.dat'
    data = read_file(filename)
    cdf = thinkstats2.MakeCdfFromList(data)

    thinkplot.SubPlot(2, 3, 1)
    thinkplot.Cdf(cdf)
    thinkplot.Config(title='linear')

    thinkplot.SubPlot(2, 3, 2)
    scale = thinkplot.Cdf(cdf, xscale='log')
    thinkplot.Config(title='logx', **scale)

    thinkplot.SubPlot(2, 3, 3)
    scale = thinkplot.Cdf(cdf, transform='exponential')
    thinkplot.Config(title='expo', **scale)

    thinkplot.SubPlot(2, 3, 4)
    xs, ys = thinkstats2.NormalProbability(data)
    thinkplot.Plot(xs, ys)
    thinkplot.Config(title='normal')

    thinkplot.SubPlot(2, 3, 5)
    scale = thinkplot.Cdf(cdf, transform='pareto')
    thinkplot.Config(title='pareto', **scale)

    thinkplot.SubPlot(2, 3, 6)
    scale = thinkplot.Cdf(cdf, transform='weibull')
    thinkplot.Config(title='weibull', **scale)

    thinkplot.Show()
示例#12
0
def main(script, filename='mystery0.dat'):
    data = ReadFile(filename)
    cdf = thinkstats2.Cdf(data)

    thinkplot.PrePlot(rows=2, cols=3)
    thinkplot.SubPlot(1)
    thinkplot.Cdf(cdf)
    thinkplot.Config(title='linear')

    thinkplot.SubPlot(2)
    scale = thinkplot.Cdf(cdf, xscale='log')
    thinkplot.Config(title='logx', **scale)

    thinkplot.SubPlot(3)
    scale = thinkplot.Cdf(cdf, transform='exponential')
    thinkplot.Config(title='expo', **scale)

    thinkplot.SubPlot(4)
    xs, ys = thinkstats2.NormalProbability(data)
    thinkplot.Plot(xs, ys)
    thinkplot.Config(title='normal')

    thinkplot.SubPlot(5)
    scale = thinkplot.Cdf(cdf, transform='pareto')
    thinkplot.Config(title='pareto', **scale)

    thinkplot.SubPlot(6)
    scale = thinkplot.Cdf(cdf, transform='weibull')
    thinkplot.Config(title='weibull', **scale)

    thinkplot.Show(legend=False)
示例#13
0
def MakeHists(male, female):
    """Plot Hists for live births

    live: DataFrame
    others: DataFrame
    """
    thinkplot.PrePlot(rows=1, cols=2)
    hist = thinkstats2.Hist(male.alcwknd)
    thinkplot.SubPlot(1)
    thinkplot.Config(axis=[0, 800, 0, 600],
                     ylabel='Number of people',
                     xlabel='Alcohol consumed (grams)',
                     title='Weekend Alcohol Consumption for Men')
    thinkplot.Hist(hist, alpha=1)

    hist = thinkstats2.Hist(female.alcwknd)
    thinkplot.SubPlot(2)
    thinkplot.Config(axis=[0, 800, 0, 1200],
                     ylabel='Number of people',
                     xlabel='Alcohol consumed (grams)',
                     title='Weekend Alcohol Consumption for Women')
    thinkplot.Hist(hist, alpha=1)
    thinkplot.Show()
def MakeFigures():
    """Plots the CDF of populations in several forms.

    On a log-log scale the tail of the CCDF looks like a straight line,
    which suggests a Pareto distribution, but that turns out to be misleading.

    On a log-x scale the distribution has the characteristic sigmoid of
    a lognormal distribution.

    The normal probability plot of log(sizes) confirms that the data fit the
    lognormal model very well.

    Many phenomena that have been described with Pareto models can be described
    as well, or better, with lognormal models.
    """
    pops = ReadData()
    print('Number of cities/towns', len(pops))
    
    log_pops = np.log10(pops)
    cdf = thinkstats2.Cdf(pops, label='data')
    cdf_log = thinkstats2.Cdf(log_pops, label='data')

    # pareto plot
    xs, ys = thinkstats2.RenderParetoCdf(xmin=5000, alpha=1.4, low=0, high=1e7)
    thinkplot.Plot(np.log10(xs), 1-ys, label='model', color='0.8')

    thinkplot.Cdf(cdf_log, complement=True) 
    thinkplot.Config(xlabel='log10 population',
                     ylabel='CCDF',
                     yscale='log')
    thinkplot.Save(root='populations_pareto')

    # lognormal plot
    thinkplot.PrePlot(cols=2)

    mu, sigma = log_pops.mean(), log_pops.std()
    xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=8)
    thinkplot.Plot(xs, ps, label='model', color='0.8')

    thinkplot.Cdf(cdf_log) 
    thinkplot.Config(xlabel='log10 population',
                     ylabel='CDF')

    thinkplot.SubPlot(2)
    thinkstats2.NormalProbabilityPlot(log_pops, label='data')
    thinkplot.Config(xlabel='z',
                     ylabel='log10 population',
                     xlim=[-5, 5])

    thinkplot.Save(root='populations_normal')
示例#15
0
def pmf_stuff(width, x_low, x_high, third, pmf_one, pmf_two, label,
              y_axis_scale):
    width = width
    axis = [x_low, x_high, third, y_axis_scale]
    thinkplot.PrePlot(2, cols=2)
    thinkplot.Hist(pmf_one, align='right', width=width)
    thinkplot.Hist(pmf_two, align='left', width=width)
    thinkplot.Config(xlabel=label, ylabel='PMF', axis=axis)

    thinkplot.PrePlot(2)
    thinkplot.SubPlot(2)
    thinkplot.Pmfs([pmf_one, pmf_two])
    thinkplot.Config(xlabel=label, ylabel='PMF', axis=axis)
    thinkplot.Show()
示例#16
0
def plot_bar_step(first_pmf, other_pmf):
    """PrePlot takes optional parameters rows and cols to make a grid of figures for bar grapg"""
    width = 0.5
    thinkplot.PrePlot(2, cols=2)
    thinkplot.Hist(first_pmf, align="left", width=width)
    thinkplot.Hist(other_pmf, align="right", width=width)
    thinkplot.Config(xlabel="weeks",
                     ylabel="probability",
                     axis=[27, 46, 0, 0.6])
    #for step graph
    thinkplot.PrePlot(2)
    thinkplot.SubPlot(2)
    thinkplot.Pmfs([first_pmf, other_pmf])
    thinkplot.Show(xlabel="weeks", axis=[27, 46, 0, 0.6])
示例#17
0
def NormalPlotSamples(samples, plot=1, ylabel=''):
    """Makes normal probability plots for samples.

    samples: list of samples
    label: string
    """
    for n, sample in samples:
        thinkplot.SubPlot(plot)
        thinkstats2.NormalProbabilityPlot(sample)

        thinkplot.Config(title='n=%d' % n,
                         legend=False,
                         xticks=[],
                         yticks=[],
                         ylabel=ylabel)
        plot += 1
def PlotDailies(dailies):
    """Makes a plot with daily prices for different qualities.

    dailies: map from name to DataFrame
    """
    thinkplot.PrePlot(rows=3)
    for i, (name, daily) in enumerate(dailies.items()):
        thinkplot.SubPlot(i + 1)
        title = 'price per gram ($)' if i == 0 else ''
        thinkplot.Config(ylim=[0, 20], title=title)
        thinkplot.Scatter(daily.ppg, s=10, label=name)
        if i == 2:
            pyplot.xticks(rotation=30)
        else:
            thinkplot.Config(xticks=[])

    thinkplot.Save(root='timeseries1', formats=FORMATS)
示例#19
0
def MakeAcfPlot(dailies):
    """Makes a figure showing autocorrelation functions.

    dailies: map from category name to DataFrame of daily prices    
    """
    axis = [0, 41, -0.2, 0.2]

    thinkplot.PrePlot(cols=2)
    PlotAutoCorrelation(dailies, add_weekly=False)
    thinkplot.Config(axis=axis,
                     loc='lower right',
                     ylabel='correlation',
                     xlabel='lag (day)')

    thinkplot.SubPlot(2)
    PlotAutoCorrelation(dailies, add_weekly=True)
    thinkplot.Save(root='timeseries9',
                   axis=axis,
                   loc='lower right',
                   xlabel='lag (days)')
def MakeBabyBoom():
    """Plot CDF of interarrival time on log and linear scales.
    """
    # compute the interarrival times
    df = ReadBabyBoom()
    diffs = df.minutes.diff()
    cdf = thinkstats2.Cdf(diffs, label='actual')

    thinkplot.PrePlot(cols=2)
    thinkplot.Cdf(cdf)
    thinkplot.Config(xlabel='minutes', ylabel='CDF', legend=False)

    thinkplot.SubPlot(2)
    thinkplot.Cdf(cdf, complement=True)
    thinkplot.Config(xlabel='minutes',
                     ylabel='CCDF',
                     yscale='log',
                     legend=False)

    thinkplot.Save(root='analytic_interarrivals', legend=False)
示例#21
0
def PlotMarriageData(resp):
    """Plots hazard and survival functions.

    resp: DataFrame of respondents
    """
    hf, sf = EstimateMarriageSurvival(resp)

    thinkplot.PrePlot(rows=2)
    thinkplot.Plot(hf)
    thinkplot.Config(ylabel='hazard', legend=False)

    thinkplot.SubPlot(2)
    thinkplot.Plot(sf)
    thinkplot.Save(root='survival2',
                   xlabel='age (years)',
                   ylabel='prob unmarried',
                   ylim=[0, 1],
                   legend=False,
                   formats=FORMATS)
    return sf
示例#22
0
def PlotSurvival(complete):
    """Plots survival and hazard curves.

    complete: list of complete lifetimes
    """
    thinkplot.PrePlot(3, rows=2)

    cdf = thinkstats2.Cdf(complete, label='cdf')
    sf = MakeSurvivalFromCdf(cdf, label='survival')
    print(cdf[13])
    print(sf[13])

    thinkplot.Plot(sf)
    thinkplot.Cdf(cdf, alpha=0.2)
    thinkplot.Config()

    thinkplot.SubPlot(2)
    hf = sf.MakeHazardFunction(label='hazard')
    print(hf[39])
    thinkplot.Plot(hf)
    thinkplot.Config(ylim=[0, 0.75])
def PlotRollingMean(daily, name):
    """Plots rolling mean and EWMA.

    daily: DataFrame of daily prices
    """
    dates = pandas.date_range(daily.index.min(), daily.index.max())
    reindexed = daily.reindex(dates)

    thinkplot.PrePlot(cols=2)
    thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.1, label=name)
    roll_mean = pandas.rolling_mean(reindexed.ppg, 30)
    thinkplot.Plot(roll_mean, label='rolling mean')
    pyplot.xticks(rotation=30)
    thinkplot.Config(ylabel='price per gram ($)')

    thinkplot.SubPlot(2)
    thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.1, label=name)
    ewma = pandas.ewma(reindexed.ppg, span=30)
    thinkplot.Plot(ewma, label='EWMA')
    pyplot.xticks(rotation=30)
    thinkplot.Save(root='timeseries10', formats=FORMATS)
示例#24
0
def MakeFigures(firsts, others):
    """Plot Pmfs of pregnancy length.

    firsts: DataFrame
    others: DataFrame
    """
    # plot the PMFs
    first_pmf = thinkstats2.Pmf(firsts.prglngth, label='first')
    other_pmf = thinkstats2.Pmf(others.prglngth, label='other')
    width = 0.45

    thinkplot.PrePlot(2, cols=2)
    thinkplot.Hist(first_pmf, align='right', width=width)
    thinkplot.Hist(other_pmf, align='left', width=width)
    thinkplot.Config(xlabel='weeks',
                     ylabel='probability',
                     axis=[27, 46, 0, 0.6])

    thinkplot.PrePlot(2)
    thinkplot.SubPlot(2)
    thinkplot.Pmfs([first_pmf, other_pmf])
    thinkplot.Save(root='probability_nsfg_pmf',
                   xlabel='weeks',
                   axis=[27, 46, 0, 0.6])

    # plot the differences in the PMFs
    weeks = range(35, 46)
    diffs = []
    for week in weeks:
        p1 = first_pmf.Prob(week)
        p2 = other_pmf.Prob(week)
        diff = 100 * (p1 - p2)
        diffs.append(diff)

    thinkplot.Bar(weeks, diffs)
    thinkplot.Save(root='probability_nsfg_diffs',
                   title='Difference in PMFs',
                   xlabel='weeks',
                   ylabel='percentage points',
                   legend=False)
示例#25
0
def PlotMarriageData():
    resp = chap01ex_soln.ReadFemResp()
    resp.cmmarrhx.replace([9997, 9998, 9999], np.nan, inplace=True)

    resp['agemarry'] = (resp.cmmarrhx - resp.cmbirth) / 12.0
    cdf = thinkstats2.Cdf(resp.agemarry)
    resp['age'] = (resp.cmintvw - resp.cmbirth) / 12.0
    cdf = thinkstats2.Cdf(resp.age)

    complete = resp[resp.evrmarry == 1].agemarry
    ongoing = resp[resp.evrmarry == 0].age

    hf = EstimateHazardFunction(complete, ongoing, label='hazard')
    sf = hf.MakeSurvival(label='survival')

    thinkplot.PrePlot(rows=2)
    thinkplot.Plot(hf)
    thinkplot.Config()

    thinkplot.SubPlot(2)
    thinkplot.Plot(sf)
    thinkplot.Save(root='survival2', xlabel='age (years)', ylim=[0, 1])
示例#26
0
def main():
    p1 = thinkbayes2.MakeNormalPmf(0, 1, 3, n=101)
    p1.label = 'p1'
    p2 = p1.Copy(label='p2')

    q1 = thinkbayes2.MakeNormalPmf(0, 1, 3, n=101)
    q1.label = 'q1'
    q2 = q1.Copy(label='q2')

    p1, q1 = Update(p1, q1, True)
    p1, q2 = Update(p1, q2, True)
    p2, q1 = Update(p2, q1, True)
    p2, q2 = Update(p2, q2, False)

    thinkplot.PrePlot(num=4, rows=2)
    thinkplot.Pmfs([p1, p2])
    thinkplot.Config(legend=True)

    thinkplot.SubPlot(2)
    thinkplot.Pmfs([q1, q2])
    thinkplot.Show()

    print('Prob p1 > p2', p1 > p2)
    print('Prob q1 > q2', q1 > q2)
示例#27
0
def compareDetroitAirport(flights):
    """Create PMF to compare Atlanta airport versus other airports
       Per JD Power: Detroit Metropolitan Wayne County Airport ranks highest in passenger satisfaction among mega airports with a score of 786. 
       https://www.jdpower.com/business/press-releases/2019-north-america-airport-satisfaction-study 

    """
    detroit = flights[flights.DESTINATION_AIRPORT == 'DTW']
    others = flights[flights.AIRLINE != 'DTW']
    detroit_pmf = thinkstats2.Pmf(detroit.ARRIVAL_DELAY,
                                  label='Detroit Metro Arrival Delay')
    other_pmf = thinkstats2.Pmf(others.ARRIVAL_DELAY, label='other')
    width = 0.45

    thinkplot.PrePlot(2, cols=2)
    thinkplot.Hist(detroit_pmf, align='right', width=width)
    thinkplot.Hist(other_pmf, align='left', width=width)
    thinkplot.Save(root='-100to100DetroitDelayBarPMF',
                   title='-100 to 100 min Arrival Delay',
                   xlabel='detroit metro arrival delay',
                   ylabel='probability -100 to 100 mins',
                   axis=[-100, 100, 0, 0.032])

    thinkplot.PrePlot(2)
    thinkplot.SubPlot(2)
    thinkplot.Pmfs([detroit_pmf, other_pmf])
    thinkplot.Save(root='-100to100DetroitDelayStepPMF',
                   title='-100 to 100 min Arrival Delay',
                   xlabel='detroit metro arrival delay',
                   ylabel='probability -100 to 100 mins',
                   axis=[-100, 100, 0, 0.032])

    thinkplot.PrePlot(2, cols=2)
    thinkplot.Hist(detroit_pmf, align='right', width=width)
    thinkplot.Hist(other_pmf, align='left', width=width)
    thinkplot.Save(root='-30to30DetroitDelayBarPMF',
                   title='-30 to 30 min Arrival Delay',
                   xlabel='detroit metro arrival delay',
                   ylabel='probability -30 to 30 mins',
                   axis=[-30, 30, 0, 0.032])

    thinkplot.PrePlot(2)
    thinkplot.SubPlot(2)
    thinkplot.Pmfs([detroit_pmf, other_pmf])
    thinkplot.Save(root='-30to30DetroitDelayStepPMF',
                   title='-30 to 30 min Arrival Delay',
                   xlabel='detroit metro arrival delay',
                   ylabel='probability -30 to 30 mins',
                   axis=[-30, 30, 0, 0.032])

    thinkplot.PrePlot(2, cols=2)
    thinkplot.Hist(detroit_pmf, align='right', width=width)
    thinkplot.Hist(other_pmf, align='left', width=width)
    thinkplot.Save(root='-60to0DetroitDelayBarPMF',
                   title='-60 to 0 min Arrival Delay',
                   xlabel='detroit metro arrival delay',
                   ylabel='probability -60 to 0 mins',
                   axis=[-60, 0, 0, 0.032])

    thinkplot.PrePlot(2)
    thinkplot.SubPlot(2)
    thinkplot.Pmfs([detroit_pmf, other_pmf])
    thinkplot.Save(root='-60to0DetroitDelayStepPMF',
                   title='-60 to 0 min Arrival Delay',
                   xlabel='detroit metro arrival delay',
                   ylabel='probability -60 to 0 mins',
                   axis=[-60, 0, 0, 0.032])

    thinkplot.PrePlot(2, cols=2)
    thinkplot.Hist(detroit_pmf, align='right', width=width)
    thinkplot.Hist(other_pmf, align='left', width=width)
    thinkplot.Save(root='0to60DetroitDelayBarPMF',
                   title='0 to 60 min Arrival Delay',
                   xlabel='detroit metro arrival delay',
                   ylabel='probability 0 to 60 mins',
                   axis=[0, 60, 0, 0.032])

    thinkplot.PrePlot(2)
    thinkplot.SubPlot(2)
    thinkplot.Pmfs([detroit_pmf, other_pmf])
    thinkplot.Save(root='0to60DetroitDelayStepPMF',
                   title='0 to 60 min Arrival Delay',
                   xlabel='detroit metro arrival delay',
                   ylabel='probability 0 to 60 mins',
                   axis=[0, 60, 0, 0.032])
示例#28
0
def compareDay4(flights):
    """Create PMF to compare Day 4 (Thursday) with other days.
       I chose Day 4 (Thursday) because it showed the most flights for that day in the scatterplot

    """
    labelString = "Day 4 Arrival Delay"
    xLabelString = "day 4 arrival delay"
    day = flights[flights.DAY == 4]
    others = flights[flights.DAY != 4]
    day_pmf = thinkstats2.Pmf(day.ARRIVAL_DELAY, label=labelString)
    other_pmf = thinkstats2.Pmf(others.ARRIVAL_DELAY, label='other')
    width = 0.45

    thinkplot.PrePlot(2, cols=2)
    thinkplot.Hist(day_pmf, align='right', width=width)
    thinkplot.Hist(other_pmf, align='left', width=width)
    thinkplot.Save(root='Thursday-100to100ArrivalDelayBarPMF',
                   title='-100 to 100 min Arrival Delay',
                   xlabel=xLabelString,
                   ylabel='probability -100 to 100 mins',
                   axis=[-100, 100, 0, 0.032])

    thinkplot.PrePlot(2)
    thinkplot.SubPlot(2)
    thinkplot.Pmfs([day_pmf, other_pmf])
    thinkplot.Save(root='Thursday-100to100ArrivalDelayStepPMF',
                   title='-100 to 100 min Arrival Delay',
                   xlabel=xLabelString,
                   ylabel='probability -100 to 100 mins',
                   axis=[-100, 100, 0, 0.032])

    thinkplot.PrePlot(2, cols=2)
    thinkplot.Hist(day_pmf, align='right', width=width)
    thinkplot.Hist(other_pmf, align='left', width=width)
    thinkplot.Save(root='Thursday-30to30ArrivalDelayBarPMF',
                   title='-30 to 30 min Arrival Delay',
                   xlabel=xLabelString,
                   ylabel='probability -30 to 30 mins',
                   axis=[-30, 30, 0, 0.032])

    thinkplot.PrePlot(2)
    thinkplot.SubPlot(2)
    thinkplot.Pmfs([day_pmf, other_pmf])
    thinkplot.Save(root='Thursday-30to30ArrivalDelayStepPMF',
                   title='-30 to 30 min Arrival Delay',
                   xlabel=xLabelString,
                   ylabel='probability -30 to 30 mins',
                   axis=[-30, 30, 0, 0.032])

    thinkplot.PrePlot(2, cols=2)
    thinkplot.Hist(day_pmf, align='right', width=width)
    thinkplot.Hist(other_pmf, align='left', width=width)
    thinkplot.Save(root='Thursday-60to0ArrivalDelayBarPMF',
                   title='-60 to 0 min Arrival Delay',
                   xlabel=xLabelString,
                   ylabel='probability -60 to 0 mins',
                   axis=[-60, 0, 0, 0.032])

    thinkplot.PrePlot(2)
    thinkplot.SubPlot(2)
    thinkplot.Pmfs([day_pmf, other_pmf])
    thinkplot.Save(root='Thursday-60to0ArrivalDelayStepPMF',
                   title='-60 to 0 min Arrival Delay',
                   xlabel=xLabelString,
                   ylabel='probability -60 to 0 mins',
                   axis=[-60, 0, 0, 0.032])

    thinkplot.PrePlot(2, cols=2)
    thinkplot.Hist(day_pmf, align='right', width=width)
    thinkplot.Hist(other_pmf, align='left', width=width)
    thinkplot.Save(root='Thursday0to60ArrivalDelayBarPMF',
                   title='0 to 60 min Arrival Delay',
                   xlabel=xLabelString,
                   ylabel='probability 0 to 60 mins',
                   axis=[0, 60, 0, 0.032])

    thinkplot.PrePlot(2)
    thinkplot.SubPlot(2)
    thinkplot.Pmfs([day_pmf, other_pmf])
    thinkplot.Save(root='Thursday0to60ArrivalDelayStepPMF',
                   title='0 to 60 min Arrival Delay',
                   xlabel=xLabelString,
                   ylabel='probability 0 to 60 mins',
                   axis=[0, 60, 0, 0.032])
示例#29
0
    def RunModel(self):
        np.random.shuffle(self.fake_data)
        return self.fake_data


## main scripts
if __name__ == '__main__':

    ## read csv and group by quality and day
    transactions = pd.read_csv('mj-clean.csv', parse_dates=[5])
    dailies = GroupByQualityAndDay(transactions)

    ## plot time series by quality
    thinkplot.PrePlot(rows=3)
    for i, (name, daily) in enumerate(dailies.items()):
        thinkplot.SubPlot(i + 1)
        title = 'Price per gram ($)' if i == 0 else ''
        thinkplot.Config(ylim=[0, 20], title=title)
        thinkplot.Scatter(daily.ppg, s=10, label=name)
        if i == 2:
            plt.xticks(rotation=30)
            thinkplot.Config()
        else:
            thinkplot.Config(xticks=[])

    plt.show()

    ## calculate linear regressions for each quality

    for name, daily in dailies.items():
        model, results = RunLinearModel(daily)
示例#30
0
    ## make HexBin plot
    thinkplot.HexBin(heights, weights)
    thinkplot.Show(xlabel='Height (cm)',
                   ylabel='Weight (kg)',
                   axis=[140, 210, 20, 200],
                   legend=False)

    ## now use entire dataset
    heights_all, weights_all = sample.htm3, sample.wtkg2
    heights_all = Jitter(heights, 1.4)
    weights_all = Jitter(weights, 0.5)

    ## make scatter plot
    thinkplot.PrePlot(num=2, cols=2)
    thinkplot.SubPlot(1)
    thinkplot.Scatter(heights_all, weights_all, alpha=0.1, s=10)
    thinkplot.Show(xlabel='Height (cm)',
                   ylabel='Weight (kg)',
                   axis=[140, 210, 20, 200],
                   legend=False)

    thinkplot.SubPlot(2)
    thinkplot.HexBin(heights_all, weights_all)
    thinkplot.Show(xlabel='Height (cm)',
                   ylabel='Weight (kg)',
                   axis=[140, 210, 20, 200],
                   legend=False)

    ## bin data
    cleaned = df.dropna(subset=['htm3', 'wtkg2'])