Пример #1
0
def MakeDF(script, nrows=1000):
    """Tests the functions in this module.

    script: string script name
    """
    thinkstats2.RandomSeed(17)

    nrows = int(nrows)
    df = ReadBrfss(nrows=nrows)
    #MakeFigures(df)

    Summarize(df, 'htm3', 'Height (cm):')
    Summarize(df, 'wtkg2', 'Weight (kg):')
    Summarize(df, 'wtyrago', 'Weight year ago (kg):')

    if nrows == 1000:
        assert(df.age.value_counts()[40] == 28)
        assert(df.sex.value_counts()[2] == 668)
        assert(df.wtkg2.value_counts()[90.91] == 49)
        assert(df.wtyrago.value_counts()[160/2.2] == 49)
        assert(df.htm3.value_counts()[163] == 103)
        assert(df.finalwt.value_counts()[185.870345] == 13)
        print('%s: All tests passed.' % script)

    return df
Пример #2
0
def main():
    thinkstats2.RandomSeed(17)

    Estimate1()
    Estimate2()
    Estimate3(m=1000)
    SimulateSample()
Пример #3
0
def MakeFigures():
    """Make scatterplots.
    """
    thinkstats2.RandomSeed(17)

    df = brfss.ReadBrfss(nrows=None)
    sample = SampleRows(df, 5000, replace=False)

    heights, weights = GetHeightWeight(sample)
    assert (heights.values[100] == 175)
    assert (weights.values[100] == 86.36)

    ScatterPlot('brfss_scatter1', heights, weights)

    heights, weights = GetHeightWeight(sample, hjitter=1.5, wjitter=1.1)
    assert (int(heights.values[100]) == 173)
    assert (int(weights.values[100]) == 85)

    ScatterPlot('brfss_scatter2', heights, weights)
    ScatterPlot('brfss_scatter3', heights, weights, alpha=0.1)

    # make a hexbin of all records
    heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=1.1)
    assert (int(heights.values[100]) == 171)
    assert (int(weights.values[100]) == 55)
    HexBin('brfss_scatter4', heights, weights)
def main():
    thinkstats2.RandomSeed(18)
    live, firsts, others = first.MakeFrames()
    n = len(live)
    for _ in range(7):
        sample = thinkstats2.SampleRows(live, n)
        RunTests(sample)
        n //= 2
Пример #5
0
def main():
    thinkstats2.RandomSeed(17)

    live, firsts, others = first.MakeFrames()
    PlotAdultWeights(live)

    PlotPregLengths(live, firsts, others)

    TestIntervention()
Пример #6
0
def main(name, data_dir=''):
    thinkstats2.RandomSeed(17)

    MakeExample()
    live, firsts, others = first.MakeFrames()
    RandomFigure(live)
    TestSample(live)
    MakeCdf(live)
    MakeFigures(live, firsts, others)
Пример #7
0
def main():
    thinkstats2.RandomSeed(17)

    # Estimate1()
    # Estimate2()
    # Estimate3(m=1000)
    # SimulateSample()
    # ex1()
    # ex2()
    ex3()
Пример #8
0
def main(script):
    thinkstats2.RandomSeed(17)

    df = brfss.ReadBrfss(nrows=None)
    df = df.dropna(subset=['htm3', 'wtkg2'])
    Correlations(df)
    return

    MakeFigures(df)
    BinnedPercentiles(df)
Пример #9
0
def main():
    thinkstats2.RandomSeed(17)

    live, _, _ = first.MakeFrames()
    EstimateBirthWeight(live)

    live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    PlotSamplingDistributions(live)

    PlotFit(live)
    PlotResiduals(live)
Пример #10
0
def main():
    #random seed saves the random samples
    thinkstats2.RandomSeed(23)
    live, firsts, others = first.MakeFrames()
    RunResampleTest(firsts, others)

    n = len(live)
    for _ in range(7):
        sample = thinkstats2.SampleRows(live, n)
        RunTests(sample)
        n //= 2
Пример #11
0
def main():
    thinkstats2.RandomSeed(17)
    Estimate4()
    return

    for n in [10, 100, 1000]:
        stderr = SimulateSample(n=n)
        print(n, stderr)

    Estimate1()
    Estimate2()
def main(name, data_dir='.'):
    thinkstats2.RandomSeed(17)
    LogisticRegressionExample()

    live, firsts, others = first.MakeFrames()
    live['isfirst'] = (live.birthord == 1)

    RunLogisticModels(live)

    RunSimpleRegression(live)
    RunModels(live)

    PredictBirthWeight(live)
def main(script):
    thinkstats2.RandomSeed(17)

    live, firsts, others = first.MakeFrames()
    live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    BinnedPercentiles(live)

    ages = live.agepreg
    weights = live.totalwgt_lb
    print('thinkstats2 Corr', thinkstats2.Corr(ages, weights))
    print('thinkstats2 SpearmanCorr', thinkstats2.SpearmanCorr(ages, weights))

    ScatterPlot(ages, weights, alpha=0.1)
    thinkplot.Save(root='chap07scatter1', legend=False, formats=['jpg'])
Пример #14
0
def main(name, nrows=None):
    thinkstats2.RandomSeed(17)

    if nrows is not None:
        nrows = int(nrows)

    df = brfss.ReadBrfss(nrows=nrows)

    columns = df[['htm3', 'wtkg2']].dropna()
    heights, weights = columns.htm3.values, columns.wtkg2.values

    TestCorrelation(heights, weights)
    if nrows == None:
        ComputeCorrelations(heights, weights)
Пример #15
0
def main():
    thinkstats2.RandomSeed(17)

    # run the coin test
    ct = CoinTest((140, 110))
    pvalue = ct.PValue()
    print('coin test p-value', pvalue)

    # compare pregnancy lengths
    print('\nprglngth')
    live, firsts, others = first.MakeFrames()
    data = firsts.prglngth.values, others.prglngth.values
    RunTests(data)

    # compare birth weights
    print('\nbirth weight')
    data = (firsts.totalwgt_lb.dropna().values,
            others.totalwgt_lb.dropna().values)
    ht = DiffMeansPermute(data)
    p_value = ht.PValue(iters=1000)
    print('means permute two-sided')
    PrintTest(p_value, ht)

    # test correlation
    live2 = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    data = live2.agepreg.values, live2.totalwgt_lb.values
    ht = CorrelationPermute(data)
    p_value = ht.PValue()
    print('\nage weight correlation')
    print('n=', len(live2))
    PrintTest(p_value, ht)

    # run the dice test
    RunDiceTest()

    # compare pregnancy lengths (chi-squared)
    data = firsts.prglngth.values, others.prglngth.values
    ht = PregLengthTest(data)
    p_value = ht.PValue()
    print('\npregnancy length chi-squared')
    PrintTest(p_value, ht)

    # compute the false negative rate for difference in pregnancy length
    data = firsts.prglngth.values, others.prglngth.values
    neg_rate = FalseNegRate(data)
    print('false neg rate', neg_rate)

    # run the tests with new nsfg data
    ReplicateTests()
Пример #16
0
def main():
    thinkstats2.RandomSeed(17)

    MakeCltPlots()

    print('Gorilla example')
    dist = Normal(90, 7.5**2)
    print(dist)
    dist_xbar = dist.Sum(9) / 9
    print(dist_xbar.sigma)
    print(dist_xbar.Percentile(5), dist_xbar.Percentile(95))

    live, firsts, others = first.MakeFrames()
    TestCorrelation(live)
    PlotPregLengths(live, firsts, others)

    TestChiSquared()
Пример #17
0
def main():
    thinkstats2.RandomSeed(17)

    # get the data
    live, firsts, others = first.MakeFrames()
    mean_var = thinkstats2.MeanVar(live.prglngth)
    print('(Mean, Var) of prglength for live births', mean_var)
    data = firsts.prglngth.values, others.prglngth.values

    # test the difference in means
    ht = DiffMeansPermute(data)
    p_value = ht.PValue(iters=1000)
    print('p-value =', p_value)

    ht.PlotCdf()
    thinkplot.Save(root='hypothesis1',
                   title='Permutation test',
                   xlabel='difference in means (weeks)',
                   ylabel='CDF',
                   legend=False)

    # test the difference in std
    ht = DiffStdPermute(data)
    p_value = ht.PValue(iters=1000)
    print('p-value =', p_value)

    ht.PlotCdf()
    thinkplot.Save(root='hypothesis2',
                   title='Permutation test',
                   xlabel='difference in std (weeks)',
                   ylabel='CDF',
                   legend=False)

    # test the difference in means by resampling
    ht = DiffStdPermute(data)
    p_value = ht.PValue(iters=1000)
    print('p-value =', p_value)

    ht.PlotCdf()
    thinkplot.Save(root='hypothesis3',
                   title='Resampling test',
                   xlabel='difference in means (weeks)',
                   ylabel='CDF',
                   legend=False)
def main(name):
    thinkstats2.RandomSeed(18)
    transactions = ReadData()

    dailies = GroupByQualityAndDay(transactions)
    PlotDailies(dailies)
    RunModels(dailies)
    PrintSerialCorrelations(dailies)
    MakeAcfPlot(dailies)

    name = 'high'
    daily = dailies[name]

    PlotLinearModel(daily, name)
    PlotRollingMean(daily, name)
    PlotFilled(daily, name)

    years = np.linspace(0, 5, 101)
    thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)
    PlotPredictions(daily, years)
    xlim = years[0] - 0.1, years[-1] + 0.1
    thinkplot.Save(root='timeseries4',
                   title='predictions',
                   xlabel='years',
                   xlim=xlim,
                   ylabel='price per gram ($)',
                   formats=FORMATS)

    name = 'medium'
    daily = dailies[name]

    thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)
    PlotIntervals(daily, years)
    PlotPredictions(daily, years)
    xlim = years[0] - 0.1, years[-1] + 0.1
    thinkplot.Save(root='timeseries5',
                   title='predictions',
                   xlabel='years',
                   xlim=xlim,
                   ylabel='price per gram ($)',
                   formats=FORMATS)
def main():
    thinkstats2.RandomSeed(18)
    MakeExampleNormalPlot()

    # make the analytic CDFs
    MakeExpoCdf()
    MakeBabyBoom()

    MakeParetoCdf()
    MakeParetoCdf2()
    MakeNormalCdf()

    # test the distribution of birth weights for normality
    preg = nsfg.ReadFemPreg()
    full_term = preg[preg.prglngth >= 37]

    weights = preg.totalwgt_lb.dropna()
    term_weights = full_term.totalwgt_lb.dropna()

    MakeNormalModel(weights)
    MakeNormalPlot(weights, term_weights)
Пример #20
0
def main():
    thinkstats2.RandomSeed(17)
    
    preg = nsfg.ReadFemPreg()
    sf1 = PlotPregnancyData(preg)

    # make the plots based on Cycle 6
    resp6 = ReadFemResp2002()

    sf2 = PlotMarriageData(resp6)

    ResampleSurvival(resp6)

    PlotRemainingLifetime(sf1, sf2)

    # read Cycles 5 and 7
    resp5 = ReadFemResp1995()
    resp7 = ReadFemResp2010()

    # plot resampled survival functions by decade
    resps = [resp5, resp6, resp7]
    PlotResampledByDecade(resps)
    thinkplot.Save(root='survival4',
                   xlabel='age (years)',
                   ylabel='prob unmarried',
                   xlim=[13, 45],
                   ylim=[0, 1],
                   formats=FORMATS)

    # plot resampled survival functions by decade, with predictions
    PlotResampledByDecade(resps, predict_flag=True, omit=[5])
    thinkplot.Save(root='survival5',
                   xlabel='age (years)',
                   ylabel='prob unmarried',
                   xlim=[13, 45],
                   ylim=[0, 1],
                   formats=FORMATS)
Пример #21
0
def main():
    thinkstats2.RandomSeed(17)

    # make the plots based on Cycle 6

    resp6 = ReadFemResp2002()
    resps = [resp6]

    sf_map = ResampleSurvivalByDecade(resps)
    sf_map_pred = ResampleSurvivalByDecade(resps, predict_flag=True)
    PlotSurvivalFunctions(sf_map)
    thinkplot.Save(root='marriage1', formats=['pdf'])
    return

    resp8 = ReadFemResp2013()
    Validate2013(resp8)
    return

    resp7 = ReadFemResp2010()
    Validate2010(resp7)
    return

    resp6 = ReadFemResp2002()
    Validate2002(resp6)
    return

    resp5 = ReadFemResp1995()
    Validate1995(resp5)
    return

    resp4 = ReadFemResp1988()
    Validate1988(resp4)
    return

    resp3 = ReadFemResp1982()
    Validate1982(resp3)
    return
Пример #22
0
def main():
    thinkstats2.RandomSeed(17)

    MakePdfExample()
    ComputeSkewnesses()
Пример #23
0
def main():

    thinkstats2.RandomSeed(17)

    flights = ReadFlightData()
    #    print(flights.head())

    #    print(flights.DESTINATION_AIRPORT.to_string(index=False))

    airlines = ReadAirlineData()
    #    print(airlines.head())

    airports = ReadAirportData()
    #    print(airports.head())
    """ A minimum of 5 variables in your dataset used during your analysis (for help with selecting, the author made his selection on page 6 of your book). Consider what you think could have an impact on your question – remember this is never perfect, so don’t be worried if you miss one (Chapter 1).
    Describe what the 5 variables mean in the dataset (Chapter 1).
    DAY_OF_WEEK - Integer 1 - 7 corresponding to the day of the week.  1 is Monday and 7 is Sunday.
    AIRLINE - Letter code corresponding to the airline for the flight.
    ORIGIN_AIRPORT - Airport code corresponding to the flight's origin airport.
    DESTINATION_AIRPORT - Airport code corresponding to the flight's destination airport.
    DEPARTURE_DELAY - Integer value corresponding to the departure delay for the flight. Computed from SCHEDULED_DEPARTURE and DEPARTURE_TIME.
    ARRIVAL_DELAY - Integer value corresponding to the arrival delay for the flight.  Computed from SCHEDULED_ARRIVAL and ARRIVAL_TIME.
    """
    """Include a histogram of each of the 5 variables – in your summary and analysis, identify any outliers and explain the reasoning for them being outliers and how you believe they should be handled (Chapter 2).  
       Include the other descriptive characteristics about the variables: Mean, Mode, Spread, and Tails (Chapter 2).
    """
    createHistograms(flights, airlines, airports)

    alaska = flights[flights.AIRLINE == 'AS']
    #    print(alaska.head())
    notAlaska = flights[flights.AIRLINE != 'AS']
    #    print(notAlaska.head())
    """Using pg. 29 of your text as an example, compare two scenarios in your data using a PMF. 
       Reminder, this isn’t comparing two variables against each other – it is the same variable, 
       but a different scenario. Almost like a filter. The example in the book is first babies compared 
       to all other babies, it is still the same variable, but breaking the data out based on criteria 
       we are exploring (Chapter 3).
    """
    compareAlaskaAirlinesPmf(alaska, notAlaska)
    compareDetroitAirport(flights)
    compareDay4(flights)
    """ Create 1 CDF with one of your variables, using page 41-44 as your guide, what does this tell you 
        about your variable and how does it address the question you are trying to answer (Chapter 4).
    
    """
    compareAlaskaAirlinesCdf(alaska, notAlaska)

    arrivalDelays = flights.ARRIVAL_DELAY.dropna()
    """ Plot 1 analytical distribution and provide your analysis on how it applies to the dataset you have chosen (Chapter 5).
    
    """
    MakeNormalModel(arrivalDelays)
    MakeNormalPlot(arrivalDelays)
    """ Create two scatter plots comparing two variables and provide your analysis on correlation and causation. 
        Remember, covariance, Pearson’s correlation, and Non-Linear Relationships should also be considered during 
        your analysis (Chapter 7).
    """

    MakeAirlineArrivalDelayScatterPlots(flights)
    MakeArrivalDepartureDelayScatterPlots(flights)
    ComputeArrivalDepartureDelayCorrelations(flights)
    ComputeAirlineArrivalDelayCorrelations(flights)

    # Remove data with missing arrival delay
    # It seems most of the rows in the set with missing arrival delay is also missing values for other attributes
    # I do not feel this will have an impact for this analysis.
    """ Conduct a test on your hypothesis using one of the methods covered in Chapter 9.
    """
    hypothesisTestData = alaska.ARRIVAL_DELAY.dropna(
    ).values, notAlaska.ARRIVAL_DELAY.dropna().values
    RunAlaskaTests(hypothesisTestData)
    """ For this project, conduct a regression analysis on either one dependent and one explanatory variable, 
        or multiple explanatory variables (Chapter 10 & 11).  
    """
    PlotAirlineArrivalDelayFit(flights)
    PlotArrivalDepartureDelayFit(flights)
    live2 = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    data = live2.agepreg.values, live2.totalwgt_lb.values
    ht = hypothesis.CorrelationPermute(data)
    p3 = ht.PValue(iters=iters)

    # compare pregnancy lengths (chi-squared)
    data = firsts.prglngth.values, others.prglngth.values
    ht = hypothesis.PregLengthTest(data)
    p4 = ht.PValue(iters=iters)

    print("{}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}".format(n, p1, p2, p3, p4))


#%%
# set the random generators
thinkstats2.RandomSeed(18)

# get the wght and length
live, firsts, others = first.MakeFrames()
RunSampleTest(first, others)

#%%
# run the test
n = len(live)
print("nval\t Test1\t  Test2\t  Test3\t  Test4\t")
for i in range(7):
    sample = thinkstats2.SampleRows(live, n)
    RunTests(sample)
    n //= 2

#%% [markdown]
Пример #25
0
def main():
    thinkstats2.RandomSeed(17)
    MakeFigures()