Exemplo n.º 1
0
def MakeFigures(df):
    """Make scatterplots.
    """
    sample = thinkstats2.SampleRows(df, 5000)

    # simple scatter plot
    thinkplot.PrePlot(cols=2)
    heights, weights = GetHeightWeight(sample)
    ScatterPlot(heights, weights)

    # scatter plot with jitter
    thinkplot.SubPlot(2)
    heights, weights = GetHeightWeight(sample, hjitter=1.3, wjitter=0.5)
    ScatterPlot(heights, weights)

    thinkplot.Save(root='scatter1')

    # with jitter and transparency
    thinkplot.PrePlot(cols=2)
    ScatterPlot(heights, weights, alpha=0.1)

    # hexbin plot
    thinkplot.SubPlot(2)
    heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=0.5)
    HexBin(heights, weights)
    thinkplot.Save(root='scatter2')
Exemplo n.º 2
0
def PlotArrivalDepartureDelayFit(flights):
    """Plots a scatter plot and fitted curve.

    live: DataFrame
    """

    sample = thinkstats2.SampleRows(flights, 1000)
    arrivalDelays = sample.ARRIVAL_DELAY
    departureDelays = sample.DEPARTURE_DELAY
    inter, slope = thinkstats2.LeastSquares(arrivalDelays, departureDelays)
    fit_xs, fit_ys = thinkstats2.FitLine(arrivalDelays, inter, slope)

    thinkplot.Scatter(arrivalDelays, departureDelays, color='gray', alpha=0.1)
    thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3)
    thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2)
    thinkplot.Save(
        root='ArrivalDepartureDelayFit_linear1',
        xlabel='arrival delay (min)',
        ylabel='departure delay (min)',
        #                   axis=[10, 45, 0, 15],
        legend=False)

    formula = 'DEPARTURE_DELAY ~ ARRIVAL_DELAY'
    model = smf.ols(formula, data=sample)
    results = model.fit()
    regression.SummarizeResults(results)
Exemplo n.º 3
0
def MakeArrivalDepartureDelayScatterPlots(flights):
    """Make scatterplots.
    """
    sample = thinkstats2.SampleRows(flights, 10000)

    # simple scatter plot
    thinkplot.PrePlot(cols=2)
    #    departureDelays, arrivalDelays = GetArrivalDepartureDelay(sample)
    #    airports = sample.AIRLINE
    #   arrivalDelays = sample.ARRIVAL_DELAY
    #    ScatterPlot(airports, arrivalDelays)

    # scatter plot with jitter
    #    thinkplot.SubPlot(2)
    departureDelays, arrivalDelays = GetArrivalDepartureDelay(sample,
                                                              hjitter=1.3,
                                                              wjitter=0.5)

    thinkplot.Scatter(arrivalDelays, departureDelays, alpha=1)
    thinkplot.Config(
        xlabel='arrival delay (min)',
        ylabel='departure delay (min)',
        #                     axis=[-20, 20, 20, 200],
        legend=False)

    thinkplot.Save(root='ArrivalDepartureDelayScatterplot')
def main():
    thinkstats2.RandomSeed(18)
    live, firsts, others = first.MakeFrames()
    n = len(live)
    for _ in range(7):
        sample = thinkstats2.SampleRows(live, n)
        RunTests(sample)
        n //= 2
Exemplo n.º 5
0
def ResampleRows(df):
    """Resamples rows from a DataFrame.

    df: DataFrame

    returns: DataFrame
    """
    return thinkstats2.SampleRows(df, len(df), replace=True)
Exemplo n.º 6
0
def main():
    #random seed saves the random samples
    thinkstats2.RandomSeed(23)
    live, firsts, others = first.MakeFrames()
    RunResampleTest(firsts, others)

    n = len(live)
    for _ in range(7):
        sample = thinkstats2.SampleRows(live, n)
        RunTests(sample)
        n //= 2
Exemplo n.º 7
0
   
    # compare total time spent between males and females (chi-squared)
    data = male_ds.Daily_Time_Spent.values, female_ds.Daily_Time_Spent.values
    ht = Totaltimespent(data)
    p4 = ht.PValue(iters=iters)
    
    print('\nn\tp1\tp2\tp3\tp4' )
    print('%d\t%0.2f\t%0.2f\t%0.2f\t%0.2f' % (n, p1, p2, p3, p4))

    

RunResampleTest(male_ds, female_ds)
print('\nn-number of sample records\np1-p-value for total time spent on site\np2-p value for daily internet usage\np3-correlation pvalue \np4-Chi-squared p-value')
n = len(advertisement_data)
for _ in range(8):
    sample = thinkstats2.SampleRows(advertisement_data, n)
    RunTests(sample)
    n //= 2






#########################################################################################################
############################# Section 6 -Multiple Regression Model  #####################################
#########################################################################################################

         
# Build and run a regression model with one dependent variable and multiple explanatory variables
Exemplo n.º 8
0
    ht = hypothesis.DiffMeansOneSided(data)
    p2 = ht.PValue(iters=iters)

    data = firsts_n.totalwgt_lb.values, others_n.totalwgt_lb.values
    ht = hypothesis.DiffMeansPermute(data)
    p3 = ht.PValue(iters=iters)

    data = df.agepreg.values, df.totalwgt_lb.values
    ht = hypothesis.CorrelationPermute(data)
    p4 = ht.PValue(iters=iters)

    data = firsts_n.prglngth.values, others_n.prglngth.values
    ht = hypothesis.PregLengthTest(data)
    p5 = ht.PValue(iters=iters)

    data = firsts_n.totalwgt_lb.values, others_n.totalwgt_lb.values
    ht = hypothesis.PregLengthTest(data)
    p6 = ht.PValue(iters=iters)

    print('%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' %
          (n, p1, p2, p3, p4, p5, p6))


live, firsts, others = first.MakeFrames()

n = len(live)
for _ in range(8):
    live_n = thinkstats2.SampleRows(live, n)
    RunTests(live_n, 5000)
    n //= 2
Exemplo n.º 9
0
    live, firsts, others = nsfg2.MakeFrames()

    ## test difference in pregnancy lengths
    data_length = firsts.prglngth.values, others.prglngth.values
    ht = DiffMeansPermute(data_length)
    pvalue = ht.PValue()
    print('pvalue:\n', pvalue)
    ht.PlotCdf()
    thinkplot.Show(xlabel='test statistic', ylabel='CDF')

    ## evaluate p value vs. sample size
    ns = np.logspace(2, 12, num=50, base=2, dtype=int)
    pvalues = []

    for n in ns:
        firsts_sub = thinkstats2.SampleRows(firsts, n)
        others_sub = thinkstats2.SampleRows(others, n)
        data = firsts_sub.prglngth.values, others_sub.prglngth.values

        ht = DiffMeansPermute(data)
        pvalue = ht.PValue()
        pvalues.append(pvalue)
        print('sample size: %d \npvalue: %f\n' % (n, pvalue))

    thinkplot.Plot(ns, pvalues, '.')
    thinkplot.axhline(0.05)
    thinkplot.Show(xlabel='sample size', ylabel='p values')

    ## test difference in pregnancy lengths with resampling
    data_length = firsts.prglngth.values, others.prglngth.values
    ht = DiffMeansResample(data_length)
# Clara Garcia-Sanchez
# 12/08/2020
################################# DO NOT CHANGE #####################################
# some_file.py
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../common_functions/')
import numpy as np  #
import matplotlib.pyplot as plt  #
import pandas as pd  #
import thinkstats2
import brfss
from scipy import stats
#####################################################################################
df = brfss.ReadBrfss(nrows=None)
sample = thinkstats2.SampleRows(df, 5000)
heights, weights = sample.htm3, sample.wtkg2
print(heights)