def MakeFigures(df): """Make scatterplots. """ sample = thinkstats2.SampleRows(df, 5000) # simple scatter plot thinkplot.PrePlot(cols=2) heights, weights = GetHeightWeight(sample) ScatterPlot(heights, weights) # scatter plot with jitter thinkplot.SubPlot(2) heights, weights = GetHeightWeight(sample, hjitter=1.3, wjitter=0.5) ScatterPlot(heights, weights) thinkplot.Save(root='scatter1') # with jitter and transparency thinkplot.PrePlot(cols=2) ScatterPlot(heights, weights, alpha=0.1) # hexbin plot thinkplot.SubPlot(2) heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=0.5) HexBin(heights, weights) thinkplot.Save(root='scatter2')
def PlotArrivalDepartureDelayFit(flights): """Plots a scatter plot and fitted curve. live: DataFrame """ sample = thinkstats2.SampleRows(flights, 1000) arrivalDelays = sample.ARRIVAL_DELAY departureDelays = sample.DEPARTURE_DELAY inter, slope = thinkstats2.LeastSquares(arrivalDelays, departureDelays) fit_xs, fit_ys = thinkstats2.FitLine(arrivalDelays, inter, slope) thinkplot.Scatter(arrivalDelays, departureDelays, color='gray', alpha=0.1) thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3) thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2) thinkplot.Save( root='ArrivalDepartureDelayFit_linear1', xlabel='arrival delay (min)', ylabel='departure delay (min)', # axis=[10, 45, 0, 15], legend=False) formula = 'DEPARTURE_DELAY ~ ARRIVAL_DELAY' model = smf.ols(formula, data=sample) results = model.fit() regression.SummarizeResults(results)
def MakeArrivalDepartureDelayScatterPlots(flights): """Make scatterplots. """ sample = thinkstats2.SampleRows(flights, 10000) # simple scatter plot thinkplot.PrePlot(cols=2) # departureDelays, arrivalDelays = GetArrivalDepartureDelay(sample) # airports = sample.AIRLINE # arrivalDelays = sample.ARRIVAL_DELAY # ScatterPlot(airports, arrivalDelays) # scatter plot with jitter # thinkplot.SubPlot(2) departureDelays, arrivalDelays = GetArrivalDepartureDelay(sample, hjitter=1.3, wjitter=0.5) thinkplot.Scatter(arrivalDelays, departureDelays, alpha=1) thinkplot.Config( xlabel='arrival delay (min)', ylabel='departure delay (min)', # axis=[-20, 20, 20, 200], legend=False) thinkplot.Save(root='ArrivalDepartureDelayScatterplot')
def main(): thinkstats2.RandomSeed(18) live, firsts, others = first.MakeFrames() n = len(live) for _ in range(7): sample = thinkstats2.SampleRows(live, n) RunTests(sample) n //= 2
def ResampleRows(df): """Resamples rows from a DataFrame. df: DataFrame returns: DataFrame """ return thinkstats2.SampleRows(df, len(df), replace=True)
def main(): #random seed saves the random samples thinkstats2.RandomSeed(23) live, firsts, others = first.MakeFrames() RunResampleTest(firsts, others) n = len(live) for _ in range(7): sample = thinkstats2.SampleRows(live, n) RunTests(sample) n //= 2
# compare total time spent between males and females (chi-squared) data = male_ds.Daily_Time_Spent.values, female_ds.Daily_Time_Spent.values ht = Totaltimespent(data) p4 = ht.PValue(iters=iters) print('\nn\tp1\tp2\tp3\tp4' ) print('%d\t%0.2f\t%0.2f\t%0.2f\t%0.2f' % (n, p1, p2, p3, p4)) RunResampleTest(male_ds, female_ds) print('\nn-number of sample records\np1-p-value for total time spent on site\np2-p value for daily internet usage\np3-correlation pvalue \np4-Chi-squared p-value') n = len(advertisement_data) for _ in range(8): sample = thinkstats2.SampleRows(advertisement_data, n) RunTests(sample) n //= 2 ######################################################################################################### ############################# Section 6 -Multiple Regression Model ##################################### ######################################################################################################### # Build and run a regression model with one dependent variable and multiple explanatory variables
ht = hypothesis.DiffMeansOneSided(data) p2 = ht.PValue(iters=iters) data = firsts_n.totalwgt_lb.values, others_n.totalwgt_lb.values ht = hypothesis.DiffMeansPermute(data) p3 = ht.PValue(iters=iters) data = df.agepreg.values, df.totalwgt_lb.values ht = hypothesis.CorrelationPermute(data) p4 = ht.PValue(iters=iters) data = firsts_n.prglngth.values, others_n.prglngth.values ht = hypothesis.PregLengthTest(data) p5 = ht.PValue(iters=iters) data = firsts_n.totalwgt_lb.values, others_n.totalwgt_lb.values ht = hypothesis.PregLengthTest(data) p6 = ht.PValue(iters=iters) print('%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' % (n, p1, p2, p3, p4, p5, p6)) live, firsts, others = first.MakeFrames() n = len(live) for _ in range(8): live_n = thinkstats2.SampleRows(live, n) RunTests(live_n, 5000) n //= 2
live, firsts, others = nsfg2.MakeFrames() ## test difference in pregnancy lengths data_length = firsts.prglngth.values, others.prglngth.values ht = DiffMeansPermute(data_length) pvalue = ht.PValue() print('pvalue:\n', pvalue) ht.PlotCdf() thinkplot.Show(xlabel='test statistic', ylabel='CDF') ## evaluate p value vs. sample size ns = np.logspace(2, 12, num=50, base=2, dtype=int) pvalues = [] for n in ns: firsts_sub = thinkstats2.SampleRows(firsts, n) others_sub = thinkstats2.SampleRows(others, n) data = firsts_sub.prglngth.values, others_sub.prglngth.values ht = DiffMeansPermute(data) pvalue = ht.PValue() pvalues.append(pvalue) print('sample size: %d \npvalue: %f\n' % (n, pvalue)) thinkplot.Plot(ns, pvalues, '.') thinkplot.axhline(0.05) thinkplot.Show(xlabel='sample size', ylabel='p values') ## test difference in pregnancy lengths with resampling data_length = firsts.prglngth.values, others.prglngth.values ht = DiffMeansResample(data_length)
# Clara Garcia-Sanchez # 12/08/2020 ################################# DO NOT CHANGE ##################################### # some_file.py import sys # insert at 1, 0 is the script path (or '' in REPL) sys.path.insert(1, '../common_functions/') import numpy as np # import matplotlib.pyplot as plt # import pandas as pd # import thinkstats2 import brfss from scipy import stats ##################################################################################### df = brfss.ReadBrfss(nrows=None) sample = thinkstats2.SampleRows(df, 5000) heights, weights = sample.htm3, sample.wtkg2 print(heights)