def ComputeCorrelations(heights, weights): """Compute correlations and least squares fit. heights: sequence weights: sequence """ pearson = thinkstats2.Corr(heights, weights) assert almostEquals(pearson, 0.508736478973) print('Pearson correlation (weights):', pearson) log_weights = np.log(weights) log_pearson = thinkstats2.Corr(heights, log_weights) assert almostEquals(log_pearson, 0.531728260598) print('Pearson correlation (log weights):', log_pearson) spearman = thinkstats2.SpearmanCorr(heights, weights) print('Spearman correlation (weights):', spearman) assert almostEquals(spearman, 0.541535836332) inter, slope = thinkstats2.LeastSquares(heights, log_weights) print('Least squares inter, slope (log weights):', inter, slope) res = thinkstats2.Residuals(heights, log_weights, inter, slope) R2 = thinkstats2.CoefDetermination(log_weights, res) R = math.sqrt(R2) print('Coefficient of determination:', R2) print('sqrt(R^2):', R) assert almostEquals(R, log_pearson)
def testCov(self): t = [0, 4, 7, 3, 8, 1, 6, 2, 9, 5] a = np.array(t) t2 = [5, 4, 3, 0, 8, 9, 7, 6, 2, 1] self.assertAlmostEqual(thinkstats2.Cov(t, a), 8.25) self.assertAlmostEqual(thinkstats2.Cov(t, -a), -8.25) self.assertAlmostEqual(thinkstats2.Corr(t, a), 1) self.assertAlmostEqual(thinkstats2.Corr(t, -a), -1) self.assertAlmostEqual(thinkstats2.Corr(t, t2), -0.1878787878) self.assertAlmostEqual(thinkstats2.SpearmanCorr(t, -a), -1) self.assertAlmostEqual(thinkstats2.SpearmanCorr(t, t2), -0.1878787878)
def TestStatistic(self, data): """Computes the test statistic. data: tuple of xs and ys """ x, y = data test_stat = abs(thinkstats2.Corr(x, y)) return test_stat
def Correlations(df): print('pandas cov', df.htm3.cov(df.wtkg2)) #print('NumPy cov', np.cov(df.htm3, df.wtkg2, ddof=0)) print('thinkstats2 Cov', thinkstats2.Cov(df.htm3, df.wtkg2)) print() print('pandas corr', df.htm3.corr(df.wtkg2)) #print('NumPy corrcoef', np.corrcoef(df.htm3, df.wtkg2, ddof=0)) print('thinkstats2 Corr', thinkstats2.Corr(df.htm3, df.wtkg2)) print() print('pandas corr spearman', df.htm3.corr(df.wtkg2, method='spearman')) print('thinkstats2 SpearmanCorr', thinkstats2.SpearmanCorr(df.htm3, df.wtkg2)) print('thinkstats2 SpearmanCorr log wtkg3', thinkstats2.SpearmanCorr(df.htm3, np.log(df.wtkg2))) print() print('thinkstats2 Corr log wtkg3', thinkstats2.Corr(df.htm3, np.log(df.wtkg2))) print()
def main(script): thinkstats2.RandomSeed(17) live, firsts, others = first.MakeFrames() live = live.dropna(subset=['agepreg', 'totalwgt_lb']) BinnedPercentiles(live) ages = live.agepreg weights = live.totalwgt_lb print('thinkstats2 Corr', thinkstats2.Corr(ages, weights)) print('thinkstats2 SpearmanCorr', thinkstats2.SpearmanCorr(ages, weights)) ScatterPlot(ages, weights, alpha=0.1) thinkplot.Save(root='chap07scatter1', legend=False, formats=['jpg'])
def PValue(xs, ys, n=10): actual = thinkstats2.Corr(xs, ys) xs_copy = list(xs) ys_copy = list(ys) corrs = [] for i in range(n): corr = SimulateNull(xs_copy, ys_copy) corrs.append(corr) # what does the distribution of corrs look like? hits = [corr for corr in corrs if abs(corr) >= abs(actual)] p = len(hits) / float(n) return p
def main(name, data_dir='.'): xs, ys = ReadData(data_dir) thinkplot.Scatter(xs, ys, alpha=0.05) thinkplot.Save(root='correlate1', xlabel='Age (years)', ylabel='Birth weight (oz)', axis=[9, 45, 0, 250]) print 'Pearson', thinkstats2.Corr(xs, ys) print 'Spearman', thinkstats2.SpearmanCorr(xs, ys) for i in range(10): print SimulateNull(list(xs), list(ys)) print PValue(xs, ys, 1000)
def main(): random.seed(17) rho = -0.8 res = CorrelatedGenerator(1000, rho) xs, ys = zip(*res) a = 1.0 b = 0.0 xs = [a * x + b for x in xs] print 'mean, var of x', thinkstats2.MeanVar(xs) print 'mean, var of y', thinkstats2.MeanVar(ys) print 'covariance', thinkstats2.Cov(xs, ys) print 'Pearson corr', thinkstats2.Corr(xs, ys) print 'Spearman corr', thinkstats2.SpearmanCorr(xs, ys) thinkplot.Scatter(xs, ys) thinkplot.Show()
def ComputeLeastSquares(ages, weights): """Computes least squares fit for ages and weights. Prints summary statistics. """ # compute the correlation between age and weight print 'Pearson correlation', thinkstats2.Corr(ages, weights) print 'Spearman correlation', thinkstats2.SpearmanCorr(ages, weights) # compute least squares fit inter, slope = thinkstats2.LeastSquares(ages, weights) print '(inter, slope):', inter, slope res = thinkstats2.Residuals(ages, weights, inter, slope) R2 = thinkstats2.CoefDetermination(weights, res) print 'R^2', R2 print return inter, slope, R2
def main(): random.seed(17) rho = 0.8 xs, ys = SatIqData(1000, rho) print 'mean, var of x', thinkstats2.MeanVar(xs) print 'mean, var of y', thinkstats2.MeanVar(ys) print 'Pearson corr', thinkstats2.Corr(xs, ys) inter, slope = thinkstats2.LeastSquares(xs, ys) print 'inter', inter print 'slope', slope fxs, fys = thinkstats2.FitLine(xs, inter, slope) res = thinkstats2.Residuals(xs, ys, inter, slope) R2 = thinkstats2.CoefDetermination(ys, res) print 'R2', R2 thinkplot.Plot(fxs, fys, color='gray', alpha=0.2) thinkplot.Scatter(xs, ys) thinkplot.Show()
def ComputeAirlineArrivalDelayCorrelations(flights): """Compute the different correlations. This is similar to Correlations() in scatter.py """ flights = flights.dropna(subset=['AIRLINE', 'ARRIVAL_DELAY']) print('pandas cov', flights.AIRLINE_CODE.cov(flights.ARRIVAL_DELAY)) print('thinkstats2 Cov', thinkstats2.Cov(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY)) print() print('pandas corr Pearson', flights.AIRLINE_CODE.corr(flights.ARRIVAL_DELAY)) print('thinkstats2 Corr Pearson', thinkstats2.Corr(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY)) print() print('pandas corr spearman', flights.AIRLINE_CODE.corr(flights.ARRIVAL_DELAY, method='spearman')) print( 'thinkstats2 SpearmanCorr', thinkstats2.SpearmanCorr(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY)) print()
def SerialCorr(series, lag=1): xs = series[lag:] ys = series.shift(lag)[lag:] corr = thinkstats2.Corr(xs, ys) return corr
def TestStatistic(self, data): xs, ys = data test_stat = abs(thinkstats2.Corr(xs, ys)) return test_stat
def TestStatistic(self, data): xs, ys = data test_stat = ts2.Corr(xs, ys) return test_stat
import thinkstats2 import thinkplot import first import numpy as np live, firsts, others = first.MakeFrames() live = live.dropna(subset=['agepreg', 'totalwgt_lb']) rho = thinkstats2.Corr(live.agepreg, live.totalwgt_lb) rho_s = thinkstats2.SpearmanCorr(live.agepreg, live.totalwgt_lb) print('Pearson\'s Correlation, Mother\'s age and Birth weight: ', rho) print('Spearman\'s Rank Correlation, Mother\'s age and Birth weight: ', rho_s) thinkplot.LEGEND = False thinkplot.Scatter(live.agepreg, live.totalwgt_lb) #thinkplot.Show(xlabel = 'Mother\'s age', ylabel = 'Birth weight') thinkplot.SaveFormat(root='age_weight_scatter', fmt='png', xlabel='Mothers\'s age', ylabel='Birth weight') thinkplot.LEGEND = True bins = np.arange(10, 45, 2.5) indices = np.digitize(live.agepreg, bins) groups = live.groupby(indices) ages = [group.agepreg.mean() for i, group in groups] cdfs = [thinkstats2.Cdf(group.totalwgt_lb) for i, group in groups] for percent in [75, 50, 25]: weights = [cdf.Percentile(percent) for cdf in cdfs] label = '%dth' % percent thinkplot.Plot(ages, weights, label=label)
def PlotSamplingDistributions(live): """Plots confidence intervals for the fitted curve and sampling dists. live: DataFrame """ ages = live.agepreg weights = live.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) res = thinkstats2.Residuals(ages, weights, inter, slope) r2 = thinkstats2.CoefDetermination(weights, res) print('rho', thinkstats2.Corr(ages, weights)) print('R2', r2) print('R', math.sqrt(r2)) print('Std(ys)', thinkstats2.Std(weights)) print('Std(res)', thinkstats2.Std(res)) # plot the confidence intervals inters, slopes = SamplingDistributions(live, iters=1001) PlotConfidenceIntervals(ages, inters, slopes, percent=90, alpha=0.3, label='90% CI') thinkplot.Text(42, 7.53, '90%') PlotConfidenceIntervals(ages, inters, slopes, percent=50, alpha=0.5, label='50% CI') thinkplot.Text(42, 7.59, '50%') thinkplot.Save(root='linear3', xlabel='age (years)', ylabel='birth weight (lbs)', legend=False) # plot the confidence intervals thinkplot.PrePlot(2) thinkplot.Scatter(ages, weights, color='gray', alpha=0.1) PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2) PlotConfidenceIntervals(ages, inters, slopes) thinkplot.Save(root='linear5', xlabel='age (years)', ylabel='birth weight (lbs)', title='90% CI', axis=[10, 45, 0, 15], legend=False) # plot the sampling distribution of slope under null hypothesis # and alternate hypothesis sampling_cdf = thinkstats2.Cdf(slopes) print('p-value, sampling distribution', sampling_cdf[0]) ht = SlopeTest((ages, weights)) pvalue = ht.PValue() print('p-value, slope test', pvalue) print('inter', inter, thinkstats2.Mean(inters)) Summarize(inters, inter) print('slope', slope, thinkstats2.Mean(slopes)) Summarize(slopes, slope) thinkplot.PrePlot(2) thinkplot.Plot([0, 0], [0, 1], color='0.8') ht.PlotCdf(label='null hypothesis') thinkplot.Cdf(sampling_cdf, label='sampling distribution') thinkplot.Save(root='linear4', xlabel='slope (lbs / year)', ylabel='CDF', xlim=[-0.03, 0.03], loc='upper left')
groups = live_ss.groupby(indices) age_means = [g.agepreg.mean() for i, g in groups] wgt_cdfs = [thinkstats2.Cdf(g.totalwgt_lb) for i, g in groups] percentiles = [75, 50, 25] thinkplot.PrePlot(len(percentiles)) for percent in percentiles: wgt_percentile = [cdf.Percentile(percent) for cdf in wgt_cdfs] label = '%dth' % percent thinkplot.Plot(age_means, wgt_percentile, label=label) thinkplot.Config(xlabel='Mother age (years)', ylabel='Birth weight (lbs)', legend=True) p_corr = thinkstats2.Corr(live_ss.agepreg, live_ss.totalwgt_lb) s_corr = thinkstats2.SpearmanCorr(live_ss.agepreg, live_ss.totalwgt_lb) print('Pearson\'s Correlation:', p_corr) print('Spearman\'s Correlation:', s_corr) #--- Chapter8 Ex2 def SimulateSample(lam=2, n=10, iters=1000): lams_est = [] for m in np.arange(iters): xs = np.random.exponential(1.0 / lam, n) L = 1 / np.mean(xs) lams_est.append(L) return lams_est
def CorrelationPlots(df, xlabel, ylabel, xjitter=0, yjitter=0, axis=None, nbins=5, **options): cleaned = df.dropna(subset=[xlabel, ylabel]) xs = cleaned[xlabel] ys = cleaned[ylabel] xs = thinkstats2.Jitter(xs, xjitter) ys = thinkstats2.Jitter(ys, yjitter) xmin, xmax = min(xs), max(xs) ymin, ymax = min(ys), max(ys) if axis is None: axis = [xmin, xmax, ymin, ymax] PrePlot(num=4, rows=2, cols=2) # make scatter plot SubPlot(1) Scatter(xs, ys, alpha=0.1, s=10) Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False) # make HexBin plot SubPlot(2) HexBin(xs, ys) Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False) # plot percentiles SubPlot(3) xs_cdf = thinkstats2.Cdf(xs) lower = xs_cdf.Percentile(1) upper = xs_cdf.Percentile(99) bins = np.arange(lower, upper, nbins) indices = np.digitize(xs, bins) groups = cleaned.groupby(indices) mean_xs = [group[xlabel].mean() for i, group in groups] cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups] for percent in [75, 50, 25]: y_percentiles = [cdf.Percentile(percent) for cdf in cdfs] label = '%dth' % percent Plot(mean_xs, y_percentiles, label=label) Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=True) # plot CDFs n = (upper - lower) // (nbins - 2) bins = np.arange(lower, upper, n) indices = np.digitize(cleaned[xlabel], bins) groups = cleaned.groupby(indices) mean_xs = [group[xlabel].mean() for i, group in groups] cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups] ## plot the cdfs SubPlot(4) PrePlot(len(cdfs)) for i, cdf in enumerate(cdfs): if i == 0: label = '<%d ' % bins[0] + xlabel elif i == len(cdfs) - 1: label = '>%d ' % bins[-1] + xlabel else: label = '%d - %d ' % (bins[i - 1], bins[i]) + xlabel Cdf(cdf, label=label) Config(xlabel=ylabel, ylabel='CDF', legend=True) #print statistics print('Correlation:\n', thinkstats2.Corr(xs, ys)) print('Spearman Correlation Coefficient:\n', thinkstats2.SpearmanCorr(xs, ys))
return greq, less def SplitFrames(df): df = df.dropna(subset=['agepreg', 'totalwgt_lb']) age = df.agepreg wgt = df.totalwgt_lb return age, wgt def PlotScatter(age, wgt, xmin, xmax, ymin, ymax): thinkplot.Scatter(age, wgt, alpha=1.0) thinkplot.Config(xlabel='Age (Years)', ylabel='Birth Weight (lbs)', xlim=[xmin, xmax], ylim=[ymin, ymax], legend=False) thinkplot.Show() greq, less = MakeFrames() greqage, greqwgt = SplitFrames(greq) lessage, lesswgt = SplitFrames(less) PlotScatter(greqage, greqwgt, 30, 50, 0, 14) PlotScatter(lessage, lesswgt, 5, 30, 0, 14) print "Greq 30 Pearson's corr:", thinkstats2.Corr(greqage, greqwgt) print "Greq 30 Spearman corr:", thinkstats2.SpearmanCorr(greqage, greqwgt) print "Less 30 Pearson's corr:", thinkstats2.Corr(lessage, lesswgt) print "Less 30 Spearman corr:", thinkstats2.SpearmanCorr(lessage, lesswgt)
groups = data.groupby(indices) means = [group.htm3.mean() for i, group in groups][1:-1] cdfs = [thinkstats2.Cdf(group.residual) for i, group in groups][1:-1] # plot the pencitles for p in [75, 50, 25]: ys = [cdf.Percentile(p) for cdf in cdfs] label = str(p) + 'th' thinkplot.Plot(means, ys, label=label) thinkplot.Config(xlabel='height (cm)', ylabel='residual weight (kg)') #%% # calculate correlation and coefficient of determination rho = thinkstats2.Corr(heights, logWeight) r2 = thinkstats2.CoefDetermination(logWeight, res) # check if R^2 = rho^2 print("Correlation: {:.3f}".format(rho)) print("Coefficent of determination: {:.3f}".format(r2)) print("R^2 - rho^2: {:.3f}".format(rho**2 - r2)) #%% # calc standard deviation (RMSE) of prediction w/o height std_ys = thinkstats2.Std(logWeight) print("Standard deviation w/o height: {:.3f}".format(std_ys)) #%% # calc standard deviation (RMSE) of prediction w/ height
def SimulateNull(xs, ys): random.shuffle(xs) random.shuffle(ys) return thinkstats2.Corr(xs, ys)
cdf = thinkstats2.Cdf(df.Age) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='Age', ylabel='CDF') #plot normal distribution mean = df.Age.mean() std = df.Age.std() xs = [-4, 4] fxs, fys = thinkstats2.FitLine(xs, inter=mean, slope=std) thinkplot.Plot(fxs, fys, color='gray', label='model') xs, ys = thinkstats2.NormalProbability(df.Age) thinkplot.Plot(xs, ys, label='Age') #scatter plots and correlation #year vs. age year = thinkstats2.Jitter(df.Year, .25) thinkplot.Scatter(year, df.Age) thinkplot.Show(xlabel='Year', ylabel='Age') thinkstats2.Corr(df.Year, df.Age) #drug vs. age thinkplot.Scatter(df.Age, df.Drug) thinkplot.Show(xlabel='Age', ylabel='Drug') #testing a difference in gender data = male.Age.values, female.Age.values ht = DiffMeansPermute(data) pvalue = ht.PValue() print(pvalue) ht.PlotCdf() thinkplot.Config(xlabel='test statistic', ylabel='CDF')
# bin the data bins = np.arange(120, 200, 6) indices = np.digitize(df.htm3, bins) groups = df.groupby(indices) # make cdfs height_means = [group.htm3.mean() for _, group in groups][1:-1] cdfs = [thinkstats2.Cdf(group.residual) for _, group in groups][1:-1] # make plot of percentiles PlotPercentileLines(height_means, cdfs, xlabel='height(cm)', ylabel='residual log_10 weight (log_10 kg)') ## calculate correlation rho = thinkstats2.Corr(heights, log_weights) print('rho:\n',rho) ## coefficient of determination res = df.residual r2 = CoefDetermination(log_weights, res) print('r2:\n',r2) ## confirm that R^2 = rho^2 print('rho**2:\n',rho**2) print('r2:\n',r2) ## Std(ys) print('Std(log_weights):\n',Std(log_weights)) print('Std(res):\n',Std(res)) ratio = 1 - (Std(res) / Std(log_weights))