def Summarize(estimates, actual=None): mean = thinkstats2.Mean(estimates) stderr = thinkstats2.Std(estimates, mu=actual) cdf = thinkstats2.Cdf(estimates) ci = cdf.ConfidenceInterval(90) print('mean: ', mean, '\nSE: ',stderr, '\nCI: ', ci)
def Summarize(estimates, actual=None): """Prints standard error and 90% confidence interval. estimates: sequence of estimates actual: float actual value """ mean = thinkstats2.Mean(estimates) stderr = thinkstats2.Std(estimates, mu=actual) cdf = thinkstats2.Cdf(estimates) ci = cdf.ConfidenceInterval(90) print('mean, SE, CI', mean, stderr, ci)
def PlotSamplingDistributions(live): """Plots confidence intervals for the fitted curve and sampling dists. live: DataFrame """ ages = live.agepreg weights = live.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) res = thinkstats2.Residuals(ages, weights, inter, slope) r2 = thinkstats2.CoefDetermination(weights, res) print('rho', thinkstats2.Corr(ages, weights)) print('R2', r2) print('R', math.sqrt(r2)) print('Std(ys)', thinkstats2.Std(weights)) print('Std(res)', thinkstats2.Std(res)) # plot the confidence intervals inters, slopes = SamplingDistributions(live, iters=1001) PlotConfidenceIntervals(ages, inters, slopes, percent=90, alpha=0.3, label='90% CI') thinkplot.Text(42, 7.53, '90%') PlotConfidenceIntervals(ages, inters, slopes, percent=50, alpha=0.5, label='50% CI') thinkplot.Text(42, 7.59, '50%') thinkplot.Save(root='linear3', xlabel='age (years)', ylabel='birth weight (lbs)', legend=False) # plot the confidence intervals thinkplot.PrePlot(2) thinkplot.Scatter(ages, weights, color='gray', alpha=0.1) PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2) PlotConfidenceIntervals(ages, inters, slopes) thinkplot.Save(root='linear5', xlabel='age (years)', ylabel='birth weight (lbs)', title='90% CI', axis=[10, 45, 0, 15], legend=False) # plot the sampling distribution of slope under null hypothesis # and alternate hypothesis sampling_cdf = thinkstats2.Cdf(slopes) print('p-value, sampling distribution', sampling_cdf[0]) ht = SlopeTest((ages, weights)) pvalue = ht.PValue() print('p-value, slope test', pvalue) print('inter', inter, thinkstats2.Mean(inters)) Summarize(inters, inter) print('slope', slope, thinkstats2.Mean(slopes)) Summarize(slopes, slope) thinkplot.PrePlot(2) thinkplot.Plot([0, 0], [0, 1], color='0.8') ht.PlotCdf(label='null hypothesis') thinkplot.Cdf(sampling_cdf, label='sampling distribution') thinkplot.Save(root='linear4', xlabel='slope (lbs / year)', ylabel='CDF', xlim=[-0.03, 0.03], loc='upper left')
def Summarize(estimates, actual=None): mean = thinkstats2.Mean(estimates) stderr = thinkstats2.Std(estimates, mu=actual) cdf = thinkstats2.Cdf(estimates) ci = cdf.ConfidenceInterval(90) print('mean: {:.3f} SE: {:.3f} CI: {}'.format(mean, stderr, ci))
thinkplot.Config(xlabel='height (cm)', ylabel='residual weight (kg)') #%% # calculate correlation and coefficient of determination rho = thinkstats2.Corr(heights, logWeight) r2 = thinkstats2.CoefDetermination(logWeight, res) # check if R^2 = rho^2 print("Correlation: {:.3f}".format(rho)) print("Coefficent of determination: {:.3f}".format(r2)) print("R^2 - rho^2: {:.3f}".format(rho**2 - r2)) #%% # calc standard deviation (RMSE) of prediction w/o height std_ys = thinkstats2.Std(logWeight) print("Standard deviation w/o height: {:.3f}".format(std_ys)) #%% # calc standard deviation (RMSE) of prediction w/ height std_res = thinkstats2.Std(res) print("Standard deviation w/ height: {:.3f}".format(std_res)) #%% # How does RMSE get impacted by height info print("Impact: {:.3f}".format(1 - (std_res / std_ys))) #%% # Resampling to compute inter and slope t = [] for _ in range(100):