def main(): df = hinc.ReadData() log_sample = InterpolateSample(df, log_upper=6.0) log_cdf = thinkstats2.Cdf(log_sample) print("median", thinkstats2.Median(log_sample)) print("pearson's median skewness", thinkstats2.PearsonMedianSkewness(log_sample)) print("skewness", thinkstats2.Skewness(log_sample)) print("mean", log_cdf.Mean()) print( "the higher our log_upper, the more right-skewed (according to g_1) or at least less left-skewed (according to g_p) things get" ) print("the mean moves to the right a bit, too.") print("proportion of the population with income < mean", log_cdf.Prob(log_cdf.Mean())) print( "the higher the upper bound, the greater the proprtion below the mean." ) thinkplot.Cdf(log_cdf) thinkplot.Show(xlabel='household income', ylabel='CDF')
def PearsonMedianSkewness(xs): median = thinkstats2.Median(xs) mean = RawMoment(xs, 1) var = CentralMoment(xs, 2) std = math.sqrt(var) gp = 3 * (mean - median) / std return gp
def Summarize(data): mean = data.mean() std = data.std() median = thinkstats2.Median(data) print('mean', mean) print('std', std) print('median', median) print('skewness', thinkstats2.Skewness(data)) print('pearson skewness', thinkstats2.PearsonMedianSkewness(data)) return mean, median
def Summarize(data): """Prints summary statistics. data: pandas Series """ mean = data.mean() std = data.std() median = thinkstats2.Median(data) print('mean', mean) print('std', std) print('median', median) print('skewness', thinkstats2.Skewness(data)) print('pearson skewness', thinkstats2.PearsonMedianSkewness(data)) return mean, median
def Experiment1(n=6, m=1000): mu = 0 sigma = 1 means = [] medians = [] for _ in range(m): xs = [random.gauss(mu, sigma) for i in range(n)] xbar = numpy.mean(xs) median = thinkstats2.Median(xs) means.append(xbar) medians.append(median) print 'rmse xbar', RMSE(means, mu) print 'rmse median', RMSE(medians, mu)
def Experiment3(n=7, m=1000): lam = 2 means = [] medians = [] for _ in range(m): xs = [random.expovariate(lam) for i in range(n)] L = 1 / numpy.mean(xs) Lm = math.log(2) / thinkstats2.Median(xs) means.append(L) medians.append(Lm) print 'rmse L', RMSE(means, lam) print 'rmse Lm', RMSE(medians, lam) print 'mean error L', MeanError(means, lam) print 'mean error Lm', MeanError(medians, lam)
def describe_inc_dist(log_upper): log_sample = hinc2.InterpolateSample(df, log_upper=j) incomes = np.power(10, log_sample) inc_mean = thinkstats2.Mean(incomes) inc_med = thinkstats2.Median(incomes) inc_skew = thinkstats2.Skewness(incomes) inc_pearskew = thinkstats2.PearsonMedianSkewness(incomes) print('log_upper = ', j) print('Mean Income: ', inc_mean) print('Median Income: ', inc_med) print('Skewness: ', inc_skew) print('Pearson Median Skewness: ', inc_pearskew) cdf = thinkstats2.Cdf(incomes) inc_below_mean = cdf.Prob(inc_mean) print('Pct. below mean: ', inc_below_mean) print('\n')
def SimulateSampleExpo(lam=2.0, n=10, iters=1000): """Simulate samples of exponential dist of lambda 'lam' of size 'n' for 'm' iters. lam: float shape parameter n: sample size iters: number of iterations return: Ls - estimates of lam based on mean Lms - estimates of lam based on median """ Ls = [] Lms = [] for j in range(iters): xs = np.random.exponential(1.0 / lam, n) L = 1 / np.mean(xs) Lm = np.log(2) / thinkstats2.Median(xs) Ls.append(L) Lms.append(Lm) return Ls, Lms
def Estimate3(n=7, iters=1000): """Evaulates sample mean and sample median as estimators for properties of exponential distribution. n: int sample size iters: int number of iterations return: None """ lam = 2 means = [] medians = [] for _ in range(iters): xs = np.random.exponential(1.0 / lam, n) L = 1 / np.mean(xs) Lm = np.log(2) / thinkstats2.Median(xs) means.append(L) medians.append(Lm) print('RMSE(means, lam):\n', RMSE(means, lam)) print('RMSE(medians, lam):\n', RMSE(medians, lam)) print('MeanError(means, lam):\n', MeanError(means, lam)) print('MeanError(medians, lam):\n', MeanError(medians, lam))
for n in n_arr: lams = SimulateSample(lam, n, 1000) SampleDistrPLot(lams, n, lam) thinkplot.Config(xlabel='L estimate', ylabel='CDF', title='Sampling distribution', xlim=[0, 4], legend=True) #--- Chapter6 Ex1 df = hinc.ReadData() log_sample = hinc2.InterpolateSample(df, log_upper=6.0) sample = np.power(10, log_sample) print('Mean = ', sample.mean()) print('Median =', thinkstats2.Median(sample)) print('Skewness =', thinkstats2.Skewness(sample)) print('Pearson Median Skweness =', thinkstats2.PearsonMedianSkewness(sample)) income_cdf = thinkstats2.Cdf(sample) print(income_cdf.Prob(sample.mean()) * 100) #--- Chapter8 Ex3 def SimulateGame(lam): t = 0 goals = 0 while True: time_int = random.expovariate(lam) t += time_int if t > 1: break
greq = preg[preg.agepreg >= 30] less = preg[preg.agepreg < 30] assert len(greq) == 2635 assert len(less) == 10606 return greq, less def MakePdfs(greq, less): greqpdf = thinkstats2.EstimatedPdf(greq.totalwgt_lb.dropna()) lesspdf = thinkstats2.EstimatedPdf(less.totalwgt_lb.dropna()) thinkplot.PrePlot(rows=1, cols=2) thinkplot.SubPlot(1) thinkplot.Pdf(greqpdf, label='greater/equal to 30') thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF') thinkplot.SubPlot(2) thinkplot.Pdf(lesspdf, label='less than 30') thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF') thinkplot.Show() greq, less = MakeFrames() MakePdfs(greq, less) print "greater/equal to 30 skew:", thinkstats2.Skewness(greq.totalwgt_lb.dropna()) print "less than 30 skew:", thinkstats2.Skewness(less.totalwgt_lb.dropna()) print "greater/equal to 30 mean:", thinkstats2.Mean(greq.totalwgt_lb.dropna()) print "greater/equal to 30 median:", thinkstats2.Median(greq.totalwgt_lb.dropna()) print "less than 30 mean:", thinkstats2.Mean(less.totalwgt_lb.dropna()) print "less than 30 median:", thinkstats2.Median(less.totalwgt_lb.dropna())