def MakeGaussianModel(weights): """Plots a CDF with a Gaussian model. weights: sequence """ cdf = thinkstats2.Cdf(weights, label='weights') mean, var = thinkstats2.TrimmedMeanVar(weights) std = math.sqrt(var) print('n, mean, std', len(weights), mean, std) xmin = mean - 4 * std xmax = mean + 4 * std xs, ps = thinkstats2.RenderGaussianCdf(mean, std, xmin, xmax) thinkplot.Plot(xs, ps, label='model', linewidth=4, color='0.8') thinkplot.Cdf(cdf)
def MakeFigures(): """Plots the CDF of populations in several forms. On a log-log scale the tail of the CCDF looks like a straight line, which suggests a Pareto distribution, but that turns out to be misleading. On a log-x scale the distribution has the characteristic sigmoid of a lognormal distribution. The normal probability plot of log(sizes) confirms that the data fit the lognormal model very well. Many phenomena that have been described with Pareto models can be described as well, or better, with lognormal models. """ pops = ReadData() print('Number of cities/towns', len(pops)) log_pops = np.log10(pops) cdf = thinkstats2.Cdf(pops, label='data') cdf_log = thinkstats2.Cdf(log_pops, label='data') # pareto plot xs, ys = thinkstats2.RenderParetoCdf(xmin=5000, alpha=1.4, low=0, high=1e7) thinkplot.Plot(np.log10(xs), 1 - ys, label='model', color='0.8') thinkplot.Cdf(cdf_log, complement=True) thinkplot.Config(xlabel='log10 population', ylabel='CCDF', yscale='log') thinkplot.Save(root='populations_pareto') # lognormal plot thinkplot.PrePlot(cols=2) mu, sigma = log_pops.mean(), log_pops.std() xs, ps = thinkstats2.RenderGaussianCdf(mu, sigma, low=0, high=8) thinkplot.Plot(xs, ps, label='model', color='0.8') thinkplot.Cdf(cdf_log) thinkplot.Config(xlabel='log10 population', ylabel='CDF') thinkplot.SubPlot(2) thinkstats2.NormalProbabilityPlot(log_pops, label='data') thinkplot.Config(xlabel='z', ylabel='log10 population', xlim=[-5, 5]) thinkplot.Save(root='populations_normal')
def MakeFigures(df): """Plots the CDF of income in several forms. """ xs, ps = df.income.values, df.ps.values cdf = SmoothCdf(xs, ps, label='data') cdf_log = SmoothCdf(np.log10(xs), ps, label='data') # linear plot thinkplot.Cdf(cdf) thinkplot.Save(root='hinc_linear', xlabel='household income', ylabel='CDF') # pareto plot # for the model I chose parameters by hand to fit the tail xs, ys = thinkstats2.RenderParetoCdf(xmin=55000, alpha=2.5, low=0, high=250000) thinkplot.Plot(xs, 1 - ys, label='model', color='0.8') thinkplot.Cdf(cdf, complement=True) thinkplot.Save(root='hinc_pareto', xlabel='log10 household income', ylabel='CCDF', xscale='log', yscale='log') # lognormal plot # for the model I estimate mu and sigma using # percentile-based statistics median = cdf_log.Percentile(50) iqr = cdf_log.Percentile(75) - cdf_log.Percentile(25) std = iqr / 1.349 # choose std to match the upper tail std = 0.35 print(median, std) xs, ps = thinkstats2.RenderGaussianCdf(median, std, low=3.5, high=5.5) thinkplot.Plot(xs, ps, label='model', color='0.8') thinkplot.Cdf(cdf_log) thinkplot.Save(root='hinc_normal', xlabel='log10 household income', ylabel='CDF')
def MakeGaussianCdf(): """Generates a plot of the gaussian CDF.""" thinkplot.PrePlot(3) mus = [1.0, 2.0, 3.0] sigmas = [0.5, 0.4, 0.3] for mu, sigma in zip(mus, sigmas): xs, ps = thinkstats2.RenderGaussianCdf(mu=mu, sigma=sigma, low=-1.0, high=4.0) label = 'mu=%g, sigma=%g' % (mu, sigma) thinkplot.Plot(xs, ps, label=label) thinkplot.Save(root='analytic_gaussian_cdf', title='Gaussian CDF', xlabel='x', ylabel='CDF', loc=2)
def MakeGaussianModel(weights): """Plot the CDF of birthweights with a gaussian model.""" # estimate parameters: trimming outliers yields a better fit mu, var = thinkstats2.TrimmedMeanVar(weights, p=0.01) print('Mean, Var', mu, var) # plot the model sigma = math.sqrt(var) print('Sigma', sigma) xs, ps = thinkstats2.RenderGaussianCdf(mu, sigma, low=0, high=12.5) thinkplot.Plot(xs, ps, label='model', color='0.8') # plot the data cdf = thinkstats2.Cdf(weights, label='data') thinkplot.PrePlot(1) thinkplot.Cdf(cdf) thinkplot.Save(root='analytic_birthwgt_model', title='Birth weights', xlabel='birth weight (lbs)', ylabel='CDF')