def Render(self): """Returns pair of xs, ys suitable for plotting. """ mean, std = self.mu, self.sigma low, high = mean - 3 * std, mean + 3 * std xs, ys = thinkstats2.RenderNormalCdf(mean, std, low, high) return xs, ys
def MakeNormalModel(arrivalDelays): """Plot the CDF of arrival delays with a normal model. This is a modified copy from analytic.py """ # estimate parameters: trimming outliers yields a better fit mu, var = thinkstats2.TrimmedMeanVar(arrivalDelays, p=0.01) print('Mean, Var', mu, var) # plot the model sigma = math.sqrt(var) print('Sigma', sigma) xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=12.5) thinkplot.Plot(xs, ps, label='model', color='0.8') # plot the data cdf = thinkstats2.Cdf(arrivalDelays, label='data') thinkplot.PrePlot(1) thinkplot.Cdf(cdf) thinkplot.Save(root='NormalModel_arrivaldelay_model', title='Arrival Delays', xlabel='arrival delays (min)', ylabel='CDF')
def PlotNormalModel(sample, title="", xlabel=""): cdf = thinkstats2.Cdf(sample, label="actual") mu, var = thinkstats2.TrimmedMeanVar(sample, p=0.01) sigma = np.sqrt(var) xmin = mu - 4.0 * sigma xmax = mu + 4.0 * sigma xs, ys = thinkstats2.RenderNormalCdf(mu, sigma, xmin, xmax) thinkplot.Cdf(cdf) thinkplot.plot(xs, ys, label=r'model $\mu$={:.2f} $\sigma$={:.2f}'.format( mu, sigma)) thinkplot.Config(title=title, xlabel=xlabel, ylabel="CDF")
def MakeNormalModel(data, label): cdf = thinkstats2.Cdf(data, label=label) mean, var = thinkstats2.TrimmedMeanVar(data) std = np.sqrt(var) print('n, mean, std', len(data), mean, std) xmin = mean - 4 * std xmax = mean + 4 * std xs, ps = thinkstats2.RenderNormalCdf(mean, std, xmin, xmax) thinkplot.Plot(xs, ps, label='model', linewidth=4, color='0.8') thinkplot.Cdf(cdf)
def MakeFigures(): """Plots the CDF of populations in several forms. On a log-log scale the tail of the CCDF looks like a straight line, which suggests a Pareto distribution, but that turns out to be misleading. On a log-x scale the distribution has the characteristic sigmoid of a lognormal distribution. The normal probability plot of log(sizes) confirms that the data fit the lognormal model very well. Many phenomena that have been described with Pareto models can be described as well, or better, with lognormal models. """ pops = ReadData() print('Number of cities/towns', len(pops)) log_pops = np.log10(pops) cdf = thinkstats2.Cdf(pops, label='data') cdf_log = thinkstats2.Cdf(log_pops, label='data') # pareto plot xs, ys = thinkstats2.RenderParetoCdf(xmin=5000, alpha=1.4, low=0, high=1e7) thinkplot.Plot(np.log10(xs), 1-ys, label='model', color='0.8') thinkplot.Cdf(cdf_log, complement=True) thinkplot.Config(xlabel='log10 population', ylabel='CCDF', yscale='log') thinkplot.Save(root='populations_pareto') # lognormal plot thinkplot.PrePlot(cols=2) mu, sigma = log_pops.mean(), log_pops.std() xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=8) thinkplot.Plot(xs, ps, label='model', color='0.8') thinkplot.Cdf(cdf_log) thinkplot.Config(xlabel='log10 population', ylabel='CDF') thinkplot.SubPlot(2) thinkstats2.NormalProbabilityPlot(log_pops, label='data') thinkplot.Config(xlabel='z', ylabel='log10 population', xlim=[-5, 5]) thinkplot.Save(root='populations_normal')
def MakeNormalModel(weights): """Plots a CDF with a Normal model. weights: sequence """ cdf = thinkstats2.Cdf(weights, label='weights') mean, var = thinkstats2.TrimmedMeanVar(weights) std = math.sqrt(var) print('n, mean, std', len(weights), mean, std) xmin = mean - 4 * std xmax = mean + 4 * std xs, ps = thinkstats2.RenderNormalCdf(mean, std, xmin, xmax) thinkplot.Plot(xs, ps, label='model', linewidth=4, color='0.8') thinkplot.Cdf(cdf)
def MakeNormalModel(age): """Plots a CDF with a Normal model. age: sequence """ cdf = thinkstats2.Cdf(age, label='variable') mean, var = thinkstats2.TrimmedMeanVar(age) std = np.sqrt(var) print('n, mean, std', len(age), mean, std) xmin = mean - 4 * std xmax = mean + 4 * std xs, ps = thinkstats2.RenderNormalCdf(mean, std, xmin, xmax) thinkplot.Plot(xs, ps, label='model', linewidth=4, color='0.8') thinkplot.Cdf(cdf)
def MakeNormalCdf(): """Generates a plot of the normal CDF.""" thinkplot.PrePlot(3) mus = [1.0, 2.0, 3.0] sigmas = [0.5, 0.4, 0.3] for mu, sigma in zip(mus, sigmas): xs, ps = thinkstats2.RenderNormalCdf(mu=mu, sigma=sigma, low=-1.0, high=4.0) label = r'$\mu=%g$, $\sigma=%g$' % (mu, sigma) thinkplot.Plot(xs, ps, label=label) thinkplot.Save(root='analytic_normal_cdf', title='Normal CDF', xlabel='x', ylabel='CDF', loc=2)
def MakeFigures(df): """Plots the CDF of income in several forms. """ xs, ps = df.income.values, df.ps.values cdf = SmoothCdf(xs, ps, label='data') cdf_log = SmoothCdf(np.log10(xs), ps, label='data') # linear plot thinkplot.Cdf(cdf) thinkplot.Save(root='hinc_linear', xlabel='household income', ylabel='CDF') # pareto plot # for the model I chose parameters by hand to fit the tail xs, ys = thinkstats2.RenderParetoCdf(xmin=55000, alpha=2.5, low=0, high=250000) thinkplot.Plot(xs, 1 - ys, label='model', color='0.8') thinkplot.Cdf(cdf, complement=True) thinkplot.Save(root='hinc_pareto', xlabel='log10 household income', ylabel='CCDF', xscale='log', yscale='log') # lognormal plot # for the model I estimate mu and sigma using # percentile-based statistics median = cdf_log.Percentile(50) iqr = cdf_log.Percentile(75) - cdf_log.Percentile(25) std = iqr / 1.349 # choose std to match the upper tail std = 0.35 print(median, std) xs, ps = thinkstats2.RenderNormalCdf(median, std, low=3.5, high=5.5) thinkplot.Plot(xs, ps, label='model', color='0.8') thinkplot.Cdf(cdf_log) thinkplot.Save(root='hinc_normal', xlabel='log10 household income', ylabel='CDF')
def MakeNormalModel(weights): """Plot the CDF of birthweights with a normal model.""" # estimate parameters: trimming outliers yields a better fit mu, var = thinkstats2.TrimmedMeanVar(weights, p=0.01) print('Mean, Var', mu, var) # plot the model sigma = math.sqrt(var) print('Sigma', sigma) xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=12.5) thinkplot.Plot(xs, ps, label='model', color='0.8') # plot the data cdf = thinkstats2.Cdf(weights, label='data') thinkplot.PrePlot(1) thinkplot.Cdf(cdf) thinkplot.Save(root='analytic_birthwgt_model', title='Birth weights', xlabel='birth weight (lbs)', ylabel='CDF')
thinkplot.Cdf(cdf) thinkplot.Show(xlabel='Parts per Million', ylabel='CDF') #plotting a complementary CDF (CCDF) of O3 thinkplot.Cdf(cdf, complement=True) thinkplot.Show(xlabel='minutes', ylabel='CCDF', yscale='log') #normal CDF with a range of parameters thinkplot.PrePlot(3) mus = [1.0, 2.0, 3.0] #should change to my own numbers instead sigmas = [0.5, 0.4, 0.3] for mu, sigma in zip(mus, sigmas): xs, ps = thinkstats2.RenderNormalCdf(mu=mu, sigma=sigma, low=-1.0, high=4.0) label = r'$\mu=%g$, $\sigma=%g$' % (mu, sigma) thinkplot.Plot(xs, ps, label=label) thinkplot.Config(title='Normal CDF', xlabel='x', ylabel='CDF', loc='upper left') thinkplot.Show() #Scatterplots thinkplot.Scatter(grp_pollution_df['NO2AQI'], grp_pollution_df['SO2AQI'], alpha=1) thinkplot.Config(xlabel='NO2 & SO2 AQI',
yscale='log', loc='upper right') #%% [markdown] # ## Normal distribution # # Here's what the normal CDF looks like with a range of parameters. #%% thinkplot.PrePlot(3) mus = [1.0, 2.0, 3.0] sigmas = [0.5, 0.4, 0.3] for mu, sigma in zip(mus, sigmas): xs, ps = thinkstats2.RenderNormalCdf(mu=mu, sigma=sigma, low=-1.0, high=4.0) label = r'$\mu=%g$, $\sigma=%g$' % (mu, sigma) thinkplot.Plot(xs, ps, label=label) thinkplot.Config(title='Normal CDF', xlabel='x', ylabel='CDF', loc='upper left') #%% [markdown] # I'll use a normal model to fit the distribution of birth weights from the NSFG. #%% preg = nsfg.ReadFemPreg() weights = preg.totalwgt_lb.dropna()
thinkplot.Cdf(log_cdf, complement=True) xmin = 5000 alpha = 1.4 xs, ys = thinkstats2.RenderParetoCdf(xmin=xmin, alpha=alpha, low=0, high=1.0e7) thinkplot.Plot(np.log10(xs), 1 - ys, label=r'model $x_m={}$ $\alpha={}$'.format(xmin, alpha)) thinkplot.Config(yscale='log', xlabel='log10 pupulation', ylabel='CCDF') #%% thinkplot.Cdf(log_cdf) mu, var = thinkstats2.TrimmedMeanVar(log_pops, p=0.01) sigma = np.sqrt(var) xmin = mu - 4.0 * sigma xmax = mu + 4.0 * sigma xs, ys = thinkstats2.RenderNormalCdf(mu, sigma, xmin, xmax) thinkplot.plot(xs, ys, label=r'model $\mu$={:.2f} $\sigma$={:.2f}'.format(mu, sigma)) thinkplot.Config(xlabel='log10 pupulation', ylabel='CDF') #%% PlotNormalProbability(log_pops, ylabel="log10 population") #%% [markdown] # ## 5.6 random #%% import math
max(filmsdata.budget) # In[444]: min(filmsdata.budget) # In[445]: # plot the model sigma = np.sqrt(var) print('Sigma', sigma) xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=7000, high=380000000) thinkplot.Plot(xs, ps, label='model', color='0.6') cdf = thinkstats2.Cdf(filmsdata.budget, label='data') thinkplot.PrePlot(1) thinkplot.Cdf(cdf) thinkplot.Config(title='Film Budgets Normal Distribution', xlabel='Film Budgets', ylabel='CDF') # Next, I will observe the relationship between profit and budget. I will begin by plotting a scatterplot of these two variables. # In[446]:
def CDFVisualDist(cdf): xs, ps = cdf.xs, cdf.ps # set up subplots PrePlot(num=6, cols=3, rows=2) # linear plot SubPlot(1) Cdf(cdf, color='C0') Config(xlabel='x', ylabel='CDF', title='Linear Plot') # lognormal plot SubPlot(2) xs_log = np.log10(xs) cdf_log = thinkstats2.Cdf(xs_log, ps, label='data') median = cdf_log.Percentile(50) iqr = thinkstats2.IQRFromCDF(cdf_log) std = thinkstats2.StdFromIQR(iqr) low = np.nanmin(xs_log[xs_log != -np.inf]) high = np.nanmax(xs_log[xs_log != np.inf]) x_norm, p_norm = thinkstats2.RenderNormalCdf(median, std, low=low, high=high) Plot(x_norm, p_norm, label='model', color='0.8') Cdf(cdf_log, color='C0') Config(xlabel='log10 x', ylabel='CDF', title='Lognormal Plot') # pareto plot SubPlot(3) scale = Cdf(cdf, transform='pareto', color='C0') Config(xlabel='x', ylabel='CCDF', title='Pareto Plot', **scale) # exponential plot SubPlot(4) mean = cdf.NaNMean() lam = 1 / mean low = np.nanmin(xs[xs != -np.inf]) high = np.nanmax(xs[xs != np.inf]) expo_xs, expo_ps = thinkstats2.RenderExpoCdf(lam, low, high) Plot(expo_xs, 1 - expo_ps, label='model', color='0.8') scale = Cdf(cdf, transform='exponential', color='C0') Config(xlabel='x', ylabel='CCDF', title='Exponential Plot', **scale) # normal plot SubPlot(5) var = cdf.NaNVar() std = np.sqrt(var) low = mean - 4 * std high = mean + 4 * std norm_xs, norm_ps = thinkstats2.RenderNormalCdf(mean, std, low, high) Cdf(cdf, color='C0') Plot(norm_xs, norm_ps, label='model', linewidth=4, color='0.8') Config(xlabel='x', ylabel='CDF', title='Normal Plot') # weibull plot SubPlot(6) scale = Cdf(cdf, transform='weibull', color='C0') Config(title='weibull transform', xlabel='log x', ylabel='log log CCDF', **scale) Show()