def test_get_rdataset(): test_url = "https://raw.githubusercontent.com/vincentarelbundock/" \ "Rdatasets/master/csv/datasets/cars.csv" internet_available = check_internet(test_url) if not internet_available: pytest.skip('Unable to retrieve file - skipping test') try: duncan = get_rdataset("Duncan", "carData", cache=cur_dir) except (HTTPError, URLError, SSLError, timeout): pytest.skip('Failed with HTTPError or URLError, these are random') assert_(isinstance(duncan, utils.Dataset)) duncan = get_rdataset("Duncan", "carData", cache=cur_dir) assert_(duncan.from_cache)
def test_get_rdataset(): test_url = "https://raw.githubusercontent.com/vincentarelbundock/" \ "Rdatasets/master/csv/datasets/cars.csv" internet_available = check_internet(test_url) if not internet_available: pytest.skip('Unable to retrieve file - skipping test') try: duncan = get_rdataset("Duncan", "carData", cache=cur_dir) except IGNORED_EXCEPTIONS: pytest.skip('Failed with HTTPError or URLError, these are random') assert_(isinstance(duncan, utils.Dataset)) duncan = get_rdataset("Duncan", "carData", cache=cur_dir) assert_(duncan.from_cache)
def test_get_rdataset_write_read_cache(): # test writing and reading cache try: guerry = get_rdataset("Guerry", "HistData", cache=cur_dir) except (HTTPError, URLError, SSLError, timeout): pytest.skip('Failed with HTTPError or URLError, these are random') assert_(guerry.from_cache is False) guerry2 = get_rdataset("Guerry", "HistData", cache=cur_dir) assert_(guerry2.from_cache is True) fn = "raw.githubusercontent.com,vincentarelbundock,Rdatasets,master,csv," \ "HistData,Guerry.csv.zip" os.remove(os.path.join(cur_dir, fn)) fn = "raw.githubusercontent.com,vincentarelbundock,Rdatasets,master,doc," \ "HistData,rst,Guerry.rst.zip" os.remove(os.path.join(cur_dir, fn))
def test_results_on_the_quakes_dataset(self): """ R code: ------ > data("quakes") > x = quakes[1:50, 1:3] > y = quakes[51:100, 1:3] > dcov.test(x, y, R=200) dCov independence test (permutation test) data: index 1, replicates 200 nV^2 = 45046, p-value = 0.4577 sample estimates: dCov 30.01526 """ quakes = get_rdataset("quakes").data.values[:, :3] x = quakes[:50] y = quakes[50:100] stats = ddm.distance_statistics(x, y) assert_almost_equal(np.round(stats.test_statistic), 45046, 0) assert_almost_equal(stats.distance_correlation, 0.1894193, 4) assert_almost_equal(stats.distance_covariance, 30.01526, 4) assert_almost_equal(stats.dvar_x, 170.1702, 4) assert_almost_equal(stats.dvar_y, 147.5545, 4) assert_almost_equal(stats.S, 52265, 0) test_statistic, _, method = ddm.distance_covariance_test(x, y, B=199) assert_almost_equal(np.round(test_statistic), 45046, 0) assert method == "emp"
def test_results_on_the_iris_dataset(self): """ R code example from the `energy` package documentation for `energy::distance_covariance.test`: > x <- iris[1:50, 1:4] > y <- iris[51:100, 1:4] > set.seed(1) > dcov.test(x, y, R=200) dCov independence test (permutation test) data: index 1, replicates 200 nV^2 = 0.5254, p-value = 0.9552 sample estimates: dCov 0.1025087 """ iris = get_rdataset("iris").data.values[:, :4] x = iris[:50] y = iris[50:100] stats = ddm.distance_statistics(x, y) assert_almost_equal(stats.test_statistic, 0.5254, 4) assert_almost_equal(stats.distance_correlation, 0.3060479, 4) assert_almost_equal(stats.distance_covariance, 0.1025087, 4) assert_almost_equal(stats.dvar_x, 0.2712927, 4) assert_almost_equal(stats.dvar_y, 0.4135274, 4) assert_almost_equal(stats.S, 0.667456, 4) test_statistic, _, method = ddm.distance_covariance_test(x, y, B=199) assert_almost_equal(test_statistic, 0.5254, 4) assert method == "emp"
def test_get_rdataset_write_read_cache(): # test writing and reading cache try: guerry = get_rdataset("Guerry", "HistData", cache=cur_dir) except IGNORED_EXCEPTIONS: pytest.skip('Failed with HTTPError or URLError, these are random') assert_(guerry.from_cache is False) guerry2 = get_rdataset("Guerry", "HistData", cache=cur_dir) assert_(guerry2.from_cache is True) fn = "raw.githubusercontent.com,vincentarelbundock,Rdatasets,master,csv," \ "HistData,Guerry.csv.zip" os.remove(os.path.join(cur_dir, fn)) fn = "raw.githubusercontent.com,vincentarelbundock,Rdatasets,master,doc," \ "HistData,rst,Guerry.rst.zip" os.remove(os.path.join(cur_dir, fn))
def pull_data_function(): print("Initiating data pull...") data = dt.get_rdataset("Boston", "MASS").data print("Data pull complete...") print() print("Saving...") data.to_csv(path + "/data/raw/input_raw.csv", header=True) print("Save complete!") return data
def test_get_rdataset(): # smoke test if not PY3: #NOTE: there's no way to test both since the cached files were #created with Python 2.x, they're strings, but Python 3 expects #bytes and the index file path is hard-coded so both can't live #side by side duncan = get_rdataset("Duncan", "car", cache=cur_dir) assert_(duncan.from_cache)
def test_get_rdataset(): # smoke test test_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/cars.csv" internet_available = check_internet(test_url) if not internet_available: raise SkipTest('Unable to retrieve file - skipping test') duncan = get_rdataset("Duncan", "car", cache=cur_dir) assert_(isinstance(duncan, utils.Dataset)) assert_(duncan.from_cache)
def test_get_rdataset(): # smoke test test_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/cars.csv" internet_available = check_internet(test_url) if not internet_available: raise SkipTest('Unable to retrieve file - skipping test') duncan = get_rdataset("Duncan", "car", cache=cur_dir) assert_(isinstance(duncan, utils.Dataset)) assert_(duncan.from_cache) # test writing and reading cache guerry = get_rdataset("Guerry", "HistData", cache=cur_dir) assert_(guerry.from_cache is False) guerry2 = get_rdataset("Guerry", "HistData", cache=cur_dir) assert_(guerry2.from_cache is True) fn = "raw.github.com,vincentarelbundock,Rdatasets,master,csv,HistData,Guerry.csv.zip" os.remove(os.path.join(cur_dir, fn)) fn = "raw.github.com,vincentarelbundock,Rdatasets,master,doc,HistData,rst,Guerry.rst.zip" os.remove(os.path.join(cur_dir, fn))
def test_get_rdataset(): # smoke test test_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/cars.csv" internet_available = check_internet(test_url) if not internet_available: raise SkipTest('Unable to retrieve file - skipping test') duncan = get_rdataset("Duncan", "car", cache=cur_dir) assert_(isinstance(duncan, utils.Dataset)) if not PY3: #NOTE: there's no way to test both since the cached files were #created with Python 2.x, they're strings, but Python 3 expects #bytes and the index file path is hard-coded so both can't live #side by side assert_(duncan.from_cache)
def test_results_on_the_quakes_dataset(self): """ R code: ------ > data("quakes") > x = quakes[1:50, 1:3] > y = quakes[51:100, 1:3] > dcov.test(x, y, R=200) dCov independence test (permutation test) data: index 1, replicates 200 nV^2 = 45046, p-value = 0.4577 sample estimates: dCov 30.01526 """ try: quakes = get_rdataset("quakes").data.values[:, :3] except IGNORED_EXCEPTIONS: pytest.skip('Failed with HTTPError or URLError, these are random') x = np.asarray(quakes[:50], dtype=float) y = np.asarray(quakes[50:100], dtype=float) stats = ddm.distance_statistics(x, y) assert_almost_equal(np.round(stats.test_statistic), 45046, 0) assert_almost_equal(stats.distance_correlation, 0.1894193, 4) assert_almost_equal(stats.distance_covariance, 30.01526, 4) assert_almost_equal(stats.dvar_x, 170.1702, 4) assert_almost_equal(stats.dvar_y, 147.5545, 4) assert_almost_equal(stats.S, 52265, 0) test_statistic, _, method = ddm.distance_covariance_test(x, y, B=199) assert_almost_equal(np.round(test_statistic), 45046, 0) assert method == "emp"
def test_results_on_the_iris_dataset(self): """ R code example from the `energy` package documentation for `energy::distance_covariance.test`: > x <- iris[1:50, 1:4] > y <- iris[51:100, 1:4] > set.seed(1) > dcov.test(x, y, R=200) dCov independence test (permutation test) data: index 1, replicates 200 nV^2 = 0.5254, p-value = 0.9552 sample estimates: dCov 0.1025087 """ try: iris = get_rdataset("iris").data.values[:, :4] except IGNORED_EXCEPTIONS: pytest.skip('Failed with HTTPError or URLError, these are random') x = np.asarray(iris[:50], dtype=float) y = np.asarray(iris[50:100], dtype=float) stats = ddm.distance_statistics(x, y) assert_almost_equal(stats.test_statistic, 0.5254, 4) assert_almost_equal(stats.distance_correlation, 0.3060479, 4) assert_almost_equal(stats.distance_covariance, 0.1025087, 4) assert_almost_equal(stats.dvar_x, 0.2712927, 4) assert_almost_equal(stats.dvar_y, 0.4135274, 4) assert_almost_equal(stats.S, 0.667456, 4) test_statistic, _, method = ddm.distance_covariance_test(x, y, B=199) assert_almost_equal(test_statistic, 0.5254, 4) assert method == "emp"
def main(): #Load mastectomy dataset df = datasets.get_rdataset('mastectomy', 'HSAUR', cache=True).data #Change event to integer df.event = df.event.astype(np.int64) #Change metastized to integer (1 for yes, 0 for no) df.metastized = (df.metastized == 'yes').astype(np.int64) #Count the number of patients n_patients = df.shape[0] #Create array for each individual patient patients = np.arange(n_patients) #Censoring - we do not observe the death of every subject, and subjects may still be alive at time t=0 #1 - observation is not censored (death was observed) #0 - observation is censored (death was not observed) nonCensored = df.event.mean() #Create censoring plot fig, ax = plt.subplots(figsize=(8, 6)) blue, _, red = sns.color_palette()[:3] #Create horizontal lines for censored observations ax.hlines(patients[df.event.values == 0], 0, df[df.event.values == 0].time, color=blue, label='Censored') #Create horizontal red lines for uncensored observations ax.hlines(patients[df.event.values == 1], 0, df[df.event.values == 1].time, color=red, label='Uncensored') #Create scatter ppoints for metastized months ax.scatter(df[df.metastized.values == 1].time, patients[df.metastized.values == 1], color='k', zorder=10, label='Metastized') ax.set_xlim(left=0) ax.set_xlabel('Months since mastectomy') ax.set_yticks([]) ax.set_ylabel('Subject') ax.set_ylim(-0.25, n_patients + 0.25) ax.legend(loc='center right') #To understand the impact of metastization on survival time, we use a risk regression model #Cox proportional hazards model #Make intervals 3 months long interval_length = 3 interval_bounds = np.arange(0, df.time.max() + interval_length + 1, interval_length) n_intervals = interval_bounds.size - 1 intervals = np.arange(n_intervals) #Check how deaths and censored observations are distributed in intervals fig, ax = plt.subplots(figsize=(8, 6)) #Plot histogram of uncensored events ax.hist(df[df.event == 1].time.values, bins=interval_bounds, color=red, alpha=0.5, lw=0, label='Uncensored') #Plot histogram of censored events ax.hist(df[df.event == 0].time.values, bins=interval_bounds, color=blue, alpha=0.5, lw=0, label='Censored') ax.set_xlim(0, interval_bounds[-1]) ax.set_xlabel('Months since mastectomy') ax.set_yticks([0, 1, 2, 3]) ax.set_ylabel('Number of observations') ax.legend() #Calculates the last interval period when a subject was alive last_period = np.floor((df.time - 0.01) / interval_length).astype(int) #Creates an empty matrix to store deaths death = np.zeros((n_patients, n_intervals)) #For each patient (row), create an event where the last interval period was observed (column) death[patients, last_period] = df.event #Create matrix of the amount of time a subject (row) was at risk in an interval (column) exposure = np.greater_equal.outer(df.time, interval_bounds[:-1]) * interval_length exposure[patients, last_period] = df.time - interval_bounds[last_period] #Define parameters for PyMC SEED = 5078864 n_samples = 1000 n_tune = 1000 #Create PyMC model -> lambda(t) = lambda0(t) * e ^ (X*beta) with pm.Model() as model: #Define prior distribution of hazards as vague Gamma distribution lambda0 = pm.Gamma('lambda0', 0.01, 0.01, shape=n_intervals) #Define hazard regression coefficients (beta) for covariates X as a normal distribution beta = pm.Normal('beta', 0, sd=1000) #Create equation for lambda(t) as a deterministic node - record sampled values as part of output #T.outer = symbolic matrix, vector-vector outer product lambda_ = pm.Deterministic( 'lambda_', T.outer(T.exp(beta * df.metastized), lambda0)) #Mu is created from our lambda values (hazard) times patient exposure per interval mu = pm.Deterministic('mu', exposure * lambda_) #We model the posterior distribution as a Poisson distribution with mean Mu obs = pm.Poisson('obs', mu, observed=death) with model: trace = pm.sample(n_samples, tune=n_tune, random_seed=SEED) pm.traceplot(trace) #Calculate hazard rate for subjects with metastized cancer (based on regression coefficients) hazardRate = np.exp(trace['beta'].mean()) pm.plot_posterior(trace, varnames=['beta'], color='#87ceeb') pm.autocorrplot(trace, varnames=['beta']) #Store base hazard as well as metastized hazard for each sample per interval #(sample x number of intervals) base_hazard = trace['lambda0'] met_hazard = trace['lambda0'] * np.exp(np.atleast_2d(trace['beta']).T) #Calculate cumulative hazard def cum_hazard(hazard): return (interval_length * hazard).cumsum(axis=-1) #Calculative survival as = e^(-cumulative hazard) def survival(hazard): return np.exp(-cum_hazard(hazard)) #Plot highest posterior density def plot_with_hpd(x, hazard, f, ax, color=None, label=None, alpha=0.05): #Use function f on hazard mean mean = f(hazard.mean(axis=0)) #Create confidence percentiles percentiles = 100 * np.array([alpha / 2., 1. - alpha / 2.]) hpd = np.percentile(f(hazard), percentiles, axis=0) ax.fill_between(x, hpd[0], hpd[1], color=color, alpha=0.25) ax.step(x, mean, color=color, label=label) #Create figure fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2, sharex=True, sharey=False, figsize=(16, 6)) #Plot Hazard with HPD up until the last interval for non-metasized cancer plot_with_hpd(interval_bounds[:-1], base_hazard, cum_hazard, hazard_ax, color=blue, label='Had not metastized') #Plot Hazard with HPD up until the last interval for metasized cancer plot_with_hpd(interval_bounds[:-1], met_hazard, cum_hazard, hazard_ax, color=red, label='Metastized') hazard_ax.set_xlim(0, df.time.max()) hazard_ax.set_xlabel('Months since mastectomy') hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$') hazard_ax.legend(loc=2) #Plot Survival with HPD up until the last interval for non-metasized cancer plot_with_hpd(interval_bounds[:-1], base_hazard, survival, surv_ax, color=blue) #Plot Survival with HPD up until the last interval for metasized cancer plot_with_hpd(interval_bounds[:-1], met_hazard, survival, surv_ax, color=red) surv_ax.set_xlim(0, df.time.max()) surv_ax.set_xlabel('Months since mastectomy') surv_ax.set_ylabel('Survival function $S(t)$') fig.suptitle('Bayesian survival model') #Consider time varying effects with pm.Model() as time_varying_model: lambda0 = pm.Gamma('lambda0', 0.01, 0.01, shape=n_intervals) #Beta is now modeled as a normal random walk instead of a normal distribution #This is due to the fact that the regression coefficients can vary over time beta = GaussianRandomWalk('beta', tau=1., shape=n_intervals) lambda_ = pm.Deterministic( 'h', lambda0 * T.exp(T.outer(T.constant(df.metastized), beta))) mu = pm.Deterministic('mu', exposure * lambda_) obs = pm.Poisson('obs', mu, observed=death) with time_varying_model: time_varying_trace = pm.sample(n_samples, tune=n_tune, random_seed=SEED) pm.traceplot(time_varying_trace) pm.plot_posterior(time_varying_trace, varnames=['beta'], color='#87ceeb') pm.forestplot(time_varying_trace, varnames=['beta']) #Create plot to show the mean trace of beta fig, ax = plt.subplots(figsize=(8, 6)) #Create percentiles of the new trace beta_hpd = np.percentile(time_varying_trace['beta'], [2.5, 97.5], axis=0) beta_low = beta_hpd[0] beta_high = beta_hpd[1] #Fill percentile interval ax.fill_between(interval_bounds[:-1], beta_low, beta_high, color=blue, alpha=0.25) #Create the mean estimate for beta from trace samples beta_hat = time_varying_trace['beta'].mean(axis=0) #Plot a stepwise line for beta_hat per interval ax.step(interval_bounds[:-1], beta_hat, color=blue) #Plot points where cancer was metastized, differentiation between death and censorship ax.scatter(interval_bounds[last_period[(df.event.values == 1) & (df.metastized == 1)]], beta_hat[last_period[(df.event.values == 1) & (df.metastized == 1)]], c=red, zorder=10, label='Died, cancer metastized') ax.scatter(interval_bounds[last_period[(df.event.values == 0) & (df.metastized == 1)]], beta_hat[last_period[(df.event.values == 0) & (df.metastized == 1)]], c=blue, zorder=10, label='Censored, cancer metastized') ax.set_xlim(0, df.time.max()) ax.set_xlabel('Months since mastectomy') ax.set_ylabel(r'$\beta_j$') ax.legend() #Store time-varying model tv_base_hazard = time_varying_trace['lambda0'] tv_met_hazard = time_varying_trace['lambda0'] * np.exp( np.atleast_2d(time_varying_trace['beta'])) #Plot cumulative hazard functions with and without time-varying effect fig, ax = plt.subplots(figsize=(8, 6)) ax.step(interval_bounds[:-1], cum_hazard(base_hazard.mean(axis=0)), color=blue, label='Had not metastized') ax.step(interval_bounds[:-1], cum_hazard(met_hazard.mean(axis=0)), color=red, label='Metastized') ax.step(interval_bounds[:-1], cum_hazard(tv_base_hazard.mean(axis=0)), color=blue, linestyle='--', label='Had not metastized (time varying effect)') ax.step(interval_bounds[:-1], cum_hazard(tv_met_hazard.mean(axis=0)), color=red, linestyle='--', label='Metastized (time varying effect)') ax.set_xlim(0, df.time.max() - 4) ax.set_xlabel('Months since mastectomy') ax.set_ylim(0, 2) ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$') ax.legend(loc=2) #Plot cumulative hazard and survival models with HPD fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2, sharex=True, sharey=False, figsize=(16, 6)) plot_with_hpd(interval_bounds[:-1], tv_base_hazard, cum_hazard, hazard_ax, color=blue, label='Had not metastized') plot_with_hpd(interval_bounds[:-1], tv_met_hazard, cum_hazard, hazard_ax, color=red, label='Metastized') hazard_ax.set_xlim(0, df.time.max()) hazard_ax.set_xlabel('Months since mastectomy') hazard_ax.set_ylim(0, 2) hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$') hazard_ax.legend(loc=2) plot_with_hpd(interval_bounds[:-1], tv_base_hazard, survival, surv_ax, color=blue) plot_with_hpd(interval_bounds[:-1], tv_met_hazard, survival, surv_ax, color=red) surv_ax.set_xlim(0, df.time.max()) surv_ax.set_xlabel('Months since mastectomy') surv_ax.set_ylabel('Survival function $S(t)$') fig.suptitle('Bayesian survival model with time varying effects') plt.show() print('x')
# hittersRegTree.py # Code to plot figures 8.1 and 8.2 # How to find region boundaries (e.g., 4.5, 117.5) from tree estimator? from statsmodels import datasets from sklearn.tree import DecisionTreeRegressor, plot_tree import numpy as np import matplotlib.pyplot as plt plt.style.use('seaborn-whitegrid') hitters = datasets.get_rdataset('Hitters', 'ISLR').data hitters_use = hitters[['Hits', 'Years', 'Salary']].copy() hitters_use.dropna(how='any', inplace=True) tree = DecisionTreeRegressor(max_leaf_nodes=3) X = hitters_use[['Hits', 'Years']] y = np.log(hitters_use['Salary']) tree.fit(X, y) fig, ax = plt.subplots() plot_tree(tree, feature_names=['Hits', 'Years'], ax=ax) # Plot decision tree regions and training data xx = np.linspace(hitters_use['Years'].min(), hitters_use['Years'].max()) yy = np.linspace(hitters_use['Hits'].min(), hitters_use['Hits'].max()) x_grid, y_grid = np.meshgrid(xx, yy) zz = tree.predict(np.vstack((y_grid.ravel(), x_grid.ravel())).T) z_grid = zz.reshape(x_grid.shape)
# Figure 4.3 import matplotlib.pyplot as plt from statsmodels import datasets import pandas as pd import numpy as np default = datasets.get_rdataset('Default', 'ISLR').data default['balance_grp'] = pd.cut(default['balance'], bins=np.linspace(0, 2700, 10)) student_balance = default.loc[default['student'] == 'Yes', ['balance', 'balance_grp']].groupby( 'balance_grp').mean() student_defrate = default.loc[default['student'] == 'Yes'].groupby( 'balance_grp').apply(lambda x: np.sum(x['default'] == 'Yes') / x.shape[0]) student_defrate.name = 'default_rate' student_data = pd.merge(student_balance, student_defrate, left_index=True, right_index=True) notstudent_balance = default.loc[default['student'] == 'No', ['balance', 'balance_grp']].groupby( 'balance_grp').mean() notstudent_defrate = default.loc[default['student'] == 'No'].groupby( 'balance_grp').apply(lambda x: np.sum(x['default'] == 'Yes') / x.shape[0]) notstudent_defrate.name = 'default_rate' notstudent_data = pd.merge(notstudent_balance, notstudent_defrate, left_index=True,
# Plot figure 4.1 from statsmodels import datasets import matplotlib.pyplot as plt import numpy as np credit = datasets.get_rdataset('Default', 'ISLR', cache=True) credit_data = credit.data credit_sample = credit_data.iloc[np.random.choice(credit_data.shape[0], 1000, replace=False)] fig = plt.figure(figsize=(8, 4)) ax1 = fig.add_subplot(121) credit_sample.loc[credit_sample['default'] == 'No'].plot(x='balance', y='income', alpha=0.5, kind='scatter', ax=ax1) credit_data.loc[credit_data['default'] == 'Yes'].plot(x='balance', y='income', marker='+', color='brown', kind='scatter', alpha=0.9, ax=ax1) ax1.set_xlabel('Balance') ax1.set_ylabel('Income') ax2 = fig.add_subplot(143)
# Figure 7.1 # Polynomial fit along with confidence intervals for Wage data # Confidence intervals are drawn for ols regression only # statsmodels does not provide confidence intervals for logistic regression from statsmodels import datasets import statsmodels.formula.api as smf import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.preprocessing import PolynomialFeatures wage = datasets.get_rdataset('Wage', 'ISLR').data wage_model = smf.ols('wage ~ age + I(age ** 2) + I(age ** 3) + I(age ** 4)', data=wage) wage_fit = wage_model.fit() res_df = pd.DataFrame({ 'age': wage['age'], 'wage_fit': wage_fit.fittedvalues, 'wage_lower': wage_fit.get_prediction().conf_int()[:, 0], 'wage_upper': wage_fit.get_prediction().conf_int()[:, 1] }) res_df.sort_values('age', inplace=True) fig = plt.figure(figsize=(7, 4)) ax1 = fig.add_subplot(121) wage.plot(x='age', y='wage', kind='scatter', s=10, alpha=0.5, ax=ax1) res_df.plot(x='age', y='wage_fit', c='k', ax=ax1) res_df.plot(x='age', y='wage_lower', linestyle='--', c='r', ax=ax1)
# Plot figure 3.15 import matplotlib.pyplot as plt import pandas as pd import statsmodels.datasets as datasets import statsmodels.formula.api as smf import numpy as np credit = datasets.get_rdataset('Credit', 'ISLR').data def calcRSS(beta1, beta2, var1, var2, yvar, df): # Solve for intercept alpha = np.mean(df[yvar] - beta1 * df[var1] - beta2 * df[var2]) rss = np.sum((df[yvar] - alpha - beta1 * df[var1] - beta2 * df[var2])**2) return rss age_limit_reg = smf.ols(formula='Balance ~ Limit + Age', data=credit) age_limit_fit = age_limit_reg.fit() age_limit_res = age_limit_fit.summary2().tables[1] beta_limit = age_limit_fit.params['Limit'] beta_limit_se = age_limit_res.loc['Limit', 'Std.Err.'] beta_age = age_limit_fit.params['Age'] beta_age_se = age_limit_res.loc['Age', 'Std.Err.'] beta_limit_xx = np.linspace(beta_limit - 4 * beta_limit_se, beta_limit + 4 * beta_limit_se) beta_age_xx = np.linspace(beta_age - 4 * beta_age_se, beta_age + 4 * beta_age_se) beta_limit_grid, beta_age_grid = np.meshgrid(beta_limit_xx, beta_age_xx)
import pandas as pd import matplotlib.pyplot as plt import statsmodels.datasets as datasets credit = datasets.get_rdataset('Credit', package='ISLR').data axes = pd.plotting.scatter_matrix(credit[[ 'Balance', 'Age', 'Cards', 'Education', 'Income', 'Limit', 'Rating' ]], alpha=0.6, s=5, figsize=(6, 6)) [plt.setp(item.xaxis.get_label(), 'size', 7) for item in axes.ravel()] [ plt.setp(item.xaxis.get_majorticklabels(), 'size', 7) for item in axes.ravel() ] [plt.setp(item.yaxis.get_label(), 'size', 7) for item in axes.ravel()] [ plt.setp(item.yaxis.get_majorticklabels(), 'size', 7) for item in axes.ravel() ] plt.tight_layout()
def plantTraits(): data = datasets.get_rdataset("plantTraits", "cluster", cache=True).data data.index.name = "ID" return data
# Figure 5.2 import numpy as np from statsmodels import datasets import matplotlib.pyplot as plt from numpy.polynomial.polynomial import polyfit, polyval auto = datasets.get_rdataset('Auto', 'ISLR').data n_obs = auto.shape[0] n_train = int(n_obs / 2) n_test = n_obs - n_train np.random.seed(911) train_ind = np.random.choice(range(n_obs), n_train, replace=False) test_ind = set(range(n_obs)).difference(set(train_ind)) test_ind = list(test_ind) auto_train = auto.iloc[train_ind] auto_test = auto.iloc[test_ind] poly_degree = [] poly_mse = [] for p in range(1, 11): poly_fit = polyfit(auto_train['horsepower'], auto_train['mpg'], deg=p) mpg_test = polyval(auto_test['horsepower'], poly_fit) mse_test = np.mean((mpg_test - auto_test['mpg'])**2) poly_degree.append(p) poly_mse.append(mse_test) fig = plt.figure(figsize=(8, 4)) ax1 = fig.add_subplot(121) ax1.plot(poly_degree, poly_mse, marker='o', c='r')
datasets_df = read_csv(documentation_file, usecols=datasets_usecols) logger.info(datasets_df.shape) packages = datasets_df['Package'].unique() for package in packages: if not exists(data_folder + package): logger.info('creating output folder {}'.format(data_folder + package)) mkdir(data_folder + package) for index, row in datasets_df.iterrows(): logger.info('loading {} / {} data'.format(row['Package'], row['Item'])) current_pickle = data_folder + row['Package'] + '/' + row['Item'] + '.pkl' if exists(current_pickle): with open(current_pickle, 'rb') as current_fp: current_bundle = pickle.load(current_fp) else: current_bundle = get_rdataset(row['Item'], row['Package']) with open(current_pickle, 'wb') as current_fp: pickle.dump(current_bundle, current_fp) current_data = current_bundle.data logger.info('{} data has variables {}'.format(row['Item'], list(current_data))) if len(current_data.shape) > 1: logger.info('{} data has {} rows and {} variables'.format(row['Item'], current_data.shape[0], current_data.shape[1])) if 'title' in current_bundle.keys(): current_title = current_bundle.title logger.info('{} data has title {}'.format(row['Item'], current_title)) return_X_y = False if not exists(data_folder + 'statsmodels'): logger.info('creating output folder {}'.format(data_folder + 'statsmodels'))