def test_dropna(self): p = lm._RegressionPlotter("x", "y_na", data=self.df) assert len(p.x) == pd.notnull(self.df.y_na).sum() p = lm._RegressionPlotter("x", "y_na", data=self.df, dropna=False) assert len(p.x) == len(self.df.y_na)
def test_estimate_cis(self): seed = 123 p = lm._RegressionPlotter(self.df.d, self.df.y, x_estimator=np.mean, ci=95, seed=seed) _, _, ci_big = p.estimate_data p = lm._RegressionPlotter(self.df.d, self.df.y, x_estimator=np.mean, ci=50, seed=seed) _, _, ci_wee = p.estimate_data npt.assert_array_less(np.diff(ci_wee), np.diff(ci_big)) p = lm._RegressionPlotter(self.df.d, self.df.y, x_estimator=np.mean, ci=None) _, _, ci_nil = p.estimate_data npt.assert_array_equal(ci_nil, [None] * len(ci_nil))
def test_variables_must_be_1d(self): array_2d = np.random.randn(20, 2) array_1d = np.random.randn(20) with pytest.raises(ValueError): lm._RegressionPlotter(array_2d, array_1d) with pytest.raises(ValueError): lm._RegressionPlotter(array_1d, array_2d)
def test_regression_options(self): with pytest.raises(ValueError): lm._RegressionPlotter("x", "y", data=self.df, lowess=True, order=2) with pytest.raises(ValueError): lm._RegressionPlotter("x", "y", data=self.df, lowess=True, logistic=True)
def test_ci(self): p = lm._RegressionPlotter("x", "y", data=self.df, ci=95) assert p.ci == 95 assert p.x_ci == 95 p = lm._RegressionPlotter("x", "y", data=self.df, ci=95, x_ci=68) assert p.ci == 95 assert p.x_ci == 68 p = lm._RegressionPlotter("x", "y", data=self.df, ci=95, x_ci="sd") assert p.ci == 95 assert p.x_ci == "sd"
def test_regression_limits(self): f, ax = plt.subplots() ax.scatter(self.df.x, self.df.y) p = lm._RegressionPlotter("x", "y", data=self.df) grid, _, _ = p.fit_regression(ax) xlim = ax.get_xlim() assert grid.min() == xlim[0] assert grid.max() == xlim[1] p = lm._RegressionPlotter("x", "y", data=self.df, truncate=True) grid, _, _ = p.fit_regression() assert grid.min() == self.df.x.min() assert grid.max() == self.df.x.max()
def test_variables_from_mix(self): p = lm._RegressionPlotter("x", self.df.y + 1, data=self.df) npt.assert_array_equal(p.x, self.df.x) npt.assert_array_equal(p.y, self.df.y + 1) pdt.assert_frame_equal(p.data, self.df)
def test_lowess_regression(self): p = lm._RegressionPlotter("x", "y", data=self.df, lowess=True) grid, yhat, err_bands = p.fit_regression(x_range=(-3, 3)) assert len(grid) == len(yhat) assert err_bands is None
def test_robust_regression(self): p_ols = lm._RegressionPlotter("x", "y", data=self.df, n_boot=self.n_boot) _, ols_yhat, _ = p_ols.fit_regression(x_range=(-3, 3)) p_robust = lm._RegressionPlotter("x", "y", data=self.df, robust=True, n_boot=self.n_boot) _, robust_yhat, _ = p_robust.fit_regression(x_range=(-3, 3)) assert len(ols_yhat) == len(robust_yhat)
def test_variables_from_series(self): p = lm._RegressionPlotter(self.df.x, self.df.y, units=self.df.s) npt.assert_array_equal(p.x, self.df.x) npt.assert_array_equal(p.y, self.df.y) npt.assert_array_equal(p.units, self.df.s) assert p.data is None
def test_variables_from_frame(self): p = lm._RegressionPlotter("x", "y", data=self.df, units="s") pdt.assert_series_equal(p.x, self.df.x) pdt.assert_series_equal(p.y, self.df.y) pdt.assert_series_equal(p.units, self.df.s) pdt.assert_frame_equal(p.data, self.df)
def test_regress_bootstrap_seed(self): seed = 200 p1 = lm._RegressionPlotter("x", "y", data=self.df, n_boot=self.n_boot, seed=seed) p2 = lm._RegressionPlotter("x", "y", data=self.df, n_boot=self.n_boot, seed=seed) _, boots1 = p1.fit_fast(self.grid) _, boots2 = p2.fit_fast(self.grid) npt.assert_array_equal(boots1, boots2)
def test_estimate_data(self): p = lm._RegressionPlotter(self.df.d, self.df.y, x_estimator=np.mean) x, y, ci = p.estimate_data npt.assert_array_equal(x, np.sort(np.unique(self.df.d))) npt.assert_array_almost_equal(y, self.df.groupby("d").y.mean()) npt.assert_array_less(np.array(ci)[:, 0], y) npt.assert_array_less(y, np.array(ci)[:, 1])
def test_estimate_units(self): # Seed the RNG locally seed = 345 p = lm._RegressionPlotter("x", "y", data=self.df, units="s", seed=seed, x_bins=3) _, _, ci_big = p.estimate_data ci_big = np.diff(ci_big, axis=1) p = lm._RegressionPlotter("x", "y", data=self.df, seed=seed, x_bins=3) _, _, ci_wee = p.estimate_data ci_wee = np.diff(ci_wee, axis=1) npt.assert_array_less(ci_wee, ci_big)
def test_logistic_regression(self): p = lm._RegressionPlotter("x", "c", data=self.df, logistic=True, n_boot=self.n_boot) _, yhat, _ = p.fit_regression(x_range=(-3, 3)) npt.assert_array_less(yhat, 1) npt.assert_array_less(0, yhat)
def test_logistic_perfect_separation(self): y = self.df.x > self.df.x.mean() p = lm._RegressionPlotter("x", y, data=self.df, logistic=True, n_boot=10) with np.errstate(all="ignore"): _, yhat, _ = p.fit_regression(x_range=(-3, 3)) assert np.isnan(yhat).all()
def test_regress_poly(self): p = lm._RegressionPlotter("x", "y", data=self.df, n_boot=self.n_boot) # Fit an first-order polynomial yhat_poly, _ = p.fit_poly(self.grid, 1) # Fit using the statsmodels function with an OLS model yhat_smod, _ = p.fit_statsmodels(self.grid, smlm.OLS) # Compare the vector of y_hat values npt.assert_array_almost_equal(yhat_poly, yhat_smod)
def test_fast_regression(self): p = lm._RegressionPlotter("x", "y", data=self.df, n_boot=self.n_boot) # Fit with the "fast" function, which just does linear algebra yhat_fast, _ = p.fit_fast(self.grid) # Fit using the statsmodels function with an OLS model yhat_smod, _ = p.fit_statsmodels(self.grid, smlm.OLS) # Compare the vector of y_hat values npt.assert_array_almost_equal(yhat_fast, yhat_smod)
def test_scatter_data(self): p = lm._RegressionPlotter(self.df.x, self.df.y) x, y = p.scatter_data npt.assert_array_equal(x, self.df.x) npt.assert_array_equal(y, self.df.y) p = lm._RegressionPlotter(self.df.d, self.df.y) x, y = p.scatter_data npt.assert_array_equal(x, self.df.d) npt.assert_array_equal(y, self.df.y) p = lm._RegressionPlotter(self.df.d, self.df.y, x_jitter=.1) x, y = p.scatter_data assert (x != self.df.d).any() npt.assert_array_less(np.abs(self.df.d - x), np.repeat(.1, len(x))) npt.assert_array_equal(y, self.df.y) p = lm._RegressionPlotter(self.df.d, self.df.y, y_jitter=.05) x, y = p.scatter_data npt.assert_array_equal(x, self.df.d) npt.assert_array_less(np.abs(self.df.y - y), np.repeat(.1, len(y)))
def test_partial(self): x = self.rs.randn(100) y = x + self.rs.randn(100) z = x + self.rs.randn(100) p = lm._RegressionPlotter(y, z) _, r_orig = np.corrcoef(p.x, p.y)[0] p = lm._RegressionPlotter(y, z, y_partial=x) _, r_semipartial = np.corrcoef(p.x, p.y)[0] assert r_semipartial < r_orig p = lm._RegressionPlotter(y, z, x_partial=x, y_partial=x) _, r_partial = np.corrcoef(p.x, p.y)[0] assert r_partial < r_orig x = pd.Series(x) y = pd.Series(y) p = lm._RegressionPlotter(y, z, x_partial=x, y_partial=x) _, r_partial = np.corrcoef(p.x, p.y)[0] assert r_partial < r_orig
def test_regress_logx(self): x = np.arange(1, 10) y = np.arange(1, 10) grid = np.linspace(1, 10, 100) p = lm._RegressionPlotter(x, y, n_boot=self.n_boot) yhat_lin, _ = p.fit_fast(grid) yhat_log, _ = p.fit_logx(grid) assert yhat_lin[0] > yhat_log[0] assert yhat_log[20] > yhat_lin[20] assert yhat_lin[90] > yhat_log[90]
def test_regress_n_boot(self): p = lm._RegressionPlotter("x", "y", data=self.df, n_boot=self.n_boot) # Fast (linear algebra) version _, boots_fast = p.fit_fast(self.grid) npt.assert_equal(boots_fast.shape, (self.n_boot, self.grid.size)) # Slower (np.polyfit) version _, boots_poly = p.fit_poly(self.grid, 1) npt.assert_equal(boots_poly.shape, (self.n_boot, self.grid.size)) # Slowest (statsmodels) version _, boots_smod = p.fit_statsmodels(self.grid, smlm.OLS) npt.assert_equal(boots_smod.shape, (self.n_boot, self.grid.size))
def test_regress_without_bootstrap(self): p = lm._RegressionPlotter("x", "y", data=self.df, n_boot=self.n_boot, ci=None) # Fast (linear algebra) version _, boots_fast = p.fit_fast(self.grid) assert boots_fast is None # Slower (np.polyfit) version _, boots_poly = p.fit_poly(self.grid, 1) assert boots_poly is None # Slowest (statsmodels) version _, boots_smod = p.fit_statsmodels(self.grid, smlm.OLS) assert boots_smod is None
def test_singleton(self, x, y): p = lm._RegressionPlotter(x, y) assert not p.fit_reg
def test_provided_bins(self): p = lm._RegressionPlotter(self.df.x, self.df.y) x_binned, bins = p.bin_predictor(self.bins_given) npt.assert_array_equal(np.unique(x_binned), self.bins_given)
def test_bin_results(self): p = lm._RegressionPlotter(self.df.x, self.df.y) x_binned, bins = p.bin_predictor(self.bins_given) assert self.df.x[x_binned == 0].min() > self.df.x[x_binned == -1].max() assert self.df.x[x_binned == 1].min() > self.df.x[x_binned == 0].max()
def test_numeric_bins(self): p = lm._RegressionPlotter(self.df.x, self.df.y) x_binned, bins = p.bin_predictor(self.bins_numeric) npt.assert_equal(len(bins), self.bins_numeric) npt.assert_array_equal(np.unique(x_binned), bins)
import seaborn as sns import matplotlib.pyplot as plt import numpy as np from scipy import stats from bokeh.plotting import figure, show, output_file from bokeh.models import HoverTool from seaborn.regression import _RegressionPlotter tips = sns.load_dataset("tips") tips.sort_values(by='total_bill', inplace=True) regplot = _RegressionPlotter('total_bill', 'tip', data=tips) grid, yhat, err_bands = regplot.fit_regression(grid=tips.total_bill) tips['yhat'] = yhat tips['ci1'] = err_bands[0] tips['ci2'] = err_bands[1] hover = HoverTool(tooltips=[ ("(x, y)", "($x, $y)"), ]) tools = [hover, 'pan', 'wheel_zoom'] p = figure(title="Bokeh Regplot", toolbar_location='right', tools=tools) p.scatter('total_bill', 'tip', source=tips) p.line('total_bill', 'yhat', source=tips, line_width=2, line_color='grey') p.line('total_bill', 'ci1', source=tips, alpha=0.7,