def test_stateful_transform(): data_train = patsy.demo_data("x1", "x2", "y") data_train['x1'][:] = 1 # mean of x1 is 1 data_test = patsy.demo_data("x1", "x2", "y") data_test['x1'][:] = 0 # center x1 est = PatsyTransformer("center(x1) + x2") est.fit(data_train) data_trans = est.transform(data_test) # make sure that mean of training, not test data was removed assert_array_equal(data_trans[:, 0], -1)
def test_proflogit_with_patsy_demo_data_no_intercept(self): """ Test on simple demo data from patsy w/o intercept. """ # demo_data: returns a dict # categorical variables are returned as a list of strings. # Numerical data sampled from a normal distribution (fixed seed) rng = np.random.RandomState(42) data = patsy.demo_data("a", "b", "x1", "x2", nlevels=3) y = rng.randint(2, size=len(data["a"])) # dmatrix: to create the design matrix alone (no left-hand side) X = patsy.dmatrix("a + b + x1 + x2 - 1", data) pfl = ProfLogitCCP( rga_kws={ "niter": 10, "disp": False, "random_state": 42 }, intercept=False, ) pfl.fit(X, y) npt.assert_array_almost_equal( pfl.rga.res.x, [0.27466536, 0.0, -0.24030505, 0.0, 0.0, -0.82215168, 0.0], ) self.assertAlmostEqual(pfl.rga.res.fun, 12.310732234783764) empc_score = pfl.score(X, y) self.assertAlmostEqual(empc_score, 12.4444444445)
def test_proflogit_with_patsy_build_in_transformation_functions(self): """Test patsy build-in transformation functions.""" # demo_data: returns a dict # Categorical variables are returned as a list of strings. # Numerical data sampled from a normal distribution (fixed seed) rng = np.random.RandomState(42) data = patsy.demo_data("a", "b", "x1", "x2", nlevels=3) y = rng.randint(2, size=len(data["a"])) # dmatrix: to create the design matrix alone (no left-hand side) # Important that `data` can be indexed like a Python dictionary, # e.g., `data[varname]`. It can also be a pandas.DataFrame # Strings and booleans are treated as categorical variables, where # the first level is the baseline. X = patsy.dmatrix( "a + b + standardize(x1) + standardize(x2)", data, ) pfl = ProfLogitCCP(rga_kws={ "niter": 10, "disp": False, "random_state": 42 }, ) pfl.fit(X, y) npt.assert_array_almost_equal( pfl.rga.res.x, [0.71321495, 0.0, -0.6815996, 0.0, 0.0, -0.92505635, 0.0], ) self.assertAlmostEqual(pfl.rga.res.fun, 12.2837788495) empc_score = pfl.score(X, y) self.assertAlmostEqual(empc_score, 12.4444444445)
def test_error_on_y_transform(): data = patsy.demo_data("x1", "x2", "x3", "y") est = PatsyTransformer("y ~ x1 + x2") msg = ("encountered outcome variables for a model" " that does not expect them") assert_raise_message(patsy.PatsyError, msg, est.fit, data) assert_raise_message(patsy.PatsyError, msg, est.fit_transform, data)
def test_proflogit_with_patsy_demo_data(self): """Test on simple categorical/numerical demo data from patsy.""" # demo_data: returns a dict # Categorical variables are returned as a list of strings. # Numerical data sampled from a normal distribution (fixed seed) rng = np.random.RandomState(42) data = patsy.demo_data("a", "b", "x1", "x2", nlevels=3) y = rng.randint(2, size=len(data["a"])) # dmatrix: to create the design matrix alone (no left-hand side) # Important that `data` can be indexed like a Python dictionary, # e.g., `data[varname]`. It can also be a pandas.DataFrame X = patsy.dmatrix("a + b + x1 + x2", data) pfl = ProfLogitCCP(rga_kws={ "niter": 10, "disp": False, "random_state": 42 }, ) pfl.fit(X, y) npt.assert_array_almost_equal( pfl.rga.res.x, [ 0.26843982, # Intercept 0.0, # Categorical variable 'a' - level a2 -0.21947001, # Categorical variable 'a' - level a3 0.12036944, # Categorical variable 'b' - level b2 0.0, # Categorical variable 'b' - level b3 -0.47514314, # Numeric variable 'x1' -0.08812723, # Numeric variable 'x2' ], ) self.assertAlmostEqual(pfl.rga.res.fun, 12.3541334628) empc_score = pfl.score(X, y) self.assertAlmostEqual(empc_score, 12.4444444445)
def test_stateful_transform_dataframe(): data_train = pd.DataFrame(patsy.demo_data("x1", "x2", "y")) data_train['x1'][:] = 1 # mean of x1 is 1 data_test = pd.DataFrame(patsy.demo_data("x1", "x2", "y")) data_test['x1'][:] = 0 # center x1 est = PatsyTransformer("center(x1) + x2", return_type='dataframe') est.fit(data_train) data_trans = est.transform(data_test) # make sure result is pandas dataframe assert type(data_trans) is pd.DataFrame # make sure that mean of training, not test data was removed assert_array_equal(data_trans['center(x1)'][:], -1)
def test_stateful_model(): data_train = patsy.demo_data("x1", "x2", "y") data_train['x1'][:] = 1 # mean of x1 is 1 data_test = patsy.demo_data("x1", "x2", "y") data_test['x1'][:] = 0 # center x1 est = PatsyModel(CheckingClassifier(), "y ~ center(x1) + x2") est.fit(data_train) def check_centering(X): return np.all(X[:, 0] == -1) est.estimator_.check_X = check_centering # make sure that mean of training, not test data was removed est.predict(data_test)
def test_stateful_transform_dataframe(): data_train = pd.DataFrame(patsy.demo_data("x1", "x2", "y")) data_train['x1'][:] = 1 # mean of x1 is 1 data_test = pd.DataFrame(patsy.demo_data("x1", "x2", "y")) data_test['x1'][:] = 0 # center x1 est = PatsyTransformer("center(x1) + x2", return_type='dataframe') est.fit(data_train) data_trans = est.transform(data_test) # make sure result is pandas dataframe assert type(data_trans) is pd.DataFrame # make sure that mean of training, not test data was removed assert_array_equal(data_trans['center(x1)'][:],-1)
def test_intercept_transformer(): data = patsy.demo_data("x1", "x2", "x3", "y") # check wether X contains only the two features, no intercept est = PatsyTransformer("x1 + x2") est.fit(data) assert_equal(est.transform(data).shape[1], 2) # check wether X does contain intercept est = PatsyTransformer("x1 + x2", add_intercept=True) est.fit(data) data_transformed = est.transform(data) assert_array_equal(data_transformed[:, 0], 1) assert_equal(est.transform(data).shape[1], 3)
def test_scope_model(): data = patsy.demo_data("x1", "x2", "x3", "y") def myfunc(x): tmp = np.ones_like(x) tmp.fill(42) return tmp def check_X(X): return np.all(X[:, 1] == 42) # checking classifier raises error if check_X doesn't return true. # this checks that myfunc was actually applied est = PatsyModel(CheckingClassifier(check_X=check_X), "y ~ x1 + myfunc(x2)") est.fit(data)
def test_scope_transformer(): data = patsy.demo_data("x1", "x2", "x3", "y") def myfunc(x): tmp = np.ones_like(x) tmp.fill(42) return tmp est = PatsyTransformer("x1 + myfunc(x2)") est.fit(data) data_trans = est.transform(data) assert_array_equal(data_trans[:, 1], 42) est = PatsyTransformer("x1 + myfunc(x2)") data_trans = est.fit_transform(data) assert_array_equal(data_trans[:, 1], 42)
def test_scope_transformer(): data = patsy.demo_data("x1", "x2", "x3", "y") def myfunc(x): tmp = np.ones_like(x) tmp.fill(42) return tmp est = PatsyTransformer("x1 + myfunc(x2)") est.fit(data) data_trans = est.transform(data) assert_array_equal(data_trans[:, 1], 42) est = PatsyTransformer("x1 + myfunc(x2)") data_trans = est.fit_transform(data) assert_array_equal(data_trans[:, 1], 42) # test feature names assert_equal(est.feature_names_, ["x1", "myfunc(x2)"])
def test_scope_model(): data = patsy.demo_data("x1", "x2", "x3", "y") def myfunc(x): tmp = np.ones_like(x) tmp.fill(42) return tmp def check_X(X): return np.all(X[:, 1] == 42) # checking classifier raises error if check_X doesn't return true. # this checks that myfunc was actually applied est = PatsyModel(CheckingClassifier(check_X=check_X), "y ~ x1 + myfunc(x2)") est.fit(data) # test feature names assert_equal(est.feature_names_, ["x1", "myfunc(x2)"])
def test_intercept_model(): data = patsy.demo_data("x1", "x2", "x3", "y") def check_X_no_intercept(X): return X.shape[1] == 2 # check wether X contains only the two features, no intercept est = PatsyModel(CheckingClassifier(check_X=check_X_no_intercept), "y ~ x1 + x2") est.fit(data) # predict checks applying to new data est.predict(data) def check_X_intercept(X): shape_correct = X.shape[1] == 3 first_is_intercept = np.all(X[:, 0] == 1) return shape_correct and first_is_intercept # check wether X does contain intercept est = PatsyModel(CheckingClassifier(check_X=check_X_intercept), "y ~ x1 + x2", add_intercept=True) est.fit(data) est.predict(data)
def ser_types(ser): res = list(set(ser.apply(lambda x: str(type(x))))) return res def df_coltypes(df): panda_types = pd.DataFrame(df.dtypes, columns=["PandaType"]) python_types = pd.DataFrame(df.apply(ser_types), columns=["PythonTypes"]) res = pd.merge(panda_types, python_types, how="outer", left_index=True, right_index=True) return res data = patsy.demo_data('city', 'state', 'population', 'xLocation', 'yLatitude', min_rows=100) df = pd.DataFrame(data) x = df_coltypes(df) dir = os.path.expanduser("~") fpath = os.path.join(dir, "sample.csv") df.to_csv(fpath) print("done") pass
#!/usr/bin/env python # -*- coding: utf-8 -*- from patsy import demo_data from LM import LM import numpy as np data = demo_data("x", "y", "a") print(data["x"]) # Old and boring approach (but it still works): X = np.column_stack(([1] * len(data["y"]), data["x"])) print(X) print(LM((data["y"], X))) m = LM("y ~ x", data) print(m) print(m.loglik(data)) print(m.loglik({"x": [10, 20, 30], "y": [-1, -2, -3]})) # Your users get support for categorical predictors for free: print(LM("y ~ a", data)) print(LM("y ~ np.log(x ** 2)", data))
# -*- coding: utf-8 -*- import numpy as np from patsy import dmatrices, dmatrix, demo_data data = demo_data('a', 'b', 'x1', 'x2', 'y', 'z column') print(f'data:\n{data}') y, X = dmatrices("y ~ x1 + x2", data) print(f'y={y}') print(f"X={X}")
import argparse import patsy import pandas as pd parser = argparse.ArgumentParser() parser.add_argument('--columns') parser.add_argument('--data') args = parser.parse_args() with open(args.columns, 'r') as file: columns = file.read().split() data = patsy.demo_data(*columns) pd.DataFrame(data).to_feather(args.data)
from patsy import dmatrix, demo_data # demo of how patsy handles categorical variables # Patsy notation is described here #http://statsmode#ls.sourceforge.net/devel/example_formulas.html #http://patsy.readthedocs.org/en/latest/categorical-coding.html data = demo_data("a", nlevels=3) dmatrix("a", data) ''' DesignMatrix with shape (6, 3) Intercept a[T.a2] a[T.a3] 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 1 0 1 Terms: 'Intercept' (column 0) 'a' (columns 1:3) ''' data = demo_data("a", nlevels=3) dmatrix("a-1", data) ''' DesignMatrix with shape (6, 3) a[a1] a[a2] a[a3] 1 0 0 0 1 0
print(ModelDesc.from_formula("y ~ x + x + x").describe()) print(ModelDesc.from_formula("y ~ -1 + x").describe()) print(ModelDesc.from_formula("~ -1").describe()) print(ModelDesc.from_formula("y ~ a:b").describe()) print(ModelDesc.from_formula("y ~ a*b").describe()) print(ModelDesc.from_formula("y ~ (a + b + c + d) ** 2").describe()) print(ModelDesc.from_formula("y ~ (a + b)/(c + d)").describe()) print( ModelDesc.from_formula("np.log(x1 + x2) " "+ (x + {6: x3, 8 + 1: x4}[3 * i])").describe()) #Sometimes it might be easier to read if you put the processed formula back into formula notation using ModelDesc.describe(): desc = ModelDesc.from_formula("y ~ (a + b + c + d) ** 2") print(desc.describe()) data = demo_data("a", "b", "x1", "x2") mat = dmatrix("x1:x2 + a:b + b + x1:a:b + a + x2:a:x1", data) print(mat.design_info.term_names) data = demo_data("a", "b", "y") mat1 = dmatrices("y ~ 0 + a:b", data)[1] mat2 = dmatrices("y ~ 1 + a + b + a:b", data)[1] np.linalg.matrix_rank(mat1) print(np.linalg.matrix_rank(mat2)) print(np.linalg.matrix_rank(np.column_stack((mat1, mat2)))) print(mat1) print(mat2)
def generate_count_matrix( n_factors=1, n_replicates=4, n_features=1000, intercept_mean=4, intercept_std=2, coefficient_stds=0.4, size_factors=None, size_factors_std=0.1, dispersion_function=None, ): """ Generate count matrix for groups of samples by sampling from a negative binomial distribution. """ import patsy if isinstance(coefficient_stds, (int, float)): coefficient_stds = [coefficient_stds] * n_factors if dispersion_function is None: dispersion_function = _disp # Build sample vs factors table dcat = pd.DataFrame( patsy.demo_data(*(list(string.ascii_lowercase[:n_factors])))) dcat.columns = dcat.columns.str.upper() for col in dcat.columns: dcat[col] = dcat[col].str.upper() if n_replicates > 1: dcat = (pd.concat([ dcat for _ in range(int(np.ceil(n_replicates / 2))) ]).sort_values(dcat.columns.tolist()).reset_index(drop=True)) dcat.index = [ "S{}_{}".format(str(i + 1).zfill(2), dcat.loc[i, :].sum()) for i in dcat.index ] m_samples = dcat.shape[0] # make model design table design = np.asarray( patsy.dmatrix( "~ 1 + " + " + ".join(string.ascii_uppercase[:n_factors]), dcat)) # get means beta = np.asarray( [np.random.normal(intercept_mean, intercept_std, n_features)] + [np.random.normal(0, std, n_features) for std in coefficient_stds]).T if size_factors is None: size_factors = np.random.normal(1, size_factors_std, (m_samples, 1)) mean = (2**(design @ beta.T) * size_factors).T # now sample counts dispersion = (1 / dispersion_function(2**(beta[:, 1:]))).mean(1).reshape( -1, 1) dnum = pd.DataFrame(np.random.negative_binomial(n=mean, p=dispersion, size=mean.shape), columns=dcat.index) dcat.index.name = dnum.columns.name = "sample_name" return dnum, dcat
#!/usr/bin/env python # -*- coding: utf-8 -*- import numpy as np import patsy from patsy import dmatrices, dmatrix, demo_data data = demo_data("a", "b", "x1", "x2", "y", "z column") print(data) matrix = dmatrices("y ~ x1 + x2", data) print(matrix) outcome, predictors = dmatrices("y ~ x1 + x2", data) print(outcome) print(predictors) betas = np.linalg.lstsq(predictors, outcome, rcond=None)[0].ravel() print(betas) for name, beta in zip(predictors.design_info.column_names, betas): print(f"{name}: {beta}") d = dmatrix("x1 + x2", data) print(d) d = dmatrix("x1 + x2 - 1", data) print(d) d = dmatrix("x1 + np.log(x2 + 10)", data) print(d) new_x2 = data["x2"] * 100 d = dmatrix("new_x2") print(d)
imputer_age = SimpleImputer(strategy="median") titanic["age"] = imputer_embark_town.fit_transform(titanic[["age"]]) msno.matrix(titanic) plt.show() print("==========================") # Pasty 패키지 # 이번에는 pasty 패키지를 사용하여 데이터 프레임에서 원하는 데이터만 선택하거나 # 새로운 데이터를 조합 생성하는 방법을 살펴본다. 설명을 위해 PASTY 패키지가 제공하는 demo_data() 함수로 # 다음과 같은 예제 데이터프레임을 만들자 # demo_data() 함수는 x로 시작하는 변수에 대해 임의의 실수 데이터를 생성한다. from patsy import demo_data df = pd.DataFrame(demo_data("x1", "x2", "x3", "x4", "x5")) df from patsy import dmatrix dmatrix("x1+0", data=df) dmatrix("x1 + x2 + x3 + 0", data=df) # 'dmatrix()'함수는 변수를 어떤 함수에 넣어서 다른 값으로 만드는 수학변환(트랜스폼)도 가능하다. dmatrix("x1 + np.log(np.abs(x2))", df) def ten_times(x): return 10 * x