def transform(self, X): # add column of ones to X X = hstack([np.ones((shape(X)[0], 1)), X]) d_x = shape(X)[1] d_y, d_t = self._d_y, self._d_t # for each row, create the d_y*d_t*(d_x+1) features (which are matrices of size d_y by d_t) return reshape(np.einsum('nx,fyt->nfxyt', X, self._fts), (shape(X)[0], d_y * d_t * d_x, d_y, d_t))
def fit(self, X, y, sample_weight=None): """ Fit the ordinary least squares model. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data y : array_like, shape (n_samples, 1) or (n_samples,) Target values sample_weight : array_like, shape (n_samples,) Individual weights for each sample Returns ------- self """ assert ndim(y) == 1 or (ndim(y) == 2 and shape(y)[1] == 1) y = reshape(y, (-1,)) if self.fit_intercept: X = add_constant(X, has_constant='add') if sample_weight is not None: ols = WLS(y, X, weights=sample_weight, hasconst=self.fit_intercept) else: ols = WLS(y, X, hasconst=self.fit_intercept) self.results = ols.fit(**self.fit_args) return self
def test_hermite_results(self): inputs = np.random.normal(size=(5, 1)) hf = HermiteFeatures(3).fit_transform(inputs) # first polynomials are 1, x, x*x-1, x*x*x-3*x ones = np.ones(shape(inputs)) polys = np.hstack([ones, inputs, inputs * inputs - ones, inputs * inputs * inputs - 3 * inputs]) assert(np.allclose(hf, polys * np.exp(-inputs * inputs / 2))) for j in [True, False]: hf = HermiteFeatures(1, shift=1, joint=j).fit_transform(inputs) # first derivatives are -x, -x^2+1 (since there's just one column, joint-ness doesn't matter) polys = np.hstack([-inputs, -inputs * inputs + ones]) assert(np.allclose(hf, reshape(polys * np.exp(-inputs * inputs / 2), (5, 1, 2))))
def _test_sparse(n_p, d_w, n_r): # need at least as many rows in e_y as there are distinct columns # in [X;X⊗W;W⊗W;X⊗e_t] to find a solution for e_t assert n_p * n_r >= 2 * n_p + n_p * d_w + d_w * (d_w + 1) / 2 a = np.random.normal(size=(n_p,)) # one effect per product n = n_p * n_r p = np.tile(range(n_p), n_r) # product id b = np.random.normal(size=(d_w + n_p,)) g = np.random.normal(size=(d_w + n_p,)) x = np.empty((2 * n, n_p)) # product dummies w = np.empty((2 * n, d_w)) y = np.empty(2 * n) t = np.empty(2 * n) for fold in range(0, 2): x_f = OneHotEncoder().fit_transform(np.reshape(p, (-1, 1))).toarray() w_f = np.random.normal(size=(n, d_w)) xw_f = hstack([x_f, w_f]) e_t_f, e_y_f = TestDML._generate_recoverable_errors(a, x_f, W=w_f) t_f = xw_f @ b + e_t_f y_f = t_f * np.choose(p, a) + xw_f @ g + e_y_f x[fold * n:(fold + 1) * n, :] = x_f w[fold * n:(fold + 1) * n, :] = w_f y[fold * n:(fold + 1) * n] = y_f t[fold * n:(fold + 1) * n] = t_f dml = SparseLinearDMLCateEstimator(LinearRegression(fit_intercept=False), LinearRegression( fit_intercept=False), featurizer=FunctionTransformer()) dml.fit(y, t, x, w) # note that this would fail for the non-sparse DMLCateEstimator np.testing.assert_allclose(a, dml.coef_.reshape(-1)) eff = reshape(t * np.choose(np.tile(p, 2), a), (-1, 1)) np.testing.assert_allclose(eff, dml.effect(0, t, x)) dml = SparseLinearDMLCateEstimator(LinearRegression(fit_intercept=False), LinearRegression(fit_intercept=False), featurizer=Pipeline([("id", FunctionTransformer()), ("matrix", MatrixFeatures(1, 1))])) dml.fit(y, t, x, w) np.testing.assert_allclose(eff, dml.effect(0, t, x))
def test_complex_features(self): # recover simple features by initializing complex features appropriately for _ in range(10): d_w = np.random.randint(0, 4) d_x = np.random.randint(1, 3) d_y = np.random.randint(1, 3) d_t = np.random.randint(1, 3) n = 20 with self.subTest(d_w=d_w, d_x=d_x, d_y=d_y, d_t=d_t): W, X, Y, T = [np.random.normal(size=(n, d)) for d in [d_w, d_x, d_y, d_t]] # using full set of matrix features should be equivalent to using non-matrix featurizer dml = DMLCateEstimator(model_y=LinearRegression(), model_t=LinearRegression(), featurizer=MatrixFeatures(d_y, d_t)) dml.fit(Y, T, X, W) coef1 = dml.coef_ dml = DMLCateEstimator(model_y=LinearRegression(), model_t=LinearRegression()) dml.fit(Y, T, X, W) coef2 = dml.coef_ np.testing.assert_allclose(coef1, reshape(coef2, -1))
def test_dominicks(): file_name = "oj_large.csv" if not os.path.isfile(file_name): print("Downloading file (this might take a few seconds)...") urllib.request.urlretrieve( "https://msalicedatapublic.blob.core.windows.net/datasets/OrangeJuice/oj_large.csv", file_name) oj_data = pd.read_csv(file_name) brands = sorted(set(oj_data["brand"])) stores = sorted(set(oj_data["store"])) featnames = ["week", "feat"] + list(oj_data.columns[6:]) # Preprocess data import datetime import numpy as np # Convert 'week' to a date # week_zero = datetime.datetime.strptime("09/07/89", "%m/%d/%y") # oj_data["week"] = pd.to_timedelta(oj_data["week"], unit='w') + week_zero # Take log of price oj_data["logprice"] = np.log(oj_data["price"]) oj_data.drop("price", axis=1, inplace=True) # Make brand numeric oj_data["brand"] = [brands.index(b) for b in oj_data["brand"]] class PriceFeaturizer(TransformerMixin): def __init__(self, n_prods, own_price=True, cross_price_groups=False, cross_price_indiv=True, per_product_effects=True): base_arrays = [] effect_names = [] one_hots = [(0,) * p + (1,) + (0,) * (n_prods - p - 1) for p in range(n_prods)] if own_price: base_arrays.append(np.eye(n_prods)) effect_names.append("own price") if cross_price_groups: base_arrays.append((np.ones((n_prods, n_prods)) - np.eye(n_prods)) / (n_prods - 1)) effect_names.append("group cross price") if cross_price_indiv: for p in range(n_prods): base_arrays.append(one_hots[p] * np.ones((n_prods, 1)) - np.diag(one_hots[p])) effect_names.append("cross price effect {} ->".format(p)) if per_product_effects: all = [(np.diag(one_hots[p]) @ arr, nm + " {}".format(p)) for arr, nm in zip(base_arrays, effect_names) for p in range(n_prods)] # remove meaningless features (e.g. cross-price effects of products on themselves), # which have all zero coeffs nonempty = [(arr, nm) for arr, nm in all if np.count_nonzero(arr) > 0] self._features = [arr for arr, _ in nonempty] self._names = [nm for _, nm in nonempty] else: self._features = base_arrays self._names = effect_names def fit(self, X): self._is_fitted = True assert shape(X)[1] == 0 return self def transform(self, X): assert self._is_fitted assert shape(X)[1] == 0 return np.tile(self._features, (shape(X)[0], 1, 1, 1)) @property def names(self): return self._names for name, op, xp_g, xp_i, pp in [("Homogeneous treatment effect", True, False, False, False), ("Heterogeneous treatment effects", True, False, False, True), (("Heterogeneous treatment effects" " with group effects"), True, True, False, True), (("Heterogeneous treatment effects" " with cross price effects"), True, False, True, True)]: print(name) np.random.seed(42) ft = PriceFeaturizer(n_prods=3, own_price=op, cross_price_groups=xp_g, cross_price_indiv=xp_i, per_product_effects=pp) names = ft.names dml = LinearDMLCateEstimator(model_y=RandomForestRegressor(), model_t=RandomForestRegressor(), featurizer=ft, n_splits=2) effects = [] for store in stores: data = oj_data[oj_data['store'] == store].sort_values(by=['week', 'brand']) dml.fit(T=reshape(data.as_matrix(["logprice"]), (-1, 3)), Y=reshape(data.as_matrix(["logmove"]), (-1, 3)), W=reshape(data.as_matrix(featnames), (-1, 3 * len(featnames)))) effects.append(dml.coef_) effects = np.array(effects) for nm, eff in zip(names, effects.T): print(" Effect: {}".format(nm)) print(" Mean: {}".format(np.mean(eff))) print(" Std.: {}".format(np.std(eff))) class ConstFt(TransformerMixin): def fit(self, X): return self def transform(self, X): return np.ones((shape(X)[0], 1)) print("Vanilla HTE+XP") np.random.seed(42) dml = LinearDMLCateEstimator(model_y=RandomForestRegressor(), model_t=RandomForestRegressor(), featurizer=ConstFt(), n_splits=2) effects = [] for store in stores: data = oj_data[oj_data['store'] == store].sort_values(by=['week', 'brand']) dml.fit(T=reshape(data.as_matrix(["logprice"]), (-1, 3)), Y=reshape(data.as_matrix(["logmove"]), (-1, 3)), W=reshape(data.as_matrix(featnames), (-1, 3 * len(featnames)))) effects.append(dml.coef_) effects = np.array(effects) names = ["{} on {}".format(i, j) for j in range(3) for i in range(3)] for nm, eff in zip(names, reshape(effects, (-1, 9)).T): print(" Effect: {}".format(nm)) print(" Mean: {}".format(np.mean(eff))) print(" Std.: {}".format(np.std(eff)))
def predict(self, X): predictions = self.model.predict(X) return reshape(predictions, (-1, 1)) if self.needs_unravel else predictions