def test_can_use_vectors(self): """Test that we can pass vectors for T and Y (not only 2-dimensional arrays).""" dml = DMLCateEstimator(LinearRegression(), LinearRegression(), featurizer=FunctionTransformer()) dml.fit(np.array([1, 2, 3, 1, 2, 3]), np.array([1, 2, 3, 1, 2, 3]), np.ones((6, 1))) self.assertAlmostEqual(dml.coef_.reshape(())[()], 1)
def test_dominicks(): file_name = "oj_large.csv" if not os.path.isfile(file_name): print("Downloading file (this might take a few seconds)...") urllib.request.urlretrieve( "https://msalicedatapublic.blob.core.windows.net/datasets/OrangeJuice/oj_large.csv", file_name) oj_data = pd.read_csv(file_name) brands = sorted(set(oj_data["brand"])) stores = sorted(set(oj_data["store"])) featnames = ["week", "feat"] + list(oj_data.columns[6:]) # Preprocess data import datetime import numpy as np # Convert 'week' to a date # week_zero = datetime.datetime.strptime("09/07/89", "%m/%d/%y") # oj_data["week"] = pd.to_timedelta(oj_data["week"], unit='w') + week_zero # Take log of price oj_data["logprice"] = np.log(oj_data["price"]) oj_data.drop("price", axis=1, inplace=True) # Make brand numeric oj_data["brand"] = [brands.index(b) for b in oj_data["brand"]] class PriceFeaturizer(TransformerMixin): def __init__(self, n_prods, own_price=True, cross_price_groups=False, cross_price_indiv=True, per_product_effects=True): base_arrays = [] effect_names = [] one_hots = [(0, ) * p + (1, ) + (0, ) * (n_prods - p - 1) for p in range(n_prods)] if own_price: base_arrays.append(np.eye(n_prods)) effect_names.append("own price") if cross_price_groups: base_arrays.append((np.ones( (n_prods, n_prods)) - np.eye(n_prods)) / (n_prods - 1)) effect_names.append("group cross price") if cross_price_indiv: for p in range(n_prods): base_arrays.append(one_hots[p] * np.ones((n_prods, 1)) - np.diag(one_hots[p])) effect_names.append("cross price effect {} ->".format(p)) if per_product_effects: all = [(np.diag(one_hots[p]) @ arr, nm + " {}".format(p)) for arr, nm in zip(base_arrays, effect_names) for p in range(n_prods)] # remove meaningless features (e.g. cross-price effects of products on themselves), # which have all zero coeffs nonempty = [(arr, nm) for arr, nm in all if np.count_nonzero(arr) > 0] self._features = [arr for arr, _ in nonempty] self._names = [nm for _, nm in nonempty] else: self._features = base_arrays self._names = effect_names def fit(self, X): self._is_fitted = True assert shape(X)[1] == 0 return self def transform(self, X): assert self._is_fitted assert shape(X)[1] == 0 return np.tile(self._features, (shape(X)[0], 1, 1, 1)) @property def names(self): return self._names for name, op, xp_g, xp_i, pp in [ ("Homogeneous treatment effect", True, False, False, False), ("Heterogeneous treatment effects", True, False, False, True), (("Heterogeneous treatment effects" " with group effects"), True, True, False, True), (("Heterogeneous treatment effects" " with cross price effects"), True, False, True, True) ]: print(name) np.random.seed(42) ft = PriceFeaturizer(n_prods=3, own_price=op, cross_price_groups=xp_g, cross_price_indiv=xp_i, per_product_effects=pp) names = ft.names dml = DMLCateEstimator(model_y=RandomForestRegressor(), model_t=RandomForestRegressor(), featurizer=ft, n_splits=2) effects = [] for store in stores: data = oj_data[oj_data['store'] == store].sort_values( by=['week', 'brand']) dml.fit(T=reshape(data.as_matrix(["logprice"]), (-1, 3)), Y=reshape(data.as_matrix(["logmove"]), (-1, 3)), W=reshape(data.as_matrix(featnames), (-1, 3 * len(featnames)))) effects.append(dml.coef_) effects = np.array(effects) for nm, eff in zip(names, effects.T): print(" Effect: {}".format(nm)) print(" Mean: {}".format(np.mean(eff))) print(" Std.: {}".format(np.std(eff))) class ConstFt(TransformerMixin): def fit(self, X): return self def transform(self, X): return np.ones((shape(X)[0], 1)) print("Vanilla HTE+XP") np.random.seed(42) dml = DMLCateEstimator(model_y=RandomForestRegressor(), model_t=RandomForestRegressor(), featurizer=ConstFt(), n_splits=2) effects = [] for store in stores: data = oj_data[oj_data['store'] == store].sort_values( by=['week', 'brand']) dml.fit(T=reshape(data.as_matrix(["logprice"]), (-1, 3)), Y=reshape(data.as_matrix(["logmove"]), (-1, 3)), W=reshape(data.as_matrix(featnames), (-1, 3 * len(featnames)))) effects.append(dml.coef_) effects = np.array(effects) names = ["{} on {}".format(i, j) for j in range(3) for i in range(3)] for nm, eff in zip(names, reshape(effects, (-1, 9)).T): print(" Effect: {}".format(nm)) print(" Mean: {}".format(np.mean(eff))) print(" Std.: {}".format(np.std(eff)))