def test_imputation_shape(self): """Verify the shapes of the imputed matrix for different strategies.""" X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent']: imputer = Imputer(strategy=strategy) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (10, 2)) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert_equal(X_imputed.shape, (10, 2))
def test_imputation_pickle(self): """Test for pickling imputers.""" import pickle l = 100 X = sparse_random_matrix(l, l, density=0.10) for strategy in ["mean", "median", "most_frequent"]: imputer = Imputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_equal(imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), "Fail to transform the data after pickling " "(strategy = %s)" % (strategy))
def test_imputation_pipeline_grid_search(self): """Test imputation within a pipeline + gridsearch.""" pipeline = Pipeline([('imputer', Imputer(missing_values=0)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = { 'imputer__strategy': ["mean", "median", "most_frequent"], 'imputer__axis': [0, 1] } l = 100 X = sparse_random_matrix(l, l, density=0.10) Y = sparse_random_matrix(l, 1, density=0.10).toarray() gs = grid_search.GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) ohe = OneHotEncoder(categorical_features=categorical, sparse=True) X_transformed = ohe.fit_transform(X) imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype) X_transformed = imputer.fit_transform(X_transformed) standard_scaler = StandardScaler(copy=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_
def test_imputation_copy(self): """Test imputation with copy""" X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = Imputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_false(np.all(X == Xt)) # copy=True, sparse csr => copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = Imputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_true(np.all(X == Xt)) # copy=False, sparse csr, axis=1 => no copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_true(np.all(X.data == Xt.data)) # copy=False, sparse csc, axis=0 => no copy X = X_orig.copy().tocsc() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_true(np.all(X.data == Xt.data)) # copy=False, sparse csr, axis=0 => copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, sparse csc, axis=1 => copy X = X_orig.copy().tocsc() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, sparse csr, axis=1, missing_values=0 => copy X = X_orig.copy() imputer = Imputer(missing_values=0, strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) assert_false(sparse.issparse(Xt))
def _check_statistics(self, X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test: - along the two axes - with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " \ "axis = {0}, sparse = {1}" % (strategy, missing_values) # Normal matrix, axis = 0 imputer = Imputer(missing_values, strategy=strategy, axis=0) X_trans = imputer.fit(X).transform(X.copy()) assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, False)) assert_array_equal(X_trans, X_true, err_msg.format(0, False)) # Normal matrix, axis = 1 imputer = Imputer(missing_values, strategy=strategy, axis=1) imputer.fit(X.transpose()) if np.isnan(statistics).any(): assert_raises(ValueError, imputer.transform, X.copy().transpose()) else: X_trans = imputer.transform(X.copy().transpose()) assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, False)) # Sparse matrix, axis = 0 imputer = Imputer(missing_values, strategy=strategy, axis=0) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, True)) assert_array_equal(X_trans, X_true, err_msg.format(0, True)) # Sparse matrix, axis = 1 imputer = Imputer(missing_values, strategy=strategy, axis=1) imputer.fit(sparse.csc_matrix(X.transpose())) if np.isnan(statistics).any(): assert_raises(ValueError, imputer.transform, sparse.csc_matrix(X.copy().transpose())) else: X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, True))