def test_lasso_readonly_data(): X = np.array([[-1], [0], [1]]) Y = np.array([-1, 0, 1]) # just a straight line T = np.array([[2], [3], [4]]) # test sample with TempMemmap((X, Y)) as (X, Y): clf = Lasso(alpha=0.5) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [.25]) assert_array_almost_equal(pred, [0.5, 0.75, 1.]) assert_almost_equal(clf.dual_gap_, 0)
def test_lasso_positive_constraint(): X = [[-1], [0], [1]] y = [1, 0, -1] # just a straight line with negative slope lasso = Lasso(alpha=0.1, max_iter=1000, positive=True) lasso.fit(X, y) assert_true(min(lasso.coef_) >= 0) lasso = Lasso(alpha=0.1, max_iter=1000, precompute=True, positive=True) lasso.fit(X, y) assert_true(min(lasso.coef_) >= 0)
def test_lasso_positive_constraint(): X = [[-1], [0], [1]] y = [1, 0, -1] # just a straight line with negative slope lasso = Lasso(alpha=0.1, max_iter=1000, positive=True) lasso.fit(X, y) assert min(lasso.coef_) >= 0 lasso = Lasso(alpha=0.1, max_iter=1000, precompute=True, positive=True) lasso.fit(X, y) assert min(lasso.coef_) >= 0
def test_lasso_alpha_warning(): check_warnings() # Skip if unsupported Python version with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") X = [[-1], [0], [1]] Y = [-1, 0, 1] # just a straight line clf = Lasso(alpha=0) clf.fit(X, Y) assert_greater(len(w), 0) # warnings should be raised
def test_lasso_alpha_warning(): check_warnings() # Skip if unsupported Python version with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') X = [[-1], [0], [1]] Y = [-1, 0, 1] # just a straight line clf = Lasso(alpha=0) clf.fit(X, Y) assert_greater(len(w), 0) # warnings should be raised
def test_fit_simple_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True) X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') Solver = h2o4gpu.Lasso enet = Solver(glm_stop_early=False, backend=backend) print("h2o4gpu fit()") enet.fit(X, y) print("h2o4gpu predict()") print(enet.predict(X)) print("h2o4gpu score()") print(enet.score(X, y)) enet_wrapper = Solver(precompute=True, random_state=1234, backend=backend) print("h2o4gpu scikit wrapper fit()") enet_wrapper.fit(X, y) print("h2o4gpu scikit wrapper predict()") print(enet_wrapper.predict(X)) print("h2o4gpu scikit wrapper score()") print(enet_wrapper.score(X, y)) from sklearn.linear_model.coordinate_descent import Lasso enet_sk = Lasso(precompute=True, random_state=1234) print("Scikit fit()") enet_sk.fit(X, y) print("Scikit predict()") print(enet_sk.predict(X)) print("Scikit score()") print(enet_sk.score(X, y)) enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray() enet_sk_sparse_coef = csr_matrix(enet_sk.sparse_coef_, dtype=np.float32).toarray() if backend != 'h2o4gpu': print(enet_sk.coef_) print(enet_sk.sparse_coef_) print(enet_sk_coef) print(enet_sk_sparse_coef) print(enet_wrapper.coef_) print(enet_wrapper.sparse_coef_) print(enet_sk.intercept_) print(enet_wrapper.intercept_) print(enet_sk.n_iter_) print(enet_wrapper.n_iter_) assert np.allclose(enet_wrapper.coef_, enet_sk_coef) assert np.allclose(enet_wrapper.intercept_, enet_sk.intercept_) assert np.allclose(enet_wrapper.n_iter_, enet_sk.n_iter_)
def test_sparse_lasso_not_as_toy_dataset(): n_samples = 100 max_iter = 1000 n_informative = 10 X, y = make_sparse_data(n_samples=n_samples, n_informative=n_informative) X_train, X_test = X[n_samples // 2:], X[:n_samples // 2] y_train, y_test = y[n_samples // 2:], y[:n_samples // 2] s_clf = Lasso(alpha=0.1, fit_intercept=False, max_iter=max_iter, tol=1e-7) s_clf.fit(X_train, y_train) assert_almost_equal(s_clf.dual_gap_, 0, 4) assert_greater(s_clf.score(X_test, y_test), 0.85) # check the convergence is the same as the dense version d_clf = Lasso(alpha=0.1, fit_intercept=False, max_iter=max_iter, tol=1e-7) d_clf.fit(X_train.toarray(), y_train) assert_almost_equal(d_clf.dual_gap_, 0, 4) assert_greater(d_clf.score(X_test, y_test), 0.85) # check that the coefs are sparse assert_equal(np.sum(s_clf.coef_ != 0.0), n_informative)
def test_sparse_lasso_not_as_toy_dataset(): n_samples = 100 max_iter = 1000 n_informative = 10 X, y = make_sparse_data(n_samples=n_samples, n_informative=n_informative) X_train, X_test = X[n_samples / 2:], X[:n_samples / 2] y_train, y_test = y[n_samples / 2:], y[:n_samples / 2] s_clf = Lasso(alpha=0.1, fit_intercept=False, max_iter=max_iter, tol=1e-7) s_clf.fit(X_train, y_train) assert_almost_equal(s_clf.dual_gap_, 0, 4) assert_greater(s_clf.score(X_test, y_test), 0.85) # check the convergence is the same as the dense version d_clf = Lasso(alpha=0.1, fit_intercept=False, max_iter=max_iter, tol=1e-7) d_clf.fit(X_train.todense(), y_train) assert_almost_equal(d_clf.dual_gap_, 0, 4) assert_greater(d_clf.score(X_test, y_test), 0.85) # check that the coefs are sparse assert_equal(np.sum(s_clf.coef_ != 0.0), n_informative)
class LassoImpl(): def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic'): self._hyperparams = { 'alpha': alpha, 'fit_intercept': fit_intercept, 'normalize': normalize, 'precompute': precompute, 'copy_X': copy_X, 'max_iter': max_iter, 'tol': tol, 'warm_start': warm_start, 'positive': positive, 'random_state': random_state, 'selection': selection } self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def test_lasso_toy(): """ Test Lasso on a toy example for various values of alpha. When validating this against glmnet notice that glmnet divides it against nobs. """ X = [[-1], [0], [1]] Y = [-1, 0, 1] # just a straight line T = [[2], [3], [4]] # test sample clf = Lasso(alpha=1e-8) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [1]) assert_array_almost_equal(pred, [2, 3, 4]) assert_almost_equal(clf.dual_gap_, 0) clf = Lasso(alpha=0.1) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.85]) assert_array_almost_equal(pred, [1.7, 2.55, 3.4]) assert_almost_equal(clf.dual_gap_, 0) clf = Lasso(alpha=0.5) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.25]) assert_array_almost_equal(pred, [0.5, 0.75, 1.0]) assert_almost_equal(clf.dual_gap_, 0) clf = Lasso(alpha=1) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.0]) assert_array_almost_equal(pred, [0, 0, 0]) assert_almost_equal(clf.dual_gap_, 0)
def test_lasso_toy(): """ Test Lasso on a toy example for various values of alpha. When validating this against glmnet notice that glmnet divides it against nobs. """ X = [[-1], [0], [1]] Y = [-1, 0, 1] # just a straight line T = [[2], [3], [4]] # test sample clf = Lasso(alpha=1e-8) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [1]) assert_array_almost_equal(pred, [2, 3, 4]) assert_almost_equal(clf.dual_gap_, 0) clf = Lasso(alpha=0.1) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [.85]) assert_array_almost_equal(pred, [1.7, 2.55, 3.4]) assert_almost_equal(clf.dual_gap_, 0) clf = Lasso(alpha=0.5) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [.25]) assert_array_almost_equal(pred, [0.5, 0.75, 1.]) assert_almost_equal(clf.dual_gap_, 0) clf = Lasso(alpha=1) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [.0]) assert_array_almost_equal(pred, [0, 0, 0]) assert_almost_equal(clf.dual_gap_, 0)
def test_coef_shape_not_zero(): est_no_intercept = Lasso(fit_intercept=False) est_no_intercept.fit(np.c_[np.ones(3)], np.ones(3)) assert est_no_intercept.coef_.shape == (1,)
def test_coef_shape_not_zero(): est_no_intercept = Lasso(fit_intercept=False) est_no_intercept.fit(np.c_[np.ones(3)], np.ones(3)) assert est_no_intercept.coef_.shape == (1, )
class TrimRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive using Trim in combination with Lasso. Code based on JPL's implementation of Lasso. Trim deconfounding paper: https://arxiv.org/pdf/1811.05352.pdf `sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html>`_ """ __author__ = "ISI" metadata = metadata_base.PrimitiveMetadata({ "id": "de250522-5edb-4697-8945-56d04baba0e4", "version": "1.0.0", "name": "TrimRegressor", "description": "Lasso enhanced by spectral deconfounding", "python_path": "d3m.primitives.regression.trim_regressor.TrimRegressor", "source": { "name": "ISI", "contact": "mailto:[email protected]", "uris": ["https://github.com/serbanstan/trim-regressor"] }, "algorithm_types": ["REGULARIZED_LEAST_SQUARES", 'FEATURE_SCALING'], "primitive_family": "REGRESSION", "installation": [config.INSTALLATION] # "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LASSO, ], # "name": "sklearn.linear_model.coordinate_descent.Lasso", # "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, # "python_path": "d3m.primitives.regression.lasso.SKlearn", # "source": {'name': 'JPL', 'contact': 'mailto:[email protected]', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html']}, # "version": "v2019.2.27", # "id": "a7100c7d-8d8e-3f2a-a0ee-b4380383ed6c", # 'installation': [ # # TODO : Will update based on https://gitlab.com/datadrivendiscovery/d3m/issues/137 # #{ # # "type": "PIP", # # "package_uri": "git+https://gitlab.com/datadrivendiscovery/common-primitives.git@26419dde2f660f901066c896a972ae4c438ee236#egg=common_primitives" # #}, # {'type': metadata_base.PrimitiveInstallationType.PIP, # 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( # git_commit=utils.current_git_commit(os.path.dirname(__file__)), # ), # }] }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) # False self._clf = Lasso( alpha=self.hyperparams['alpha'], # fit_intercept=self.hyperparams['fit_intercept'], # normalize=self.hyperparams['normalize'], # precompute=self.hyperparams['precompute'], # max_iter=self.hyperparams['max_iter'], # tol=self.hyperparams['tol'], # warm_start=self.hyperparams['warm_start'], # positive=self.hyperparams['positive'], # selection=self.hyperparams['selection'], random_state=self.random_seed, ) # self._F = None # self._F_inv = None self._training_inputs = None self._training_outputs = None self._target_names = None self._training_indices = None self._target_column_indices = None self._target_columns_metadata: List[OrderedDict] = None self._fitted = False def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: self._training_inputs, self._training_indices = self._get_columns_to_fit( inputs, self.hyperparams) self._training_outputs, self._target_names, self._target_column_indices = self._get_targets( outputs, self.hyperparams) self._fitted = False # Computes the linear transform F, so we work in the system (FX, FY) to recover the # true betas. def _compute_F(self, X_data): X = numpy.array(X_data) U, d, V = numpy.linalg.svd(X) r = len(d) tau = sorted(d)[int(r * self.hyperparams['trim_perc'])] d_hat = numpy.array([min(x, tau) / x for x in d]) D_hat = numpy.zeros(U.shape) D_hat[:r, :r] = numpy.diag(d_hat) D_hat_inv = numpy.zeros(U.shape) D_hat_inv[:r, :r] = numpy.diag(1 / d_hat) F = numpy.dot(U, numpy.dot(D_hat, U.T)) F_inv = numpy.dot(U, numpy.dot(D_hat_inv, U.T)) return F, F_inv def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) if self._training_inputs is None or self._training_outputs is None: raise ValueError("Missing training data.") self._target_columns_metadata = self._get_target_columns_metadata( self._training_outputs.metadata) shape = self._training_outputs.shape # if len(shape) == 2 and shape[1] == 1: # sk_training_output = numpy.ravel(sk_training_output) # Don't want to use the d3mIndex columnZ X = numpy.array( self._training_inputs[self._training_inputs.columns[1:]]) y = numpy.array( self._training_outputs[self._training_outputs.columns[1:]]) if y.shape[1] == 1: y = y.ravel() F, _ = self._compute_F(X) new_inputs = numpy.dot(F, X) new_outputs = numpy.dot(F, y) # print(new_inputs.shape) # print(new_outputs.shape) self._beta = self._clf.fit(new_inputs, new_outputs).coef_ remainder = y - numpy.dot(X, self._beta) # print(y[:10]) # print(numpy.dot(X, self._beta)[:10]) self._delta = self._clf.fit(X, remainder).coef_ self._fitted = True return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: sk_inputs = inputs if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._training_indices] # print(self._training_indices) # print(sk_inputs.head()) # print((self._beta + self._delta).shape) # print(self._delta) # do prediction without index column sk_output = numpy.dot(sk_inputs[sk_inputs.columns[1:]], self._beta + self._delta) if len(sk_output.shape) == 1: sk_output = sk_output.reshape(sk_output.shape[0], 1) # but add it back in afterwards idx_col = sk_inputs[sk_inputs.columns[0]].values if len(idx_col.shape) == 1: idx_col = idx_col.reshape(idx_col.shape[0], 1) sk_output = numpy.concatenate((idx_col, sk_output), axis=1) if sparse.issparse(sk_output): sk_output = sk_output.toarray() output = self._wrap_predictions(inputs, sk_output) output.columns = self._target_names outputs = common_utils.combine_columns( return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._target_column_indices, columns_list=[output]) return CallResult(outputs) def get_params(self) -> Params: if not self._fitted: return Params( beta=None, delta=None, # coef_=None, # intercept_=None, # n_iter_=None, # dual_gap_=None, # l1_ratio=None, training_indices_=self._training_indices, target_names_=self._target_names, target_column_indices_=self._target_column_indices, target_columns_metadata_=self._target_columns_metadata) return Params( beta=self._beta, delta=self._delta, # coef_=getattr(self._clf, 'coef_', None), # intercept_=getattr(self._clf, 'intercept_', None), # n_iter_=getattr(self._clf, 'n_iter_', None), # dual_gap_=getattr(self._clf, 'dual_gap_', None), # l1_ratio=getattr(self._clf, 'l1_ratio', None), training_indices_=self._training_indices, target_names_=self._target_names, target_columns_metadata_=self._target_columns_metadata, target_column_indices_=self._target_column_indices) def set_params(self, *, params: Params) -> None: self._beta = params['beta'], self._delta = params['delta'], # self._clf.coef_ = params['coef_'] # self._clf.intercept_ = params['intercept_'] # self._clf.n_iter_ = params['n_iter_'] # self._clf.dual_gap_ = params['dual_gap_'] # self._clf.l1_ratio = params['l1_ratio'] self._training_indices = params['training_indices_'] self._target_names = params['target_names_'] self._target_column_indices = params['target_column_indices_'] self._target_columns_metadata = params['target_columns_metadata_'] self._fitted = False if params['coef_'] is not None: self._fitted = True if params['intercept_'] is not None: self._fitted = True if params['n_iter_'] is not None: self._fitted = True if params['dual_gap_'] is not None: self._fitted = True if params['l1_ratio'] is not None: self._fitted = True @classmethod def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) columns_to_produce, columns_not_to_produce = common_utils.get_columns_to_use( inputs_metadata, use_columns=hyperparams['use_input_columns'], exclude_columns=hyperparams['exclude_input_columns'], can_use_column=can_produce_column) return inputs.iloc[:, columns_to_produce], columns_to_produce # return columns_to_produce @classmethod def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (int, float, numpy.integer, numpy.float64) accepted_semantic_types = set() accepted_semantic_types.add( "https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False @classmethod def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): if not hyperparams['use_semantic_types']: return data, list(data.columns), [] metadata = data.metadata def can_produce_column(column_index: int) -> bool: accepted_semantic_types = set() accepted_semantic_types.add( "https://metadata.datadrivendiscovery.org/types/TrueTarget") column_metadata = metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) semantic_types = set(column_metadata.get('semantic_types', [])) if len(semantic_types) == 0: cls.logger.warning( "No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False target_column_indices, target_columns_not_to_produce = common_utils.get_columns_to_use( metadata, use_columns=hyperparams['use_output_columns'], exclude_columns=hyperparams['exclude_output_columns'], can_use_column=can_produce_column) targets = common_utils.select_columns(data, target_column_indices) target_column_names = [] for idx in target_column_indices: target_column_names.append(data.columns[idx]) return targets, target_column_names, target_column_indices @classmethod def _get_target_columns_metadata( cls, outputs_metadata: metadata_base.DataMetadata) -> List[OrderedDict]: outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict( outputs_metadata.query_column(column_index)) # Update semantic types and prepare it for predicted targets. semantic_types = list(column_metadata.get('semantic_types', [])) semantic_types_to_remove = [ "https://metadata.datadrivendiscovery.org/types/TrueTarget", "https://metadata.datadrivendiscovery.org/types/SuggestedTarget", "https://metadata.datadrivendiscovery.org/types/Attribute" ] if 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' not in semantic_types: semantic_types.append( 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' ) semantic_types = [ semantic_type for semantic_type in semantic_types if semantic_type not in semantic_types_to_remove ] column_metadata['semantic_types'] = semantic_types target_columns_metadata.append(column_metadata) return target_columns_metadata @classmethod def _update_predictions_metadata( cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict] ) -> metadata_base.DataMetadata: outputs_metadata = inputs_metadata.clear(for_value=outputs, generate_metadata=True) for column_index, column_metadata in enumerate( target_columns_metadata): outputs_metadata = outputs_metadata.update_column( column_index, column_metadata) return outputs_metadata def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: outputs = d3m_dataframe(predictions, generate_metadata=False) outputs.metadata = self._update_predictions_metadata( inputs.metadata, outputs, self._target_columns_metadata) return outputs @classmethod def _add_target_columns_metadata( cls, outputs_metadata: metadata_base.DataMetadata): outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict() semantic_types = [] semantic_types.append( 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' ) column_name = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)).get("name") if column_name is None: column_name = "output_{}".format(column_index) column_metadata["semantic_types"] = semantic_types column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata # TrimRegressor.__doc__ = Lasso.__doc__