def pinball_loss(y_true, y_pred, probs): """Compute the pinball loss. Parameters ---------- pred : {array-like}, shape = [n_quantiles, n_samples] or [n_samples] Predictions. y : {array-like}, shape = [n_samples] Targets. Returns ------- l : {array}, shape = [n_quantiles] Average loss for each quantile level. """ probs = asarray(probs).reshape(-1) check_consistent_length(y_true, y_pred.T) y_true = check_array(y_true.reshape((-1, 1)), ensure_2d=True) y_pred = check_array(y_pred.T.reshape((y_true.shape[0], -1)), ensure_2d=True) residual = y_true - y_pred loss = npsum([fmax(prob * res, (prob - 1) * res) for (res, prob) in zip(residual.T, probs)], axis=1) return loss / y_true.size
def fit(self, X, y=None): if self.encoding not in ['similarity', 'target', 'ordinal', 'onehot', 'onehot-dense', 'ngram-count', 'ngram-presence', 'ngram-tfidf']: template = ("Encoding %s has not been implemented yet") raise ValueError(template % self.handle_unknown) if self.handle_unknown not in ['error', 'ignore']: template = ("handle_unknown should be either 'error' or " "'ignore', got %s") raise ValueError(template % self.handle_unknown) if self.encoding == 'ordinal' and self.handle_unknown == 'ignore': raise ValueError("handle_unknown='ignore' is not supported for" " encoding='ordinal'") if self.categories != 'auto': for cats in self.categories: if not np.all(np.sort(cats) == np.array(cats)): raise ValueError("Unsorted categories are not yet " "supported") X_temp = check_array(X, dtype=None) if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): X = check_array(X, dtype=np.object) else: X = X_temp n_samples, n_features = X.shape self._label_encoders_ = [LabelEncoder() for _ in range(n_features)] for i in range(n_features): le = self._label_encoders_[i] Xi = X[:, i] if self.categories == 'auto': le.fit(Xi) else: if self.handle_unknown == 'error': valid_mask = np.in1d(Xi, self.categories[i]) if not np.all(valid_mask): diff = np.unique(Xi[~valid_mask]) msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) le.classes_ = np.array(self.categories[i]) self.categories_ = [le.classes_ for le in self._label_encoders_] if self.encoding == 'target': self.Eyx_ = [{cat: np.mean(y[X[:, i] == cat]) for cat in self.categories_[i]} for i in range(len(self.categories_))] self.Ey_ = [np.mean(y) for i in range(len(self.categories_))] return self
def predict_proba(self,X): """Create predictions. Start a vw process. Convert data to vw format and send. Returns class probability estimates for the given test data. X : pandas dataframe or array-like Test samples Returns ------- proba : array-like, shape = (n_samples, n_outputs) Class probability estimates. Caveats : 1. A seldon specific fork of wabbit_wappa is needed to allow vw to run in server mode without save_resume. Save_resume seems to cause issues with the scores returned. Maybe connected to https://github.com/JohnLangford/vowpal_wabb#it/issues/262 """ self._start_vw_if_needed("test") if isinstance(X,pd.DataFrame): df = X df_base = self._exclude_include_features(df) df_base = df_base.fillna(0) else: check_array(X) df_base = pd.DataFrame(X) df_vw = df_base.apply(self._convert_row,axis=1) predictions = None for (index,val) in df_vw.iteritems(): prediction = self.vw.send_line(val,parse_result=True) self._start_raw_predictions() scores = self._get_full_scores() if predictions is None: predictions = np.array([scores]) else: predictions = np.vstack([predictions,scores]) return predictions
def __call__(self, y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None): if self.lb_ is None: self.lb_ = LabelBinarizer() T = self.lb_.fit_transform(y_true) else: T = self.lb_.transform(y_true) if T.shape[1] == 1: T = np.append(1 - T, T, axis=1) Y = np.clip(y_pred, eps, 1 - eps) if not isinstance(Y, np.ndarray): raise ValueError("y_pred should be an array of floats.") if Y.ndim == 1: Y = Y[:, np.newaxis] if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) check_consistent_length(T, Y) T = check_array(T) Y = check_array(Y) if T.shape[1] != Y.shape[1]: raise ValueError("y_true and y_pred have different number of classes " "%d, %d" % (T.shape[1], Y.shape[1])) Y /= Y.sum(axis=1)[:, np.newaxis] loss = -(T * np.log(Y)).sum(axis=1) return _weighted_sum(loss, sample_weight, normalize)
def test_check_array_force_all_finiteinvalid(value, force_all_finite, match_msg, retype): X = retype(np.arange(4).reshape(2, 2).astype(np.float)) X[0, 0] = value with pytest.raises(ValueError, match=match_msg): check_array(X, force_all_finite=force_all_finite, accept_sparse=True)
def fit(self, X_train, y_train, n_more_iter=0): """ Fit model with specified loss. Parameters ---------- X : scipy.sparse.csc_matrix, (n_samples, n_features) y : float | ndarray, shape = (n_samples, ) n_more_iter : int Number of iterations to continue from the current Coefficients. """ check_consistent_length(X_train, y_train) y_train = check_array(y_train, ensure_2d=False, dtype=np.float64) X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64, order="F") self.n_iter = self.n_iter + n_more_iter if n_more_iter > 0: _check_warm_start(self, X_train) self.warm_start = True self.w0_, self.w_, self.V_ = ffm.ffm_als_fit(self, X_train, y_train) if self.iter_count != 0: self.iter_count = self.iter_count + n_more_iter else: self.iter_count = self.n_iter # reset to default setting self.warm_start = False return self
def _transform_new(self, X): """New implementation assuming categorical input""" X_temp = check_array(X, dtype=None) if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): X = check_array(X, dtype=np.object) else: X = X_temp n_samples, n_features = X.shape X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) mask = X_mask.ravel() n_values = [cats.shape[0] for cats in self.categories_] n_values = np.array([0] + n_values) feature_indices = np.cumsum(n_values) indices = (X_int + feature_indices[:-1]).ravel()[mask] indptr = X_mask.sum(axis=1).cumsum() indptr = np.insert(indptr, 0, 0) data = np.ones(n_samples * n_features)[mask] out = sparse.csr_matrix((data, indices, indptr), shape=(n_samples, feature_indices[-1]), dtype=self.dtype) if not self.sparse: return out.toarray() else: return out
def fit_transform(self,X,y=None): """ Generates sets of hyper-spheres for anomaly scores Parameters ---------- X : numpy array (nb_samples, nb_features) data set Returns ------- self """ t_0 = time() check_array(X) self._sets_of_spheres = [] if self.verbose: logger.info('generating sets of spheres...') for j in range(self.ensemble_size): X_s = np.random.permutation(X)[:self.sample_size,:] spheres = self._generate_spheres(X_s) self._sets_of_spheres.append(spheres) t_f = time() - t_0 m,s = divmod(t_f, 60) h,m = divmod(m, 60) if self.verbose: logger.info('Total run time: %i:%i:%i' % (h,m,s)) return self
def query(self, X, **query_kwargs): """ Finds the n_instances most informative point in the data provided by calling the query_strategy function. Returns the queried instances and its indices. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The pool of samples from which the query strategy should choose instances to request labels. query_kwargs: keyword arguments Keyword arguments for the query strategy function Returns ------- query_idx: numpy.ndarray of shape (n_instances, ) The indices of the instances from X_pool chosen to be labelled. X[query_idx]: numpy.ndarray of shape (n_instances, n_features) The instances from X_pool chosen to be labelled. """ check_array(X, ensure_2d=True) query_idx, query_instances = self.query_strategy(self, X, **query_kwargs) return query_idx, X[query_idx]
def fit(self, X, y): check_array(X, y) for x_i, y_i in izip(X, y): self.partial_fit(x_i, y_i) return self
def _add_training_data(self, X, y): """ Adds the new data and label to the known data, but does not retrain the model. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The new samples for which the labels are supplied by the expert. y: numpy.ndarray of shape (n_samples, ) Labels corresponding to the new instances in X. Note ---- If the classifier has been fitted, the features in X have to agree with the training samples which the classifier has seen. """ X, y = check_array(X), check_array(y, ensure_2d=False) assert len(X) == len(y), 'the number of new data points and number of labels must match' if type(self._X_training) != type(None): try: self._X_training = np.vstack((self._X_training, X)) self._y_training = np.concatenate((self._y_training, y)) except ValueError: raise ValueError('the dimensions of the new training data and label must' 'agree with the training data and labels provided so far') else: self._X_training = X self._y_training = y
def csr_to_fm(self, X_csr, return_oh=True, indices=None): assert (X_csr.shape == (self.n_samples, self.n_features)) if indices is None: y = check_array(X_csr.data, ensure_2d=False, copy=True) else: if isinstance(indices, tuple): indices_samples, indices_features = indices elif isinstance(indices, sp.csc_matrix): indices_samples, indices_features = self.fm_to_indices(indices) y = X_csr[indices_samples, indices_features].A[0].copy() if not return_oh: return y else: X = check_array(X_csr, accept_sparse='coo', force_all_finite=False) n_rows, n_cols = X_csr.shape assert ((n_rows, n_cols) == (self.n_samples, self.n_features)) if indices is None: encoder = OneHotEncoder(n_values=[self.n_samples, self.n_features]) X_ix = np.column_stack([X.row, X.col]) else: assert (np.sorted(indices_samples) == np.sorted(X.row)) assert (np.sorted(indices_features) == np.sorted(X.col)) X_ix = np.column_stack([indices_samples, indices_features]) X_oh = encoder.fit_transform(X_ix) return X_oh, y
def vote(self, X, **predict_kwargs): """ Predicts the labels for the supplied data for each learner in the Committee. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The samples to cast votes. predict_kwargs: keyword arguments Keyword arguments to be passed for the learners .predict() method. Returns ------- vote: numpy.ndarray of shape (n_samples, n_learners) The predicted class for each learner in the Committee and each sample in X. """ check_array(X, ensure_2d=True) prediction = np.zeros(shape=(X.shape[0], len(self._learner_list))) for learner_idx, learner in enumerate(self._learner_list): prediction[:, learner_idx] = learner.predict(X, **predict_kwargs) return prediction
def _transform(self, X, handle_unknown='error'): X_temp = check_array(X, dtype=None) if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): X = check_array(X, dtype=np.object) else: X = X_temp _, n_features = X.shape X_int = np.zeros_like(X, dtype=np.int) X_mask = np.ones_like(X, dtype=np.bool) for i in range(n_features): Xi = X[:, i] valid_mask = np.in1d(Xi, self.categories_[i]) if not np.all(valid_mask): if handle_unknown == 'error': diff = np.unique(X[~valid_mask, i]) msg = ("Found unknown categories {0} in column {1}" " during transform".format(diff, i)) raise ValueError(msg) else: # Set the problematic rows to an acceptable value and # continue `The rows are marked `X_mask` and will be # removed later. X_mask[:, i] = valid_mask Xi = Xi.copy() Xi[~valid_mask] = self.categories_[i][0] X_int[:, i] = self._label_encoders_[i].transform(Xi) return X_int, X_mask
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None): lb = LabelBinarizer() T = lb.fit_transform(y_true) if T.shape[1] == 1: T = np.append(1 - T, T, axis=1) # Clipping Y = np.clip(y_pred, eps, 1 - eps) # This happens in cases when elements in y_pred have type "str". if not isinstance(Y, np.ndarray): raise ValueError("y_pred should be an array of floats.") # If y_pred is of single dimension, assume y_true to be binary # and then check. if Y.ndim == 1: Y = Y[:, np.newaxis] if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) # Check if dimensions are consistent. check_consistent_length(T, Y) T = check_array(T) Y = check_array(Y) if T.shape[1] != Y.shape[1]: raise ValueError("y_true and y_pred have different number of classes " "%d, %d" % (T.shape[1], Y.shape[1])) # Renormalize Y /= Y.sum(axis=1)[:, np.newaxis] loss = -(T * np.log(Y)).sum(axis=1) return loss
def fit(self, X, y): """Fit OVK ridge regression model. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data. y : {array-like}, shape = [n_samples] or [n_samples, n_targets] Target values. numpy.NaN for missing targets (semi-supervised learning). Returns ------- self : returns an instance of self. """ X = check_array(X, force_all_finite=True, accept_sparse=False, ensure_2d=True) y = check_array(y, force_all_finite=False, accept_sparse=False, ensure_2d=False) if y.ndim == 1: y = check_array(y, force_all_finite=True, accept_sparse=False, ensure_2d=False) self._validate_params() self.linop_ = self._get_kernel_map(X, y) Gram = self.linop_._Gram(X) if self.lbda > 0: self.dual_coefs_ = dlyap(-Gram / self.lbda, self.linop_.A, y / self.lbda) else: # TODO: Check A is invertible!! self.dual_coefs_ = solve(Gram, y) return self
def transform(self, X): check_array(X, accept_sparse=['csr', 'csc']) if issparse(X): mult = spdiags(self.weights_, 0, self.length, self.length) X *= mult else: X *= self.weights_ return X
def __init__(self, X, y, n_classes, batch_size): self.X = check_array(X, dtype=np.float32, ensure_2d=False, allow_nd=True) self.y = check_array(y, ensure_2d=False, dtype=None) self.n_classes = n_classes self.batch_size = batch_size self._input_shape = [batch_size] + list(X.shape[1:]) self._output_shape = [batch_size, n_classes] if n_classes > 1 else [batch_size]
def test_check_array_on_mock_dataframe(): arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]]) mock_df = MockDataFrame(arr) checked_arr = check_array(mock_df) assert_equal(checked_arr.dtype, arr.dtype) checked_arr = check_array(mock_df, dtype=np.float32) assert_equal(checked_arr.dtype, np.dtype(np.float32))
def fit(self, X, y): X = check_array(X) y = check_array(y) for x_i, y_i in izip(X, y): self.partial_fit(x_i.reshape(-1, 1), y_i.reshape(1, -1)) return self
def partial_fit(self, X, y): X = check_array(X, copy=self.copy) y = check_array(y, copy=self.copy) if self._X is None: self._X = X self._y = y else: self._X = np.vstack((self._X, X)) self._y = np.vstack((self._y, y))
def fit_score(self,X,y=None): """ Generate set of hyper-sphere and return anomaly score for all points in dataset Parameters ---------- X : numpy array data set Return ------ scores : numpy array 1-d vector with the anomaly scores for all data points """ t_0 = time() check_array(X) self._sets_of_spheres = [] if self.verbose: logger.info('generating sets of spheres...') for j in range(self.ensemble_size): X_s = np.random.permutation(X)[:self.sample_size,:] spheres = self._generate_spheres(X_s) self._sets_of_spheres.append(spheres) scores = np.zeros(X.shape[0]) for i in range(X.shape[0]): if i % 1000 == 0 and self.verbose: logger.info('Getting anomaly score for data point %i' % i) logger.info('X shape: %i X %i' % X.shape) scores_i = [] j=0 for spheres in self._sets_of_spheres: score = self._score(X[i],spheres) if i % 1000 == 0 and j % 10 ==0 and self.verbose: logger.info('Anomaly score for data point %i from estimator %i: %f' % (i,j,score)) scores_i.append(score) j+=1 scores[i] = np.mean(scores_i) if 'X_scored' not in dir(self): self.X_scored = np.column_stack((X,scores)) t_f = time() - t_0 m,s = divmod(t_f, 60) h,m = divmod(m, 60) if self.verbose: logger.info('Total run time: %i:%i:%i' % (h,m,s)) return scores
def __init__(self, X, y, n_classes, batch_size): self.X = check_array(X, ensure_2d=False, allow_nd=True, dtype=[np.float32, np.int64]) self.y = check_array(y, ensure_2d=False, dtype=np.float32) self.n_classes = n_classes self.batch_size = batch_size self.input_shape, self.output_shape = _get_in_out_shape( self.X.shape, self.y.shape, n_classes, batch_size) self.input_dtype, self.output_dtype = self.X.dtype, self.y.dtype
def test_check_array_warn_on_dtype_deprecation(): X = np.asarray([[0.0], [1.0]]) Y = np.asarray([[2.0], [3.0]]) with pytest.warns(DeprecationWarning, match="'warn_on_dtype' is deprecated"): check_array(X, warn_on_dtype=True) with pytest.warns(DeprecationWarning, match="'warn_on_dtype' is deprecated"): check_X_y(X, Y, warn_on_dtype=True)
def _init(self, X): """Initialize statistic and dictionary""" if self.projection not in ["partial", "full"]: raise ValueError("projection should be in {'partial', 'full'}," " got %s" % self.projection) X = check_array(X, dtype="float", order="F", accept_sparse="csr") self.sparse_ = sp.issparse(X) n_rows, n_cols = X.shape if self.n_samples is not None: self.n_samples_ = self.n_samples else: self.n_samples_ = n_rows self.random_state_ = check_random_state(self.random_state) # D dictionary if self.dict_init is not None: if self.dict_init.shape != (self.n_components, n_cols): raise ValueError( "Initial dictionary and X shape mismatch: %r != %r" % (self.dict_init.shape, (self.n_components, n_cols)) ) self.D_ = check_array(self.dict_init, order="C", dtype="float", copy=True) if self.fit_intercept: if not (np.all(self.D_[0] == self.D_[0].mean())): raise ValueError( "When fitting intercept and providing " "initial dictionary, first component of" " the dictionary should be " "proportional to [1, ..., 1]" ) self.D_[0] = 1 else: self.D_ = np.empty((self.n_components, n_cols), order="C") if self.fit_intercept: self.D_[0] = 1 U = self.random_state_.randn(n_cols, self.n_components - 1) Q, _ = np.linalg.qr(U) self.D_[1:] = Q.T else: self.D_[:] = self.random_state_.randn(self.n_components, n_cols) self.D_ = np.asfortranarray(enet_scale(self.D_, l1_ratio=self.l1_ratio, radius=1)) self.A_ = np.zeros((self.n_components, self.n_components), order="F") self.B_ = np.zeros((self.n_components, n_cols), order="F") self.counter_ = np.zeros(n_cols + 1, dtype="int") self.n_iter_ = np.zeros(1, dtype="long") self.code_ = np.zeros((self.n_samples_, self.n_components))
def test_check_array_pandas_dtype_object_conversion(): # test that data-frame like objects with dtype object # get converted X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.object) X_df = MockDataFrame(X) assert_equal(check_array(X_df).dtype.kind, "f") assert_equal(check_array(X_df, ensure_2d=False).dtype.kind, "f") # smoke-test against dataframes with column named "dtype" X_df.dtype = "Hans" assert_equal(check_array(X_df, ensure_2d=False).dtype.kind, "f")
def test_check_array_series(): # regression test that check_array works on pandas Series pd = importorskip("pandas") res = check_array(pd.Series([1, 2, 3]), ensure_2d=False) assert_array_equal(res, np.array([1, 2, 3])) # with categorical dtype (not a numpy dtype) (GH12699) s = pd.Series(['a', 'b', 'c']).astype('category') res = check_array(s, dtype=None, ensure_2d=False) assert_array_equal(res, np.array(['a', 'b', 'c'], dtype=object))
def fit(self, X, y): """Fit OVK ridge regression model. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data. y : {array-like}, shape = [n_samples] or [n_samples, n_targets] Target values. numpy.NaN for missing targets (semi-supervised learning). Returns ------- self : returns an instance of self. """ X = check_array(X, force_all_finite=True, accept_sparse=False, ensure_2d=True) y = check_array(y, force_all_finite=False, accept_sparse=False, ensure_2d=False) if y.ndim == 1: y = check_array(y, force_all_finite=True, accept_sparse=False, ensure_2d=False) self._validate_params() solver_params = self.solver_params or {} self.linop_ = self._get_kernel_map(X, y) Gram = self.linop_(X) risk = OVKRidgeRisk(self.lbda) if not issubdtype(y.dtype, number): raise ValueError("Unknown label type: %r" % y.dtype) if y.ndim > 1: is_sup = ~all(isnan(y), axis=1) else: is_sup = ~isnan(y) if sum(~is_sup) > 0: self.L_ = _graph_Laplacian(rbf_kernel(X[~is_sup, :], gamma=self.gamma_m)) else: self.L_ = empty((0, 0)) p = y.shape[1] if y.ndim > 1 else 1 weight, zeronan = _SemisupLinop(self.lbda_m, is_sup, self.L_, p).gen() self.solver_res_ = minimize(risk.functional_grad_val, zeros(Gram.shape[1]), args=(y.ravel(), Gram, weight, zeronan), method=self.solver, jac=True, options=solver_params) self.dual_coefs_ = self.solver_res_.x return self
def test_check_array_force_all_finite_object(): X = np.array([['a', 'b', np.nan]], dtype=object).T X_checked = check_array(X, dtype=None, force_all_finite='allow-nan') assert X is X_checked X_checked = check_array(X, dtype=None, force_all_finite=False) assert X is X_checked with pytest.raises(ValueError, match='Input contains NaN'): check_array(X, dtype=None, force_all_finite=True)
def __init__(self, y_true, y_pred, name='real_metrics'): super(BaseRealMetrics, self).__init__(name=name) # check inputs self._y_true = check_array( y_true, ensure_2d=False, ensure_min_samples=0) self._y_pred = check_array( y_pred, ensure_2d=False, ensure_min_samples=0) if self._y_true.shape != self._y_pred.shape: raise ValueError( 'The sizes of true and predicted vectors must be equal')
def decision_function(self, X): check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) X = check_array(X) outlier_scores = self._calculate_decision_score(X) return np.array(outlier_scores)
def test_check_array_force_all_finite_valid(value, force_all_finite, retype): X = retype(np.arange(4).reshape(2, 2).astype(float)) X[0, 0] = value X_checked = check_array(X, force_all_finite=force_all_finite, accept_sparse=True) assert_allclose_dense_sparse(X, X_checked)
def _fit(self, X, y, sample_weight=None, check_input=True): # check X and y if check_input: X, y = check_X_y( X, y, copy=False, accept_sparse='csc', dtype=[np.float64, np.float32], multi_output=True, y_numeric=True, ) y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False) if not sp.issparse(X): self.fit_shape_good_for_daal_ = \ True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False else: self.fit_shape_good_for_daal_ = False log_str = "sklearn.linear_model." + self.__class__.__name__ + ".fit: " sklearn_ready = sp.issparse(X) or not self.fit_shape_good_for_daal_ or \ X.dtype not in [np.float64, np.float32] or sample_weight is not None if sklearn_ready: if hasattr(self, 'daal_model_'): del self.daal_model_ logging.info( log_str + get_patch_message("sklearn") ) if sklearn_check_version('0.23'): res_new = super(ElasticNet, self).fit( X, y, sample_weight=sample_weight, check_input=check_input) else: res_new = super(ElasticNet, self).fit( X, y, check_input=check_input) self._gap = res_new.dual_gap_ return res_new self.n_iter_ = None self._gap = None if not check_input: # only for compliance with Sklearn, # this assert is not required for Intel(R) oneAPI Data # Analytics Library print(type(X), X.flags['F_CONTIGUOUS']) if isinstance(X, np.ndarray) and \ X.flags['F_CONTIGUOUS'] is False: # print(X.flags) raise ValueError("ndarray is not Fortran contiguous") if sklearn_check_version('1.0'): self._normalize = _deprecate_normalize( self.normalize, default=False, estimator_name=self.__class__.__name__) # only for pass tests # "check_estimators_fit_returns_self(readonly_memmap=True) and # check_regressors_train(readonly_memmap=True) if not X.flags.writeable: X = np.copy(X) if not y.flags.writeable: y = np.copy(y) logging.info(log_str + get_patch_message("daal")) if self.__class__.__name__ == "ElasticNet": res = _daal4py_fit_enet(self, X, y, check_input=check_input) else: res = _daal4py_fit_lasso(self, X, y, check_input=check_input) if res is None: if hasattr(self, 'daal_model_'): del self.daal_model_ logging.info( log_str + get_patch_message("sklearn_after_daal") ) if sklearn_check_version('0.23'): res_new = super(ElasticNet, self).fit( X, y, sample_weight=sample_weight, check_input=check_input) else: res_new = super(ElasticNet, self).fit( X, y, check_input=check_input) self._gap = res_new.dual_gap_ return res_new return res
def _fit_resample(self, X, y): self.n_features_ = X.shape[1] self._validate_estimator() # compute the median of the standard deviation of the minority class target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) X_continuous = X[:, self.continuous_features_] X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"]) X_minority = _safe_indexing( X_continuous, np.flatnonzero(y == class_minority) ) if sparse.issparse(X): if X.format == "csr": _, var = csr_mean_variance_axis0(X_minority) else: _, var = csc_mean_variance_axis0(X_minority) else: var = X_minority.var(axis=0) self.median_std_ = np.median(np.sqrt(var)) X_categorical = X[:, self.categorical_features_] if X_continuous.dtype.name != "object": dtype_ohe = X_continuous.dtype else: dtype_ohe = np.float64 self.ohe_ = OneHotEncoder( sparse=True, handle_unknown="ignore", dtype=dtype_ohe ) # the input of the OneHotEncoder needs to be dense X_ohe = self.ohe_.fit_transform( X_categorical.toarray() if sparse.issparse(X_categorical) else X_categorical ) # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever # distance is computed between 2 samples, the difference will be equal # to the median of the standard deviation as in the original paper. X_ohe.data = ( np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2 ) X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr") X_resampled, y_resampled = super()._fit_resample(X_encoded, y) # reverse the encoding of the categorical features X_res_cat = X_resampled[:, self.continuous_features_.size:] X_res_cat.data = np.ones_like(X_res_cat.data) X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat) if sparse.issparse(X): X_resampled = sparse.hstack( ( X_resampled[:, : self.continuous_features_.size], X_res_cat_dec, ), format="csr", ) else: X_resampled = np.hstack( ( X_resampled[:, : self.continuous_features_.size].toarray(), X_res_cat_dec, ) ) indices_reordered = np.argsort( np.hstack((self.continuous_features_, self.categorical_features_)) ) if sparse.issparse(X_resampled): # the matrix is supposed to be in the CSR format after the stacking col_indices = X_resampled.indices.copy() for idx, col_idx in enumerate(indices_reordered): mask = X_resampled.indices == col_idx col_indices[mask] = idx X_resampled.indices = col_indices else: X_resampled = X_resampled[:, indices_reordered] return X_resampled, y_resampled
def _fit(self, X, Y, weights, check_input): time_init = time.perf_counter() # Check parameters and input arrays _check_parameters(**self.get_params()) _check_X_Y_weights(X, Y, weights) self._n_scenarios = len(X) if self.verbose: logging.info("Optimal binning started.") logging.info("Options: check parameters.") _check_parameters(**self.get_params()) # Pre-processing if self.verbose: logging.info("Pre-processing started.") time_preprocessing = time.perf_counter() self._n_samples_scenario = [len(x) for x in X] self._n_samples = sum(self._n_samples_scenario) if self.verbose: logging.info("Pre-processing: number of samples: {}" .format(self._n_samples)) [x_clean, y_clean, x_missing, y_missing, x_special, y_special, w] = split_data_scenarios(X, Y, weights, self.special_codes, check_input) self._time_preprocessing = time.perf_counter() - time_preprocessing if self.verbose: n_clean = len(x_clean) n_missing = len(x_missing) n_special = len(x_special) logging.info("Pre-processing: number of clean samples: {}" .format(n_clean)) logging.info("Pre-processing: number of missing samples: {}" .format(n_missing)) logging.info("Pre-processing: number of special samples: {}" .format(n_special)) logging.info("Pre-processing terminated. Time: {:.4f}s" .format(self._time_preprocessing)) # Pre-binning if self.verbose: logging.info("Pre-binning started.") time_prebinning = time.perf_counter() if self.user_splits is not None: user_splits = check_array( self.user_splits, ensure_2d=False, dtype=None, force_all_finite=True) user_splits = np.unique(self.user_splits) splits, n_nonevent, n_event = self._prebinning_refinement( user_splits, x_clean, y_clean, y_missing, y_special) else: splits, n_nonevent, n_event = self._fit_prebinning( w, x_clean, y_clean, y_missing, y_special, self.class_weight) self._n_prebins = len(n_nonevent) self._time_prebinning = time.perf_counter() - time_prebinning if self.verbose: logging.info("Pre-binning: number of prebins: {}" .format(self._n_prebins)) logging.info("Pre-binning: number of refinements: {}" .format(self._n_refinements)) logging.info("Pre-binning terminated. Time: {:.4f}s" .format(self._time_prebinning)) # Optimization self._fit_optimizer(splits, n_nonevent, n_event, weights) # Post-processing if self.verbose: logging.info("Post-processing started.") logging.info("Post-processing: compute binning information.") time_postprocessing = time.perf_counter() self._n_nonevent = 0 self._n_event = 0 self._binning_tables = [] for s in range(self._n_scenarios): s_n_nonevent, s_n_event = bin_info( self._solution, n_nonevent[:, s], n_event[:, s], self._n_nonevent_missing[s], self._n_event_missing[s], self._n_nonevent_special[s], self._n_event_special[s], None, None, []) self._n_nonevent += s_n_nonevent self._n_event += s_n_event binning_table = BinningTable( self.name, self.dtype, self._splits_optimal, s_n_nonevent, s_n_event, None, None, self.user_splits) self._binning_tables.append(binning_table) self._binning_table = BinningTable( self.name, self.dtype, self._splits_optimal, self._n_nonevent, self._n_event, None, None, self.user_splits) self._time_postprocessing = time.perf_counter() - time_postprocessing if self.verbose: logging.info("Post-processing terminated. Time: {:.4f}s" .format(self._time_postprocessing)) self._time_total = time.perf_counter() - time_init if self.verbose: logging.info("Optimal binning terminated. Status: {}. " "Time: {:.4f}s".format( self._status, self._time_total)) # Completed successfully self._logger.close() self._is_fitted = True return self
def _daal_fit_classifier(self, X, y, sample_weight=None): y = check_array(y, ensure_2d=False, dtype=None) y, expanded_class_weight = self._validate_y_class_weight(y) n_classes_ = self.n_classes_[0] self.n_features_ = X.shape[1] self.n_features_in_ = X.shape[1] if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight if sample_weight is not None: sample_weight = [sample_weight] rs_ = check_random_state(self.random_state) seed_ = rs_.randint(0, np.iinfo('i').max) if n_classes_ < 2: raise ValueError( "Training data only contain information about one class.") # create algorithm X_fptype = getFPType(X) daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) features_per_node_ = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=True) n_samples_bootstrap_ = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") dfc_algorithm = daal4py.decision_forest_classification_training( nClasses=int(n_classes_), fptype=X_fptype, method='hist' if daal_check_version( (2021, 'P', 200)) else 'defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=n_samples_bootstrap_ if self.bootstrap is True else 1., featuresPerNode=int(features_per_node_), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=(self.min_samples_leaf if isinstance( self.min_samples_leaf, numbers.Integral) else int( ceil(self.min_samples_leaf * X.shape[0]))), engine=daal_engine_, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap), minObservationsInSplitNode=(self.min_samples_split if isinstance( self.min_samples_split, numbers.Integral) else int( ceil(self.min_samples_split * X.shape[0]))), minWeightFractionInLeafNode=self.min_weight_fraction_leaf, minImpurityDecreaseInSplitNode=self.min_impurity_decrease, maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes, maxBins=self.maxBins, minBinSize=self.minBinSize) self._cached_estimators_ = None # compute dfc_trainingResult = dfc_algorithm.compute(X, y, sample_weight) # get resulting model model = dfc_trainingResult.model self.daal_model_ = model # compute oob_score_ #if self.oob_score: # self.estimators_ = self._estimators_ # self._set_oob_score(X, y) return self
def fit(self, X, seq_length=[], n_iter=2): X = check_array(X) n_seq = max(len(seq_length), 1) n_sts = len(self.states) n_obs, n_dim = X.shape n_mix = self.gmm_k if self.initialized is False: try: self._init_param(X) except: raise ValueError("Problem with initialization") if self.method == 'gmm': for i in range(n_iter): start_prob_accum = np.zeros(n_sts) xisum_accum = np.zeros((n_sts, n_sts)) gamma_sum_accum = np.zeros((n_sts, n_mix)) means_num_accum = np.zeros((n_sts, n_mix, n_dim)) sigmas_num_accum = np.zeros((n_sts, n_mix, n_dim, n_dim)) logA = np.log(self.A) obs_logprob_accum = 0 print('in loop 1', i) for j in range(n_seq): print('in loop 2 ', j) S = self._get_seq(X, seq_length, j) emitlogprob = self._log_emission(S) fw = self._pass_forward(emitlogprob, logA) bw = self._pass_backward(emitlogprob, logA) obs_logprob_accum += self._precision_lse(fw[0], axis=0) print('fw: \n', fw) print('bw: \n', bw) print('framelogprob: \n', emitlogprob) # accumulate start prob temp = fw[0] + bw[0] self._log_normalize(temp, axis=0) start_prob_accum += np.exp(temp) # accumulate logxisum logxisum = self._log_sum_xi(S, fw, bw, emitlogprob, logA) print('logxisum: \n', logxisum) xisum_accum += np.exp(logxisum) # accumulate gamma_sum gamma = np.exp(self._log_gamma(S, fw, bw)) print('gamma; \n', gamma) gamma_sum_accum += gamma.sum(axis=0) # accumulate means means_num_accum += np.einsum('jik,jh->ikh', gamma, S) # accumulate sigma d = S[:, None, None, :] - self.means[None, :, :, :] sigmas_num_accum += np.sum( gamma[:, :, :, None, None] * (d[:, :, :, :, None] * d[:, :, :, None, :]), axis=0) # update start_prob prev_start_prob = self.start_prob.copy() self.start_prob = start_prob_accum self._normalize(self.start_prob, axis=0) print('update startprob: ', prev_start_prob, '->', self.start_prob) # update A new_A = xisum_accum self._normalize(new_A, axis=1) # update weights new_weights = gamma_sum_accum.copy() print('new_weights num\n', new_weights) self._normalize(new_weights, axis=1) # update mean means_den = gamma_sum_accum[:, :, None] new_means = means_num_accum / means_den print('new_means num\n', means_num_accum) # update sigma sigmas_den = gamma_sum_accum[:, :, None, None] new_sigmas = sigmas_num_accum / sigmas_den print('new_covs num\n', sigmas_num_accum) # update parameter self._update_param({ 'A': new_A, 'C': new_weights, 'MU': new_means, 'SIGMA': new_sigmas }) # update Convergence self._update_convergence(obs_logprob_accum) if self._converged: print('Done') print('Convergence: \n', self._convergence) break else: raise ValueError("\"{0}\" method not supported".format( self.method)) return
def fit(self, X, y=None): """Fit the model of minimum divergence / maximum entropy subject to constraints on the feature expectations <f_i(X)> = X[0]. Parameters ---------- X : ndarray (dense) of shape [1, n_features] A row vector (1 x n_features matrix) representing desired expectations of features. The curious shape is deliberate: models of minimum divergence / maximum entropy depend on the data only through the feature expectations. y : is not used: placeholder to allow for usage in a Pipeline. Returns ------- self """ X = np.atleast_2d(X) X = check_array(X) n_samples = X.shape[0] if n_samples != 1: raise ValueError('X must have only one row') # Extract a 1d array of the feature expectations # K = np.asarray(X[0], float) K = X[0] assert K.ndim == 1 # Store the desired feature expectations as a member variable self.K = K self._check_features() # Sanity checks try: self.params except AttributeError: self.resetparams(len(K)) else: assert len(self.params) == len(K) # Don't reset the number of function and gradient evaluations to zero # self.fnevals = 0 # self.gradevals = 0 # Make a copy of the parameters oldparams = np.array(self.params) callback = self.log retval = optimize.minimize(self.dual, oldparams, args=(), method=self.algorithm, jac=self.grad, tol=self.tol, options={ 'maxiter': self.maxiter, 'disp': self.verbose }, callback=callback) newparams = retval.x func_calls = retval.nfev # if self.algorithm == 'CG': # retval = optimize.fmin_cg(self.dual, oldparams, self.grad, (), self.avegtol, \ # maxiter=self.maxiter, full_output=1, \ # disp=self.verbose, retall=0, # callback=callback) # # (newparams, fopt, func_calls, grad_calls, warnflag) = retval # # elif self.algorithm == 'LBFGSB': # if callback is not None: # raise NotImplementedError("L-BFGS-B optimization algorithm" # " does not yet support callback functions for" # " testing with an external sample") # retval = optimize.fmin_l_bfgs_b(self.dual, oldparams, \ # self.grad, args=(), bounds=self.bounds, pgtol=self.maxgtol, # maxfun=self.maxfun) # (newparams, fopt, d) = retval # warnflag, func_calls = d['warnflag'], d['funcalls'] # if self.verbose: # print(self.algorithm + " optimization terminated successfully.") # print("\tFunction calls: " + str(func_calls)) # # We don't have info on how many gradient calls the LBFGSB # # algorithm makes # # elif self.algorithm == 'BFGS': # retval = optimize.fmin_bfgs(self.dual, oldparams, \ # self.grad, (), self.tol, \ # maxiter=self.maxiter, full_output=1, \ # disp=self.verbose, retall=0, \ # callback=callback) # # (newparams, fopt, gopt, Lopt, func_calls, grad_calls, warnflag) = retval # # elif self.algorithm == 'Powell': # retval = optimize.fmin_powell(self.dual, oldparams, args=(), \ # xtol=self.tol, ftol = self.tol, \ # maxiter=self.maxiter, full_output=1, \ # disp=self.verbose, retall=0, \ # callback=callback) # # (newparams, fopt, direc, numiter, func_calls, warnflag) = retval # # fmin_powell seems to turn newparams into a 0d array # newparams = np.atleast_1d(newparams) # # elif self.algorithm == 'Nelder-Mead': # retval = optimize.fmin(self.dual, oldparams, args=(), \ # xtol=self.tol, ftol = self.tol, \ # maxiter=self.maxiter, full_output=1, \ # disp=self.verbose, retall=0, \ # callback=callback) # # (newparams, fopt, numiter, func_calls, warnflag) = retval # # else: # raise AttributeError("the specified algorithm '" + str(self.algorithm) # + "' is unsupported. Options are 'CG', 'LBFGSB', " # "'Nelder-Mead', 'Powell', and 'BFGS'") if np.any(self.params != newparams): self.setparams(newparams) self.func_calls = func_calls return self
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None, densify_threshold=1000): logger = getLogger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) assert not any(categorical) X_transformed = X imputer = Imputer(strategy='mean', copy=False) X_transformed = imputer.fit_transform(X_transformed) center = not scipy.sparse.isspmatrix(X_transformed) standard_scaler = StandardScaler(copy=False, with_mean=center) X_transformed = standard_scaler.fit_transform(X_transformed) # categorical_transformed = [False] * X_transformed.shape[1] # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[ 0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed # categorical_ = categorical_transformed categorical_ = categorical else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info('%s: Going to calculate: %s', dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info('%s: Going to calculate: %s', dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_
def solve(self, X, missing_mask): if self.task_type == "Regression": self.sign = "MAE" self.loss_fn = tf.keras.losses.MeanAbsoluteError() elif self.task_type == 'Classification': self.loss_fn = tf.keras.losses.BinaryCrossentropy() self.sign = "BCE" self.group_pre_u = {} self.group_pre_i = {} X = check_array(X, force_all_finite=False) X_init = X.copy() self.loss_record = [] self.valloss_record = [] X_filled = X observed_mask = ~missing_mask max_singular_value = self._max_singular_value(X_filled) if self.verbose: if self.auto_tune == False: print("[SoftImpute] Max Singular Value of X_init = %f" % (max_singular_value)) if self.shrinkage_value: shrinkage_value = self.shrinkage_value else: # totally hackish heuristic: keep only components # with at least 1/50th the max singular value shrinkage_value = max_singular_value / 50.0 if self.auto_tune == False: print('#####mf_training#####') X_reconstruction, rank, U_thresh, V_thresh, S_thresh = self._svd_step( X_filled, shrinkage_value, tuning=False, max_rank=self.max_rank) X_reconstruction = self.clip(X_reconstruction) converged = self._converged(X_old=X_filled, X_new=X_reconstruction, missing_mask=missing_mask) X_filled[missing_mask] = X_reconstruction[missing_mask] self.ini_u = X_filled for i in range(self.max_iters): X_reconstruction, rank, U_thresh, V_thresh, S_thresh = self._svd_step( X_filled, shrinkage_value, tuning=True, max_rank=self.max_rank) X_reconstruction = self.clip(X_reconstruction) pred = self.predict(X_reconstruction, self.tr_Xi) if self.wc == 'warm': predval = self.predict(X_reconstruction, self.val_Xi) else: predval = self.predict_cold(U_thresh, V_thresh, S_thresh) if self.task_type == 'Classification': #self.loss_record.append(tf.keras.losses.MeanAbsoluteError(self.tr_y.ravel(),tf.sigmoid(pred.ravel()+self.pred_tr.ravel()).numpy()).numpy()) #self.valloss_record.append(tf.keras.losses.MeanAbsoluteError(self.val_y.ravel(),tf.sigmoid(predval.ravel()+self.pred_val.ravel()).numpy()).numpy()) self.loss_record.append( self.loss_fn( self.tr_y.ravel(), tf.sigmoid(pred.ravel() + self.pred_tr.ravel()).numpy()).numpy()) self.valloss_record.append( self.loss_fn( self.val_y.ravel(), tf.sigmoid(predval.ravel() + self.pred_val.ravel()).numpy()).numpy()) else: self.loss_record.append( self.loss_fn(self.tr_y.ravel(), pred.ravel() + self.pred_tr.ravel()).numpy()) self.valloss_record.append( self.loss_fn(self.val_y.ravel(), predval.ravel() + self.pred_val.ravel()).numpy()) # print error on observed data if self.verbose: self._verbose(X_reconstruction, i, rank) converged = self._converged(X_old=X_filled, X_new=X_reconstruction, missing_mask=missing_mask) X_filled[missing_mask] = X_reconstruction[missing_mask] # print(X_reconstruction[observed_mask]) if converged: break if self.verbose: if self.auto_tune == False: print("[SoftImpute] Stopped after iteration %d for lambda=%f" % (i + 1, shrinkage_value)) if self.change_mode: X_filled = X_reconstruction var_whole_u = np.var(U_thresh) var_whole_i = np.var(V_thresh.T) print('final num of user group:', len(self.match_u)) print('final num of item group:', len(self.match_i)) return X_filled, U_thresh, V_thresh, S_thresh, self.loss_record, self.valloss_record, self.match_u, self.match_i, self.var_u, self.var_i, var_whole_u, var_whole_i, self.group_pre_u, self.group_pre_i
def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='euclidean', algorithm='best', memory=Memory(cachedir=None, verbose=0), leaf_size=40, core_dist_n_jobs=4, **kwargs): """Perform robust single linkage clustering from a vector array or distance matrix. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. cut : float The reachability distance value to cut the cluster heirarchy at to derive a flat cluster labelling. k : int, optional (default=5) Reachability distances will be computed with regard to the `k` nearest neighbors. alpha : float, optional (default=np.sqrt(2)) Distance scaling for reachability distance computation. Reachability distance is computed as $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$. gamma : int, optional (default=5) Ignore any clusters in the flat clustering with size less than gamma, and declare points in such clusters as noise points. metric : string, or callable, optional (default='euclidean') The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. algorithm : string, optional (default='best') Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set to ``best`` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: * ``generic`` * ``best`` * ``prims_kdtree`` * ``prims_balltree`` * ``boruvka_kdtree`` * ``boruvka_balltree`` memory : Instance of joblib.Memory or string (optional) Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. leaf_size : int, optional (default=40) Leaf size for trees responsible for fast nearest neighbour queries. core_dist_n_jobs : int, optional Number of parallel jobs to run in core distance computations (if supported by the specific algorithm). For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used. (default 4) Returns ------- labels : ndarray, shape (n_samples, ) Cluster labels for each point. Noisy samples are given the label -1. single_linkage_tree : ndarray, shape (n_samples - 1, 4) The single linkage tree produced during clustering in scipy hierarchical clustering format (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html). References ---------- .. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the cluster tree. In Advances in Neural Information Processing Systems (pp. 343-351). """ if not isinstance(k, int) or k < 1: raise ValueError('k must be an integer greater than zero!') if not isinstance(alpha, float) or alpha < 1.0: raise ValueError('alpha must be a float greater than or equal to 1.0!') if not isinstance(gamma, int) or gamma < 1: raise ValueError('gamma must be an integer greater than zero!') if not isinstance(leaf_size, int) or leaf_size < 1: raise ValueError('Leaf size must be at least one!') if metric == 'minkowski': if 'p' not in kwargs or kwargs['p'] is None: raise TypeError('Minkowski metric given but no p value supplied!') if kwargs['p'] < 0: raise ValueError('Minkowski metric with negative p value is not' ' defined!') X = check_array(X, accept_sparse='csr') if isinstance(memory, six.string_types): memory = Memory(cachedir=memory, verbose=0) if algorithm != 'best': if algorithm == 'generic': single_linkage_tree = memory.cache(_rsl_generic)(X, k, alpha, metric, **kwargs) elif algorithm == 'prims_kdtree': single_linkage_tree = memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, **kwargs) elif algorithm == 'prims_balltree': single_linkage_tree = memory.cache(_rsl_prims_balltree)(X, k, alpha, metric, **kwargs) elif algorithm == 'boruvka_kdtree': single_linkage_tree = \ memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs) elif algorithm == 'boruvka_balltree': single_linkage_tree = \ memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs) else: raise TypeError('Unknown algorithm type %s specified' % algorithm) else: if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... single_linkage_tree = memory.cache(_rsl_generic)(X, k, alpha, metric, **kwargs) elif metric in KDTree.valid_metrics: # Need heuristic to decide when to go to boruvka; # still debugging for now if X.shape[1] > 128: single_linkage_tree = memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, **kwargs) else: single_linkage_tree = \ memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs) else: # Metric is a valid BallTree metric # Need heuristic to decide when to go to boruvka; # still debugging for now if X.shape[1] > 128: single_linkage_tree = memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, **kwargs) else: single_linkage_tree = \ memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs) labels = single_linkage_tree.get_clusters(cut, gamma) return labels, single_linkage_tree.to_numpy()
def normalize(X, norm='l2', axis=1, copy=True): """Scale input vectors individually to unit norm (vector length). Parameters ---------- X : array or scipy.sparse matrix with shape [n_samples, n_features] The data to normalize, element by element. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. norm : 'l1' or 'l2', optional ('l2' by default) The norm to use to normalize each non zero sample (or each non-zero feature if axis is 0). axis : 0 or 1, optional (1 by default) axis used to normalize the data along. If 1, independently normalize each sample, otherwise (if 0) normalize each feature. copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix and if axis is 1). See also -------- :class:`sklearn.preprocessing.Normalizer` to perform normalization using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`) """ if norm not in ('l1', 'l2'): raise ValueError("'%s' is not a supported norm" % norm) if axis == 0: sparse_format = 'csc' elif axis == 1: sparse_format = 'csr' else: raise ValueError("'%d' is not a supported axis" % axis) X = check_array(X, sparse_format, copy=copy) warn_if_not_float(X, 'The normalize function') if axis == 0: X = X.T if sparse.issparse(X): X = check_array(X, accept_sparse=sparse_format, dtype=np.float64) if norm == 'l1': inplace_csr_row_normalize_l1(X) elif norm == 'l2': inplace_csr_row_normalize_l2(X) else: if norm == 'l1': norms = np.abs(X).sum(axis=1) norms[norms == 0.0] = 1.0 elif norm == 'l2': norms = row_norms(X) norms[norms == 0.0] = 1.0 X /= norms[:, np.newaxis] if axis == 0: X = X.T return X
def split_data(dtype, x, y, special_codes=None, cat_cutoff=None, user_splits=None, check_input=True, outlier_detector=None, outlier_params=None, fix_lb=None, fix_ub=None, class_weight=None, sample_weight=None): """Split data into clean, missing and special values data. Parameters ---------- dtype : str or None, optional (default=None) The variable data type. Supported data types are "numerical" for continuous and ordinal variables and "categorical" for categorical and nominal variables. x : array-like, shape = (n_samples) Data samples, where n_samples is the number of samples. y : array-like, shape = (n_samples) Target vector relative to x. special_codes : array-like or None, optional (default=None) List of special codes. Use special codes to specify the data values that must be treated separately. cat_cutoff : float or None, optional (default=None) Generate bin others with categories in which the fraction of occurrences is below the ``cat_cutoff`` value. This option is available when ``dtype`` is "categorical". user_splits : array-like or None, optional (default=None) The list of pre-binning split points when ``dtype`` is "numerical" or the list of prebins when ``dtype`` is "categorical". check_input : bool, (default=True) If False, the input arrays x and y will not be checked. outlier_detector : str or None (default=None) The outlier detection method. Supported methods are "range" to use the interquartile range based method or "zcore" to use the modified Z-score method. outlier_params : dict or None (default=None) Dictionary of parameters to pass to the outlier detection method. fix_lb : float or None (default=None) Lower bound or minimum admissible value. fix_ub : float or None (default=None) Upper bound or maximum admissible value. class_weight : dict, "balanced" or None, optional (default=None) Weights associated with classes in the form ``{class_label: weight}``. If None, all classes are supposed to have weight one. sample_weight : array-like of shape (n_samples,) (default=None) Array of weights that are assigned to individual samples. Returns ------- x_clean : array, shape = (n_clean) Clean data samples y_clean : array, shape = (n_clean) Clean target samples. x_missing : array, shape = (n_missing) Missing data samples. y_missing : array, shape = (n_missing) Missing target samples. x_special : array, shape = (n_special) Special data samples. y_special : array, shape = (n_special) Special target samples. y_others : array, shape = (n_others) Others target samples. categories : array, shape (n_categories) List of categories. others : array, shape (n_other_categories) List of other categories. sw_clean : array-like Clean data sample weigth. sw_missing : array-like Missing data sample weight. sw_special : array-like Special data sample weight. sw_others : array-like Others data sample weight. """ if outlier_detector is not None: if outlier_detector not in ("range", "zscore"): raise ValueError('Invalid value for outlier_detector. Allowed ' 'string values are "range" and "zscore".') if outlier_params is not None: if not isinstance(outlier_params, dict): raise TypeError("outlier_params must be a dict or None; " "got {}.".format(outlier_params)) if fix_lb is not None: if not isinstance(fix_lb, numbers.Number): raise ValueError("fix_lb must be a number; got {}.".format(fix_lb)) if fix_ub is not None: if not isinstance(fix_ub, numbers.Number): raise ValueError("fix_ub must be a number; got {}.".format(fix_ub)) if fix_lb is not None and fix_ub is not None: if fix_lb > fix_ub: raise ValueError("fix_lb must be <= fix_ub; got {} <= {}.".format( fix_lb, fix_ub)) if check_input: x = check_array(x, ensure_2d=False, dtype=None, force_all_finite='allow-nan') y = check_array(y, ensure_2d=False, dtype=None, force_all_finite=True) check_consistent_length(x, y) x = np.asarray(x) y = np.asarray(y) sample_weight = _check_sample_weight(sample_weight, x, dtype=x.dtype) if class_weight is not None: classes = np.unique(y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, classes, y) sample_weight *= class_weight_[le.fit_transform(y)] if np.issubdtype(x.dtype, np.number) and np.issubdtype(y.dtype, np.number): missing_mask = np.isnan(x) | np.isnan(y) else: missing_mask = pd.isnull(x) | pd.isnull(y) if special_codes is None: clean_mask = ~missing_mask x_clean = x[clean_mask] y_clean = y[clean_mask] x_missing = x[missing_mask] y_missing = y[missing_mask] x_special = [] y_special = [] sw_clean = sample_weight[clean_mask] sw_missing = sample_weight[missing_mask] sw_special = [] else: special_mask = pd.Series(x).isin(special_codes).values clean_mask = ~missing_mask & ~special_mask x_clean = x[clean_mask] y_clean = y[clean_mask] x_missing = x[missing_mask] y_missing = y[missing_mask] x_special = x[special_mask] y_special = y[special_mask] sw_clean = sample_weight[clean_mask] sw_missing = sample_weight[missing_mask] sw_special = sample_weight[special_mask] if dtype == "numerical": if outlier_detector is not None: if outlier_detector == "range": detector = RangeDetector() elif outlier_detector == "zscore": detector = ModifiedZScoreDetector() if outlier_params is not None: detector.set_params(**outlier_params) mask_outlier = detector.fit(x_clean).get_support() x_clean = x_clean[~mask_outlier] y_clean = y_clean[~mask_outlier] sw_clean = sw_clean[~mask_outlier] if fix_lb is not None or fix_ub is not None: if fix_lb is not None: mask = x_clean >= fix_lb elif fix_ub is not None: mask = x_clean <= fix_ub else: mask = (x_clean >= fix_lb) & (x_clean <= fix_ub) x_clean = x_clean[mask] y_clean = y_clean[mask] sw_clean = sw_clean[mask] if dtype == "categorical" and user_splits is None: if cat_cutoff is not None: mask_others, others = categorical_cutoff(x_clean, y_clean, cat_cutoff) y_others = y_clean[mask_others] sw_others = sw_clean[mask_others] x_clean = x_clean[~mask_others] y_clean = y_clean[~mask_others] sw_clean = sw_clean[~mask_others] else: y_others = [] others = [] sw_others = [] categories, x_clean = categorical_transform(x_clean, y_clean) return (x_clean, y_clean, x_missing, y_missing, x_special, y_special, y_others, categories, others, sw_clean, sw_missing, sw_special, sw_others) else: return (x_clean, y_clean, x_missing, y_missing, x_special, y_special, [], [], [], sw_clean, sw_missing, sw_special, [])
def test_check_array_force_all_finite_object_unsafe_casting( X, err_msg, force_all_finite): # casting a float array containing NaN or inf to int dtype should # raise an error irrespective of the force_all_finite parameter. with pytest.raises(ValueError, match=err_msg): check_array(X, dtype=int, force_all_finite=force_all_finite)
def fit(self, X, y=None): """Compute clustering of the data. Parameters ---------- X: ndarray, shape = [n_samples, n_features] Training data. y: Ignored Returns ------- self: `ReNA` object """ X = check_array(X, ensure_min_features=2, ensure_min_samples=2, estimator=self) n_features = X.shape[1] if not isinstance(self.mask_img, (str, Nifti1Image)): raise ValueError("The mask image should be a Niimg-like" "object. Instead a %s object was provided." % type(self.mask_img)) if self.memory is None or isinstance(self.memory, str): self.memory_ = Memory(cachedir=self.memory, verbose=max(0, self.verbose - 1)) else: self.memory_ = self.memory if self.n_clusters <= 0: raise ValueError("n_clusters should be an integer greater than 0." " %s was provided." % str(self.n_clusters)) if self.n_iter <= 0: raise ValueError("n_iter should be an integer greater than 0." " %s was provided." % str(self.n_iter)) if self.n_clusters > n_features: self.n_clusters = n_features warnings.warn("n_clusters should be at most the number of " "features. Taking n_clusters = %s instead." % str(n_features)) n_components, labels = self.memory_.cache( recursive_neighbor_agglomeration)(X, self.mask_img, self.n_clusters, n_iter=self.n_iter, threshold=self.threshold, verbose=self.verbose) sizes = np.bincount(labels) sizes = sizes[sizes > 0] self.labels_ = labels self.n_clusters_ = np.unique(self.labels_).shape[0] self.sizes_ = sizes return self
def fit(self, X, y, sample_weight=None, check_input=True): """Fit model with coordinate descent. Parameters ---------- X : {ndarray, sparse matrix} of (n_samples, n_features) Data y : {ndarray, sparse matrix} of shape (n_samples,) or \ (n_samples, n_targets) Target. Will be cast to X's dtype if necessary sample_weight : float or array-like of shape (n_samples,), default=None Sample weight. check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you do. Notes ----- Coordinate descent is an algorithm that considers each column of data at a time hence it will automatically convert the X input as a Fortran-contiguous numpy array if necessary. To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ #check X and y if check_input: X, y = check_X_y(X, y, copy=False, accept_sparse='csc', dtype=[np.float64, np.float32], multi_output=True, y_numeric=True) y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False) else: #only for compliance with Sklearn, this assert is not required for Intel(R) oneAPI Data #Analytics Library if (isinstance(X, np.ndarray) and X.flags['F_CONTIGUOUS'] == False): raise ValueError("ndarray is not Fortran contiguous") if isinstance(X, np.ndarray): self.fit_shape_good_for_daal_ = True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False else: self.fit_shape_good_for_daal_ = False if (sp.issparse(X) or sample_weight is not None or not self.fit_shape_good_for_daal_ or not (X.dtype == np.float64 or X.dtype == np.float32)): if hasattr(self, 'daal_model_'): del self.daal_model_ logging.info("sklearn.linear_model.Lasso.fit: " + get_patch_message("sklearn")) res_new = super(ElasticNet, self).fit(X, y, sample_weight=sample_weight, check_input=check_input) self._gap = res_new.dual_gap_ return res_new self.n_iter_ = None self._gap = None #only for pass tests "check_estimators_fit_returns_self(readonly_memmap=True) and check_regressors_train(readonly_memmap=True) if not (X.flags.writeable): X = np.copy(X) if not (y.flags.writeable): y = np.copy(y) logging.info("sklearn.linear_model.Lasso.fit: " + get_patch_message("daal")) res = _daal4py_fit_lasso(self, X, y, check_input=check_input) if res is None: if hasattr(self, 'daal_model_'): del self.daal_model_ logging.info("sklearn.linear_model.Lasso.fit: " + get_patch_message("sklearn_after_daal")) res_new = super(ElasticNet, self).fit(X, y, sample_weight=sample_weight, check_input=check_input) self._gap = res_new.dual_gap_ return res_new return res
def __init__( self, data, metric="euclidean", metric_kwds=None, n_neighbors=15, n_trees=8, leaf_size=15, pruning_level=0, tree_init=True, random_state=np.random, algorithm="standard", max_candidates=20, n_iters=10, delta=0.001, rho=0.5, ): self.n_trees = n_trees self.n_neighbors = n_neighbors self.metric = metric if metric_kwds is None: metric_kwds = dict() self.metric_kwds = metric_kwds self.leaf_size = leaf_size self.prune_level = pruning_level self.max_candidates = max_candidates self.n_iters = n_iters self.delta = delta self.rho = rho self.dim = data.shape[1] data = check_array(data).astype(np.float32) if not tree_init or n_trees == 0: self.tree_init = False else: self.tree_init = True self._dist_args = tuple(metric_kwds.values()) self.random_state = check_random_state(random_state) self._raw_data = data.copy() if callable(metric): self._distance_func = metric elif metric in dist.named_distances: self._distance_func = dist.named_distances[metric] if metric in ("cosine", "correlation", "dice", "jaccard"): self._angular_trees = True else: self._angular_trees = False self.rng_state = self.random_state.randint(INT32_MIN, INT32_MAX, 3).astype( np.int64 ) indices = np.arange(data.shape[0]) if self.tree_init: if self._angular_trees: self._rp_forest = [ flatten_tree( make_angular_tree( data, indices, self.rng_state, self.leaf_size ), self.leaf_size, ) for i in range(n_trees) ] else: self._rp_forest = [ flatten_tree( make_euclidean_tree( data, indices, self.rng_state, self.leaf_size ), self.leaf_size, ) for i in range(n_trees) ] leaf_array = np.vstack([tree.indices for tree in self._rp_forest]) else: self._rp_forest = None leaf_array = np.array([[-1]]) if algorithm == "standard" or leaf_array.shape[0] == 1: nn_descent = make_nn_descent(self._distance_func, self._dist_args) self._neighbor_graph = nn_descent( self._raw_data, self.n_neighbors, self.rng_state, self.max_candidates, self.n_iters, self.delta, self.rho, True, leaf_array, ) elif algorithm == "alternative": self._search = make_initialized_nnd_search( self._distance_func, self._dist_args ) init_heaps = make_heap_initializer(self._distance_func, self._dist_args) graph_heap, search_heap = init_heaps( self._raw_data, self.n_neighbors, leaf_array ) graph = lil_matrix((data.shape[0], data.shape[0])) graph.rows, graph.data = deheap_sort(graph_heap) graph = graph.maximum(graph.transpose()) self._neighbor_graph = deheap_sort( self._search( self._raw_data, graph.indptr, graph.indices, search_heap, self._raw_data, ) ) else: raise ValueError("Unknown algorithm selected") self._search_graph = lil_matrix( (data.shape[0], data.shape[0]), dtype=np.float32 ) self._search_graph.rows = self._neighbor_graph[0] self._search_graph.data = self._neighbor_graph[1] self._search_graph = self._search_graph.maximum( self._search_graph.transpose() ).tocsr() self._search_graph = prune( self._search_graph, prune_level=self.prune_level, n_neighbors=self.n_neighbors, ) self._search_graph = (self._search_graph != 0).astype(np.int8) self._random_init, self._tree_init = make_initialisations( self._distance_func, self._dist_args ) self._search = make_initialized_nnd_search(self._distance_func, self._dist_args) return
def estimate_sigma( X: np.ndarray, subsample: Optional[int] = None, method: str = "median", percent: Optional[float] = 0.15, scale: float = 1.0, random_state: Optional[int] = None, ) -> float: """A function to provide a reasonable estimate of the sigma values for the RBF kernel using different methods. Parameters ---------- X : array, (n_samples, d_dimensions) The data matrix to be estimated. method : str, default: 'median' different methods used to estimate the sigma for the rbf kernel matrix. * Mean * Median * Silverman * Scott - very common for density estimation percent : float, default=0.15 The kth percentage of distance chosen scale : float, default=None Option to scale the sigma chosen. Typically used with the median or mean method as they are data dependent. random_state : int, (default: None) controls the seed for the subsamples drawn to represent the data distribution Returns ------- sigma : float The estimated sigma value Resources --------- - Original MATLAB function: https://goo.gl/xYoJce Information ----------- Author : J. Emmanuel Johnson Email : [email protected] : [email protected] Date : 6 - July - 2018 """ X = check_array(X, ensure_2d=True) rng = check_random_state(random_state) # subsampling [n_samples, d_dimensions] = X.shape if subsample is not None: X = rng.permutation(X)[:subsample, :] if method == "mean": if percent is None: sigma = np.mean(pdist(X)) else: kth_sample = int(percent * n_samples) sigma = np.mean(np.sort(squareform(pdist(X)))[:, kth_sample]) elif method == "median": if percent is None: sigma = np.median(pdist(X)) else: kth_sample = int(percent * n_samples) sigma = np.median(np.sort(squareform(pdist(X)))[:, kth_sample]) elif method == "silverman": sigma = np.power( n_samples * (d_dimensions + 2.0) / 4.0, -1.0 / (d_dimensions + 4) ) elif method == "scott": sigma = np.power(n_samples, -1.0 / (d_dimensions + 4)) else: raise ValueError('Unrecognized mode "{}".'.format(method)) # scale the sigma by a factor if scale is not None: sigma *= scale # return sigma return sigma
def _make_samples(self, X, y_type, nn_data, nn_num, n_samples, step_size=1.): """A support function that returns artificial samples constructed along the line connecting nearest neighbours. Parameters ---------- X : ndarray, shape (n_samples, n_features) Points from which the points will be created. y_type : str or int The minority target value, just so the function can return the target values for the synthetic variables with correct length in a clear format. nn_data : ndarray, shape (n_samples_all, n_features) Data set carrying all the neighbours to be used nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in nn_data. n_samples : int The number of samples to generate. step_size : float, optional (default=1.) The step size to create samples. Returns ------- X_new : ndarray, shape (n_samples_new, n_features) Synthetically generated samples. y_new : ndarray, shape (n_samples_new, ) Target values for synthetic samples. """ # Check the consistency of X X = check_array(X) # A matrix to store the synthetic samples X_new = np.zeros((n_samples, X.shape[1])) # Set seeds np.random.seed(self.rs_) seeds = np.random.randint(low=0, high=100 * len(nn_num.flatten()), size=n_samples) # Randomly pick samples to construct neighbours from np.random.seed(self.rs_) samples = np.random.randint(low=0, high=len(nn_num.flatten()), size=n_samples) # Loop over the NN matrix and create new samples for i, n in enumerate(samples): # NN lines relate to original sample, columns to its # nearest neighbours row, col = divmod(n, nn_num.shape[1]) # Take a step of random size (0,1) in the direction of the # n nearest neighbours if self.rs_ is None: np.random.seed(seeds[i]) else: np.random.seed(self.rs_) step = step_size * np.random.uniform() # Construct synthetic sample X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]]) # The returned target vector is simply a repetition of the # minority label y_new = np.array([y_type] * len(X_new)) if self.verbose: print("Generated {} new samples ...".format(len(X_new))) return X_new, y_new
def fit_transform(self, X, y, sample_weight=None): """ Fit and Transform data into modified features (before being passed to penalised regression step). If `linear_features=True` then this will be scaled linear features followed by the one-hot-encoding signifying which rules are "on". Otherwise this is just the one-hot-encoding signifying which rules are "on". Fitting process involves fitted bagged/boosted tree model to generate rules and then using these in a penalised logistic regression. X: pandas.DataFrame or numpy.ndarray Features y: pandas.Series or numpy.ndarray Target Returns ------- sparse array """ # Instantiate rule ensemble generator and set parameters if isinstance(self.base_estimator, XGBClassifier): self.base_estimator.set_params(n_estimators=self.n_estimators, silent=(self.verbose > 0), max_depth=self.max_depth, n_jobs=self.n_jobs) elif isinstance(self.base_estimator, RandomForestClassifier): warnings.warn( 'This base_estimator implementation has not been tested in a while!' ) self.base_estimator.set_params(n_estimators=self.n_estimators, verbose=self.verbose, max_depth=self.max_depth, n_jobs=self.n_jobs) elif isinstance(self.base_estimator, GradientBoostingClassifier): warnings.warn( 'This base_estimator implementation has not been tested in a while!' ) self.base_estimator.set_params(n_estimators=self.n_estimators, verbose=self.verbose, max_depth=self.max_depth, n_jobs=self.n_jobs) else: raise NotImplementedError # Name features if isinstance(X, DataFrame): self.features = X.columns.values else: self.features = ['f' + str(i) for i in range(X.shape[1])] # Check input X = check_array(X) # Generate and extract rules if not self.rand_tree_size: self.base_estimator.fit(X, y, sample_weight=sample_weight) if isinstance(self.base_estimator, XGBClassifier): self._rule_dump = self.base_estimator._Booster.get_dump() else: NotImplementedError( ) # TODO: work out how to incrementally train XGB if self.verbose > 0: print('fitting trees') # For each tree: get leaf numbers and map them to [0, num leaves] # before one-hot encoding them n_values = "auto" leaves_l = [] for tree_i in self._rule_dump: leaves = [int(i) for i in re.findall(r'([0-9]+):leaf=', tree_i)] leaves_l.append(leaves) self._one_hot_encoder = LabelOneHotEncoder(leaves_l) if self.verbose > 0: print('setup encoding') # Scale and centre linear features X = self.ext_scaler.fit_transform(X) if self.linear_features: # Linear features must be scaled to have same weighting as an average rule self._scaler = FriedScaler(quantile=self.linear_feature_quantile) X_scale = self._scaler.fit_transform(X) X_transform = hstack([ X_scale, self._one_hot_encoder.fit_transform( self.base_estimator.apply(X).reshape( -1, self.n_estimators)) ]) else: X_transform = self._one_hot_encoder.fit_transform( self.base_estimator.apply(X).reshape(-1, self.n_estimators)) if self.verbose > 0: print('encoded') # Fit sparse linear model to rules (and optionally linear features) self.LR = LogisticRegression(C=self.C, penalty=self.penalty, class_weight=self.class_weight, warm_start=self.warm_start, solver='saga', verbose=self.verbose) self.LR.fit(X_transform, y, sample_weight=sample_weight) if self.verbose > 0: print('fitted') # Mask features with zero co-efficients # self.feature_mask_ = np.arange(self.LR.coef_.size) self.feature_mask_ = self.LR.coef_.nonzero()[1] self.coef_ = self.LR.coef_[0, self.feature_mask_] self.intercept_ = self.LR.intercept_ self.get_feature_names() assert self.features_.size == self.feature_mask_.size return X_transform
def bootstrap(self, X_list, n_sampling, start_from_t=1): """Evaluate the statistical reliability of DAG based on the bootstrapping. Parameters ---------- X_list : array-like, shape (X, ...) Longitudinal multiple datasets for training, where ``X`` is an dataset. The shape of ''X'' is (n_samples, n_features), where ``n_samples`` is the number of samples and ``n_features`` is the number of features. n_sampling : int Number of bootstrapping samples. Returns ------- results : array-like, shape (BootstrapResult, ...) Returns the results of bootstrapping for multiple datasets. """ # Check parameters if not isinstance(X_list, (list, np.ndarray)): raise ValueError('X_list must be a array-like.') if len(X_list) < 2: raise ValueError( 'X_list must be a list containing at least two items') self._T = len(X_list) self._n = check_array(X_list[0]).shape[0] self._p = check_array(X_list[0]).shape[1] X_t = [] for X in X_list: X = check_array(X) if X.shape != (self._n, self._p): raise ValueError('X_list must be a list with the same shape') X_t.append(X) # Bootstrapping adjacency_matrices = np.zeros( (n_sampling, self._T, 1 + self._n_lags, self._p, self._p)) total_effects = np.zeros( (n_sampling, self._T * self._p, self._T * self._p)) for i in range(n_sampling): resampled_X_t = np.empty((self._T, self._n, self._p)) indices = np.random.randint(0, self._n, size=(self._n, )) for t in range(self._T): resampled_X_t[t] = X_t[t][indices, :] self.fit(resampled_X_t) adjacency_matrices[i] = self._adjacency_matrices # Calculate total effects for from_t in range(start_from_t, self._T): for c, from_ in enumerate(self._causal_orders[from_t]): to_t = from_t for to in self._causal_orders[from_t][c + 1:]: total_effects[i, to_t * self._p + to, from_t * self._p + from_] = self.estimate_total_effect( X_t, from_t, from_, to_t, to) for to_t in range(from_t + 1, self._T): for to in self._causal_orders[to_t]: total_effects[i, to_t * self._p + to, from_t * self._p + from_] = self.estimate_total_effect( X_t, from_t, from_, to_t, to) return LongitudinalBootstrapResult(self._T, adjacency_matrices, total_effects)
def dtw_region(x, y, dist='square', region=None, return_cost=False, return_accumulated=False, return_path=False): """Dynamic Time Warping (DTW) distance with a constraint region. Parameters ---------- x : array-like, shape = (n_timestamps_1,) First array. y : array-like, shape = (n_timestamps_2,) Second array dist : 'square', 'absolute' or callable (default = 'square') Distance used. If 'square', the squared difference is used. If 'absolute', the absolute difference is used. If callable, it must be a function with a numba.njit() decorator that takes as input two numbers (two arguments) and returns a number. region : None or array-like, shape = (2, n_timestamps_1) Constraint region. If None, no constraint region is used. Otherwise, the first row consists of the starting indices (included) and the second row consists of the ending indices (excluded) of the valid rows for each column. return_cost : bool (default = False) If True, the cost matrix is returned. return_accumulated : bool (default = False) If True, the accumulated cost matrix is returned. return_path : bool (default = False) If True, the optimal path is returned. Returns ------- dtw_dist : float The DTW distance between the two arrays. cost_mat : array, shape = (n_timestamps_1, n_timestamps_2) Cost matrix. Only returned if ``return_cost=True``. acc_cost_mat : array, shape = (n_timestamps_1 n_timestamps_2) Accumulated cost matrix. Only returned if ``return_accumulated=True``. path : array, shape = (2, path_length) The optimal path along the cost matrix. The first row consists of the indices of the optimal path for x while the second row consists of the indices of the optimal path for y. Only returned if ``return_path=True``. Examples -------- >>> from pyts.metrics import dtw_region >>> x = [0, 1, 1] >>> y = [2, 0, 1] >>> region = [[0, 1, 1], [2, 2, 3]] >>> dtw_region(x, y, region=region) 2.23... """ x, y, n_timestamps_1, n_timestamps_2 = _check_input_dtw(x, y) if region is not None: region = check_array(region, dtype='int64') if region.shape != (2, n_timestamps_1): raise ValueError("If 'region' is not None, it must be array-like " "with shape (2, n_timestamps_1).") cost_mat = cost_matrix(x, y, dist=dist, region=region) acc_cost_mat = accumulated_cost_matrix(cost_mat) dtw_dist = acc_cost_mat[-1, -1] if dist == 'square': dtw_dist = sqrt(dtw_dist) res = _return_results(dtw_dist, cost_mat, acc_cost_mat, return_cost, return_accumulated, return_path) return res
def partial_fit(self, X, y, monitor=None, sample_weight=None, **kwargs): """Fit the model on a batch of training data. Parameters ---------- X : numpy array or sparse matrix of shape [n_samples, n_features] Training data y : numpy array of shape [n_samples, n_targets] Target values monitor : callable, optional The monitor is called after each iteration with the current iteration, a reference to the estimator, and a dictionary with {'loss': loss_value} representing the loss calculated by the objective function at this iteration. If the callable returns True the fitting procedure is stopped. The monitor can be used for various things such as computing held-out estimates, early stopping, model introspection, and snapshoting. sample_weight : numpy array of shape [n_samples,] Per-sample weights. Re-scale the loss per sample. Higher weights force the estimator to put more emphasis on these samples. Sample weights are normalized per-batch. Returns ------- self : returns an instance of self. """ X, y = self._check_inputs(X, y) assert self.batch_size > 0, "batch_size <= 0" if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) # Initialize the model if it hasn't been already by a previous call. if self._is_fitted: y = self._transform_targets(y) else: self._random_state = check_random_state(self.random_state) self._fit_targets(y, **kwargs) y = self._transform_targets(y) self.is_sparse_ = sp.issparse(X) self.input_layer_sz_ = X.shape[1] # Set which layer transform function points to if self.transform_layer_index is None: self._transform_layer_index = len(self.hidden_units) - 1 else: self._transform_layer_index = self.transform_layer_index if (self._transform_layer_index < -1 or self._transform_layer_index >= len(self.hidden_units)): raise ValueError( "`transform_layer_index` must be in the range " "[-1, len(hidden_units)-1]!") # Instantiate the graph. TensorFlow seems easier to use by just # adding to the default graph, and as_default lets you temporarily # set a graph to be treated as the default graph. self.graph_ = Graph() with self.graph_.as_default(): tf_random_seed.set_random_seed( self._random_state.randint(0, 10000000)) tf.get_variable_scope().set_initializer( tf.contrib.layers.xavier_initializer()) self._build_tf_graph() # Train model parameters. self._session.run(tf.global_variables_initializer()) # Set an attributed to mark this as at least partially fitted. self._is_fitted = True # Train the model with the given data. with self.graph_.as_default(): n_examples = X.shape[0] indices = np.arange(n_examples) for epoch in range(self.n_epochs): self._random_state.shuffle(indices) for start_idx in range(0, n_examples, self.batch_size): batch_ind = indices[start_idx:start_idx + self.batch_size] if sample_weight is None: batch_sample_weight = None else: batch_sample_weight = sample_weight[batch_ind] feed_dict = self._make_feed_dict( X[batch_ind], y[batch_ind], sample_weight=batch_sample_weight) obj_val, _ = self._session.run( [self._obj_func, self._train_step], feed_dict=feed_dict) _LOGGER.debug("objective: %.4f, epoch: %d, idx: %d", obj_val, epoch, start_idx) _LOGGER.info("objective: %.4f, epoch: %d, idx: %d", obj_val, epoch, start_idx) if monitor: stop_early = monitor(epoch, self, {'loss': obj_val}) if stop_early: _LOGGER.info( "stopping early due to monitor function.") return self return self
def fit(self, X, y=None): """Fit detector. y is ignored in unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : Ignored Not used, present for API consistency by convention. Returns ------- self : object Fitted estimator. """ X = check_array(X) self._set_n_classes(y) self.train_history = defaultdict(list) names = locals() epochs = self.stop_epochs * 3 stop = 0 latent_size = X.shape[1] data_size = X.shape[0] # Create discriminator self.discriminator = create_discriminator(latent_size, data_size) self.discriminator.compile(optimizer=SGD(lr=self.lr_d, decay=self.decay, momentum=self.momentum), loss='binary_crossentropy') # Create k combine models for i in range(self.k): names['sub_generator' + str(i)] = create_generator(latent_size) latent = Input(shape=(latent_size, )) names['fake' + str(i)] = names['sub_generator' + str(i)](latent) self.discriminator.trainable = False names['fake' + str(i)] = self.discriminator(names['fake' + str(i)]) names['combine_model' + str(i)] = Model(latent, names['fake' + str(i)]) names['combine_model' + str(i)].compile(optimizer=SGD( lr=self.lr_g, decay=self.decay, momentum=self.momentum), loss='binary_crossentropy') # Start iteration for epoch in range(epochs): if self.verbose: print('Epoch {} of {}'.format(epoch + 1, epochs)) batch_size = min(500, data_size) num_batches = int(data_size / batch_size) for index in range(num_batches): if self.verbose: print('\nTesting for epoch {} index {}:'.format( epoch + 1, index + 1)) # Generate noise noise_size = batch_size noise = np.random.uniform(0, 1, (int(noise_size), latent_size)) # Get training data data_batch = X[index * batch_size:(index + 1) * batch_size] # Generate potential outliers block = ((1 + self.k) * self.k) // 2 for i in range(self.k): if i != (self.k - 1): noise_start = int( (((self.k + (self.k - i + 1)) * i) / 2) * (noise_size // block)) noise_end = int( (((self.k + (self.k - i)) * (i + 1)) / 2) * (noise_size // block)) names['noise' + str(i)] = noise[noise_start:noise_end] names['generated_data' + str(i)] = names['sub_generator' + str(i)].predict(names['noise' + str(i)], verbose=0) else: noise_start = int( (((self.k + (self.k - i + 1)) * i) / 2) * (noise_size // block)) names['noise' + str(i)] = noise[noise_start:noise_size] names['generated_data' + str(i)] = names['sub_generator' + str(i)].predict(names['noise' + str(i)], verbose=0) # Concatenate real data to generated data for i in range(self.k): if i == 0: x = np.concatenate( (data_batch, names['generated_data' + str(i)])) else: x = np.concatenate( (x, names['generated_data' + str(i)])) y = np.array([1] * batch_size + [0] * int(noise_size)) # Train discriminator discriminator_loss = self.discriminator.train_on_batch(x, y) self.train_history['discriminator_loss'].append( discriminator_loss) # Get the target value of sub-generator pred_scores = self.discriminator.predict(X).ravel() for i in range(self.k): names['T' + str(i)] = np.percentile( pred_scores, i / self.k * 100) names['trick' + str(i)] = np.array( [float(names['T' + str(i)])] * noise_size) # Train generator noise = np.random.uniform(0, 1, (int(noise_size), latent_size)) if stop == 0: for i in range(self.k): names['sub_generator' + str(i) + '_loss'] = \ names['combine_model' + str(i)].train_on_batch( noise, names['trick' + str(i)]) self.train_history['sub_generator{}_loss'.format( i)].append(names['sub_generator' + str(i) + '_loss']) else: for i in range(self.k): names['sub_generator' + str(i) + '_loss'] = names['combine_model' + str(i)].evaluate( noise, names['trick' + str(i)]) self.train_history['sub_generator{}_loss'.format( i)].append(names['sub_generator' + str(i) + '_loss']) generator_loss = 0 for i in range(self.k): generator_loss = generator_loss + names['sub_generator' + str(i) + '_loss'] generator_loss = generator_loss / self.k self.train_history['generator_loss'].append(generator_loss) # Stop training generator if epoch + 1 > self.stop_epochs: stop = 1 # Detection result self.decision_scores_ = self.discriminator.predict(X).ravel() self._process_decision_scores() return self
def _compute_shap_values(pipeline, features, training_data=None): """Computes SHAP values for each feature. Arguments: pipeline (PipelineBase): Trained pipeline whose predictions we want to explain with SHAP. features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on. training_data (pd.DataFrame): Training data the pipeline was fit on. For non-tree estimators, we need a sample of training data for the KernelSHAP algorithm. Returns: dict or list(dict): For regression problems, a dictionary mapping a feature name to a list of SHAP values. For classification problems, returns a list of dictionaries. One for each class. """ estimator = pipeline.estimator if estimator.model_family == ModelFamily.BASELINE: raise ValueError( "You passed in a baseline pipeline. These are simple enough that SHAP values are not needed." ) feature_names = features.columns # This is to make sure all dtypes are numeric - SHAP algorithms will complain otherwise. # Sklearn components do this under-the-hood so we're not changing the data the model was trained on. # Catboost can naturally handle string-encoded categorical features so we don't need to convert to numeric. if estimator.model_family != ModelFamily.CATBOOST: features = check_array(features.values) if estimator.model_family.is_tree_estimator(): # Because of this issue: https://github.com/slundberg/shap/issues/1215 if estimator.model_family == ModelFamily.XGBOOST: raise NotImplementedError( "SHAP values cannot currently be computed for xgboost models.") if estimator.model_family == ModelFamily.CATBOOST and pipeline.problem_type == ProblemTypes.MULTICLASS: # Will randomly segfault raise NotImplementedError( "SHAP values cannot currently be computed for catboost models for multiclass problems." ) # Use tree_path_dependent to avoid linear runtime with dataset size with warnings.catch_warnings(record=True) as ws: explainer = shap.TreeExplainer( estimator._component_obj, feature_perturbation="tree_path_dependent") if ws: logger.debug( f"_compute_shap_values TreeExplainer: {ws[0].message}") shap_values = explainer.shap_values(features, check_additivity=False) # shap only outputs values for positive class for Catboost binary estimators. # this modifies the output to match the output format of other binary estimators. # Ok to fill values of negative class with zeros since the negative class will get dropped # in the UI anyways. if estimator.model_family == ModelFamily.CATBOOST and pipeline.problem_type == ProblemTypes.BINARY: shap_values = [np.zeros(shap_values.shape), shap_values] else: if training_data is None: raise ValueError( "You must pass in a value for parameter 'training_data' when the pipeline " "does not have a tree-based estimator. " f"Current estimator model family is {estimator.model_family}.") # More than 100 datapoints can negatively impact runtime according to SHAP # https://github.com/slundberg/shap/blob/master/shap/explainers/kernel.py#L114 sampled_training_data_features = shap.sample(training_data, 100) sampled_training_data_features = check_array( sampled_training_data_features) if pipeline.problem_type == ProblemTypes.REGRESSION: link_function = "identity" decision_function = estimator._component_obj.predict else: link_function = "logit" decision_function = estimator._component_obj.predict_proba with warnings.catch_warnings(record=True) as ws: explainer = shap.KernelExplainer(decision_function, sampled_training_data_features, link_function) shap_values = explainer.shap_values(features) if ws: logger.debug( f"_compute_shap_values KernelExplainer: {ws[0].message}") # classification problem if isinstance(shap_values, list): mappings = [] for class_shap_values in shap_values: mappings.append( _create_dictionary(class_shap_values, feature_names)) return mappings # regression problem elif isinstance(shap_values, np.ndarray): return _create_dictionary(shap_values, feature_names) else: raise ValueError( f"Unknown shap_values datatype {str(type(shap_values))}!")
def fit(self, X, lengths=None): X = check_array(X) self._init(X, lengths=lengths) self._check() self.monitor_ = ConvergenceMonitor(self.tol, self.n_iter, self.verbose) for iter in range(self.n_iter): print('iteration: {}'.format(iter)) stats = self._initialize_sufficient_statistics() curr_logprob = 0 tt = 0 path_list = list() for i, j in iter_from_X_lengths(X, lengths): logprob, state_sequence = self.decode(X[i:j], algorithm="viterbi") curr_logprob += logprob epsilon = np.zeros((state_sequence.shape[0] - 1, self.n_components, self.n_components)) gamma = np.zeros((state_sequence.shape[0], self.n_components)) for t in range(state_sequence.shape[0] - 1): epsilon[t, state_sequence[t], state_sequence[t + 1]] = 1 for t in range(state_sequence.shape[0]): for i in range(self.n_components): if t != (state_sequence.shape[0] - 1): gamma[t, i] = np.sum(epsilon[t, i]) else: gamma[t, i] = gamma[t - 1, i] path_list.append(state_sequence) self._accumulate_sufficient_statistics(stats, X[i:j], epsilon, gamma, state_sequence, None) tt += 1 print('average loss: {}'.format(curr_logprob / tt)) if not fast_update: stats['start'] /= tt stats['trans'] /= tt self._do_mstep(stats) if update_dnn: temp_path = np.zeros((0, 1)) for k, (i, j) in enumerate(iter_from_X_lengths(X, lengths)): temp_path = np.vstack( [temp_path, np.array(path_list[k]).reshape(-1, 1)]) self.mlp.train(X, temp_path, 20) acoustic_model = np.zeros(self.n_components) for i, j in iter_from_X_lengths(X, lengths): logprob, state_sequence = self.decode(X[i:j], algorithm="viterbi") for state in state_sequence: acoustic_model[state] += 1 self.aucoustic_model = acoustic_model / np.sum(acoustic_model) self.monitor_.report(curr_logprob) if self.monitor_.iter == self.monitor_.n_iter or \ (len(self.monitor_.history) == 2 and abs(self.monitor_.history[1] - self.monitor_.history[0]) < self.monitor_.tol * abs( self.monitor_.history[1])): break print('----------------------------------------------') return self
def find_bots(self, priors): print "Getting all user info..." self.users_to_query = set() followers_set = set(self.followers) print "Number of followers: " + str(len(self.followers)) follower_counts = Counter(self.followers).most_common() # should fix this to be a more precise measure size_to_keep = int(.15*len(self.followers)) connectedness_threshold = floor(0.3*self.n) tmp_followers = [f[0] for f in follower_counts if f[1] >= connectedness_threshold] if len(tmp_followers) < size_to_keep: tmp_followers.extend([f[0] for f in follower_counts[:size_to_keep] if f[1] > 1]) followers_set = set(tmp_followers) print "Number of connected followers: " + str(len(followers_set)) for follower in followers_set: user_info = None follower = str(follower) if follower not in self.users and follower not in self.ignore_users: self.cur.execute('SELECT suspended, deleted, other_error, user_info_json FROM followers WHERE user_id = %s', (follower,)) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) # print "User is suspended or deleted" continue if record[3]: # print "Already have profile information for user number " + follower self.user_info[follower] = ast.literal_eval(record[3]) continue self.users_to_query.add(follower) get_user_info(self) print "Getting all timeline info and extracting features" for follower in followers_set: timeline = None follower = str(follower) if follower not in self.users and follower not in self.ignore_users: self.users.add(follower) self.cur.execute('SELECT suspended, deleted, other_error, timeline FROM followers WHERE user_id = %s', (follower,)) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) # print "User is suspended or deleted" continue if record[3]: # print "Already have timeline information for user number " + follower # Have to read in file to get timeline info timeline = get_timeline_from_file(self, follower) else: timeline = get_user_timeline(self, follower) else: timeline = get_user_timeline(self, follower) if timeline and self.user_info.get(follower) and len(timeline) > 50: gf = GetFeatures(follower, self.user_info[follower], timeline) try: gf.user_features() gf.collect_tweets() gf.content_features() gf.temporal_features() except Exception as e: print "ERROR GETTING FEATURES" print e print follower print self.user_info[follower] # need to incorporate other network features #gf.features['num_shared_edges'] = follower_counts[user] #cself.user_features[user] = gf.features self.current_level_users.append(follower) self.features_list.append(gf.features) # Axis=0 should be vertical len_priors = len(priors) current_features = priors current_features.extend(self.features_list) print "Performing anomaly detection" #json.dump(priors, open('test.json', 'w'), indent=4, separators=(',', ': ')) X = self.vec.fit_transform(current_features).toarray() current_features = {} X_norm = normalize(X) #print np.any(np.isnan(X)) #print np.all(np.isfinite(X)) print X.shape # X = np.stack([current_features, priors], axis=0) Every round will find outliers, how do we stop exploring? clf = LocalOutlierFactor(n_neighbors=20) clf.fit(X) check_is_fitted(clf, ["threshold_", "negative_outlier_factor_", "n_neighbors_", "_distances_fit_X_"]) if X is not None: X = check_array(X, accept_sparse='csr') y_pred = clf._decision_function(X) else: y_pred = clf.negative_outlier_factor_ #y_pred = clf.fit_predict(X) y_pred_new = y_pred[len_priors:] # Do anomaly detection and set connected followers to certain outliers # this line is a stand-in users_scores = zip(self.current_level_users, y_pred_new) connected_followers = [u[0] for u in users_scores if u[1] <= clf.threshold_] #How do I add back in the outliers to the anomaly detection? Mueen said not to so I will leave for now self.level += 1 # Add highly connected followers to the clique and to_check for follower in connected_followers: self.clique.add((follower, self.level)) self.to_check.add(follower) print self.clique self.n = float(len(self.clique)) print "Current size of cluster: " + str(self.n)
def update(self, X, y=None, **fit_params): if "relations" not in fit_params: raise ValueError( "Aligned UMAP requires relations between data to be " "specified") new_dict_relations = fit_params["relations"] X = check_array(X) self.__dict__ = set_aligned_params(fit_params, self.__dict__, self.n_models_) self.n_models_ += 1 new_mapper = UMAP( n_neighbors=get_nth_item_or_val(self.n_neighbors, self.n_models_), min_dist=get_nth_item_or_val(self.min_dist, self.n_models_), n_epochs=get_nth_item_or_val(self.n_epochs, self.n_models_), repulsion_strength=get_nth_item_or_val(self.repulsion_strength, self.n_models_), learning_rate=get_nth_item_or_val(self.learning_rate, self.n_models_), spread=get_nth_item_or_val(self.spread, self.n_models_), negative_sample_rate=get_nth_item_or_val(self.negative_sample_rate, self.n_models_), local_connectivity=get_nth_item_or_val(self.local_connectivity, self.n_models_), set_op_mix_ratio=get_nth_item_or_val(self.set_op_mix_ratio, self.n_models_), unique=get_nth_item_or_val(self.unique, self.n_models_), n_components=self.n_components, ).fit(X) self.mappers_ += [new_mapper] # TODO: We can likely make this more efficient and not recompute each time self.dict_relations_ += [invert_dict(new_dict_relations)] if self.n_epochs is None: n_epochs = 200 else: n_epochs = self.n_epochs indptr_list = numba.typed.List.empty_list(numba.types.int32[::1]) indices_list = numba.typed.List.empty_list(numba.types.int32[::1]) heads = numba.typed.List.empty_list(numba.types.int32[::1]) tails = numba.typed.List.empty_list(numba.types.int32[::1]) epochs_per_samples = numba.typed.List.empty_list( numba.types.float64[::1]) for i, mapper in enumerate(self.mappers_): indptr_list.append(mapper.graph_.indptr) indices_list.append(mapper.graph_.indices) heads.append(mapper.graph_.tocoo().row) tails.append(mapper.graph_.tocoo().col) if i == len(self.mappers_) - 1: epochs_per_samples.append( make_epochs_per_sample(mapper.graph_.tocoo().data, n_epochs)) else: epochs_per_samples.append( np.full(mapper.embedding_.shape[0], n_epochs + 1, dtype=np.float64)) new_relations = expand_relations(self.dict_relations_) new_regularisation_weights = build_neighborhood_similarities( indptr_list, indices_list, new_relations, ) new_embedding = init_from_existing(self.embeddings_[-1], new_mapper.graph_, new_dict_relations) random_state = check_random_state(self.random_state) rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) self.embeddings_.append(new_embedding) self.embeddings_ = optimize_layout_aligned_euclidean( self.embeddings_, self.embeddings_, heads, tails, n_epochs, epochs_per_samples, new_regularisation_weights, new_relations, rng_state, lambda_=self.alignment_regularisation, )
def test_check_array(): # accept_sparse == False # raise error on sparse inputs X = [[1, 2], [3, 4]] X_csr = sp.csr_matrix(X) with pytest.raises(TypeError): check_array(X_csr) # ensure_2d=False X_array = check_array([0, 1, 2], ensure_2d=False) assert X_array.ndim == 1 # ensure_2d=True with 1d array with pytest.raises(ValueError, match="Expected 2D array," " got 1D array instead"): check_array([0, 1, 2], ensure_2d=True) # ensure_2d=True with scalar array with pytest.raises(ValueError, match="Expected 2D array," " got scalar array instead"): check_array(10, ensure_2d=True) # don't allow ndim > 3 X_ndim = np.arange(8).reshape(2, 2, 2) with pytest.raises(ValueError): check_array(X_ndim) check_array(X_ndim, allow_nd=True) # doesn't raise # dtype and order enforcement. X_C = np.arange(4).reshape(2, 2).copy("C") X_F = X_C.copy("F") X_int = X_C.astype(int) X_float = X_C.astype(float) Xs = [X_C, X_F, X_int, X_float] dtypes = [np.int32, int, float, np.float32, None, bool, object] orders = ['C', 'F', None] copys = [True, False] for X, dtype, order, copy in product(Xs, dtypes, orders, copys): X_checked = check_array(X, dtype=dtype, order=order, copy=copy) if dtype is not None: assert X_checked.dtype == dtype else: assert X_checked.dtype == X.dtype if order == 'C': assert X_checked.flags['C_CONTIGUOUS'] assert not X_checked.flags['F_CONTIGUOUS'] elif order == 'F': assert X_checked.flags['F_CONTIGUOUS'] assert not X_checked.flags['C_CONTIGUOUS'] if copy: assert X is not X_checked else: # doesn't copy if it was already good if (X.dtype == X_checked.dtype and X_checked.flags['C_CONTIGUOUS'] == X.flags['C_CONTIGUOUS'] and X_checked.flags['F_CONTIGUOUS'] == X.flags['F_CONTIGUOUS']): assert X is X_checked # allowed sparse != None X_csc = sp.csc_matrix(X_C) X_coo = X_csc.tocoo() X_dok = X_csc.todok() X_int = X_csc.astype(int) X_float = X_csc.astype(float) Xs = [X_csc, X_coo, X_dok, X_int, X_float] accept_sparses = [['csr', 'coo'], ['coo', 'dok']] for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses, copys): with warnings.catch_warnings(record=True) as w: X_checked = check_array(X, dtype=dtype, accept_sparse=accept_sparse, copy=copy) if (dtype is object or sp.isspmatrix_dok(X)) and len(w): # XXX unreached code as of v0.22 message = str(w[0].message) messages = ["object dtype is not supported by sparse matrices", "Can't check dok sparse matrix for nan or inf."] assert message in messages else: assert len(w) == 0 if dtype is not None: assert X_checked.dtype == dtype else: assert X_checked.dtype == X.dtype if X.format in accept_sparse: # no change if allowed assert X.format == X_checked.format else: # got converted assert X_checked.format == accept_sparse[0] if copy: assert X is not X_checked else: # doesn't copy if it was already good if X.dtype == X_checked.dtype and X.format == X_checked.format: assert X is X_checked # other input formats # convert lists to arrays X_dense = check_array([[1, 2], [3, 4]]) assert isinstance(X_dense, np.ndarray) # raise on too deep lists with pytest.raises(ValueError): check_array(X_ndim.tolist()) check_array(X_ndim.tolist(), allow_nd=True) # doesn't raise # convert weird stuff to arrays X_no_array = _NotAnArray(X_dense) result = check_array(X_no_array) assert isinstance(result, np.ndarray)
def _fit( X, alpha=1e-2, gamma=1e-3, tol=1e-3, max_iter=1000, verbose=0, return_history=True, compute_objective=True, warm_start=None, return_n_iter=False, adjust_gamma=False, A=None, T=0, rho=1, update_gamma=0.5, line_search=False, ): n, d = X.shape if warm_start is None: theta = np.zeros((d, d)) else: theta = check_array(warm_start) thetas = [theta] theta_new = theta.copy() checks = [] for iter_ in range(max_iter): theta_old = thetas[-1] if not line_search: grad = _gradient_ising(X, theta, n, A, rho, T) theta_new = theta - gamma * grad theta = (theta_new + theta_new.T) / 2 theta = soft_thresholding_od(theta, alpha * gamma) else: while True: grad = _gradient_ising(X, theta, n, A, rho, T) theta_new = theta - gamma * grad theta = (theta_new + theta_new.T) / 2 theta = soft_thresholding_od(theta, alpha * gamma) print(theta) loss_new = loss(X, theta) loss_old = loss(X, theta_old) # Line search diff_theta2 = np.linalg.norm(theta_old - theta) ** 2 grad_diff = np.trace(grad.dot(theta_old - theta)) diff = loss_old - grad_diff + (diff_theta2 / (2 * gamma)) if loss_new > diff or np.isinf(loss_new) or np.isnan(loss_new): gamma = update_gamma * gamma theta = theta_old - gamma * grad theta = soft_thresholding_od(theta, alpha * gamma) loss_new = loss(X, theta) diff = loss_old - grad_diff + (diff_theta2 / (2 * gamma)) else: break thetas.append(theta) with warnings.catch_warnings(): warnings.simplefilter("ignore") check = convergence( iter=iter_, obj=objective(X, theta, alpha), iter_norm=np.linalg.norm(thetas[-2] - thetas[-1]), iter_r_norm=(np.linalg.norm(thetas[-2] - thetas[-1]) / np.linalg.norm(thetas[-1])), ) checks.append(check) # if adjust_gamma: # TODO multiply or divide if verbose: print("Iter: %d, objective: %.4f, iter_norm %.4f" % (check[0], check[1], check[2])) if np.abs(check[2]) < tol: break return_list = [thetas[-1]] if return_history: return_list.append(thetas) return_list.append(checks) if return_n_iter: return_list.append(iter_) return return_list