def predict_proba(self, X: np.ndarray) -> np.ndarray: """ Predict the likelihood of each class. This function will only work as expected if training used the `binary:logistic` loss. Parameters ---------- X : numpy.ndarray The input data Returns ------- y_proba_pred : numpy.ndarray The probabilistic predictions """ validation_utils.check_is_fitted(self, 'best_booster_', self.name) if self.scaler_ is not None: msg = "transforming the input data" self.log(msg, logging.DEBUG) X = self.scaler_.transform(X) d_x = xgb.DMatrix(X) y_proba_pred = xgbooster_predict_proba(self.best_booster_, d_x) return y_proba_pred
def inverse_transform(self, y): """Transform labels back to original encoding. Parameters ---------- y : numpy array of shape [n_samples] Encoded target values. That is, these should be integers in the range [0, n_classes]. Returns ------- y : numpy array of shape [n_samples] """ check_is_fitted(self, 'classes_') # mark the nan's m_nan = pd.isnull(y) y[m_nan] = len(self.classes_) - 1 diff = np.setdiff1d(y[~m_nan], np.arange(len(self.classes_), dtype=object)) if diff: raise ValueError("y contains new labels: {}".format(str(diff))) y = np.asarray(y, dtype=int) return self.classes_[y]
def kneighbors_graph(self, n_neighbors=None, as_nx=True): """ Build the k-nearest neighbors graph for the training data Please see the `sklearn` documentation for more details of the semantics of this method: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html Parameters ---------- n_neighbors: int The number of neighbors. Default: the value passed to the constructor as_nx: bool Whether to return the graph as a networkx Graph data structure (`True`) or a scipy.sparse_matrix (`False`). Returns ------- kneighbors_graph: graph The k-nearest neighbors graph. Please see the documentation referenced above for more details. """ check_is_fitted(self, ["knn_"]) if n_neighbors is None: n_neighbors = self.n_neighbors kneighbors_graph = self.knn_.kneighbors_graph(n_neighbors=n_neighbors) if as_nx: kneighbors_graph = nx.from_scipy_sparse_matrix(kneighbors_graph) return kneighbors_graph
def transform(self, y): """Transform labels to normalized encoding. Parameters ---------- y : array-like of shape [n_samples] Target values. Returns ------- y : array-like of shape [n_samples] """ check_is_fitted(self, 'classes_') y = sklearn.utils.column_or_1d(y, warn=True) y = np.array(y, dtype=object) #print("[nan_le.transform] y: {}".format(y)) # use our marker for NaNs m_nan = pd.isnull(y) # check if we want to treat unknown labels as NaNs if self.treat_unknown_as_missing: m_unknown = np.array([y_i not in self.classes_ for y_i in y]) m_nan |= m_unknown y[m_nan] = self.missing_value_marker # then make sure we know all of the labels #print("[nan_le.transform] y after: {}".format(y)) observed_labels = np.unique(y) if len(self.classes_) < 20: msg = ("[nan_label_encoder.transform] observed labels: {}. " "self.classes_: {}".format(observed_labels, self.classes_)) logger.debug(msg) else: msg = ("[nan_le.transform] too many classes to print observed " "labels") logger.debug(msg) unknown_labels = [l for l in observed_labels if l not in self.classes_] if len(unknown_labels) > 0: msg = ("[nan_label_encoder.transform] y contains new labels: {}". format(str(unknown_labels))) raise ValueError(msg) # however, put the NaNs back in ret = np.array([self.classes_[y_i] for y_i in y]).astype(float) #ret = np.searchsorted(self.classes_, y).astype(float) ret[m_nan] = np.nan return ret
def transform(self, X, *_): to_check = [ 'col_mean_', 'col_std_', 'columns_', 'col_ignore_', ] validation_utils.check_is_fitted(self, to_check) # if we did not see a column in the training, or if it had only one # value, we cannot really do anything with it # so ignore those # do not overwrite our original information X = X.copy() # now, actually grab the columns depending on the type of X if isinstance(X, pd.DataFrame): X_cols = X[self.columns_].copy() X_cols.iloc[:, self.col_ignore_] = 0 elif isinstance(X, np.ndarray): # check if we have a single vector if len(X.shape) == 1: #X[self.col_ignore_] = 0 X = X.reshape(-1, 1) X_cols = X[:, self.columns_] X_cols[:, self.col_ignore_] = 0 else: msg = ("[NanStandardScaler.transform]: unrecognized data type: {}". format(type(X))) raise ValueError(msg) X_transform = ((X_cols - self.col_mean_) / self.col_std_) # and stick the columns back if isinstance(X, pd.DataFrame): X[self.columns_] = X_transform else: X[:, self.columns_] = X_transform return X
def transform(self, *_, **__): """ Transform the data provided to the constructor """ validation_utils.check_is_fitted(self, ["num_pipeline_", "bow_pipelines_"]) Xt_num = self.num_pipeline_.transform(self.num_data) Xt_num = scipy.sparse.csr_matrix(Xt_num) it = zip(self.bow_pipelines_, self.bow_tokens) Xt = [p.transform(nt) for p, nt in it] Xt.append(Xt_num) Xt = scipy.sparse.hstack(Xt) # we generally want to index, so convert to csr Xt = Xt.tocsr() return Xt
def transform(self, X, *_): check_is_fitted(self, "enc_") X = X.copy() # make sure we have a matrix rather than a vector if len(X.shape) == 1: X = X.reshape(-1, 1) # first, replace the missing categorical values with 0 masks = {} for f in self.categorical_features: m = pd.isnull(X[:,f]) masks[f] = m # this overwrites data in the passed array X[m,f] = 0 # now, encode the categorical values (ignoring whatever is in the # other fields) Xt = _encode_selected( X, self.enc_, selected=self.categorical_features, copy=True ) if self.sparse: Xt = Xt.tocsr() # and clear out the missing values for i, f in enumerate(self.categorical_features): m = masks[f] indices = self.enc_.feature_indices_[i:i+1] Xt[m,indices] = 0 return Xt
def inverse_transform(self, X, *_, **__): """ Transform labels back to the original encoding """ check_is_fitted(self, "le_") # make a copy to keep around everything we do not encode X = X.copy() for c in self.columns: le = self.le_[c] # make sure we actually grab a column if self.is_np_array_: # so np.array y = X[:,c] y = le.inverse_transform(y) X[:,c] = y else: # then pd.DataFrame y = X[c] y = le.inverse_transform(y) X[c] = y return X
def transform(self, X, *_, **__): """ Encode the respective columns of X """ check_is_fitted(self, "le_") # make a copy to keep around everything we do not encode X = X.copy() for c in self.columns: le = self.le_[c] # make sure we actually grab a column if self.is_np_array_: # so np.array y = X[:,c] y = le.transform(y) X[:,c] = y else: # then pd.DataFrame y = X[c] y = le.transform(y) X[c] = y return X
def kneighbors(self, X, n_neighbors=None, return_distance=False, as_np=False): """ Find the k nearest neighbor of each instance in `X` If specified in the constructor, then the data will be scaled (using the respective parameters learned from the training data). N.B. This method is not implemented especially efficiently. Parameters ---------- X: data matrix Missing values are represented using np.nan n_neighbors: int The number of neighbors. Default: the value passed to the constructor return_distances: bool Whether to return the distances to the neighbors (`True`) or not (`False`) as_np: bool Whether to return the neighbors as an `np.array` (`True`) or a list of lists (`False`) Returns ------- distance: np.array The distances to the neighbors. This is only present if `return_distance` is `True`. neighbors: np.array or list of lists The indices of the nearest neighbors of each entity in `X` from the original training set. """ check_is_fitted(self, ["knn_"]) # check if we need to scale the data if self.scale: X = self.scaler_.transform(X) # ensure X is the correct shape X = np.atleast_2d(X) # first, find the distance from each query point to each indexed point # we need n_query rows and n_indexed columns # a bit confusing, so use clearer variable names queries = X indexed = self.X distance_matrix = scipy.spatial.distance.cdist(queries, indexed, metric=self.knn_metric_) # convert the infs to very large numbers distance_matrix = np.nan_to_num(distance_matrix) if n_neighbors is None: n_neighbors = self.n_neighbors ret = self.knn_.kneighbors(X=distance_matrix, n_neighbors=n_neighbors, return_distance=return_distance) # find the indices of the neighbors ret_indices = ret if return_distance: ret_distances = ret[0] ret_indices = ret[1] # check if we want the list of lists if not as_np: ret_indices = [ list(ret_indices[i]) for i in range(ret_indices.shape[0]) ] ret = ret_indices if return_distance: ret = (ret_distances, ret_indices) return ret