def _validate_data(self, X, y=None, reset=True, validate_separately=False, **check_params): """Validate input data and set or check the `n_features_in_` attribute. Parameters ---------- X : {array-like, sparse matrix, dataframe} of shape \ (n_samples, n_features) The input samples. y : array-like of shape (n_samples,), default=None The targets. If None, `check_array` is called on `X` and `check_X_y` is called otherwise. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. validate_separately : False or tuple of dicts, default=False Only used if y is not None. If False, call validate_X_y(). Else, it must be a tuple of kwargs to be used for calling check_array() on X and y respectively. **check_params : kwargs Parameters passed to :func:`sklearn.utils.check_array` or :func:`sklearn.utils.check_X_y`. Ignored if validate_separately is not False. Returns ------- out : {ndarray, sparse matrix} or tuple of these The validated input. A tuple is returned if `y` is not None. """ if y is None: if self._get_tags()['requires_y']: raise ValueError( f"This {self.__class__.__name__} estimator " f"requires y to be passed, but the target y is None." ) X = check_array(X, **check_params) out = X else: if validate_separately: # We need this because some estimators validate X and y # separately, and in general, separately calling check_array() # on X and y isn't equivalent to just calling check_X_y() # :( check_X_params, check_y_params = validate_separately X = check_array(X, **check_X_params) y = check_array(y, **check_y_params) else: X, y = check_X_y(X, y, **check_params) out = X, y if check_params.get('ensure_2d', True): self._check_n_features(X, reset=reset) return out
def get_submatrix(self, i, data): """Return the submatrix corresponding to bicluster `i`. Parameters ---------- i : int The index of the cluster. data : array-like The data. Returns ------- submatrix : ndarray The submatrix corresponding to bicluster i. Notes ----- Works with sparse matrices. Only works if ``rows_`` and ``columns_`` attributes exist. """ data = check_array(data, accept_sparse='csr') row_ind, col_ind = self.get_indices(i) return data[row_ind[:, np.newaxis], col_ind]
def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"): """Check that y_true and y_pred belong to the same regression task Parameters ---------- y_true : array-like y_pred : array-like multioutput : array-like or string in ['raw_values', uniform_average', 'variance_weighted'] or None None is accepted due to backward compatibility of r2_score(). Returns ------- type_true : one of {'continuous', continuous-multioutput'} The type of the true target data, as output by 'utils.multiclass.type_of_target' y_true : array-like of shape (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like of shape (n_samples, n_outputs) Estimated target values. multioutput : array-like of shape (n_outputs) or string in ['raw_values', uniform_average', 'variance_weighted'] or None Custom output weights if ``multioutput`` is array-like or just the corresponding argument if ``multioutput`` is a correct keyword. dtype: str or list, default="numeric" the dtype argument passed to check_array """ check_consistent_length(y_true, y_pred) y_true = check_array(y_true, ensure_2d=False, dtype=dtype) y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype) # irrelevant in af, dim[1] always valid #if y_true.numdims() == 1: #y_true = y_true.reshape((-1, 1)) #if y_pred.numdims() == 1: #y_pred = y_pred.reshape((-1, 1)) print(type(y_true)) print(type(y_pred)) if y_true.numdims() != 1 and y_pred.numdims() !=1: if y_true.shape[1] != y_pred.shape[1]: raise ValueError("y_true and y_pred have different number of output " "({0}!={1})".format(y_true.shape[1], y_pred.shape[1])) n_outputs = 1 if y_true.numdims() == 1 else y_true.shape[1] allowed_multioutput_str = ('raw_values', 'uniform_average', 'variance_weighted') if isinstance(multioutput, str): if multioutput not in allowed_multioutput_str: raise ValueError("Allowed 'multioutput' string values are {}. " "You provided multioutput={!r}".format( allowed_multioutput_str, multioutput)) elif multioutput is not None: multioutput = check_array(multioutput, ensure_2d=False) if n_outputs == 1: raise ValueError("Custom weights are useful only in " "multi-output cases.") elif n_outputs != len(multioutput): raise ValueError(("There must be equally many custom weights " "(%d) as outputs (%d).") % (len(multioutput), n_outputs)) y_type = 'continuous' if n_outputs == 1 else 'continuous-multioutput' return y_type, y_true, y_pred, multioutput
def unique_labels(*ys): """Extract an ordered array of unique labels We don't allow: - mix of multilabel and multiclass (single label) targets - mix of label indicator matrix and anything else, because there are no explicit labels) - mix of label indicator matrices of different sizes - mix of string and integer labels At the moment, we also don't allow "multiclass-multioutput" input type. Parameters ---------- *ys : array-likes Returns ------- out : numpy array of shape [n_unique_labels] An ordered array of unique labels. Examples -------- >>> from sklearn.utils.multiclass import unique_labels >>> unique_labels([3, 5, 5, 5, 7, 7]) array([3, 5, 7]) >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4]) array([1, 2, 3, 4]) >>> unique_labels([1, 2, 10], [5, 11]) array([ 1, 2, 5, 10, 11]) """ if not ys: raise ValueError('No argument has been passed.') # Check that we don't mix label format ys_types = set(type_of_target(x) for x in ys) if ys_types == {"binary", "multiclass"}: ys_types = {"multiclass"} if len(ys_types) > 1: raise ValueError("Mix type of y not allowed, got types %s" % ys_types) label_type = ys_types.pop() # Check consistency for the indicator format if (label_type == "multilabel-indicator" and len( set( check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1] for y in ys)) > 1): raise ValueError("Multi-label binary indicator input with " "different numbers of labels") # Get the unique set of labels _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None) if not _unique_labels: raise ValueError("Unknown label type: %s" % repr(ys)) #ys_labels = set(chain.from_iterable(_unique_labels(y.tolist()) for y in ys)) ys_labels = set( chain.from_iterable(_unique_labels(y.to_list()) for y in ys)) # Check that we don't mix string type with number type if (len(set(isinstance(label, str) for label in ys_labels)) > 1): raise ValueError("Mix of label input types (string and number)") return np.array(sorted(ys_labels))
def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False): """Binarize labels in a one-vs-all fashion Several regression and binary classification algorithms are available in scikit-learn. A simple way to extend these algorithms to the multi-class classification case is to use the so-called one-vs-all scheme. This function makes it possible to compute this transformation for a fixed set of class labels known ahead of time. Parameters ---------- y : array-like Sequence of integer labels or multilabel data to encode. classes : array-like of shape [n_classes] Uniquely holds the label for each class. neg_label : int (default: 0) Value with which negative labels must be encoded. pos_label : int (default: 1) Value with which positive labels must be encoded. sparse_output : boolean (default: False), Set to true if output binary array is desired in CSR sparse format Returns ------- Y : numpy array or CSR matrix of shape [n_samples, n_classes] Shape will be [n_samples, 1] for binary problems. Examples -------- >>> from sklearn.preprocessing import label_binarize >>> label_binarize([1, 6], classes=[1, 2, 4, 6]) array([[1, 0, 0, 0], [0, 0, 0, 1]]) The class ordering is preserved: >>> label_binarize([1, 6], classes=[1, 6, 4, 2]) array([[1, 0, 0, 0], [0, 1, 0, 0]]) Binary targets transform to a column vector >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes']) array([[1], [0], [0], [1]]) See also -------- LabelBinarizer : class used to wrap the functionality of label_binarize and allow for fitting to classes independently of the transform operation """ if not isinstance(y, list): # XXX Workaround that will be removed when list of list format is # dropped y = check_array(y, accept_sparse='csr', ensure_2d=False, dtype=None) else: if _num_samples(y) == 0: raise ValueError('y has 0 samples: %r' % y) if neg_label >= pos_label: raise ValueError("neg_label={0} must be strictly less than " "pos_label={1}.".format(neg_label, pos_label)) if (sparse_output and (pos_label == 0 or neg_label != 0)): raise ValueError("Sparse binarization is only supported with non " "zero pos_label and zero neg_label, got " "pos_label={0} and neg_label={1}" "".format(pos_label, neg_label)) # To account for pos_label == 0 in the dense case pos_switch = pos_label == 0 if pos_switch: pos_label = -neg_label y_type = type_of_target(y) if 'multioutput' in y_type: raise ValueError("Multioutput target data is not supported with label " "binarization") if y_type == 'unknown': raise ValueError("The type of target data is not known") n_samples = y.shape[0] if sp.issparse(y) else len(y) n_classes = len(classes) classes = np.asarray(classes) if y_type == "binary": if n_classes == 1: if sparse_output: return sp.csr_matrix((n_samples, 1), dtype=int) else: Y = np.zeros((len(y), 1), dtype=np.int) Y += neg_label return Y elif len(classes) >= 3: y_type = "multiclass" sorted_class = np.sort(classes) if y_type == "multilabel-indicator": y_n_classes = y.shape[1] if hasattr(y, 'shape') else len(y[0]) if classes.size != y_n_classes: raise ValueError("classes {0} mismatch with the labels {1}" " found in the data".format( classes, unique_labels(y))) if y_type in ("binary", "multiclass"): y = column_or_1d(y) # pick out the known labels from y y_in_classes = af_in1d(y, classes) y_in_classes = af.interop.from_ndarray(y_in_classes, copy=True) y[y_in_classes] y_seen = y[y_in_classes] y_seen = y_seen.to_ndarray() indices = np.searchsorted(sorted_class, y_seen) indptr = np.hstack((0, np.cumsum(y_in_classes))) data = np.empty_like(indices) data.fill(pos_label) Y = data #Y = sp.csr_matrix((data, indices, indptr), #shape=(n_samples, n_classes)) elif y_type == "multilabel-indicator": Y = sp.csr_matrix(y) if pos_label != 1: data = np.empty_like(Y.data) data.fill(pos_label) Y.data = data else: raise ValueError("%s target data is not supported with label " "binarization" % y_type) if not sparse_output: #Y = Y.toarray() #TODO: test if ndarray, then cast if not Y = Y.astype(int, copy=False) if neg_label != 0: Y[Y == 0] = neg_label if pos_switch: Y[Y == pos_label] = 0 else: Y.data = Y.data.astype(int, copy=False) # preserve label ordering if np.any(classes != sorted_class): indices = np.searchsorted(sorted_class, classes) Y = Y[:, indices] if y_type == "binary": if sparse_output: Y = Y.getcol(-1) else: Y = Y[:, -1].reshape((-1, 1)) #return Y return Y
def _unique_indicator(y): return np.arange( check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1])