def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None): lb = LabelBinarizer() T = lb.fit_transform(y_true) if T.shape[1] == 1: T = np.append(1 - T, T, axis=1) # Clipping Y = np.clip(y_pred, eps, 1 - eps) # This happens in cases when elements in y_pred have type "str". if not isinstance(Y, np.ndarray): raise ValueError("y_pred should be an array of floats.") # If y_pred is of single dimension, assume y_true to be binary # and then check. if Y.ndim == 1: Y = Y[:, np.newaxis] if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) # Check if dimensions are consistent. val.check_consistent_length(T, Y) T = val.check_array(T) Y = val.check_array(Y) print(T) print(Y) if T.shape[1] != Y.shape[1]: raise ValueError("y_true and y_pred have different number of classes " "%d, %d" % (T.shape[1], Y.shape[1])) # Renormalize Y /= Y.sum(axis=1)[:, np.newaxis] loss = -(T * np.log(Y)).sum(axis=1) return _weighted_sum(loss, sample_weight, normalize)
def pinball_loss(y_true, y_pred, probs): """Compute the pinball loss. Parameters ---------- pred : {array-like}, shape = [n_quantiles, n_samples] or [n_samples] Predictions. y : {array-like}, shape = [n_samples] Targets. Returns ------- l : {array}, shape = [n_quantiles] Average loss for each quantile level. """ probs = asarray(probs).reshape(-1) check_consistent_length(y_true, y_pred.T) y_true = check_array(y_true.reshape((-1, 1)), ensure_2d=True) y_pred = check_array(y_pred.T.reshape((y_true.shape[0], -1)), ensure_2d=True) residual = y_true - y_pred loss = npsum([fmax(prob * res, (prob - 1) * res) for (res, prob) in zip(residual.T, probs)], axis=1) return loss / y_true.size
def _check_rows_and_columns(a, b): """Unpacks the row and column arrays and checks their shape.""" check_consistent_length(*a) check_consistent_length(*b) checks = lambda x: check_array(x, ensure_2d=False) a_rows, a_cols = map(checks, a) b_rows, b_cols = map(checks, b) return a_rows, a_cols, b_rows, b_cols
def test_check_dataframe_fit_attribute(): # check pandas dataframe with 'fit' column does not raise error # https://github.com/scikit-learn/scikit-learn/issues/8415 try: import pandas as pd X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) X_df = pd.DataFrame(X, columns=['a', 'b', 'fit']) check_consistent_length(X_df) except ImportError: raise SkipTest("Pandas not found")
def fit(self, X, y, sample_weight=None): """ Build a classifier from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values (class labels in classification). sample_weight : array-like, shape = [n_samples] or None Individual weights for each sample. Returns ------- self : object Returns self. """ self._validate_params(**self.get_params()) X, y = check_X_y(X, y, accept_sparse=True) if sp.isspmatrix(X): self._is_sparse_train_X = True else: self._is_sparse_train_X = False self._n_samples, self._n_features = X.shape sample_weight = self._get_sample_weight(sample_weight) check_consistent_length(X, y, sample_weight) check_classification_targets(y) self._classes = sorted(np.unique(y)) self._n_classes = len(self._classes) self._classes_map = {} self._set_params_with_dependencies() params = self._get_params() if self._n_classes == 2: self._classes_map[0] = self._classes[0] self._classes_map[1] = self._classes[1] self._estimators = [None] y = (y == self._classes[0]).astype(int) self._fit_binary_task(X, y, sample_weight, params) elif self._n_classes > 2: if sp.isspmatrix_dok(X): X = X.tocsr().tocoo() # Fix to avoid scipy 7699 issue self._estimators = [None] * self._n_classes self._fit_multiclass_task(X, y, sample_weight, params) else: raise ValueError("Classifier can't predict when only one class is present.") self._fitted = True return self
def _indexable(X, y): """Make arrays indexable for cross-validation. Checks consistent length, passes through None, and ensures that everything can be indexed. Parameters ---------- X : array-like or pandas DataFrame, shape = [n_samples, n_features] Input data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. """ result = [_validate_X(X), _validate_y(y)] check_consistent_length(*result) return result
def fit(self, X, y, sample_weight=None): """ Build a regressor from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values (real numbers in regression). sample_weight : array-like, shape = [n_samples] or None Individual weights for each sample. Returns ------- self : object Returns self. """ self._validate_params(**self.get_params()) X, y = check_X_y(X, y, accept_sparse=True) if sp.isspmatrix(X): self._is_sparse_train_X = True else: self._is_sparse_train_X = False self._n_samples, self._n_features = X.shape sample_weight = self._get_sample_weight(sample_weight) check_consistent_length(X, y, sample_weight) self._set_params_with_dependencies() params = self._get_params() self._estimators = [None] self._fit_regression_task(X, y, sample_weight, params) self._fitted = True return self
def _my_lrap(y_true, y_score): """Simple implementation of label ranking average precision""" check_consistent_length(y_true, y_score) y_true = check_array(y_true) y_score = check_array(y_score) n_samples, n_labels = y_true.shape score = np.empty((n_samples, )) for i in range(n_samples): # The best rank correspond to 1. Rank higher than 1 are worse. # The best inverse ranking correspond to n_labels. unique_rank, inv_rank = np.unique(y_score[i], return_inverse=True) n_ranks = unique_rank.size rank = n_ranks - inv_rank # Rank need to be corrected to take into account ties # ex: rank 1 ex aequo means that both label are rank 2. corr_rank = np.bincount(rank, minlength=n_ranks + 1).cumsum() rank = corr_rank[rank] relevant = y_true[i].nonzero()[0] if relevant.size == 0 or relevant.size == n_labels: score[i] = 1 continue score[i] = 0. for label in relevant: # Let's count the number of relevant label with better rank # (smaller rank). n_ranked_above = sum(rank[r] <= rank[label] for r in relevant) # Weight by the rank of the actual label score[i] += n_ranked_above / rank[label] score[i] /= relevant.size return score.mean()
def test_check_consistent_length(): check_consistent_length([1], [2], [3], [4], [5]) check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b']) check_consistent_length([1], (2,), np.array([3]), sp.csr_matrix((1, 2))) assert_raises_regexp(ValueError, 'inconsistent numbers of samples', check_consistent_length, [1, 2], [1]) assert_raises_regexp(TypeError, 'got <\w+ \'int\'>', check_consistent_length, [1, 2], 1) assert_raises_regexp(TypeError, 'got <\w+ \'object\'>', check_consistent_length, [1, 2], object()) assert_raises(TypeError, check_consistent_length, [1, 2], np.array(1)) # Despite ensembles having __len__ they must raise TypeError assert_raises_regexp(TypeError, 'estimator', check_consistent_length, [1, 2], RandomForestRegressor())
def test_check_consistent_length(): check_consistent_length([1], [2], [3], [4], [5]) check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b']) check_consistent_length([1], (2, ), np.array([3]), sp.csr_matrix((1, 2))) assert_raises_regex(ValueError, 'inconsistent numbers of samples', check_consistent_length, [1, 2], [1]) assert_raises_regex(TypeError, r"got <\w+ 'int'>", check_consistent_length, [1, 2], 1) assert_raises_regex(TypeError, r"got <\w+ 'object'>", check_consistent_length, [1, 2], object()) assert_raises(TypeError, check_consistent_length, [1, 2], np.array(1)) # Despite ensembles having __len__ they must raise TypeError assert_raises_regex(TypeError, 'Expected sequence or array-like', check_consistent_length, [1, 2], RandomForestRegressor())
def check_consistent_length(u, i, r): skval.check_consistent_length(u, i, r) return np.asarray(u), np.asarray(i), np.asarray(r, dtype=DTYPE)
def wpearsonr(x, y, w=None): """Utility function to calculate the weighted Pearson correlation of two samples. See https://stats.stackexchange.com/questions/221246/such-thing-as-a-weighted-correlation for more information Parameters ---------- x : array, shape (n,) Input x. y : array, shape (n,) Input y. w : array, shape (n,) Weights w. Returns ------- scores : float in range of [-1,1] Weighted Pearson Correlation between x and y. """ # unweighted version # note the return is different # TODO: fix output differences if w is None: return pearsonr(x, y) x = np.asarray(x) y = np.asarray(y) w = np.asarray(w) check_consistent_length([x, y, w]) # n = len(x) w_sum = w.sum() mx = np.sum(x * w) / w_sum my = np.sum(y * w) / w_sum xm, ym = (x - mx), (y - my) r_num = np.sum(xm * ym * w) / w_sum xm2 = np.sum(xm * xm * w) / w_sum ym2 = np.sum(ym * ym * w) / w_sum r_den = np.sqrt(xm2 * ym2) r = r_num / r_den r = max(min(r, 1.0), -1.0) # TODO: disable p value calculation due to python 2.7 break # df = n_train_ - 2 # # if abs(r) == 1.0: # prob = 0.0 # else: # t_squared = r ** 2 * (df / ((1.0 - r) * (1.0 + r))) # prob = _betai(0.5 * df, 0.5, df / (df + t_squared)) return r # , prob
def _daal_fit(self, X, y): self._check_daal_supported_parameters() _supported_dtypes_ = [np.single, np.double] X = check_array(X, dtype=_supported_dtypes_) y = np.asarray(y) y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warnings.warn("A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) check_consistent_length(X, y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if self.n_outputs_ != 1: _class_name = self.__class__.__name__ raise ValueError(_class_name + " does not currently support multi-output data. Consider using OneHotEncoder") y = check_array(y, ensure_2d=False, dtype=None) y, _ = self._validate_y_class_weight(y) self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] self.n_features_ = X.shape[1] rs_ = check_random_state(self.random_state) seed_ = rs_.randint(0, np.iinfo('i').max) if self.n_classes_ < 2: raise ValueError("Training data only contain information about one class.") # create algorithm X_fptype = getFPType(X) daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=True) dfc_algorithm = daal4py.decision_forest_classification_training( nClasses=int(self.n_classes_), fptype=X_fptype, method='defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=1, featuresPerNode=int(_featuresPerNode), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=int(self.min_samples_leaf), engine=daal_engine_, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap) ) self._cached_estimators_ = None # compute dfc_trainingResult = dfc_algorithm.compute(X, y) # get resulting model model = dfc_trainingResult.model self.daal_model_ = model # compute oob_score_ if self.oob_score: self._set_oob_score(X, y) return self
def fit(self, data, sites, discrete_covariates=None, continuous_covariates=None): """Compute the parameters to perform the harmonization/normalization Parameters ---------- data : array-like, shape [n_samples, n_features] The data used to compute the per-feature statistics used for later harmonization along the acquisition sites. sites : array-like, shape [n_samples, 1] The target variable for harmonization problems (e.g. acquisition sites or batches). discrete_covariates : array-like, shape [n_samples, n_discrete_covariates] The covariates which are categorical (e.g. schizophrenia patient or healthy control). continuous_covariates : array-like, shape [n_samples, n_continuous_covariates] The covariates which are continuous (e.g. age and clinical scores) """ # Reset internal state before fitting self._reset() data = check_array(data, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES) sites = check_array(sites, copy=self.copy, estimator=self) check_consistent_length(data, sites) if discrete_covariates is not None: self.discrete_covariates_used = True discrete_covariates = check_array(discrete_covariates, copy=self.copy, dtype=None, estimator=self) if continuous_covariates is not None: self.continuous_covariates_used = True continuous_covariates = check_array(continuous_covariates, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES) # To have a similar code to neuroCombat and Combat original scripts data = data.T sites_names, n_samples_per_site = np.unique(sites, return_counts=True) self.sites_names = sites_names self.n_sites = len(sites_names) n_samples = sites.shape[0] idx_per_site = [list(np.where(sites == idx)[0]) for idx in sites_names] design = self._make_design_matrix(sites, discrete_covariates, continuous_covariates, fitting=True) standardized_data, _ = self._standardize_across_features( data, design, n_samples, n_samples_per_site, fitting=True) gamma_hat, delta_hat = self._fit_ls_model(standardized_data, design, idx_per_site) gamma_bar, tau_2, a_prior, b_prior = self._find_priors( gamma_hat, delta_hat) self.gamma_star, self.delta_star = self._find_parametric_adjustments( standardized_data, idx_per_site, gamma_hat, delta_hat, gamma_bar, tau_2, a_prior, b_prior) return self
def test_check_consistent_length(): check_consistent_length([1], [2], [3], [4], [5]) check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b']) check_consistent_length([1], (2,), np.array([3]), sp.csr_matrix((1, 2))) with pytest.raises(ValueError, match="inconsistent numbers of samples"): check_consistent_length([1, 2], [1]) with pytest.raises(TypeError, match=r"got <\w+ 'int'>"): check_consistent_length([1, 2], 1) with pytest.raises(TypeError, match=r"got <\w+ 'object'>"): check_consistent_length([1, 2], object()) with pytest.raises(TypeError): check_consistent_length([1, 2], np.array(1)) # Despite ensembles having __len__ they must raise TypeError with pytest.raises(TypeError, match="Expected sequence or array-like"): check_consistent_length([1, 2], RandomForestRegressor())
def fit(self, X, Y): """Fit model to data Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of predictors. Y : array-like, shape = [n_samples, 1] Target vector, where n_samples is the number of samples. This implementation only supports a single response (target) variable. """ # copy since this will contains the residuals (deflated) matrices check_consistent_length(X, Y) X = check_array(X, dtype=np.float64, copy=True, ensure_min_samples=2) Y = check_array(Y, dtype=np.float64, copy=True, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = _center_scale_xy( X, Y, self.scale) Z = X.copy() w = np.dot(X.T, Y) # calculate weight vector w /= np.linalg.norm(w) # normalize weight vector T = [] P = [] Q = [] W_ortho = [] T_ortho = [] P_ortho = [] Q_ortho = [] for i in range(self.n_components): t = np.dot(Z, w) # scores vector q = np.dot(Y.T, t) / np.dot(t.T, t).item() # loadings of y u = np.dot(Y, q) # scores of y p = np.dot(Z.T, t) / np.dot(t.T, t).item() # loadings of X w_ortho = p - np.dot(w.T, p).item() / np.dot( w.T, w).item() * w # orthogonal weight w_ortho = w_ortho / np.linalg.norm( w_ortho) # normalize orthogonal weight t_ortho = np.dot(Z, w_ortho) # orthogonal components p_ortho = np.dot(Z.T, t_ortho) / np.dot(t_ortho.T, t_ortho).item() # not sure if q_ortho is OK, but it follows q q_ortho = np.dot(Y.T, t_ortho) / np.dot(t_ortho.T, t_ortho).item() Z -= np.dot(t_ortho, p_ortho.T) T.append(t) P.append(p) Q.append(q) W_ortho.append(w_ortho) T_ortho.append(t_ortho) P_ortho.append(p_ortho) Q_ortho.append(q_ortho) self.T = np.hstack(T) self.P = np.hstack(P) self.Q = np.hstack(Q) self.W_ortho_ = np.hstack(W_ortho) self.T_ortho_ = np.hstack(T_ortho) self.P_ortho_ = np.hstack(P_ortho) self.Q_ortho_ = np.hstack(Q_ortho) self._vipscore() return self
def from_arrays(cls, x, y, d, z=None, use_other_treat_as_covariate=True, force_all_x_finite=True): """ Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s. Parameters ---------- x : :class:`numpy.ndarray` Array of covariates. y : :class:`numpy.ndarray` Array of the outcome variable. d : :class:`numpy.ndarray` Array of treatment variables. z : None or :class:`numpy.ndarray` Array of instrumental variables. Default is ``None``. use_other_treat_as_covariate : bool Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. Default is ``True``. force_all_x_finite : bool or str Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used for the nuisance functions are capable to provide valid predictions with missings and / or infinite values in the covariates ``x``. Default is ``True``. Examples -------- >>> from doubleml import DoubleMLData >>> from doubleml.datasets import make_plr_CCDDHNR2018 >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) """ if isinstance(force_all_x_finite, str): if force_all_x_finite != 'allow-nan': raise ValueError( "Invalid force_all_x_finite " + force_all_x_finite + ". " + "force_all_x_finite must be True, False or 'allow-nan'.") elif not isinstance(force_all_x_finite, bool): raise TypeError( "Invalid force_all_x_finite. " + "force_all_x_finite must be True, False or 'allow-nan'.") x = check_array(x, ensure_2d=False, allow_nd=False, force_all_finite=force_all_x_finite) d = check_array(d, ensure_2d=False, allow_nd=False) y = column_or_1d(y, warn=True) x = _assure_2d_array(x) d = _assure_2d_array(d) y_col = 'y' if z is None: check_consistent_length(x, y, d) z_cols = None else: z = check_array(z, ensure_2d=False, allow_nd=False) z = _assure_2d_array(z) check_consistent_length(x, y, d, z) if z.shape[1] == 1: z_cols = ['z'] else: z_cols = [f'z{i + 1}' for i in np.arange(z.shape[1])] if d.shape[1] == 1: d_cols = ['d'] else: d_cols = [f'd{i+1}' for i in np.arange(d.shape[1])] x_cols = [f'X{i+1}' for i in np.arange(x.shape[1])] if z is None: data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + [y_col] + d_cols) else: data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + [y_col] + d_cols + z_cols) return cls(data, y_col, d_cols, x_cols, z_cols, use_other_treat_as_covariate, force_all_x_finite)
def fit(self, X, y, treatment, estimator_trmnt_fit_params=None, estimator_ctrl_fit_params=None): """Fit the model according to the given training data. For each test example calculate predictions on new set twice: by the first and second models. After that calculate uplift as a delta between these predictions. Return delta of predictions for each example. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. y (array-like, shape (n_samples,)): Target vector relative to X. treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X. estimator_trmnt_fit_params (dict, optional): Parameters to pass to the fit method of the treatment estimator. estimator_ctrl_fit_params (dict, optional): Parameters to pass to the fit method of the control estimator. Returns: object: self """ # TODO: check the treatment is binary check_consistent_length(X, y, treatment) self._type_of_target = type_of_target(y) X_ctrl, y_ctrl = X[treatment == 0], y[treatment == 0] X_trmnt, y_trmnt = X[treatment == 1], y[treatment == 1] if estimator_trmnt_fit_params is None: estimator_trmnt_fit_params = {} if estimator_ctrl_fit_params is None: estimator_ctrl_fit_params = {} if self.method == 'vanilla': self.estimator_ctrl.fit(X_ctrl, y_ctrl, **estimator_ctrl_fit_params) self.estimator_trmnt.fit(X_trmnt, y_trmnt, **estimator_trmnt_fit_params) if self.method == 'ddr_control': self.estimator_ctrl.fit(X_ctrl, y_ctrl, **estimator_ctrl_fit_params) if self._type_of_target == 'binary': ddr_control = self.estimator_ctrl.predict_proba(X_trmnt)[:, 1] else: ddr_control = self.estimator_ctrl.predict(X_trmnt) if isinstance(X_trmnt, np.ndarray): X_trmnt_mod = np.column_stack((X_trmnt, ddr_control)) elif isinstance(X_trmnt, pd.DataFrame): X_trmnt_mod = X_trmnt.assign(ddr_control=ddr_control) else: raise TypeError( "Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_trmnt)) self.estimator_trmnt.fit(X_trmnt_mod, y_trmnt, **estimator_trmnt_fit_params) if self.method == 'ddr_treatment': self.estimator_trmnt.fit(X_trmnt, y_trmnt, **estimator_trmnt_fit_params) if self._type_of_target == 'binary': ddr_treatment = self.estimator_trmnt.predict_proba(X_ctrl)[:, 1] else: ddr_treatment = self.estimator_trmnt.predict(X_ctrl) if isinstance(X_ctrl, np.ndarray): X_ctrl_mod = np.column_stack((X_ctrl, ddr_treatment)) elif isinstance(X_trmnt, pd.DataFrame): X_ctrl_mod = X_ctrl.assign(ddr_treatment=ddr_treatment) else: raise TypeError( "Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_ctrl)) self.estimator_ctrl.fit(X_ctrl_mod, y_ctrl, **estimator_ctrl_fit_params) return self
def _binary_clf_curve(self, y_true, y_score, pos_label=None, sample_weight=None): """Calculate true and false positives per binary classification threshold. Parameters ---------- y_true : array, shape = [n_samples] True targets of binary classification y_score : array, shape = [n_samples] Estimated probabilities or decision function pos_label : int or str, default=None The label of the positive class sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- fps : array, shape = [n_thresholds] A count of false positives, at index i being the number of negative samples assigned a score >= thresholds[i]. The total number of negative samples is equal to fps[-1] (thus true negatives are given by fps[-1] - fps). tps : array, shape = [n_thresholds <= len(np.unique(y_score))] An increasing count of true positives, at index i being the number of positive samples assigned a score >= thresholds[i]. The total number of positive samples is equal to tps[-1] (thus false negatives are given by tps[-1] - tps). thresholds : array, shape = [n_thresholds] Decreasing score values. """ # Check to make sure y_true is valid y_type = type_of_target(y_true) if not (y_type == "binary" or (y_type == "multiclass" and pos_label is not None)): raise ValueError("{0} format is not supported".format(y_type)) check_consistent_length(y_true, y_score, sample_weight) y_true = column_or_1d(y_true) y_score = column_or_1d(y_score) assert_all_finite(y_true) assert_all_finite(y_score) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) # ensure binary classification if pos_label is not specified # classes.dtype.kind in ('O', 'U', 'S') is required to avoid # triggering a FutureWarning by calling np.array_equal(a, b) # when elements in the two arrays are not comparable. classes = np.unique(y_true) if (pos_label is None and ( classes.dtype.kind in ('O', 'U', 'S') or not (np.array_equal(classes, [0, 1]) or np.array_equal(classes, [-1, 1]) or np.array_equal(classes, [0]) or np.array_equal(classes, [-1]) or np.array_equal(classes, [1])))): classes_repr = ", ".join(repr(c) for c in classes) raise ValueError("y_true takes value in {{{classes_repr}}} and " "pos_label is not specified: either make y_true " "take value in {{0, 1}} or {{-1, 1}} or " "pass pos_label explicitly.".format( classes_repr=classes_repr)) elif pos_label is None: pos_label = 1. # make y_true a boolean vector y_true = (y_true == pos_label) # sort scores and corresponding truth values desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] y_score = y_score[desc_score_indices] y_true = y_true[desc_score_indices] if sample_weight is not None: weight = sample_weight[desc_score_indices] else: weight = 1. # y_score typically has many tied values. Here we extract # the indices associated with the distinct values. We also # concatenate a value for the end of the curve. distinct_value_indices = np.where(np.diff(y_score))[0] threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] # accumulate the true positives with decreasing threshold tps = stable_cumsum(y_true * weight)[threshold_idxs] positives = stable_cumsum(y_true)[threshold_idxs] # Note that the number of positive should be computed differently if sample_weight is not None: # express fps as a cumsum to ensure fps is increasing even in # the presence of floating point errors fps = stable_cumsum((1 - y_true))[threshold_idxs] else: fps = 1 + threshold_idxs - tps return fps, tps, y_score[threshold_idxs], positives
def _validate_and_reformat_input(X, y=None, expect_y=True, enforce_binary_labels=False, **kwargs): """Validate input data and return the data in an appropriate format. The :code:`**kwargs` can contain :code:`sensitive_features=` and :code:`control_features=` parameters. Parameters ---------- X : numpy.ndarray, pandas.DataFrame The feature matrix y : numpy.ndarray, pandas.DataFrame, pandas.Series, or list The label vector expect_y : bool If True y needs to be provided, otherwise ignores the argument; default True enforce_binary_labels : bool If True raise exception if there are more than two distinct values in the `y` data; default False Returns ------- Tuple(pandas.DataFrame, pandas.Series, pandas.Series, pandas.Series) The validated and reformatted X, y, sensitive_features and control_features; note that certain estimators rely on metadata encoded in X which may be stripped during the reformatting process, so mitigation methods should ideally use the input X instead of the returned X for training estimators and leave potential reformatting of X to the estimator. """ if y is not None: # calling check_X_y with a 2-dimensional y causes a warning, so ensure it is 1-dimensional if isinstance(y, np.ndarray) and len(y.shape) == 2 and y.shape[1] == 1: y = y.reshape(-1) elif isinstance(y, pd.DataFrame) and y.shape[1] == 1: y = y.to_numpy().reshape(-1) X, y = check_X_y(X, y, dtype=None, force_all_finite=False) y = check_array(y, ensure_2d=False, dtype='numeric') if enforce_binary_labels and not set(np.unique(y)).issubset(set([0, 1 ])): raise ValueError(_LABELS_NOT_0_1_ERROR_MESSAGE) elif expect_y: raise ValueError(_MESSAGE_Y_NONE) else: X = check_array(X) sensitive_features = kwargs.get(_KW_SENSITIVE_FEATURES) if sensitive_features is None: raise ValueError(_MESSAGE_SENSITIVE_FEATURES_NONE) check_consistent_length(X, sensitive_features) sensitive_features = check_array(sensitive_features, ensure_2d=False, dtype=None) # compress multiple sensitive features into a single column if len(sensitive_features.shape) > 1 and sensitive_features.shape[1] > 1: sensitive_features = _merge_columns(sensitive_features) # Handle the control features control_features = kwargs.get(_KW_CONTROL_FEATURES) if control_features is not None: check_consistent_length(X, control_features) control_features = check_array(control_features, ensure_2d=False, dtype=None) # compress multiple control features into a single column if len(control_features.shape) > 1 and control_features.shape[1] > 1: control_features = _merge_columns(control_features) control_features = pd.Series(control_features.squeeze()) # If we don't have a y, then need to fiddle with return type to # avoid a warning from pandas if y is not None: result_y = pd.Series(y) else: result_y = pd.Series(dtype="float64") return pd.DataFrame(X), result_y, pd.Series( sensitive_features.squeeze()), control_features
def cv_split(cv, X, y, groups): check_consistent_length(X, y, groups) return list(cv.split(X, y, groups))
def _fit_regressor(self, X, y, sample_weight=None): if sp.issparse(y): raise ValueError( "sparse multilabel-indicator for y is not supported." ) _check_parameters(self) if sample_weight is not None: sample_weight = check_sample_weight(sample_weight, X) if sklearn_check_version('1.0') and self.criterion == "mse": warnings.warn( "Criterion 'mse' was deprecated in v1.0 and will be " "removed in version 1.2. Use `criterion='squared_error'` " "which is equivalent.", FutureWarning ) _patching_status = PatchingConditionsChain( "sklearn.ensemble.RandomForestRegressor.fit") _dal_ready = _patching_status.and_conditions([ (self.oob_score and daal_check_version((2021, 'P', 500)) or not self.oob_score, "OOB score is only supported starting from 2021.5 version of oneDAL."), (self.warm_start is False, "Warm start is not supported."), (self.criterion in ["mse", "squared_error"], f"'{self.criterion}' criterion is not supported. " "Only 'mse' and 'squared_error' criteria are supported."), (self.ccp_alpha == 0.0, f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."), (not sp.issparse(X), "X is sparse. Sparse input is not supported.") ]) if _dal_ready: if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) X = check_array(X, dtype=[np.float64, np.float32]) y = np.asarray(y) y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warnings.warn("A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] _dal_ready = _patching_status.and_conditions([ (self.n_outputs_ == 1, f"Number of outputs ({self.n_outputs_}) is not 1.")]) _patching_status.write_log() if _dal_ready: _daal_fit_regressor(self, X, y, sample_weight=sample_weight) self.estimators_ = self._estimators_ return self return super(RandomForestRegressor, self).fit( X, y, sample_weight=sample_weight)
def from_arrays(cls, x, y, d, z=None, use_other_treat_as_covariate=True): """ Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s. Parameters ---------- x : :class:`numpy.ndarray` Array of covariates. y : :class:`numpy.ndarray` Array of the outcome variable. d : :class:`numpy.ndarray` Array of treatment variables. z : None or :class:`numpy.ndarray` Array of instrumental variables. Default is ``None``. use_other_treat_as_covariate : bool Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. Default is ``True``. Examples -------- >>> from doubleml import DoubleMLData >>> from doubleml.datasets import make_plr_CCDDHNR2018 >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) """ x = check_array(x, ensure_2d=False, allow_nd=False) d = check_array(d, ensure_2d=False, allow_nd=False) y = column_or_1d(y, warn=True) x = _assure_2d_array(x) d = _assure_2d_array(d) y_col = 'y' if z is None: check_consistent_length(x, y, d) z_cols = None else: z = check_array(z, ensure_2d=False, allow_nd=False) z = _assure_2d_array(z) check_consistent_length(x, y, d, z) if z.shape[1] == 1: z_cols = ['z'] else: z_cols = [f'z{i + 1}' for i in np.arange(z.shape[1])] if d.shape[1] == 1: d_cols = ['d'] else: d_cols = [f'd{i+1}' for i in np.arange(d.shape[1])] x_cols = [f'X{i+1}' for i in np.arange(x.shape[1])] if z is None: data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + [y_col] + d_cols) else: data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + [y_col] + d_cols + z_cols) return cls(data, y_col, d_cols, x_cols, z_cols, use_other_treat_as_covariate)
def fit(self, X, y, sample_weight=None): """ Build a RGF Classifier from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values (class labels in classification). sample_weight : array-like, shape = [n_samples] or None Individual weights for each sample. Returns ------- self : object Returns self. """ _validate_params(**self.get_params()) X, y = check_X_y(X, y, accept_sparse=True) n_samples, self._n_features = X.shape if self.sl2 is None: self._sl2 = self.l2 else: self._sl2 = self.sl2 if isinstance(self.min_samples_leaf, _FLOATS): self._min_samples_leaf = ceil(self.min_samples_leaf * n_samples) else: self._min_samples_leaf = self.min_samples_leaf if self.n_iter is None: if self.loss == "LS": self._n_iter = 10 else: self._n_iter = 5 else: self._n_iter = self.n_iter if sample_weight is None: sample_weight = np.ones(n_samples, dtype=np.float32) else: sample_weight = column_or_1d(sample_weight, warn=True) if (sample_weight <= 0).any(): raise ValueError("Sample weights must be positive.") check_consistent_length(X, y, sample_weight) check_classification_targets(y) self._classes = sorted(np.unique(y)) self._n_classes = len(self._classes) self._classes_map = {} params = dict(max_leaf=self.max_leaf, test_interval=self.test_interval, algorithm=self.algorithm, loss=self.loss, reg_depth=self.reg_depth, l2=self.l2, sl2=self._sl2, normalize=self.normalize, min_samples_leaf=self._min_samples_leaf, n_iter=self._n_iter, n_tree_search=self.n_tree_search, opt_interval=self.opt_interval, learning_rate=self.learning_rate, memory_policy=self.memory_policy, verbose=self.verbose) if self._n_classes == 2: self._classes_map[0] = self._classes[0] self._classes_map[1] = self._classes[1] self._estimators = [None] y = (y == self._classes[0]).astype(int) self._estimators[0] = _RGFBinaryClassifier(**params) self._estimators[0].fit(X, y, sample_weight) elif self._n_classes > 2: if sp.isspmatrix_dok(X): X = X.tocsr().tocoo() # Fix to avoid scipy 7699 issue self._estimators = [None] * self._n_classes ovr_list = [None] * self._n_classes for i, cls_num in enumerate(self._classes): self._classes_map[i] = cls_num ovr_list[i] = (y == cls_num).astype(int) self._estimators[i] = _RGFBinaryClassifier(**params) self._estimators = Parallel(n_jobs=self.n_jobs)(delayed(_fit_ovr_binary)(self._estimators[i], X, ovr_list[i], sample_weight) for i in range(self._n_classes)) else: raise ValueError("Classifier can't predict when only one class is present.") self._fitted = True return self
def fit(self, X, y, sample_weight=None): """ Build a RGF Regressor from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values (real numbers in regression). sample_weight : array-like, shape = [n_samples] or None Individual weights for each sample. Returns ------- self : object Returns self. """ _validate_params(**self.get_params()) X, y = check_X_y(X, y, accept_sparse=True, multi_output=False, y_numeric=True) n_samples, self._n_features = X.shape if self.sl2 is None: self._sl2 = self.l2 else: self._sl2 = self.sl2 if isinstance(self.min_samples_leaf, _FLOATS): self._min_samples_leaf = ceil(self.min_samples_leaf * n_samples) else: self._min_samples_leaf = self.min_samples_leaf if self.n_iter is None: if self.loss == "LS": self._n_iter = 10 else: self._n_iter = 5 else: self._n_iter = self.n_iter if sample_weight is None: sample_weight = np.ones(n_samples, dtype=np.float32) else: sample_weight = column_or_1d(sample_weight, warn=True) if (sample_weight <= 0).any(): raise ValueError("Sample weights must be positive.") check_consistent_length(X, y, sample_weight) train_x_loc = os.path.join(_TEMP_PATH, self._file_prefix + ".train.data.x") train_y_loc = os.path.join(_TEMP_PATH, self._file_prefix + ".train.data.y") train_weight_loc = os.path.join(_TEMP_PATH, self._file_prefix + ".train.data.weight") if sp.isspmatrix(X): _sparse_savetxt(train_x_loc, X) else: np.savetxt(train_x_loc, X, delimiter=' ', fmt="%s") np.savetxt(train_y_loc, y, delimiter=' ', fmt="%s") np.savetxt(train_weight_loc, sample_weight, delimiter=' ', fmt="%s") # Format train command params = [] if self.verbose > 0: params.append("Verbose") if self.verbose > 5: params.append("Verbose_opt") # Add some info on weight optimization if self.normalize: params.append("NormalizeTarget") params.append("train_x_fn=%s" % train_x_loc) params.append("train_y_fn=%s" % train_y_loc) params.append("algorithm=%s" % self.algorithm) params.append("loss=%s" % self.loss) params.append("max_leaf_forest=%s" % self.max_leaf) params.append("test_interval=%s" % self.test_interval) params.append("reg_L2=%s" % self.l2) params.append("reg_sL2=%s" % self._sl2) params.append("reg_depth=%s" % self.reg_depth) params.append("min_pop=%s" % self._min_samples_leaf) params.append("num_iteration_opt=%s" % self._n_iter) params.append("num_tree_search=%s" % self.n_tree_search) params.append("opt_interval=%s" % self.opt_interval) params.append("opt_stepsize=%s" % self.learning_rate) params.append("memory_policy=%s" % self.memory_policy.title()) params.append("model_fn_prefix=%s" % os.path.join(_TEMP_PATH, self._file_prefix + ".model")) params.append("train_w_fn=%s" % train_weight_loc) cmd = (_EXE_PATH, "train", ",".join(params)) # Train output = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True).communicate() if self.verbose: for k in output: print(k) self._fitted = True # Find latest model location model_glob = os.path.join(_TEMP_PATH, self._file_prefix + ".model*") model_files = glob(model_glob) if not model_files: raise Exception('Model learning result is not found in {0}. ' 'Training is abnormally finished.'.format(_TEMP_PATH)) self._latest_model_loc = sorted(model_files, reverse=True)[0] return self
def cv_split(cv, X, y, groups, is_pairwise, cache): check_consistent_length(X, y, groups) return CVCache(list(cv.split(X, y, groups)), is_pairwise, cache, _num_samples(X))
def _check_reg_targets(y_true, y_pred, multioutput): """Check that y_true and y_pred belong to the same regression task Parameters ---------- y_true : array-like, y_pred : array-like, multioutput : array-like or string in ['raw_values', uniform_average', 'variance_weighted'] or None None is accepted due to backward compatibility of r2_score(). Returns ------- type_true : one of {'continuous', continuous-multioutput'} The type of the true target data, as output by 'utils.multiclass.type_of_target' y_true : array-like of shape = (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like of shape = (n_samples, n_outputs) Estimated target values. multioutput : array-like of shape = (n_outputs) or string in ['raw_values', uniform_average', 'variance_weighted'] or None Custom output weights if ``multioutput`` is array-like or just the corresponding argument if ``multioutput`` is a correct keyword. """ check_consistent_length(y_true, y_pred) y_true = check_array(y_true, ensure_2d=False) y_pred = check_array(y_pred, ensure_2d=False) if y_true.ndim == 1: y_true = y_true.reshape((-1, 1)) if y_pred.ndim == 1: y_pred = y_pred.reshape((-1, 1)) if y_true.shape[1] != y_pred.shape[1]: raise ValueError("y_true and y_pred have different number of output " "({0}!={1})".format(y_true.shape[1], y_pred.shape[1])) n_outputs = y_true.shape[1] multioutput_options = (None, 'raw_values', 'uniform_average', 'variance_weighted') if multioutput not in multioutput_options: multioutput = check_array(multioutput, ensure_2d=False) if n_outputs == 1: raise ValueError("Custom weights are useful only in " "multi-output cases.") elif n_outputs != len(multioutput): raise ValueError(("There must be equally many custom weights " "(%d) as outputs (%d).") % (len(multioutput), n_outputs)) y_type = 'continuous' if n_outputs == 1 else 'continuous-multioutput' return y_type, y_true, y_pred, multioutput
def fit(self, X, y, sample_weight=None): """ Build a RGF Regressor from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values (real numbers in regression). sample_weight : array-like, shape = [n_samples] or None Individual weights for each sample. Returns ------- self : object Returns self. """ _validate_params(**self.get_params()) X, y = check_X_y(X, y, accept_sparse=True, multi_output=False, y_numeric=True) n_samples, self._n_features = X.shape if self.sl2 is None: self._sl2 = self.l2 else: self._sl2 = self.sl2 if isinstance(self.min_samples_leaf, _FLOATS): self._min_samples_leaf = ceil(self.min_samples_leaf * n_samples) else: self._min_samples_leaf = self.min_samples_leaf if self.n_iter is None: if self.loss == "LS": self._n_iter = 10 else: self._n_iter = 5 else: self._n_iter = self.n_iter if sample_weight is None: sample_weight = np.ones(n_samples, dtype=np.float32) else: sample_weight = column_or_1d(sample_weight, warn=True) if (sample_weight <= 0).any(): raise ValueError("Sample weights must be positive.") check_consistent_length(X, y, sample_weight) train_x_loc = os.path.join(_TEMP_PATH, self._file_prefix + ".train.data.x") train_y_loc = os.path.join(_TEMP_PATH, self._file_prefix + ".train.data.y") train_weight_loc = os.path.join( _TEMP_PATH, self._file_prefix + ".train.data.weight") if sp.isspmatrix(X): _sparse_savetxt(train_x_loc, X) else: np.savetxt(train_x_loc, X, delimiter=' ', fmt="%s") np.savetxt(train_y_loc, y, delimiter=' ', fmt="%s") np.savetxt(train_weight_loc, sample_weight, delimiter=' ', fmt="%s") # Format train command params = [] if self.verbose > 0: params.append("Verbose") if self.verbose > 5: params.append( "Verbose_opt") # Add some info on weight optimization if self.normalize: params.append("NormalizeTarget") params.append("train_x_fn=%s" % train_x_loc) params.append("train_y_fn=%s" % train_y_loc) params.append("algorithm=%s" % self.algorithm) params.append("loss=%s" % self.loss) params.append("max_leaf_forest=%s" % self.max_leaf) params.append("test_interval=%s" % self.test_interval) params.append("reg_L2=%s" % self.l2) params.append("reg_sL2=%s" % self._sl2) params.append("reg_depth=%s" % self.reg_depth) params.append("min_pop=%s" % self._min_samples_leaf) params.append("num_iteration_opt=%s" % self._n_iter) params.append("num_tree_search=%s" % self.n_tree_search) params.append("opt_interval=%s" % self.opt_interval) params.append("opt_stepsize=%s" % self.learning_rate) params.append("memory_policy=%s" % self.memory_policy.title()) params.append("model_fn_prefix=%s" % os.path.join(_TEMP_PATH, self._file_prefix + ".model")) params.append("train_w_fn=%s" % train_weight_loc) cmd = (_EXE_PATH, "train", ",".join(params)) # Train output = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True).communicate() if self.verbose: for k in output: print(k) self._fitted = True # Find latest model location model_glob = os.path.join(_TEMP_PATH, self._file_prefix + ".model*") model_files = glob(model_glob) if not model_files: raise Exception( 'Model learning result is not found in {0}. ' 'Training is abnormally finished.'.format(_TEMP_PATH)) self._latest_model_loc = sorted(model_files, reverse=True)[0] return self
def plot_auc_test(self, X, y, cv=1, groups=None, title=None, ax=None, save_fig=False): '''plot roc_auc curve for given fitted estimator, must have continuous predictons (decision_function or predict_proba) to evaluate model by roc_auc metrics(iterables of X, y can be passed or X, y can be splited using cv > 1), to assess model fit performance X -2D array or list of 2D ndarrays y -binary or list of class labels cv -int, cross-validation generator or an iterable - if cv>1, generate splits by StratifyKfold method title - title added to plot header as to indicate (X, y) return -------- ax, mean-auc, std-auc, data_splits: list of test data set in the form of DataFrame (combined X & y) ''' L = locals().copy() L.pop('self') estimator = self.estimator # split test set by cv if cv > 1: xs = [] ys = [] data_splits = tuple( _split_cv(X, y=y, cv=cv, groups=groups, random_state=self.seed)) for x_set, y_set in data_splits: xs.append(x_set[1]) ys.append(y_set[1]) L.update({'X': xs, 'y': ys, 'cv': 1}) return self.plot_auc_test(**L) self._check_fitted(estimator) X = get_flat_list(X) y = get_flat_list(y) validation.check_consistent_length(X, y) fprs = [] tprs = [] aucs = [] n_sample = 0 for i in range(len(X)): x0 = X[i] y0 = y[i] y_pre = self._pre_continueous(estimator, x0) fpr, tpr, threshhold = roc_curve(y0, y_pre, drop_intermediate=True) fprs.append(fpr) tprs.append(tpr) aucs.append(auc(fpr, tpr)) n_sample += len(x0) # -- plot if ax is None: fig, ax = plt.subplots(1, 1) ax = plotter_auc(fprs, tprs, ax=ax) header = '-'.join([ _get_estimator_name(estimator), 'testCV', '{} samples'.format(n_sample) ]) if isinstance(title, str): header = '-'.join([title, header]) ax.set_title(header) data_splits = [ pd.concat((pd.DataFrame(i) for i in item), axis=1) for item in zip(X, y) ] if save_fig is True: if isinstance(title, str): plot_name = 'plots/roc_test_' + title + '.pdf' else: plot_name = 'plots/roc_test.pdf' self.folder.write(plt.gcf(), plot_name) plt.close() return ax, np.mean(aucs), np.std(aucs), data_splits
def fit(self, X, y, sample_weight=None): """ Build a RGF Classifier from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values (class labels in classification). sample_weight : array-like, shape = [n_samples] or None Individual weights for each sample. Returns ------- self : object Returns self. """ _validate_params(**self.get_params()) X, y = check_X_y(X, y, accept_sparse=True) n_samples, self._n_features = X.shape if self.sl2 is None: self._sl2 = self.l2 else: self._sl2 = self.sl2 if isinstance(self.min_samples_leaf, _FLOATS): self._min_samples_leaf = ceil(self.min_samples_leaf * n_samples) else: self._min_samples_leaf = self.min_samples_leaf if self.n_iter is None: if self.loss == "LS": self._n_iter = 10 else: self._n_iter = 5 else: self._n_iter = self.n_iter if sample_weight is None: sample_weight = np.ones(n_samples, dtype=np.float32) else: sample_weight = column_or_1d(sample_weight, warn=True) if (sample_weight <= 0).any(): raise ValueError("Sample weights must be positive.") check_consistent_length(X, y, sample_weight) check_classification_targets(y) self._classes = sorted(np.unique(y)) self._n_classes = len(self._classes) self._classes_map = {} params = dict(max_leaf=self.max_leaf, test_interval=self.test_interval, algorithm=self.algorithm, loss=self.loss, reg_depth=self.reg_depth, l2=self.l2, sl2=self._sl2, normalize=self.normalize, min_samples_leaf=self._min_samples_leaf, n_iter=self._n_iter, n_tree_search=self.n_tree_search, opt_interval=self.opt_interval, learning_rate=self.learning_rate, memory_policy=self.memory_policy, verbose=self.verbose) if self._n_classes == 2: self._classes_map[0] = self._classes[0] self._classes_map[1] = self._classes[1] self._estimators = [None] y = (y == self._classes[0]).astype(int) self._estimators[0] = _RGFBinaryClassifier(**params) self._estimators[0].fit(X, y, sample_weight) elif self._n_classes > 2: if sp.isspmatrix_dok(X): X = X.tocsr().tocoo() # Fix to avoid scipy 7699 issue self._estimators = [None] * self._n_classes ovr_list = [None] * self._n_classes for i, cls_num in enumerate(self._classes): self._classes_map[i] = cls_num ovr_list[i] = (y == cls_num).astype(int) self._estimators[i] = _RGFBinaryClassifier(**params) self._estimators = Parallel(n_jobs=self.n_jobs)( delayed(_fit_ovr_binary)(self._estimators[i], X, ovr_list[i], sample_weight) for i in range(self._n_classes)) else: raise ValueError( "Classifier can't predict when only one class is present.") self._fitted = True return self
def transform(self, data, sites, discrete_covariates=None, continuous_covariates=None): """Transform data to harmonized space Parameters ---------- data : array-like Input data that will be transformed. sites : array-like Site info of the inputted data discrete_covariates : array-like The covariates which are categorical continuous_covariates : array-like The covariates which are continuous """ check_is_fitted(self, 'n_sites') data = check_array(data, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES) sites = check_array(sites, copy=self.copy, estimator=self) check_consistent_length(data, sites) if hasattr(self, 'discrete_covariates_used'): discrete_covariates = check_array(discrete_covariates, copy=self.copy, dtype=None, estimator=self) if hasattr(self, 'continuous_covariates_used'): continuous_covariates = check_array(continuous_covariates, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES) # To have a similar code to neuroCombat and Combat original scripts data = data.T new_data_sites_name = np.unique(sites) # Check all sites from new_data were seen if not all(site_name in self.sites_names for site_name in new_data_sites_name): raise ValueError( 'There is a site unseen during the fit method in the data.') n_samples = sites.shape[0] n_samples_per_site = np.array( [np.sum(sites == site_name) for site_name in self.sites_names]) idx_per_site = [ list(np.where(sites == site_name)[0]) for site_name in self.sites_names ] design = self._make_design_matrix(sites, discrete_covariates, continuous_covariates, fitting=False) standardized_data, standardized_mean = self._standardize_across_features( data, design, n_samples, n_samples_per_site, fitting=False) bayes_data = self._adjust_data_final(standardized_data, design, standardized_mean, n_samples_per_site, n_samples, idx_per_site) return bayes_data.T
def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): """Compute uplift at first k observations by uplift of the total sample. Args: y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. k (float or int): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the computation of uplift. If int, represents the absolute number of samples. strategy (string, ['overall', 'by_group']): Determines the calculating strategy. * ``'overall'``: The first step is taking the first k observations of all test data ordered by uplift prediction (overall both groups - control and treatment) and conversions in treatment and control groups calculated only on them. Then the difference between these conversions is calculated. * ``'by_group'``: Separately calculates conversions in top k observations in each group (control and treatment) sorted by uplift predictions. Then the difference between these conversions is calculated .. versionchanged:: 0.1.0 * Add supporting absolute values for ``k`` parameter * Add parameter ``strategy`` Returns: float: Uplift score at first k observations of the total sample. See also: :func:`.uplift_auc_score`: Compute normalized Area Under the Uplift curve from prediction scores. :func:`.qini_auc_score`: Compute normalized Area Under the Qini Curve from prediction scores. """ # ToDo: checker that treatment is binary and all groups is not empty check_consistent_length(y_true, uplift, treatment) y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array( treatment) strategy_methods = ['overall', 'by_group'] if strategy not in strategy_methods: raise ValueError( f'Uplift score supports only calculating methods in {strategy_methods},' f' got {strategy}.') n_samples = len(y_true) order = np.argsort(uplift, kind='mergesort')[::-1] _, treatment_counts = np.unique(treatment, return_counts=True) n_samples_ctrl = treatment_counts[0] n_samples_trmnt = treatment_counts[1] k_type = np.asarray(k).dtype.kind if (k_type == 'i' and (k >= n_samples or k <= 0) or k_type == 'f' and (k <= 0 or k >= 1)): raise ValueError( f'k={k} should be either positive and smaller' f' than the number of samples {n_samples} or a float in the ' f'(0, 1) range') if k_type not in ('i', 'f'): raise ValueError(f'Invalid value for k: {k_type}') if strategy == 'overall': if k_type == 'f': n_size = int(n_samples * k) else: n_size = k # ToDo: _checker_ there are observations among two groups among first k score_ctrl = y_true[order][:n_size][treatment[order][:n_size] == 0].mean() score_trmnt = y_true[order][:n_size][treatment[order][:n_size] == 1].mean() else: # strategy == 'by_group': if k_type == 'f': n_ctrl = int((treatment == 0).sum() * k) n_trmnt = int((treatment == 1).sum() * k) else: n_ctrl = k n_trmnt = k if n_ctrl > n_samples_ctrl: raise ValueError( f'With k={k}, the number of the first k observations' ' bigger than the number of samples' f'in the control group: {n_samples_ctrl}') if n_trmnt > n_samples_trmnt: raise ValueError( f'With k={k}, the number of the first k observations' ' bigger than the number of samples' f'in the treatment group: {n_samples_ctrl}') score_ctrl = y_true[order][treatment[order] == 0][:n_ctrl].mean() score_trmnt = y_true[order][treatment[order] == 1][:n_trmnt].mean() return score_trmnt - score_ctrl
def _check_reg_targets(y_true, y_pred, multioutput): """Check that y_true and y_pred belong to the same regression task Parameters ---------- y_true : array-like, y_pred : array-like, multioutput : array-like or string in ['raw_values', uniform_average', 'variance_weighted'] or None None is accepted due to backward compatibility of r2_score(). Returns ------- type_true : one of {'continuous', continuous-multioutput'} The type of the true target data, as output by 'utils.multiclass.type_of_target' y_true : array-like of shape = (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like of shape = (n_samples, n_outputs) Estimated target values. multioutput : array-like of shape = (n_outputs) or string in ['raw_values', uniform_average', 'variance_weighted'] or None Custom output weights if ``multioutput`` is array-like or just the corresponding argument if ``multioutput`` is a correct keyword. """ check_consistent_length(y_true, y_pred) y_true = check_array(y_true, ensure_2d=False) y_pred = check_array(y_pred, ensure_2d=False) if y_true.ndim == 1: y_true = y_true.reshape((-1, 1)) if y_pred.ndim == 1: y_pred = y_pred.reshape((-1, 1)) if y_true.shape[1] != y_pred.shape[1]: raise ValueError("y_true and y_pred have different number of output " "({0}!={1})".format(y_true.shape[1], y_pred.shape[1])) n_outputs = y_true.shape[1] allowed_multioutput_str = ('raw_values', 'uniform_average', 'variance_weighted') if isinstance(multioutput, string_types): if multioutput not in allowed_multioutput_str: raise ValueError("Allowed 'multioutput' string values are {}. " "You provided multioutput={!r}".format( allowed_multioutput_str, multioutput)) elif multioutput is not None: multioutput = check_array(multioutput, ensure_2d=False) if n_outputs == 1: raise ValueError("Custom weights are useful only in " "multi-output cases.") elif n_outputs != len(multioutput): raise ValueError( ("There must be equally many custom weights " "(%d) as outputs (%d).") % (len(multioutput), n_outputs)) y_type = 'continuous' if n_outputs == 1 else 'continuous-multioutput' return y_type, y_true, y_pred, multioutput
def response_rate_by_percentile(y_true, uplift, treatment, group, strategy='overall', bins=10): """Compute response rate (target mean in the control or treatment group) at each percentile. Args: y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. group (string, ['treatment', 'control']): Group type for computing response rate: treatment or control. * ``'treatment'``: Values equal 1 in the treatment column. * ``'control'``: Values equal 0 in the treatment column. strategy (string, ['overall', 'by_group']): Determines the calculating strategy. Default is 'overall'. * ``'overall'``: The first step is taking the first k observations of all test data ordered by uplift prediction (overall both groups - control and treatment) and conversions in treatment and control groups calculated only on them. Then the difference between these conversions is calculated. * ``'by_group'``: Separately calculates conversions in top k observations in each group (control and treatment) sorted by uplift predictions. Then the difference between these conversions is calculated. bins (int): Determines the number of bins (and relative percentile) in the data. Default is 10. Returns: array (shape = [>2]), array (shape = [>2]), array (shape = [>2]): response rate at each percentile for control or treatment group, variance of the response rate at each percentile, group size at each percentile. """ group_types = ['treatment', 'control'] strategy_methods = ['overall', 'by_group'] n_samples = len(y_true) check_consistent_length(y_true, uplift, treatment) if group not in group_types: raise ValueError( f'Response rate supports only group types in {group_types},' f' got {group}.') if strategy not in strategy_methods: raise ValueError( f'Response rate supports only calculating methods in {strategy_methods},' f' got {strategy}.') if not isinstance(bins, int) or bins <= 0: raise ValueError( f'Bins should be positive integer. Invalid value bins: {bins}') if bins >= n_samples: raise ValueError( f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}' ) y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array( treatment) order = np.argsort(uplift, kind='mergesort')[::-1] trmnt_flag = 1 if group == 'treatment' else 0 if strategy == 'overall': y_true_bin = np.array_split(y_true[order], bins) trmnt_bin = np.array_split(treatment[order], bins) group_size = np.array([ len(y[trmnt == trmnt_flag]) for y, trmnt in zip(y_true_bin, trmnt_bin) ]) response_rate = np.array([ np.mean(y[trmnt == trmnt_flag]) for y, trmnt in zip(y_true_bin, trmnt_bin) ]) else: # strategy == 'by_group' y_bin = np.array_split(y_true[order][treatment[order] == trmnt_flag], bins) group_size = np.array([len(y) for y in y_bin]) response_rate = np.array([np.mean(y) for y in y_bin]) variance = np.multiply(response_rate, np.divide((1 - response_rate), group_size)) return response_rate, variance, group_size
def _daal_fit(self, X, y): self._check_daal_supported_parameters() _supported_dtypes_ = [np.double, np.single] X = check_array(X, dtype=_supported_dtypes_) y = np.asarray(y) y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warnings.warn("A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] self.n_features_ = X.shape[1] rs_ = check_random_state(self.random_state) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") X_fptype = getFPType(X) seed_ = rs_.randint(0, np.iinfo('i').max) daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False) # create algorithm dfr_algorithm = daal4py.decision_forest_regression_training( fptype = getFPType(X), method='defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=1, featuresPerNode=int(_featuresPerNode), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=1, engine=daal_engine, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap) ) self._cached_estimators_ = None dfr_trainingResult = dfr_algorithm.compute(X, y) # get resulting model model = dfr_trainingResult.model self.daal_model_ = model # compute oob_score_ if self.oob_score: self._set_oob_score(X, y) return self
def uplift_by_percentile(y_true, uplift, treatment, strategy='overall', bins=10, std=False, total=False): """Compute metrics: uplift, group size, group response rate, standard deviation at each percentile. Metrics in columns and percentiles in rows of pandas DataFrame: - ``n_treatment``, ``n_control`` - group sizes. - ``response_rate_treatment``, ``response_rate_control`` - group response rates. - ``uplift`` - treatment response rate substract control response rate. - ``std_treatment``, ``std_control`` - (optional) response rates standard deviation. - ``std_uplift`` - (optional) uplift standard deviation. Args: y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. strategy (string, ['overall', 'by_group']): Determines the calculating strategy. Default is 'overall'. * ``'overall'``: The first step is taking the first k observations of all test data ordered by uplift prediction (overall both groups - control and treatment) and conversions in treatment and control groups calculated only on them. Then the difference between these conversions is calculated. * ``'by_group'``: Separately calculates conversions in top k observations in each group (control and treatment) sorted by uplift predictions. Then the difference between these conversions is calculated std (bool): If True, add columns with the uplift standard deviation and the response rate standard deviation. Default is False. total (bool): If True, add the last row with the total values. Default is False. The total uplift is a weighted average uplift. See :func:`.weighted_average_uplift`. The total response rate is a response rate on the full data amount. bins (int): Determines the number of bins (and the relative percentile) in the data. Default is 10. Returns: pandas.DataFrame: DataFrame where metrics are by columns and percentiles are by rows. """ strategy_methods = ['overall', 'by_group'] n_samples = len(y_true) check_consistent_length(y_true, uplift, treatment) if strategy not in strategy_methods: raise ValueError( f'Response rate supports only calculating methods in {strategy_methods},' f' got {strategy}.') if not isinstance(total, bool): raise ValueError(f'Flag total should be bool: True or False.' f' Invalid value total: {total}') if not isinstance(std, bool): raise ValueError(f'Flag std should be bool: True or False.' f' Invalid value std: {std}') if not isinstance(bins, int) or bins <= 0: raise ValueError(f'Bins should be positive integer.' f' Invalid value bins: {bins}') if bins >= n_samples: raise ValueError( f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}' ) y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array( treatment) response_rate_trmnt, variance_trmnt, n_trmnt = response_rate_by_percentile( y_true, uplift, treatment, group='treatment', strategy=strategy, bins=bins) response_rate_ctrl, variance_ctrl, n_ctrl = response_rate_by_percentile( y_true, uplift, treatment, group='control', strategy=strategy, bins=bins) uplift_scores = response_rate_trmnt - response_rate_ctrl uplift_variance = variance_trmnt + variance_ctrl percentiles = [round(p * 100 / bins, 1) for p in range(1, bins + 1)] df = pd.DataFrame({ 'percentile': percentiles, 'n_treatment': n_trmnt, 'n_control': n_ctrl, 'response_rate_treatment': response_rate_trmnt, 'response_rate_control': response_rate_ctrl, 'uplift': uplift_scores }) if total: response_rate_trmnt_total, variance_trmnt_total, n_trmnt_total = response_rate_by_percentile( y_true, uplift, treatment, strategy=strategy, group='treatment', bins=1) response_rate_ctrl_total, variance_ctrl_total, n_ctrl_total = response_rate_by_percentile( y_true, uplift, treatment, strategy=strategy, group='control', bins=1) weighted_avg_uplift = 1 / n_trmnt_total * np.dot( n_trmnt, uplift_scores) df.loc[-1, :] = [ 'total', n_trmnt_total, n_ctrl_total, response_rate_trmnt_total, response_rate_ctrl_total, weighted_avg_uplift ] if std: std_treatment = np.sqrt(variance_trmnt) std_control = np.sqrt(variance_ctrl) std_uplift = np.sqrt(uplift_variance) if total: std_treatment = np.append(std_treatment, np.sum(std_treatment)) std_control = np.append(std_control, np.sum(std_control)) std_uplift = np.append(std_uplift, np.sum(std_uplift)) df.loc[:, 'std_treatment'] = std_treatment df.loc[:, 'std_control'] = std_control df.loc[:, 'std_uplift'] = std_uplift df = df \ .set_index('percentile', drop=True, inplace=False) \ .astype({'n_treatment': 'int32', 'n_control': 'int32'}) return df
def cv_split(cv, X, y, groups, is_pairwise, cache): check_consistent_length(X, y, groups) return CVCache(list(cv.split(X, y, groups)), is_pairwise, cache)
def _check_sample_weight(sample_weight, X, dtype=None): check_consistent_length(sample_weight, X) return sample_weight
def kaplan_meier_estimator(event, time_exit, time_enter=None, time_min=None): """Kaplan-Meier estimator of survival function. Parameters ---------- event : array-like, shape = (n_samples,) Contains binary event indicators. time_exit : array-like, shape = (n_samples,) Contains event/censoring times. time_enter : array-like, shape = (n_samples,), optional Contains time when each individual entered the study for left truncated survival data. time_min : float, optional Compute estimator conditional on survival at least up to the specified time. Returns ------- time : array, shape = (n_times,) Unique times. prob_survival : array, shape = (n_times,) Survival probability at each unique time point. If `time_enter` is provided, estimates are conditional probabilities. Examples -------- Creating a Kaplan-Meier curve: >>> x, y = kaplan_meier_estimator(event, time) >>> plt.step(x, y, where="post") >>> plt.ylim(0, 1) >>> plt.show() References ---------- .. [1] Kaplan, E. L. and Meier, P., "Nonparametric estimation from incomplete observations", Journal of The American Statistical Association, vol. 53, pp. 457-481, 1958. """ event, time_enter, time_exit = check_y_survival(event, time_enter, time_exit) check_consistent_length(event, time_enter, time_exit) if time_enter is None: uniq_times, n_events, n_at_risk = _compute_counts(event, time_exit) else: uniq_times, n_events, n_at_risk = _compute_counts_truncated( event, time_enter, time_exit) values = 1 - n_events / n_at_risk if time_min is not None: mask = uniq_times >= time_min uniq_times = numpy.compress(mask, uniq_times) values = numpy.compress(mask, values) y = numpy.cumprod(values) return uniq_times, y
def group_predict(train, test, labels, *, K=20, mu=0.4, t=20): """ Propagates `labels` from `train` data to `test` data via SNF Parameters ---------- train : `m`-list of (S1, F) array_like Input subject x feature training data. Subjects in these data sets should have been previously labelled (see: `labels`). test : `m`-list of (S2, F) array_like Input subject x feature testing data. These should be similar to the data in `train` (though the first dimension can differ). Labels will be propagated to these subjects. labels : (S1,) array_like Cluster labels for `S1` subjects in `train` data sets. These could have been obtained from some ground-truth labelling or via a previous iteration of SNF with only the `train` data (e.g., the output of :py:func:`sklearn.cluster.spectral_clustering` would be appropriate). K : (0, N) int, optional Hyperparameter normalization factor for scaling. See `Notes` of `snf.affinity_matrix` for more details. Default: 20 mu : (0, 1) float, optional Hyperparameter normalization factor for scaling. See `Notes` of `snf.affinity_matrix` for more details. Default: 0.5 t : int, optional Number of iterations to perform information swapping during SNF. Default: 20 Returns ------- predicted_labels : (S2,) np.ndarray Cluster labels for subjects in `test` assigning to groups in `labels` """ # check inputs are legit try: check_consistent_length(train, test) except ValueError: raise ValueError('Training and testing set must have same number of ' 'data types.') if not all([len(labels) == len(t) for t in train]): raise ValueError('Training data must have the same number of subjects ' 'as provided labels.') # generate affinity matrices for stacked train/test data sets affinities = [] for (tr, te) in zip(train, test): try: check_consistent_length(tr.T, te.T) except ValueError: raise ValueError('Train and test data must have same number of ' 'features for each data type. Make sure to ' 'supply data types in the same order.') affinities += [make_affinity(np.row_stack([tr, te]), K=K, mu=mu)] # fuse with SNF fused_aff = snf(*affinities, K=K, t=t) # get unique groups in training data and generate array to hold all labels groups = np.unique(labels) all_labels = np.zeros((len(fused_aff), groups.size)) # reassign training labels to all_labels array for i in range(groups.size): all_labels[np.where(labels == groups[i])[0], i] = 1 # propagate labels from train data to test data using SNF fused array propagated_labels = _label_prop(fused_aff, all_labels, t=1000) predicted_labels = groups[propagated_labels[len(train[0]):].argmax(axis=1)] return predicted_labels
def resample(*arrays, **options): """Resample arrays or sparse matrices in a consistent way The default strategy implements one step of the bootstrapping procedure. Parameters ---------- *arrays : sequence of indexable data-structures Indexable data-structures can be arrays, lists, dataframes or scipy sparse matrices with consistent first dimension. Other Parameters ---------------- replace : boolean, True by default Implements resampling with replacement. If False, this will implement (sliced) random permutations. n_samples : int, None by default Number of samples to generate. If left to None this is automatically set to the first dimension of the arrays. If replace is False it should not be larger than the length of arrays. random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. stratify : array-like or None (default=None) If not None, data is split in a stratified fashion, using this as the class labels. Returns ------- resampled_arrays : sequence of indexable data-structures Sequence of resampled copies of the collections. The original arrays are not impacted. Examples -------- It is possible to mix sparse and dense arrays in the same run:: >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) >>> y = np.array([0, 1, 2]) >>> from scipy.sparse import coo_matrix >>> X_sparse = coo_matrix(X) >>> from sklearn.utils import resample >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0) >>> X array([[1., 0.], [2., 1.], [1., 0.]]) >>> X_sparse # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE <3x2 sparse matrix of type '<... 'numpy.float64'>' with 4 stored elements in Compressed Sparse Row format> >>> X_sparse.toarray() array([[1., 0.], [2., 1.], [1., 0.]]) >>> y array([0, 1, 0]) >>> resample(y, n_samples=2, random_state=0) array([0, 1]) See also -------- :func:`sklearn.utils.shuffle` """ random_state = check_random_state(options.pop('random_state', None)) replace = options.pop('replace', True) max_n_samples = options.pop('n_samples', None) stratify = options.pop('stratify', None) if options: raise ValueError("Unexpected kw arguments: %r" % options.keys()) if len(arrays) == 0: return None first = arrays[0] n_samples = first.shape[0] if hasattr(first, 'shape') else len(first) if max_n_samples is None: max_n_samples = n_samples elif (max_n_samples > n_samples) and (not replace): raise ValueError("Cannot sample %d out of arrays with dim %d " "when replace is False" % (max_n_samples, n_samples)) check_consistent_length(*arrays) if stratify is None: if replace: indices = random_state.randint(0, n_samples, size=(max_n_samples,)) else: indices = np.arange(n_samples) random_state.shuffle(indices) indices = indices[:max_n_samples] else: # Code adapted from StratifiedShuffleSplit() y = stratify if y.ndim == 2: # for multi-label y, map each distinct row to a string repr # using join because str(row) uses an ellipsis if len(row) > 1000 y = np.array([' '.join(row.astype('str')) for row in y]) classes, y_indices = np.unique(y, return_inverse=True) n_classes = classes.shape[0] class_counts = np.bincount(y_indices) # Find the sorted list of instances for each class: # (np.unique above performs a sort, so code is O(n logn) already) class_indices = np.split(np.argsort(y_indices, kind='mergesort'), np.cumsum(class_counts)[:-1]) # if there are ties in the class-counts, we want # to make sure to break them anew in each iteration n_i = _approximate_mode(class_counts, max_n_samples, random_state) indices = [] for i in range(n_classes): indices_i = random_state.choice(class_indices[i], n_i[i], replace=replace) indices.extend(indices_i) indices = random_state.permutation(indices) # convert sparse matrices to CSR for row-based indexing arrays = [a.tocsr() if issparse(a) else a for a in arrays] resampled_arrays = [safe_indexing(a, indices) for a in arrays] if len(resampled_arrays) == 1: # syntactic sugar for the unit argument case return resampled_arrays[0] else: return resampled_arrays
def _daal_check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, estimator=None): """Input validation for standard estimators. Checks X and y for consistent length, enforces X to be 2D and y 1D. By default, X is checked to be non-empty and containing only finite values. Standard input checks are also applied to y, such as checking that y does not have np.nan or np.inf targets. For multi-label y, set multi_output=True to allow 2D and sparse y. If the dtype of X is object, attempt converting to float, raising on failure. Parameters ---------- X : nd-array, list or sparse matrix Input data. y : nd-array, list or sparse matrix Labels. accept_sparse : string, boolean or list of string (default=False) String[s] representing allowed sparse matrix formats, such as 'csc', 'csr', etc. If the input is sparse but not in the allowed format, it will be converted to the first listed format. True allows the input to be any format. False means that a sparse matrix input will raise an error. accept_large_sparse : bool (default=True) If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by accept_sparse, accept_large_sparse will cause it to be accepted only if its indices are stored with a 32-bit dtype. .. versionadded:: 0.20 dtype : string, type, list of types or None (default="numeric") Data type of result. If None, the dtype of the input is preserved. If "numeric", dtype is preserved unless array.dtype is object. If dtype is a list of types, conversion on the first type is only performed if the dtype of the input is not in the list. order : 'F', 'C' or None (default=None) Whether an array will be forced to be fortran or c-style. copy : boolean (default=False) Whether a forced copy will be triggered. If copy=False, a copy might be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter does not influence whether y can have np.inf, np.nan, pd.NA values. The possibilities are: - True: Force all values of X to be finite. - False: accepts np.inf, np.nan, pd.NA in X. - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan` ensure_2d : boolean (default=True) Whether to raise a value error if X is not 2D. allow_nd : boolean (default=False) Whether to allow X.ndim > 2. multi_output : boolean (default=False) Whether to allow 2D y (array or sparse matrix). If false, y will be validated as a vector. y cannot have np.nan or np.inf values if multi_output=True. ensure_min_samples : int (default=1) Make sure that X has a minimum number of samples in its first axis (rows for a 2D array). ensure_min_features : int (default=1) Make sure that the 2D array has some minimum number of features (columns). The default value of 1 rejects empty datasets. This check is only enforced when X has effectively 2 dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 disables this check. y_numeric : boolean (default=False) Whether to ensure that y has a numeric type. If dtype of y is object, it is converted to float64. Should only be used for regression algorithms. estimator : str or estimator instance (default=None) If passed, include the name of the estimator in warning messages. Returns ------- X_converted : object The converted and validated X. y_converted : object The converted and validated y. """ if y is None: raise ValueError("y cannot be None") X = _daal_check_array( X, accept_sparse=accept_sparse, accept_large_sparse=accept_large_sparse, dtype=dtype, order=order, copy=copy, force_all_finite=force_all_finite, ensure_2d=ensure_2d, allow_nd=allow_nd, ensure_min_samples=ensure_min_samples, ensure_min_features=ensure_min_features, estimator=estimator ) if multi_output: y = _daal_check_array(y, accept_sparse='csr', force_all_finite=True, ensure_2d=False, dtype=None) else: y = column_or_1d(y, warn=True) _daal_assert_all_finite(y) if y_numeric and hasattr(y, 'dtype') and y.dtype.kind == 'O': y = y.astype(np.float64) check_consistent_length(X, y) return X, y
def weighted_average_uplift(y_true, uplift, treatment, strategy='overall', bins=10): """Weighted average uplift. It is an average of uplift by percentile. Weights are sizes of the treatment group by percentile. Args: y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. strategy (string, ['overall', 'by_group']): Determines the calculating strategy. Default is 'overall'. * ``'overall'``: The first step is taking the first k observations of all test data ordered by uplift prediction (overall both groups - control and treatment) and conversions in treatment and control groups calculated only on them. Then the difference between these conversions is calculated. * ``'by_group'``: Separately calculates conversions in top k observations in each group (control and treatment) sorted by uplift predictions. Then the difference between these conversions is calculated bins (int): Determines the number of bins (and the relative percentile) in the data. Default is 10. Returns: float: Weighted average uplift. """ strategy_methods = ['overall', 'by_group'] n_samples = len(y_true) check_consistent_length(y_true, uplift, treatment) if strategy not in strategy_methods: raise ValueError( f'Response rate supports only calculating methods in {strategy_methods},' f' got {strategy}.') if not isinstance(bins, int) or bins <= 0: raise ValueError(f'Bins should be positive integer.' f' Invalid value bins: {bins}') if bins >= n_samples: raise ValueError( f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}' ) response_rate_trmnt, variance_trmnt, n_trmnt = response_rate_by_percentile( y_true, uplift, treatment, group='treatment', strategy=strategy, bins=bins) response_rate_ctrl, variance_ctrl, n_ctrl = response_rate_by_percentile( y_true, uplift, treatment, group='control', strategy=strategy, bins=bins) uplift_scores = response_rate_trmnt - response_rate_ctrl weighted_avg_uplift = np.dot(n_trmnt, uplift_scores) / np.sum(n_trmnt) return weighted_avg_uplift
def gap_train_test_split(*arrays, **options): """Split arrays or matrices into random train and test subsets (with a gap) Parameters ---------- *arrays : sequence of indexables with same length / shape[0] Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes. gap_size : float or int, default=0 If float, should be between 0.0 and 1.0 and represent the proportion of the dataset between the training and the test set. If int, represents the absolute number of the dropped samples. test_size : float, int, or None, default=None If float, should be between 0.0 and 1.0 and equal to test / (train + test). If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size and the gap. If `train_size` is also None, it will be set to 0.25. train_size : float, int, or None, default=None If float, should be between 0.0 and 1.0 and equal to train / (train + test). If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size and the gap size. Returns ------- splitting : list, length=2 * len(arrays) List containing train-test split of inputs. Examples -------- >>> import numpy as np >>> from tscv import gap_train_test_split >>> X, y = np.arange(10).reshape((5, 2)), range(5) >>> X array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) >>> list(y) [0, 1, 2, 3, 4] >>> X_train, X_test, y_train, y_test = gap_train_test_split( ... X, y, test_size=0.33, gap_size=1) ... >>> X_train array([[0, 1], [2, 3], [4, 5]]) >>> y_train [0, 1, 2] >>> X_test array([[8, 9]]) >>> y_test [4] >>> gap_train_test_split(list(range(10)), gap_size=0.1) [[0, 1, 2, 3, 4, 5, 6], [8, 9]] """ n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") check_consistent_length(*arrays) test_size = options.pop('test_size', None) train_size = options.pop('train_size', None) gap_size = options.pop('gap_size', 0) if not isinstance(gap_size, numbers.Real): raise TypeError("The gap size should be a real number.") if options: raise TypeError("Invalid parameters passed: %s. \n" "Check the spelling of keyword parameters." % str(options)) arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) def size_to_number(size, n): b, a = modf(size) return int(max(a, round(b * n))) n_gap = size_to_number(gap_size, n_samples) n_remain = n_samples - n_gap if test_size is None and train_size is None: test_size = 0.25 if train_size is None: n_test = size_to_number(test_size, n_remain) n_train = n_remain - n_test elif test_size is None: n_train = size_to_number(train_size, n_remain) n_test = n_remain - n_train else: warnings.warn( "The train_size argument is overridden by test_size; " "in case of nonzero gap_size, " "an explicit value should be provided " "and cannot be implied by 1 - train_size - test_size.", Warning) n_test = size_to_number(test_size, n_remain) n_train = n_remain - n_test train = np.arange(n_train) test = np.arange(n_train + n_gap, n_samples) return list( chain.from_iterable((_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays))