def transform(self, X): """ Parameters ---------- X : {array-like, sparse matrix} The data used to scale along the specified axis. """ check_is_fitted(self, 'iqr_', 'max_') X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True) # TODO sparse data train_upper_scale = (self.max_ - self.q_upper_) / self.iqr_ train_lower_scale = (self.q_lower_ - self.min_) / self.iqr_ test_quantiles = nanpercentile(X, (self.q_lower, self.q_upper)) test_iqr = _handle_zeros_in_scale( test_quantiles[1] - test_quantiles[0], copy=False) test_upper_bound = test_quantiles[1] + train_upper_scale * test_iqr test_lower_bound = test_quantiles[0] - train_lower_scale * test_iqr test_min = np.nanmin(X) if test_lower_bound < test_min: test_lower_bound = test_min X[X > test_upper_bound] = test_upper_bound X[X < test_lower_bound] = test_lower_bound X = (X - test_lower_bound) / (test_upper_bound - test_lower_bound)\ * (self.max_ - self.min_) + self.min_ return X
def fit(self, X, y=None): """ Parameters ---------- X : array-like, shape [n_samples, n_features] """ X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True) if not 0 <= self.q_lower <= self.q_upper <= 100: raise ValueError("Invalid quantile parameter values: " "q_lower %s, q_upper: %s" % (str(self.q_lower), str(self.q_upper))) # TODO sparse data quantiles = np.nanpercentile(X, (self.q_lower, self.q_upper)) iqr = quantiles[1] - quantiles[0] self.q_lower_ = quantiles[0] self.q_upper_ = quantiles[1] self.iqr_ = _handle_zeros_in_scale(iqr, copy=False) self.max_ = np.nanmax(X) self.min_ = np.nanmin(X) return self
def fit(self, X, y=None): """ Parameters ---------- X : array-like, shape [n_samples, n_features] """ X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True) if not 0 <= self.q_lower <= self.q_upper <= 100: raise ValueError("Invalid quantile parameter values: " "q_lower %s, q_upper: %s" % (str(self.q_lower), str(self.q_upper))) # TODO sparse data quantiles = nanpercentile(X, (self.q_lower, self.q_upper)) iqr = quantiles[1] - quantiles[0] self.q_lower_ = quantiles[0] self.q_upper_ = quantiles[1] self.iqr_ = _handle_zeros_in_scale(iqr, copy=False) self.max_ = np.nanmax(X) self.min_ = np.nanmin(X) return self
def transform(self, X): """ Parameters ---------- X : {array-like, sparse matrix} The data used to scale along the specified axis. """ check_is_fitted(self, 'iqr_', 'max_') X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True) # TODO sparse data train_upper_scale = (self.max_ - self.q_upper_) / self.iqr_ train_lower_scale = (self.q_lower_ - self.min_) / self.iqr_ test_quantiles = np.nanpercentile(X, (self.q_lower, self.q_upper)) test_iqr = _handle_zeros_in_scale( test_quantiles[1] - test_quantiles[0], copy=False) test_upper_bound = test_quantiles[1] + train_upper_scale * test_iqr test_lower_bound = test_quantiles[0] - train_lower_scale * test_iqr test_min = np.nanmin(X) if test_lower_bound < test_min: test_lower_bound = test_min X[X > test_upper_bound] = test_upper_bound X[X < test_lower_bound] = test_lower_bound X = (X - test_lower_bound) / (test_upper_bound - test_lower_bound)\ * (self.max_ - self.min_) + self.min_ return X
def fit(self, X, y=None): """Compute the mean and std to be used for later scaling. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y: Passthrough for ``Pipeline`` compatibility. """ # Reset internal state before fitting self._reset() X, w = weighted_data(X) weighted_stats = DescrStatsW(X, weights=w, ddof=0) self.mean_ = weighted_stats.mean # weighted mean of data (equivalent to np.average(array, weights=weights)) self.var_ = weighted_stats.var # variance with default degrees of freedom correction self.n_samples_seen_ = sum(w) if self.with_std: self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) else: self.scale_ = None return self
def scale(x, data_mean, data_std): """Mean/variance scaling. Given mean and variances, apply mean-variance normalization to data. Args: x (array): Input data data_mean (array): Means for each feature dimention. data_std (array): Standard deviation for each feature dimention. Returns: array: Scaled data. Examples: >>> from nnmnkwii.preprocessing import meanstd, scale >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model >>> from nnmnkwii.datasets import FileSourceDataset >>> X, Y = example_file_data_sources_for_acoustic_model() >>> X, Y = FileSourceDataset(X), FileSourceDataset(Y) >>> lengths = [len(y) for y in Y] >>> data_mean, data_std = meanstd(Y, lengths) >>> scaled_y = scale(Y[0], data_mean, data_std) See also: :func:`nnmnkwii.preprocessing.inv_scale` """ return (x - data_mean) / _handle_zeros_in_scale(data_std, copy=False)
def fit(self, X, y=None): q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: raise ValueError("Invalid quantile range: %s" % str(self.quantile_range)) if isinstance(X, dd.DataFrame): n_columns = len(X.columns) partition_lengths = X.map_partitions(len).compute() dtype = np.find_common_type(X.dtypes, []) blocks = X.to_delayed() X = da.vstack( [ da.from_delayed( block.values, shape=(length, n_columns), dtype=dtype ) for block, length in zip(blocks, partition_lengths) ] ) quantiles = [da.percentile(col, [q_min, 50., q_max]) for col in X.T] quantiles = da.vstack(quantiles).compute() self.center_ = quantiles[:, 1] self.scale_ = quantiles[:, 2] - quantiles[:, 0] self.scale_ = skdata._handle_zeros_in_scale(self.scale_, copy=False) return self
def fit(self, X, y, sample_weight=None): if sample_weight is None: return super(StandardScalerW, self).fit(X, y) if sparse.issparse(X): raise ValueError("Sparse matrix not supported") self._reset() print(sample_weight) average = np.average(X, axis=0, weights=sample_weight) if self.with_mean: self.mean_ = average if self.with_std: from sklearn.preprocessing.data import _handle_zeros_in_scale self.var_ = [np.cov(row, aweights=np.abs(sample_weight)) for row in X.T] self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) else: self.scale_ = None return self
def partial_fit(self, X, y=None): """Online computation of mean and std on X for later scaling. All of X is processed as a single batch. This is intended for cases when `fit` is not feasible due to very large number of `n_samples` or because X is read from a continuous stream. The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms for computing the sample variance: Analysis and recommendations." The American Statistician 37.3 (1983): 242-247: Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y: Passthrough for ``Pipeline`` compatibility. """ X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, ensure_2d=False, warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES) # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var # See incr_mean_variance_axis and _incremental_mean_variance_axis if not sparse.issparse(X): return super(SparseScaler, self).partial_fit(X) if self.with_std: # First pass if not hasattr(self, 'n_samples_seen_'): self.mean_, self.var_ = mean_variance_axis(X, axis=0) n = X.shape[0] self.n_samples_seen_ = n # Next passes else: self.mean_, self.var_, self.n_samples_seen_ = \ incr_mean_variance_axis(X, axis=0, last_mean=self.mean_, last_var=self.var_, last_n=self.n_samples_seen_) if self.with_std: self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) else: self.scale_ = None return self
def fit(self, Z): """Compute the mean and std to be used for later scaling. Parameters ---------- Z : DictRDD containing (X, y) pairs X - Training vector. {array-like, sparse matrix}, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y - Target labels Passthrough for ``Pipeline`` compatibility. """ # Reset internal state before fitting self._reset() X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z check_rdd(X, (np.ndarray, sp.spmatrix)) def mapper(X): """Calculate statistics for every numpy or scipy blocks.""" X = check_array(X, ('csr', 'csc'), dtype=np.float64) if hasattr(X, "toarray"): # sparse matrix mean, var = mean_variance_axis(X, axis=0) else: mean, var = np.mean(X, axis=0), np.var(X, axis=0) return X.shape[0], mean, var def reducer(a, b): """Calculate the combined statistics.""" n_a, mean_a, var_a = a n_b, mean_b, var_b = b n_ab = n_a + n_b mean_ab = ((mean_a * n_a) + (mean_b * n_b)) / n_ab var_ab = (((n_a * var_a) + (n_b * var_b)) / n_ab) + \ ((n_a * n_b) * ((mean_b - mean_a) / n_ab) ** 2) return (n_ab, mean_ab, var_ab) if check_rdd_dtype(X, (sp.spmatrix)): if self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") self.n_samples_seen_, self.mean_, self.var_ = X.map(mapper).treeReduce( reducer) if self.with_std: self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) else: self.scale_ = None return self
def fit(self, Z): """Compute the mean and std to be used for later scaling. Parameters ---------- Z : DictRDD containing (X, y) pairs X - Training vector. {array-like, sparse matrix}, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y - Target labels Passthrough for ``Pipeline`` compatibility. """ # Reset internal state before fitting self._reset() X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z check_rdd(X, (np.ndarray, sp.spmatrix)) def mapper(X): """Calculate statistics for every numpy or scipy blocks.""" X = check_array(X, ('csr', 'csc'), dtype=np.float64) if hasattr(X, "toarray"): # sparse matrix mean, var = mean_variance_axis(X, axis=0) else: mean, var = np.mean(X, axis=0), np.var(X, axis=0) return X.shape[0], mean, var def reducer(a, b): """Calculate the combined statistics.""" n_a, mean_a, var_a = a n_b, mean_b, var_b = b n_ab = n_a + n_b mean_ab = ((mean_a * n_a) + (mean_b * n_b)) / n_ab var_ab = (((n_a * var_a) + (n_b * var_b)) / n_ab) + \ ((n_a * n_b) * ((mean_b - mean_a) / n_ab) ** 2) return (n_ab, mean_ab, var_ab) if check_rdd_dtype(X, (sp.spmatrix)): if self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") self.n_samples_seen_, self.mean_, self.var_ = X.map(mapper).treeReduce(reducer) if self.with_std: self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) else: self.scale_ = None return self
def meanstd(dataset, lengths=None, mean_=0., var_=0., last_sample_count=0, return_last_sample_count=False): """Mean/std-deviation computation given a iterable dataset Dataset can have variable length samples. In that cases, you need to explicitly specify lengths for all the samples. Args: dataset (nnmnkwii.datasets.Dataset): Dataset lengths: (list): Frame lengths for each dataset sample. mean\_ (array or scalar): Initial value for mean vector. var\_ (array or scaler): Initial value for variance vector. last_sample_count (int): Last sample count. Default is 0. If you set non-default ``mean_`` and ``var_``, you need to set ``last_sample_count`` property. Typically this will be the number of time frames ever seen. return_last_sample_count (bool): Return ``last_sample_count`` if True. Returns: tuple: Mean and variance for each dimention. If ``return_last_sample_count`` is True, returns ``last_sample_count`` as well. See also: :func:`nnmnkwii.preprocessing.meanvar`, :func:`nnmnkwii.preprocessing.scale` Examples: >>> from nnmnkwii.preprocessing import meanstd >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model >>> from nnmnkwii.datasets import FileSourceDataset >>> X, Y = example_file_data_sources_for_acoustic_model() >>> X, Y = FileSourceDataset(X), FileSourceDataset(Y) >>> lengths = [len(y) for y in Y] >>> data_mean, data_std = meanstd(Y, lengths) """ ret = meanvar(dataset, lengths, mean_, var_, last_sample_count, return_last_sample_count) m, v = ret[0], ret[1] v = _handle_zeros_in_scale(np.sqrt(v)) if return_last_sample_count: assert len(ret) == 3 return m, v, ret[2] else: return m, v
def partial_fit(self, X, y=None): """Online computation of min and max on X for later scaling. All of X is processed as a single batch. This is intended for cases when `fit` is not feasible due to very large number of `n_samples` or because X is read from a continuous stream. Parameters ---------- X : array-like, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y : Passthrough for ``Pipeline`` compatibility. """ feature_range = self.feature_range if feature_range[0] >= feature_range[1]: raise ValueError("Minimum of desired feature range must be smaller" " than maximum. Got %s." % str(feature_range)) X = check_array( X, copy=self.copy, # warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES, ensure_2d=False, allow_nd=True) data_min = np.min(X) data_max = np.max(X) # First pass if not hasattr(self, 'n_samples_seen_'): self.n_samples_seen_ = X.shape[0] # Next steps else: data_min = np.minimum(self.data_min_, data_min) data_max = np.maximum(self.data_max_, data_max) self.n_samples_seen_ += X.shape[0] data_range = data_max - data_min self.scale_ = ((feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(data_range)) self.min_ = feature_range[0] - data_min * self.scale_ self.data_min_ = data_min self.data_max_ = data_max self.data_range_ = data_range return self
def fit(self, X, y=None): if sparse.issparse(X): raise TypeError("RobustScaler cannot be fitted on sparse inputs") X = self._check_array(X, self.copy) if self.with_centering: self.center_ = np.nanmedian(X, axis=0) if self.with_scaling: q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: raise ValueError("Invalid quantile range: %s" % str(self.quantile_range)) q = np.nanpercentile(X, self.quantile_range, axis=0) self.scale_ = (q[1] - q[0]) self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False) return self
def fit(self, X): """ Used to fit Noramlizer with data :param X: list :return: nothing """ if self.norm not in ('l1', 'l2', 'max'): raise ValueError("'%s' is not a supported norm" % self.norm) if self.axis == 0: self.sparse_format = 'csc' elif self.axis == 1: self.sparse_format = 'csr' else: raise ValueError("'%d' is not a supported axis" % self.axis) X = check_array(X, self.sparse_format, copy=self.copy, estimator='the normalize function', dtype=FLOAT_DTYPES) if self.axis == 0: X = X.T if sparse.issparse(X): if self.norm == 'l1': inplace_csr_row_normalize_l1(X) elif self.norm == 'l2': inplace_csr_row_normalize_l2(X) elif self.norm == 'max': _, self.norms = min_max_axis(X, 1) else: if self.norm == 'l1': self.norms = np.abs(X).sum(axis=1) elif self.norm == 'l2': self.norms = row_norms(X) elif self.norm == 'max': self.norms = np.max(X, axis=1) self.norms = _handle_zeros_in_scale(self.norms, copy=False)
def fit(self, X, y=None): """Compute the mean and std to be used for later scaling. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y : Passthrough for ``Pipeline`` compatibility. """ # Reset internal state before fitting self._reset() X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, force_all_finite=False, warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES) self.mean_ = .0 if self.with_std: self.var_ = .0 else: self.var_ = None if sparse.issparse(X): raise NotImplementedError if self.with_mean: self.mean_ = np.nanmean(X, 0) if self.with_std: scale_ = np.nanstd(X, 0) self.scale_ = _handle_zeros_in_scale(scale_, copy=False) return self
def fit(self, X, y=None): """Fit the scaling factor for each residue type. Parameters ---------- X : np.ndarray, shape=(n_observations, n_residues) Array of values to fit scaling upon. y : Passthrough for Pipeline compatibility. """ if X.shape[1] != self.top.n_residues: raise exception.InvalidData("Given data had shape {s} and top had n_residues {n}".format(s=X.shape, n=self.top.n_residues)) self.scale_factors_ = {} for code, residues in self.code2rindex.items(): if code is None: warnings.warn(exception.SuspiciousDataWarning("ResidueTypeScaler Topology had 'None' values as residue codes. These will be scaled as though they are the same residue type.")) target_data = X[:, residues] scale_factor = _handle_zeros_in_scale(self.scale_func(target_data), copy=False) self.scale_factors_[code] = scale_factor return self
def scale(x, data_mean, data_std): return (x - data_mean) / _handle_zeros_in_scale(data_std, copy=False)
def __minmax_scale_factor(data_min, data_max, feature_range): data_range = data_max - data_min scale = (feature_range[1] - feature_range[0]) / \ _handle_zeros_in_scale(data_range, copy=False) return scale
def zero_one_scale(serie): data_range = serie.max() scale = 1 / _handle_zeros_in_scale(data_range) serie *= scale
def nanscale(X, axis=0, with_mean=True, with_std=True, copy=True): """Standardize a dataset along any axis Center to the mean and component wise scale to unit variance. Read more in the :ref:`User Guide <preprocessing_scaler>`. Parameters ---------- X : {array-like, sparse matrix} The data to center and scale. axis : int (0 by default) axis used to compute the means and standard deviations along. If 0, independently standardize each feature, otherwise (if 1) standardize each sample. with_mean : boolean, True by default If True, center the data before scaling. with_std : boolean, True by default If True, scale the data to unit variance (or equivalently, unit standard deviation). copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSC matrix and if axis is 1). Notes ----- This implementation will refuse to center scipy.sparse matrices since it would make them non-sparse and would potentially crash the program with memory exhaustion problems. Instead the caller is expected to either set explicitly `with_mean=False` (in that case, only variance scaling will be performed on the features of the CSC matrix) or to call `X.toarray()` if he/she expects the materialized dense array to fit in memory. To avoid memory copy the caller should pass a CSC matrix. For a comparison of the different scalers, transformers, and normalizers, see :ref:`examples/preprocessing/plot_all_scaling.py <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`. See also -------- StandardScaler: Performs scaling to unit variance using the``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`). """ # noqa X = check_array(X, accept_sparse='csc', copy=copy, ensure_2d=False, warn_on_dtype=True, estimator='the scale function', force_all_finite=False, dtype=FLOAT_DTYPES) if sparse.issparse(X): if with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` instead" " See docstring for motivation and alternatives.") if axis != 0: raise ValueError("Can only scale sparse matrix on axis=0, " " got axis=%d" % axis) if with_std: _, var = mean_variance_axis(X, axis=0) var = _handle_zeros_in_scale(var, copy=False) inplace_column_scale(X, 1 / np.sqrt(var)) else: X = np.asarray(X) if with_mean: mean_ = np.nanmean(X, axis) if with_std: scale_ = np.nanstd(X, axis) # Xr is a view on the original array that enables easy use of # broadcasting on the axis in which we are interested in Xr = np.rollaxis(X, axis) if with_mean: Xr -= mean_ mean_1 = Xr.mean(axis=0) # Verify that mean_1 is 'close to zero'. If X contains very # large values, mean_1 can also be very large, due to a lack of # precision of mean_. In this case, a pre-scaling of the # concerned feature is efficient, for instance by its mean or # maximum. if not np.allclose(mean_1[np.isfinite(mean_1)], 0): warnings.warn("Numerical issues were encountered " "when centering the data " "and might not be solved. Dataset may " "contain too large values. You may need " "to prescale your features.") Xr -= mean_1 if with_std: scale_ = _handle_zeros_in_scale(scale_, copy=False) Xr /= scale_ if with_mean: mean_2 = Xr.mean(axis=0) # If mean_2 is not 'close to zero', it comes from the fact that # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even # if mean_1 was close to zero. The problem is thus essentially # due to the lack of precision of mean_. A solution is then to # subtract the mean again: if not np.allclose(mean_2[np.isfinite(mean_1)], 0): warnings.warn("Numerical issues were encountered " "when scaling the data " "and might not be solved. The standard " "deviation of the data is probably " "very close to 0. ") Xr -= mean_2 return X
def minmax_scale_params(data_min, data_max, feature_range=(0, 1)): data_range = data_max - data_min scale_ = (feature_range[1] - feature_range[0]) / \ _handle_zeros_in_scale(data_range, copy=False) min_ = feature_range[0] - data_min * scale_ return min_, scale_