Exemplo n.º 1
0
    def fit(self, X, y=None):
        """Compute the minimum and maximum to be used for later scaling.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data used to compute the per-feature minimum and maximum
            used for later scaling along the features axis.
        """
        X = check_arrays(X, sparse_format="csc", copy=self.copy)[0]
        warn_if_not_float(X, estimator=self)
        feature_range = self.feature_range
        if feature_range[0] >= feature_range[1]:
            raise ValueError("Minimum of desired feature range must be smaller"
                             " than maximum. Got %s." % str(feature_range))
        if sparse.issparse(X):
            data_min = []
            data_max = []
            data_range = []
            for i in range(X.shape[1]):
                if X.indptr[i] == X.indptr[i + 1]:
                    data_min.append(0)
                    data_max.append(0)
                    data_range.append(0)
                else:
                    data_min.append(X.data[X.indptr[i]:X.indptr[i + 1]].min())
                    data_max.append(X.data[X.indptr[i]:X.indptr[i + 1]].max())
            data_min = np.array(data_min)
            data_max = np.array(data_max)
            data_range = data_max - data_min

        else:
            data_min = np.min(X, axis=0)
            data_range = np.max(X, axis=0) - data_min

        # Do not scale constant features
        if isinstance(data_range, np.ndarray):
            # For a sparse matrix, constant features will be set to one!
            if sparse.issparse(X):
                for i in range(len(data_min)):
                    if data_range[i] == 0.0:
                        data_min[i] = data_min[i] - 1
            data_range[data_range == 0.0] = 1.0
        elif data_range == 0.:
            data_range = 1.

        self.scale_ = (feature_range[1] - feature_range[0]) / data_range
        self.min_ = feature_range[0] - data_min * self.scale_
        self.data_range = data_range
        self.data_min = data_min
        return self
Exemplo n.º 2
0
    def transform(self, X, y=None, copy=None):
        """Perform standardization by centering and scaling

        Parameters
        ----------
        X : array-like with shape [n_samples, n_features]
            The data used to scale along the features axis.
        """
        copy = copy if copy is not None else self.copy
        X = check_arrays(X, copy=copy, sparse_format="csc")[0]
        if warn_if_not_float(X, estimator=self):
            X = X.astype(np.float)
        if sparse.issparse(X):
            if self.center_sparse:
                for i in range(X.shape[1]):
                    X.data[X.indptr[i]:X.indptr[i + 1]] -= self.mean_[i]
            elif self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
            else:
                pass

            if self.std_ is not None:
                inplace_column_scale(X, 1 / self.std_)
        else:
            if self.with_mean:
                X -= self.mean_
            if self.with_std:
                X /= self.std_
        return X
Exemplo n.º 3
0
    def fit(self, X, y=None):
        """Don't trust the documentation of this module!

        Compute the mean and std to be used for later scaling.

        Parameters
        ----------
        X : array-like or CSR matrix with shape [n_samples, n_features]
            The data used to compute the mean and standard deviation
            used for later scaling along the features axis.
        """
        X = check_arrays(X, copy=self.copy, sparse_format="csc")[0]
        if warn_if_not_float(X, estimator=self):
            X = X.astype(np.float)
        if sparse.issparse(X):
            if self.center_sparse:
                means = []
                vars = []

                # This only works for csc matrices...
                for i in range(X.shape[1]):
                    if X.indptr[i] == X.indptr[i + 1]:
                        means.append(0)
                        vars.append(1)
                    else:
                        vars.append(X.data[X.indptr[i]:X.indptr[i + 1]].var())
                        # If the variance is 0, set all occurences of this
                        # features to 1
                        means.append(X.data[X.indptr[i]:X.indptr[i +
                                                                 1]].mean())
                        if 0.0000001 >= vars[-1] >= -0.0000001:
                            means[-1] -= 1

                self.std_ = np.sqrt(np.array(vars))
                self.std_[np.array(vars) == 0.0] = 1.0
                self.mean_ = np.array(means)

                return self
            elif self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
            else:
                self.mean_ = None

            if self.with_std:
                var = mean_variance_axis0(X)[1]
                self.std_ = np.sqrt(var)
                self.std_[var == 0.0] = 1.0
            else:
                self.std_ = None
            return self
        else:
            self.mean_, self.std_ = _mean_and_std(X,
                                                  axis=0,
                                                  with_mean=self.with_mean,
                                                  with_std=self.with_std)
        return self
Exemplo n.º 4
0
    def fit(self, X, y=None):
        """Compute the minimum and maximum to be used for later scaling.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data used to compute the per-feature minimum and maximum
            used for later scaling along the features axis.
        """
        X = check_array(X, copy=self.copy)
        warn_if_not_float(X, estimator=self)

        feature_range = self.feature_range
        if feature_range[0] >= feature_range[1]:
            raise ValueError("Minimum of desired feature range must be smaller"
                             " than maximum. Got %s." % str(feature_range))
        if self.fit_feature_range is not None:
            fit_feature_range = self.fit_feature_range
            if fit_feature_range[0] >= fit_feature_range[1]:
                raise ValueError("Minimum of desired (fit) feature range must "
                                 "be smaller than maximum. Got %s."
                                 % str(feature_range))
            if (fit_feature_range[0] < feature_range[0] or
                    fit_feature_range[1] > feature_range[1]):
                raise ValueError("fit_feature_range must be a subset of "
                                 "feature_range. Got %s, fit %s."
                                 % (str(feature_range),
                                    str(fit_feature_range)))
            feature_range = fit_feature_range

        data_min = np.min(X, axis=0)
        data_range = np.max(X, axis=0) - data_min
        # Do not scale constant features
        data_range[data_range == 0.0] = 1.0
        self.scale_ = (feature_range[1] - feature_range[0]) / data_range
        self.min_ = feature_range[0] - data_min * self.scale_
        self.data_range = data_range
        self.data_min = data_min
        return self
Exemplo n.º 5
0
 def normalize(self, X, norm='l2', axis=1, copy=True):
     """Normalize a dataset along any axis
 
     Parameters
     ----------
     X : array or scipy.sparse matrix with shape [n_samples, n_features]
         The data to normalize, element by element.
         scipy.sparse matrices should be in CSR format to avoid an
         un-necessary copy.
 
     norm : 'l1' or 'l2', optional ('l2' by default)
         The norm to use to normalize each non zero sample (or each non-zero
         feature if axis is 0).
 
     axis : 0 or 1, optional (1 by default)
         axis used to normalize the data along. If 1, independently normalize
         each sample, otherwise (if 0) normalize each feature.
 
     copy : boolean, optional, default is True
         set to False to perform inplace row normalization and avoid a
         copy (if the input is already a numpy array or a scipy.sparse
         CSR matrix and if axis is 1).
 
     See also
     --------
     :class:`sklearn.preprocessing.Normalizer` to perform normalization
     using the ``Transformer`` API (e.g. as part of a preprocessing
     :class:`sklearn.pipeline.Pipeline`)
     """
     if norm not in ('l1', 'l2'):
         raise ValueError("'%s' is not a supported norm" % norm)
 
     if axis == 0:
         sparse_format = 'csc'
     elif axis == 1:
         sparse_format = 'csr'
     else:
         raise ValueError("'%d' is not a supported axis" % axis)
 
     X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0]
     warn_if_not_float(X, 'The normalize function')
     if axis == 0:
         X = X.T
 
     if sparse.issparse(X):
         if norm == 'l1':
             inplace_csr_row_normalize_l1(X)
         elif norm == 'l2':
             inplace_csr_row_normalize_l2(X)
     else:
         if norm == 'l1':
             norms = np.abs(X).sum(axis=1)
             norms[norms == 0.0] = 1.0
         elif norm == 'l2':
             norms = row_norms(X)
             norms[norms == 0.0] = 1.0
         X /= norms[:, np.newaxis]
 
     if axis == 0:
         X = X.T
 
     return X
Exemplo n.º 6
0
 def fit(self, X, y=None):
     """Compute the Box-Cox lambda value to be used for later scaling.
     Parameters
     ----------
     X : array-like, shape [n_samples, n_features]
         The data used to compute the per-feature minimum and maximum
         used for later scaling along the features axis.
     """
     X = check_array(X, copy=self.copy, ensure_2d=True)
     warn_if_not_float(X, estimator=self)
     
     # Take the minimum of each feature
     data_min = np.min(X, axis=0, keepdims=True)
     # Sanity check
     if self.known_min is not None and np.any(self.known_min > data_min):
         raise Warning("The minimum of the data is less than the supplied"
                       " 'known' minimum value.")
     
     if self.known_min is not None:
         data_min = np.minimum(data_min, self.known_min)
     else:
         # Since the user didn't know how negative the values could be,
         # let's err on the side of caution a little bit
         data_min = data_min*2
         # Note: this has no effect is the data is non-negative.
     
     # Need to offset by the negative of the minima so all values are +ve
     offset = -data_min
     # And we need to ensure 0 gets mapped correctly
     offset[offset >= 0] += 1
     # We want to change inputs which are always +ve
     offset = np.maximum(offset, 0)
     
     # Store this array of feature offsets
     self.offset_ = offset
     
     # Apply the offset to the raw data
     X += self.offset_
     
     # Find the optimal Box-Cox transform for each feature
     n_samples = X.shape[0]
     n_features = X.shape[1]
     
     lambda_values = np.zeros((n_features))
     Xt = np.zeros(X.shape)
     
     for i in range(n_features):
         # Fit the BoxCox transform to the data in this column
         Xt[:,i], lambda_values[i] = stats.boxcox(X[:,i])
         
         # Sanity check:
         # Make sure the transformed values are not all the same if the original
         # data wasn't like that
         # (this is a bug which can happen if lambda was chosen badly by scipy)
         if not np.allclose(X[:,i], X[0,i]*np.ones(n_samples)) and np.allclose(Xt[:,i], Xt[0,i]*np.ones(n_samples)):
             raise ValueError("Lambda was badly chosen for feature {}. Values became singular!".format(i))
         # We should fix this issue by finding a better lambda value ourselves
     
     # Store the lambda value
     self.lambda_ = lambda_values
     
     # Fit the to z-score with standard scaler
     if self.standardise:
         self.standardiser = StandardScaler()
         # Fit on the transformed data
         self.standardiser.fit(Xt, y)
         
     return self
Exemplo n.º 7
0
def normalize(X, norm='l2', axis=1, copy=True):
    """Scale input vectors individually to unit norm (vector length).

    Parameters
    ----------
    X : array or scipy.sparse matrix with shape [n_samples, n_features]
        The data to normalize, element by element.
        scipy.sparse matrices should be in CSR format to avoid an
        un-necessary copy.

    norm : 'l1' or 'l2', optional ('l2' by default)
        The norm to use to normalize each non zero sample (or each non-zero
        feature if axis is 0).

    axis : 0 or 1, optional (1 by default)
        axis used to normalize the data along. If 1, independently normalize
        each sample, otherwise (if 0) normalize each feature.

    copy : boolean, optional, default True
        set to False to perform inplace row normalization and avoid a
        copy (if the input is already a numpy array or a scipy.sparse
        CSR matrix and if axis is 1).

    See also
    --------
    :class:`sklearn.preprocessing.Normalizer` to perform normalization
    using the ``Transformer`` API (e.g. as part of a preprocessing
    :class:`sklearn.pipeline.Pipeline`)
    """
    if norm not in ('l1', 'l2'):
        raise ValueError("'%s' is not a supported norm" % norm)

    if axis == 0:
        sparse_format = 'csc'
    elif axis == 1:
        sparse_format = 'csr'
    else:
        raise ValueError("'%d' is not a supported axis" % axis)

    X = check_array(X, sparse_format, copy=copy)
    warn_if_not_float(X, 'The normalize function')
    if axis == 0:
        X = X.T

    if sparse.issparse(X):
        X = check_array(X, accept_sparse=sparse_format, dtype=np.float64)
        if norm == 'l1':
            inplace_csr_row_normalize_l1(X)
        elif norm == 'l2':
            inplace_csr_row_normalize_l2(X)
    else:
        if norm == 'l1':
            norms = np.abs(X).sum(axis=1)
            norms[norms == 0.0] = 1.0
        elif norm == 'l2':
            norms = row_norms(X)
            norms[norms == 0.0] = 1.0
        X /= norms[:, np.newaxis]

    if axis == 0:
        X = X.T

    return X