Exemplo n.º 1
0
def test_hash_X_y():
    rng = check_random_state(0)
    X = rng.randn(2000, 20)
    y = np.array([0] * 500 + [1] * 1500)
    assert hash_X_y(X, y, 10, 10) == (joblib.hash(X[::200, ::2]),
                                      joblib.hash(y[::200]))

    X = rng.randn(5, 2)
    y = np.array([0] * 2 + [1] * 3)
    # all data will be used in this case
    assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))
def test_hash_X_y():
    rng = check_random_state(0)
    X = rng.randn(2000, 20)
    y = np.array([0] * 500 + [1] * 1500)
    assert hash_X_y(X, y, 10,
                    10) == (joblib.hash(X[::200, ::2]), joblib.hash(y[::200]))

    X = rng.randn(5, 2)
    y = np.array([0] * 2 + [1] * 3)
    # all data will be used in this case
    assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))
def test_hash_X_y_pandas():
    pd = pytest.importorskip("pandas")
    rng = check_random_state(0)
    X = pd.DataFrame(rng.randn(2000, 20))
    y = pd.Series([0] * 500 + [1] * 1500)
    assert hash_X_y(X, y, 10, 10) == (joblib.hash(X.iloc[::200, ::2]),
                                      joblib.hash(y.iloc[::200]))

    X = pd.DataFrame(rng.randn(5, 2))
    y = pd.Series([0] * 2 + [1] * 3)
    # all data will be used in this case
    assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))
Exemplo n.º 4
0
    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
        y = check_target_type(y)
        self.X_hash_, self.y_hash_ = hash_X_y(X, y)
        self.ratio_ = check_ratio(self.ratio, y, self._sampling_type)

        # Cluster input space
        self.clustering_labels_ = self.clusterer[0][1].fit_predict(X, y)

        # Identify majority and minority
        majority_label = [label for label, n_samples in self.ratio_.items() if n_samples == 0][0]
        minority_labels = [label for label in self.ratio_.keys() if label != majority_label]

        # Clusters imbalance ratios

        weights = pd.DataFrame()


        return self
Exemplo n.º 5
0
    def fit(self, X, y=None):
        """Save the initial input matrix and the number of samples
        to be removed.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.
        """
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
        if self.ratio is not None and self.ratio != 1.0:
            self.ratio_ = self.ratio
            self.n_samples_ = int(self.ratio_ * len(X))
        else:
            self.ratio_ = None
        self.X_hash_, self.y_hash_ = hash_X_y(X, y)
        return self
Exemplo n.º 6
0
    def fit(self, X, y):
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
        y = check_target_type(y)
        self.X_hash_, self.y_hash_ = hash_X_y(X, y)

        self._fit(X, y)

        return self
    def fit(self, X, y):
        """
        Find the classes statistics to perform sampling.

        Parameters
        ----------
        X : 2d ndarray or scipy sparse matrix, shape [n_samples, n_features]
            Matrix containing the data which have to be sampled.

        y : 1d ndarray, shape [n_samples]
            Corresponding label for each sample in X.

        Returns
        -------
        self
        """
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
        y = check_target_type(y)
        self.X_hash_, self.y_hash_ = hash_X_y(X, y)
        self.ratio_ = check_ratio(self.ratio, y)
        return self
Exemplo n.º 8
0
    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.
        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.
        """
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
        y = check_target_type(y)
        self.ratio_ = self.ratio
        self.X_hash_, self.y_hash_ = hash_X_y(X, y)
        labels = np.unique(y)
        counts = np.bincount(y)
        under_dict = {}
        over_dict = {}
        for lbl in labels:
            count = counts[lbl]
            if count < self.min_freq:
                under_dict[lbl] = count
                over_dict[lbl] = self.min_freq
            elif count > self.max_freq:
                under_dict[lbl] = self.max_freq
                over_dict[lbl] = self.max_freq
            else:
                under_dict[lbl] = count
                over_dict[lbl] = count
        self.under_sampler = RandomUnderSampler(ratio=under_dict,
                                                random_state=self.random_state)
        self.over_sampler = RandomOverSampler(ratio=over_dict,
                                              random_state=self.random_state)
        return self
Exemplo n.º 9
0
 def fit(self, X, y):
     self.ratio_ = 1
     self.X_hash_ = hash_X_y(X, y)
     return self
 def fit(self, X, y):
     self.ratio_ = 1
     self.X_hash_ = hash_X_y(X, y)
     return self