예제 #1
0
def test_random_choice_csc(n_samples=10000, random_state=24):
    # Explicit class probabilities
    classes = [np.array([0, 1]),  np.array([0, 1, 2])]
    class_probabilites = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]

    got = random_choice_csc(n_samples, classes, class_probabilites,
                            random_state)
    assert_true(sp.issparse(got))

    for k in range(len(classes)):
        p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
        assert_array_almost_equal(class_probabilites[k], p, decimal=1)

    # Implicit class probabilities
    classes = [[0, 1],  [1, 2]]  # test for array-like support
    class_probabilites = [np.array([0.5, 0.5]), np.array([0, 1/2, 1/2])]

    got = random_choice_csc(n_samples=n_samples,
                            classes=classes,
                            random_state=random_state)
    assert_true(sp.issparse(got))

    for k in range(len(classes)):
        p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
        assert_array_almost_equal(class_probabilites[k], p, decimal=1)

    # Edge case proabilites 1.0 and 0.0
    classes = [np.array([0, 1]),  np.array([0, 1, 2])]
    class_probabilites = [np.array([1.0, 0.0]), np.array([0.0, 1.0, 0.0])]

    got = random_choice_csc(n_samples, classes, class_probabilites,
                            random_state)
    assert_true(sp.issparse(got))

    for k in range(len(classes)):
        p = np.bincount(got.getcol(k).toarray().ravel(),
                        minlength=len(class_probabilites[k])) / n_samples
        assert_array_almost_equal(class_probabilites[k], p, decimal=1)

    # One class target data
    classes = [[1],  [0]]  # test for array-like support
    class_probabilites = [np.array([0.0, 1.0]), np.array([1.0])]

    got = random_choice_csc(n_samples=n_samples,
                            classes=classes,
                            random_state=random_state)
    assert_true(sp.issparse(got))

    for k in range(len(classes)):
        p = np.bincount(got.getcol(k).toarray().ravel()) / n_samples
        assert_array_almost_equal(class_probabilites[k], p, decimal=1)
예제 #2
0
def test_random_choice_csc(n_samples=10000, random_state=24):
    # Explicit class probabilities
    classes = [np.array([0, 1]), np.array([0, 1, 2])]
    class_probabilites = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]

    got = random_choice_csc(n_samples, classes, class_probabilites,
                            random_state)
    assert_true(sp.issparse(got))

    for k in range(len(classes)):
        p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
        assert_array_almost_equal(class_probabilites[k], p, decimal=1)

    # Implicit class probabilities
    classes = [[0, 1], [1, 2]]  # test for array-like support
    class_probabilites = [np.array([0.5, 0.5]), np.array([0, 1 / 2, 1 / 2])]

    got = random_choice_csc(n_samples=n_samples,
                            classes=classes,
                            random_state=random_state)
    assert_true(sp.issparse(got))

    for k in range(len(classes)):
        p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
        assert_array_almost_equal(class_probabilites[k], p, decimal=1)

    # Edge case probabilities 1.0 and 0.0
    classes = [np.array([0, 1]), np.array([0, 1, 2])]
    class_probabilites = [np.array([1.0, 0.0]), np.array([0.0, 1.0, 0.0])]

    got = random_choice_csc(n_samples, classes, class_probabilites,
                            random_state)
    assert_true(sp.issparse(got))

    for k in range(len(classes)):
        p = np.bincount(got.getcol(k).toarray().ravel(),
                        minlength=len(class_probabilites[k])) / n_samples
        assert_array_almost_equal(class_probabilites[k], p, decimal=1)

    # One class target data
    classes = [[1], [0]]  # test for array-like support
    class_probabilites = [np.array([0.0, 1.0]), np.array([1.0])]

    got = random_choice_csc(n_samples=n_samples,
                            classes=classes,
                            random_state=random_state)
    assert_true(sp.issparse(got))

    for k in range(len(classes)):
        p = np.bincount(got.getcol(k).toarray().ravel()) / n_samples
        assert_array_almost_equal(class_probabilites[k], p, decimal=1)
예제 #3
0
    def predict(self, X):
        """
        Perform classification on test vectors X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Input vectors, where n_samples is the number of samples
            and n_features is the number of features.

        Returns
        -------
        y : array, shape = [n_samples] or [n_samples, n_outputs]
            Predicted target values for X.
        """
        if not hasattr(self, "classes_"):
            raise ValueError("DummyClassifier not fitted.")

        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        # numpy random_state expects Python int and not long as size argument
        # under Windows
        n_samples = int(X.shape[0])
        rs = check_random_state(self.random_state)

        n_classes_ = self.n_classes_
        classes_ = self.classes_
        class_prior_ = self.class_prior_
        constant = self.constant
        if self.n_outputs_ == 1:
            # Get same type even for self.n_outputs_ == 1
            n_classes_ = [n_classes_]
            classes_ = [classes_]
            class_prior_ = [class_prior_]
            constant = [constant]
        # Compute probability only once
        if self.strategy == "stratified":
            proba = self.predict_proba(X)
            if self.n_outputs_ == 1:
                proba = [proba]

        if self.sparse_output_:
            class_prob = None
            if self.strategy == "most_frequent":
                classes_ = [np.array([cp.argmax()]) for cp in class_prior_]

            elif self.strategy == "stratified":
                class_prob = class_prior_

            elif self.strategy == "uniform":
                    raise ValueError("Sparse target prediction is not "
                                     "supported with the uniform strategy")

            elif self.strategy == "constant":
                classes_ = [np.array([c]) for c in constant]

            y = random_choice_csc(n_samples, classes_, class_prob,
                                  self.random_state)
        else:
            if self.strategy == "most_frequent":
                y = np.tile([classes_[k][class_prior_[k].argmax()] for
                             k in range(self.n_outputs_)], [n_samples, 1])

            elif self.strategy == "stratified":
                y = np.vstack(classes_[k][proba[k].argmax(axis=1)] for
                              k in range(self.n_outputs_)).T

            elif self.strategy == "uniform":
                ret = [classes_[k][rs.randint(n_classes_[k], size=n_samples)]
                       for k in range(self.n_outputs_)]
                y = np.vstack(ret).T

            elif self.strategy == "constant":
                y = np.tile(self.constant, (n_samples, 1))

            if self.n_outputs_ == 1 and not self.output_2d_:
                y = np.ravel(y)

        return y
def test_random_choice_csc():
    with pytest.warns(DeprecationWarning, match="removed in version 0.24"):
        random_choice_csc(10, [[2]])
def streaming_file_projections(
        train_dir='train', test_dir='test', file_ext='bytes', dim=256,
        percentile=75, n_jobs=-1):
    """Convert all files in given folder with given file extension to grayscale
    images and save them back to the same directory as png files.
    """
    train_paths = file_paths(train_dir, file_ext)
    test_paths = file_paths(test_dir, file_ext)
    logging.info('converting %d %s training files to png files' % (
        len(train_paths), file_ext))
    logging.info('converting %d %s testing files to png files' % (
        len(test_paths), file_ext))

    # Determine normalized file length; balance trade-off on padding vs. loss of
    # info from file trimming.
    sizes = np.array([os.path.getsize(path) for path in train_paths],
                     dtype=np.int)
    cutoff = int(dim * np.round(np.percentile(sizes, percentile) / dim))
    logging.info('using cutoff of %d' % cutoff)

    # Log some info on what kind of tradeoff is being made.
    MB = 1024 ** 3
    diff = sizes - cutoff
    trimmed = float(diff[diff > 0].sum())
    padded = float(abs(diff[diff < 0].sum()))

    logging.info('%.2fMB will be trimmed' % (trimmed / MB))
    logging.info('%.2fMB will be padded' % (padded / MB))
    logging.info('trim-to-pad ratio: %d / 1000' % (1000 * (trimmed / padded)))

    # Build random projection matrix R.
    # We interpret the cutoff as the number of features ("pixels").
    # s = 1 / density, where density = 1 / sqrt(n_features).
    n_components = dim * dim  # reduced dimension/rank after projection
    logging.info('constructing random projection matrix R (%d x %d)' % (
        cutoff, n_components))

    s = np.sqrt(cutoff)
    val = np.sqrt(s / n_components)
    vals = np.array([-1, 0, 1], dtype=np.int8)
    probs = np.array([1 / (2 * s), 1 - (1 / s), 1 / (2 * s)])
    probs = probs / probs.sum()  # remove rounding error

    # Now create R as a sparse csc matrix. This function from sklearn requires
    # "classes" for each column and class probability distributions for each.
    # https://github.com/scikit-learn/scikit-learn/blob/51a765acfa4c5d1ec05fc4b406968ad233c75162/sklearn/utils/random.py#L205
    k = vals.shape[0]
    classes = np.tile(vals, n_components).reshape(n_components, k)
    class_probs = np.tile(probs, n_components).reshape(n_components, k)
    R = skrandom.random_choice_csc(
        n_samples=cutoff, classes=classes, class_probability=class_probs)
    R = R.tocsr()  # especially suitable for fast matrix vector products

    R_nbytes = R.data.nbytes + R.indices.nbytes + R.indptr.nbytes
    R_mb = float(R_nbytes) / MB
    logging.info('done building projection matrix R (%.2fMB)' % R_mb)

    # Map work across all files, distributed based on n_jobs.
    n_jobs = n_jobs if n_jobs > 0 else (mp.cpu_count() - 2)
    logging.info('converting files using %d processes' % n_jobs)

    # Convert both train and test sets.
    all_paths = itertools.chain(train_paths, test_paths)

    if n_jobs == 1:
        arg_iter = itertools.izip(
            all_paths, itertools.repeat(cutoff), itertools.repeat(dim),
            itertools.repeat(R), itertools.repeat(file_ext))
        map(mappable_convert_and_project, arg_iter)

        save_csr_matrix(R, file_ext, percentile, dim)
        return

    # If we have more than one job, we'll want to share R.
    # Create shared memory space for the projection matrix R.
    global shared_data
    global shared_indices
    global shared_indptr
    global shared_shape

    shared_data = mp.Array(ctypes.c_double, R.data.shape[0], lock=False)
    shared_indices = mp.Array(ctypes.c_int32, R.indices.shape[0], lock=False)
    shared_indptr = mp.Array(ctypes.c_int32, R.indptr.shape[0], lock=False)
    shared_shape = mp.Array(ctypes.c_int32, len(R.shape), lock=False)

    # Fill shared memory with R data.
    shared_data[:] = R.data
    shared_indices[:] = R.indices
    shared_indptr[:] = R.indptr
    shared_shape[:] = R.shape

    arg_iter = itertools.izip(
        all_paths, itertools.repeat(cutoff), itertools.repeat(dim),
        itertools.repeat(file_ext))
    pool = mp.Pool(processes=n_jobs)
    pool.map(convert_project_shared, arg_iter)

    save_csr_matrix(R, file_ext, percentile, dim)