Exemplo n.º 1
0
def get_data(dataset_name):
    print("Getting dataset: %s" % dataset_name)

    if dataset_name == 'lfw_people':
        X = fetch_lfw_people().data
    elif dataset_name == '20newsgroups':
        X = fetch_20newsgroups_vectorized().data[:, :100000]
    elif dataset_name == 'olivetti_faces':
        X = fetch_olivetti_faces().data
    elif dataset_name == 'rcv1':
        X = fetch_rcv1().data
    elif dataset_name == 'CIFAR':
        if handle_missing_dataset(CIFAR_FOLDER) == "skip":
            return
        X1 = [
            unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
            for i in range(5)
        ]
        X = np.vstack(X1)
        del X1
    elif dataset_name == 'SVHN':
        if handle_missing_dataset(SVHN_FOLDER) == 0:
            return
        X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X']
        X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
        X = np.vstack(X2)
        del X1
        del X2
    elif dataset_name == 'low rank matrix':
        X = make_low_rank_matrix(n_samples=500,
                                 n_features=np.int(1e4),
                                 effective_rank=100,
                                 tail_strength=.5,
                                 random_state=random_state)
    elif dataset_name == 'uncorrelated matrix':
        X, _ = make_sparse_uncorrelated(n_samples=500,
                                        n_features=10000,
                                        random_state=random_state)
    elif dataset_name == 'big sparse matrix':
        sparsity = np.int(1e6)
        size = np.int(1e6)
        small_size = np.int(1e4)
        data = np.random.normal(0, 1, np.int(sparsity / 10))
        data = np.repeat(data, 10)
        row = np.random.uniform(0, small_size, sparsity)
        col = np.random.uniform(0, small_size, sparsity)
        X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
        del data
        del row
        del col
    else:
        X = fetch_openml(dataset_name).data
    return X
Exemplo n.º 2
0
def test_olivetti_faces():
    data = datasets.fetch_olivetti_faces(shuffle=True, random_state=0)

    assert isinstance(data, Bunch)
    for expected_keys in ('data', 'images', 'target', 'DESCR'):
        assert expected_keys in data.keys()

    assert data.data.shape == (400, 4096)
    assert data.images.shape == (400, 64, 64)
    assert data.target.shape == (400, )
    assert_array_equal(np.unique(np.sort(data.target)), np.arange(40))

    # test the return_X_y option
    check_return_X_y(data, datasets.fetch_olivetti_faces)
Exemplo n.º 3
0
The code below also illustrates how the construction and the computation
of the predictions can be parallelized within multiple jobs.
"""
print(__doc__)

from time import time
import matplotlib.pyplot as plt

from mrex.datasets import fetch_olivetti_faces
from mrex.ensemble import ExtraTreesClassifier

# Number of cores to use to perform parallel fitting of the forest model
n_jobs = 1

# Load the faces dataset
data = fetch_olivetti_faces()
X, y = data.data, data.target

mask = y < 5  # Limit to 5 classes
X = X[mask]
y = y[mask]

# Build a forest and compute the pixel importances
print("Fitting ExtraTreesClassifier on faces data with %d cores..." % n_jobs)
t0 = time()
forest = ExtraTreesClassifier(n_estimators=1000,
                              max_features=128,
                              n_jobs=n_jobs,
                              random_state=0)

forest.fit(X, y)
Exemplo n.º 4
0
def _is_olivetti_faces_not_available():
    try:
        datasets.fetch_olivetti_faces(download_if_missing=False)
        return False
    except IOError:
        return True
Exemplo n.º 5
0
from mrex.datasets import fetch_olivetti_faces
from mrex.cluster import MiniBatchKMeans
from mrex import decomposition

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')
n_row, n_col = 2, 3
n_components = n_row * n_col
image_shape = (64, 64)
rng = RandomState(0)

# #############################################################################
# Load faces data
faces, _ = fetch_olivetti_faces(return_X_y=True,
                                shuffle=True,
                                random_state=rng)
n_samples, n_features = faces.shape

# global centering
faces_centered = faces - faces.mean(axis=0)

# local centering
faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1)

print("Dataset consists of %d faces" % n_samples)


def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray):
    plt.figure(figsize=(2. * n_col, 2.26 * n_row))
    plt.suptitle(title, size=16)
Exemplo n.º 6
0
"""
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt

from mrex.datasets import fetch_olivetti_faces
from mrex.utils.validation import check_random_state

from mrex.ensemble import ExtraTreesRegressor
from mrex.neighbors import KNeighborsRegressor
from mrex.linear_model import LinearRegression
from mrex.linear_model import RidgeCV

# Load the faces datasets
data, targets = fetch_olivetti_faces(return_X_y=True)

train = data[targets < 30]
test = data[targets >= 30]  # Test on independent people

# Test on a subset of people
n_faces = 5
rng = check_random_state(4)
face_ids = rng.randint(test.shape[0], size=(n_faces, ))
test = test[face_ids, :]

n_pixels = data.shape[1]
# Upper half of the faces
X_train = train[:, :(n_pixels + 1) // 2]
# Lower half of the faces
y_train = train[:, n_pixels // 2:]
Exemplo n.º 7
0
def load_faces():
    print("Loading Olivetti face dataset")
    print("-----------------------------")
    from mrex.datasets import fetch_olivetti_faces
    faces = fetch_olivetti_faces(shuffle=True)
    return faces.data
Exemplo n.º 8
0
partial-fit. This is because the number of patches that they represent
has become too low, and it is better to choose a random new
cluster.
"""
print(__doc__)

import time

import matplotlib.pyplot as plt
import numpy as np

from mrex import datasets
from mrex.cluster import MiniBatchKMeans
from mrex.feature_extraction.image import extract_patches_2d

faces = datasets.fetch_olivetti_faces()

# #############################################################################
# Learn the dictionary of images

print('Learning the dictionary... ')
rng = np.random.RandomState(0)
kmeans = MiniBatchKMeans(n_clusters=81, random_state=rng, verbose=True)
patch_size = (20, 20)

buffer = []
t0 = time.time()

# The online learning part: cycle over the whole dataset 6 times
index = 0
for _ in range(6):