Пример #1
0
def data():
    fname = _download('http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz')
    with _taropen(fname, 'r') as f:
        # The first four batches are used as training set...
        datas, labels = [], []
        for i in range(1, 5):
            with f.extractfile('cifar-10-batches-py/data_batch_' + str(i)) as b:
                batch = _pickle.load(b, encoding='latin1')
                datas.append(_np.array(batch['data'], dtype=_np.float32))
                labels.append(_np.array(batch['labels']))
        Xtr = _np.concatenate(datas)
        ytr = _np.concatenate(labels)
        Xtr /= 255

        # ... and the fifth as validation set as described in cuda-convnet:
        # https://code.google.com/p/cuda-convnet/wiki/Methodology
        with f.extractfile('cifar-10-batches-py/data_batch_5') as b:
            batch = _pickle.load(b, encoding='latin1')
        Xva = _np.array(batch['data'], dtype=_np.float32)
        yva = _np.array(batch['labels'])
        Xva /= 255

        with f.extractfile('cifar-10-batches-py/test_batch') as b:
            batch = _pickle.load(b, encoding='latin1')
        Xte = _np.array(batch['data'], dtype=_np.float32)
        yte = _np.array(batch['labels'])
        Xte /= 255

    return (Xtr, ytr), (Xva, yva), (Xte, yte)
Пример #2
0
def data(fold=False):
    fname = df.zoo.download('http://dags.stanford.edu/data/iccv09Data.tar.gz')

    # extracting files one-by-one in memory is unfortunately WAY too slow
    # for this dataset. So we bite the bullet and extract the full tgz.

    where = _p.dirname(fname)
    imgdir = 'iccv09Data/images/'

    with _taropen(fname, 'r') as f:
        f.extractall(where)
        ids = [_p.basename(n)[:-4] for n in f.getnames() if n.startswith(imgdir)]

    X = [imread(_p.join(where, imgdir, i) + '.jpg') for i in ids]
    y = [_np.loadtxt(_p.join(where, 'iccv09Data/labels', i) + '.regions.txt', dtype=_np.int32) for i in ids]
    # I personally don't believe in the other label types.

    le = _np.array(['sky', 'tree', 'road', 'grass', 'water', 'building', 'mountain', 'foreground', 'object'])
    try:
        from sklearn.preprocessing import LabelEncoder
        le, classes = LabelEncoder(), le
        le.classes_ = classes
    except ImportError:
        pass

    if fold is False:
        return X, y, le

    lo, hi = fold*ntest(), (fold+1)*ntest()
    Xtr = X[:lo] + X[hi:]
    ytr = y[:lo] + y[hi:]
    Xte = X[lo:hi]
    yte = y[lo:hi]
    return (Xtr, ytr), (Xte, yte), le
Пример #3
0
def data():
    fname = _download('http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz')
    with _taropen(fname, 'r') as f:
        # The first four batches are used as training set...
        datas, labels = [], []
        for i in range(1, 5):
            with f.extractfile('cifar-10-batches-py/data_batch_' +
                               str(i)) as b:
                batch = _pickle.load(b, encoding='latin1')
                datas.append(_np.array(batch['data'], dtype=_np.float32))
                labels.append(_np.array(batch['labels']))
        Xtr = _np.concatenate(datas)
        ytr = _np.concatenate(labels)
        Xtr /= 255

        # ... and the fifth as validation set as described in cuda-convnet:
        # https://code.google.com/p/cuda-convnet/wiki/Methodology
        with f.extractfile('cifar-10-batches-py/data_batch_5') as b:
            batch = _pickle.load(b, encoding='latin1')
        Xva = _np.array(batch['data'], dtype=_np.float32)
        yva = _np.array(batch['labels'])
        Xva /= 255

        with f.extractfile('cifar-10-batches-py/test_batch') as b:
            batch = _pickle.load(b, encoding='latin1')
        Xte = _np.array(batch['data'], dtype=_np.float32)
        yte = _np.array(batch['labels'])
        Xte /= 255

    return (Xtr, ytr), (Xva, yva), (Xte, yte)
Пример #4
0
def data():
    fname = _download(
        'http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz')
    with _taropen(fname, 'r') as f:
        with f.extractfile('cifar-100-python/train') as train:
            train = _pickle.load(train, encoding='latin1')
        Xtr = _np.array(train['data'], dtype=_np.float32)
        ytr_c = _np.array(train['coarse_labels'])
        ytr_f = _np.array(train['fine_labels'])
        Xtr /= 255

        # There is no "official" validation set here that I know of!

        # But the maxout paper uses the last 10k samples as validation.
        Xtr, Xva = Xtr[:-10000], Xtr[-10000:]
        ytr_c, yva_c = ytr_c[:-10000], ytr_c[-10000:]
        ytr_f, yva_f = ytr_f[:-10000], ytr_f[-10000:]

        with f.extractfile('cifar-100-python/test') as test:
            test = _pickle.load(test, encoding='latin1')
        Xte = _np.array(test['data'], dtype=_np.float32)
        yte_c = _np.array(test['coarse_labels'])
        yte_f = _np.array(test['fine_labels'])
        Xte /= 255

        # Get the label names additionally.
        with f.extractfile('cifar-100-python/meta') as m:
            m = _pickle.load(m, encoding='latin1')

        try:
            from sklearn.preprocessing import LabelEncoder
            le_c = LabelEncoder()
            le_c.classes_ = _np.array(m['coarse_label_names'])
            le_f = LabelEncoder()
            le_f.classes_ = _np.array(m['fine_label_names'])
        except ImportError:
            le_c = _np.array(m['coarse_label_names'])
            le_f = _np.array(m['fine_label_names'])

    return (Xtr, ytr_c, ytr_f), (Xva, yva_c, yva_f), (Xte, yte_c,
                                                      yte_f), (le_c, le_f)