def get(dataset_name):
        # List of datasets that works with the current model ?
        datasets = ['adult',
                    'binarized_mnist',
                    'connect4',
                    'dna',
                    'mushrooms',
                    'nips',
                    'ocr_letters',
                    'rcv1',
                    'rcv2_russ',
                    'web']

        # Setup dataset env
        if dataset_name not in datasets:
            raise ValueError('Dataset unknown: ' + dataset_name)
        # mldataset = __import__('mlpython.datasets.' + dataset_name, globals(), locals(), [dataset_name], -1)
        datadir = os.path.join(os.getenv("MLPYTHON_DATASET_REPO"), dataset_name)

        # Verify if dataset exist and if not, download it
        if(not os.path.exists(datadir)):
            dataset_store.download(dataset_name)

        print('### Loading dataset [{0}] ...'.format(dataset_name))
        start_time = t.time()

        all_data = mldataset.load(datadir, load_to_memory=True)
        train_data, train_metadata = all_data['train']

        if dataset_name == 'binarized_mnist' or dataset_name == 'nips':
            trainset = mlpb.MLProblem(train_data, train_metadata)
        else:
            trainset = mlpb.SubsetFieldsProblem(train_data, train_metadata)

        trainset.setup()

        valid_data, valid_metadata = all_data['valid']

        validset = trainset.apply_on(valid_data, valid_metadata)

        test_data, test_metadata = all_data['test']
        testset = trainset.apply_on(test_data, test_metadata)

        # Cleaning up, packaging and theanized
        full_dataset = {'input_size': trainset.metadata['input_size']}

        trainset_theano = theano.shared(value=Dataset._clean(trainset), borrow=True)
        validset_theano = theano.shared(value=Dataset._clean(validset), borrow=True)
        testset_theano = theano.shared(value=Dataset._clean(testset), borrow=True)

        full_dataset['train'] = {'data': trainset_theano, 'length': all_data['train'][1]['length']}
        full_dataset['valid'] = {'data': validset_theano, 'length': all_data['valid'][1]['length']}
        full_dataset['test'] = {'data': testset_theano, 'length': all_data['test'][1]['length']}

        print("(Dim:{0} Train:{1} Valid:{2} Test:{3})".format(trainset.metadata['input_size'], full_dataset['train']['length'], full_dataset['valid']['length'], full_dataset['test']['length']))
        print(get_done_text(start_time), "###")
        return full_dataset
示例#2
0
def load_data(dataset_name):
    datadir = root + '/data/'
    exec 'import mlpython.datasets.' + dataset_name + ' as mldataset'
    exec 'datadir = datadir + \'' + dataset_name + '/\''
    all_data = mldataset.load(datadir, load_to_memory=True)
    train_data, train_metadata = all_data['train']
    if dataset_name == 'binarized_mnist' or dataset_name == 'nips':
        trainset = mlpb.MLProblem(train_data, train_metadata)
    else:
        trainset = mlpb.SubsetFieldsProblem(train_data, train_metadata)
    trainset.setup()
    valid_data, valid_metadata = all_data['valid']
    validset = trainset.apply_on(valid_data, valid_metadata)
    test_data, test_metadata = all_data['test']
    testset = trainset.apply_on(test_data, test_metadata)

    train_X = trainset.data.mem_data[0]
    valid_X = validset.data.mem_data[0]
    test_X = testset.data.mem_data[0]
    return train_X, valid_X, test_X
示例#3
0
import os
import itertools
import numpy as np
import fcntl
import copy
from string import Template
import mlpython.datasets.store as dataset_store
import mlpython.mlproblems.generic as mlpb
from rbm import RBM
#from autoencoder import Autoencoder

print "Loading dataset..."
trainset,validset,testset = dataset_store.get_classification_problem('ocr_letters')
print "Train RBM for 10 iterations... (this might take a few minutes)"
rbm = RBM(n_epochs = 10,
          hidden_size = 200,
          lr = 0.01,
          CDk = 1,
          seed=1234
          )

rbm.train(mlpb.SubsetFieldsProblem(trainset))
rbm.show_filters()

示例#4
0
pb5 = mlpbclass.ClassificationProblem(pb4)
print 'pb5:'
for example in pb5:
    print example
print 'metadata:', pb5.metadata
print ''

pb6 = mlpbclass.ClassSubsetProblem(pb5, subset=set(['A', 'C']))
print 'pb6 (final):'
for example in pb6:
    print example
print 'metadata:', pb6.metadata
print ''

pb7 = mlpbgen.SubsetFieldsProblem(pb6, fields=[0, 0, 1])
print 'pb7 (final):'
for example in pb7:
    print example
print 'metadata:', pb7.metadata
print ''

print 'What is expected:'
final_data = zip([(1, 1), (3, 3), (0, 0), (1, 1), (3, 3), (4, 4),
                  (5, 5)], [(1, 1), (3, 3), (0, 0), (1, 1), (3, 3), (4, 4),
                            (5, 5)], [0, 1, 0, 0, 1, 0])
for example in final_data:
    print example
print ''

raw_data2 = zip(range(6, 10), ['C', 'B', 'A', 'C'])
示例#5
0
def test_mlproblem_combinations():
    """
    Test a combination of many different MLProblems.
    """

    raw_data = zip(range(6), ['A', 'A', 'B', 'C', 'A', 'B'])
    metadata = {'length': 6, 'targets': ['A', 'B', 'C'], 'input_size': 1}

    def features(example, metadata):
        metadata['input_size'] = 2
        return ((example[0], example[0]), example[1])

    pb1 = mlpbgen.MLProblem(raw_data, metadata)
    print 'pb1', pb1.metadata
    pb2 = mlpbgen.SubsetProblem(pb1, subset=set([1, 3, 5]))
    print 'pb2', pb2.metadata
    pb3 = mlpbgen.MergedProblem([pb2, pb1])
    print 'pb3', pb3.metadata
    pb4 = mlpbgen.PreprocessedProblem(pb3, preprocess=features)
    print 'pb4', pb4.metadata
    pb5 = mlpbclass.ClassificationProblem(pb4)
    print 'pb5', pb5.metadata
    pb6 = mlpbclass.ClassSubsetProblem(pb5, subset=set(['A', 'C']))
    print 'pb6', pb6.metadata
    pb7 = mlpbgen.SubsetFieldsProblem(pb6, fields=[0, 0, 1])
    print 'pb7', pb7.metadata

    final_data = [[(1, 1), (1, 1), 0], [(3, 3), (3, 3), 1], [(0, 0), (0, 0),
                                                             0],
                  [(1, 1), (1, 1), 0], [(3, 3), (3, 3), 1], [(4, 4), (4, 4),
                                                             0]]
    final_metadata = {
        'input_size': 2,
        'targets': set(['A', 'C']),
        'class_to_id': {
            'A': 0,
            'C': 1
        }
    }

    for ex1, ex2 in zip(pb7, final_data):
        assert cmp(ex1, ex2) == 0
    print pb7.metadata, final_metadata
    assert cmp(pb7.metadata, final_metadata) == 0

    raw_data2 = zip(range(6, 10), ['C', 'B', 'A', 'C'])
    metadata2 = {'length': 4, 'targets': ['A', 'B', 'C'], 'input_size': 1}

    pbtest = pb7.apply_on(raw_data2, metadata2)
    final_test_data = [[(6, 6), (6, 6), 1], [(8, 8), (8, 8), 0],
                       [(9, 9), (9, 9), 1]]
    final_test_metadata = {
        'input_size': 2,
        'targets': set(['A', 'C']),
        'class_to_id': {
            'A': 0,
            'C': 1
        }
    }

    for ex1, ex2 in zip(pbtest, final_test_data):
        assert cmp(ex1, ex2) == 0
    assert cmp(pbtest.metadata, final_test_metadata) == 0