def get(dataset_name): # List of datasets that works with the current model ? datasets = ['adult', 'binarized_mnist', 'connect4', 'dna', 'mushrooms', 'nips', 'ocr_letters', 'rcv1', 'rcv2_russ', 'web'] # Setup dataset env if dataset_name not in datasets: raise ValueError('Dataset unknown: ' + dataset_name) # mldataset = __import__('mlpython.datasets.' + dataset_name, globals(), locals(), [dataset_name], -1) datadir = os.path.join(os.getenv("MLPYTHON_DATASET_REPO"), dataset_name) # Verify if dataset exist and if not, download it if(not os.path.exists(datadir)): dataset_store.download(dataset_name) print('### Loading dataset [{0}] ...'.format(dataset_name)) start_time = t.time() all_data = mldataset.load(datadir, load_to_memory=True) train_data, train_metadata = all_data['train'] if dataset_name == 'binarized_mnist' or dataset_name == 'nips': trainset = mlpb.MLProblem(train_data, train_metadata) else: trainset = mlpb.SubsetFieldsProblem(train_data, train_metadata) trainset.setup() valid_data, valid_metadata = all_data['valid'] validset = trainset.apply_on(valid_data, valid_metadata) test_data, test_metadata = all_data['test'] testset = trainset.apply_on(test_data, test_metadata) # Cleaning up, packaging and theanized full_dataset = {'input_size': trainset.metadata['input_size']} trainset_theano = theano.shared(value=Dataset._clean(trainset), borrow=True) validset_theano = theano.shared(value=Dataset._clean(validset), borrow=True) testset_theano = theano.shared(value=Dataset._clean(testset), borrow=True) full_dataset['train'] = {'data': trainset_theano, 'length': all_data['train'][1]['length']} full_dataset['valid'] = {'data': validset_theano, 'length': all_data['valid'][1]['length']} full_dataset['test'] = {'data': testset_theano, 'length': all_data['test'][1]['length']} print("(Dim:{0} Train:{1} Valid:{2} Test:{3})".format(trainset.metadata['input_size'], full_dataset['train']['length'], full_dataset['valid']['length'], full_dataset['test']['length'])) print(get_done_text(start_time), "###") return full_dataset
def load_data(dataset_name): datadir = root + '/data/' exec 'import mlpython.datasets.' + dataset_name + ' as mldataset' exec 'datadir = datadir + \'' + dataset_name + '/\'' all_data = mldataset.load(datadir, load_to_memory=True) train_data, train_metadata = all_data['train'] if dataset_name == 'binarized_mnist' or dataset_name == 'nips': trainset = mlpb.MLProblem(train_data, train_metadata) else: trainset = mlpb.SubsetFieldsProblem(train_data, train_metadata) trainset.setup() valid_data, valid_metadata = all_data['valid'] validset = trainset.apply_on(valid_data, valid_metadata) test_data, test_metadata = all_data['test'] testset = trainset.apply_on(test_data, test_metadata) train_X = trainset.data.mem_data[0] valid_X = validset.data.mem_data[0] test_X = testset.data.mem_data[0] return train_X, valid_X, test_X
import os import itertools import numpy as np import fcntl import copy from string import Template import mlpython.datasets.store as dataset_store import mlpython.mlproblems.generic as mlpb from rbm import RBM #from autoencoder import Autoencoder print "Loading dataset..." trainset,validset,testset = dataset_store.get_classification_problem('ocr_letters') print "Train RBM for 10 iterations... (this might take a few minutes)" rbm = RBM(n_epochs = 10, hidden_size = 200, lr = 0.01, CDk = 1, seed=1234 ) rbm.train(mlpb.SubsetFieldsProblem(trainset)) rbm.show_filters()
pb5 = mlpbclass.ClassificationProblem(pb4) print 'pb5:' for example in pb5: print example print 'metadata:', pb5.metadata print '' pb6 = mlpbclass.ClassSubsetProblem(pb5, subset=set(['A', 'C'])) print 'pb6 (final):' for example in pb6: print example print 'metadata:', pb6.metadata print '' pb7 = mlpbgen.SubsetFieldsProblem(pb6, fields=[0, 0, 1]) print 'pb7 (final):' for example in pb7: print example print 'metadata:', pb7.metadata print '' print 'What is expected:' final_data = zip([(1, 1), (3, 3), (0, 0), (1, 1), (3, 3), (4, 4), (5, 5)], [(1, 1), (3, 3), (0, 0), (1, 1), (3, 3), (4, 4), (5, 5)], [0, 1, 0, 0, 1, 0]) for example in final_data: print example print '' raw_data2 = zip(range(6, 10), ['C', 'B', 'A', 'C'])
def test_mlproblem_combinations(): """ Test a combination of many different MLProblems. """ raw_data = zip(range(6), ['A', 'A', 'B', 'C', 'A', 'B']) metadata = {'length': 6, 'targets': ['A', 'B', 'C'], 'input_size': 1} def features(example, metadata): metadata['input_size'] = 2 return ((example[0], example[0]), example[1]) pb1 = mlpbgen.MLProblem(raw_data, metadata) print 'pb1', pb1.metadata pb2 = mlpbgen.SubsetProblem(pb1, subset=set([1, 3, 5])) print 'pb2', pb2.metadata pb3 = mlpbgen.MergedProblem([pb2, pb1]) print 'pb3', pb3.metadata pb4 = mlpbgen.PreprocessedProblem(pb3, preprocess=features) print 'pb4', pb4.metadata pb5 = mlpbclass.ClassificationProblem(pb4) print 'pb5', pb5.metadata pb6 = mlpbclass.ClassSubsetProblem(pb5, subset=set(['A', 'C'])) print 'pb6', pb6.metadata pb7 = mlpbgen.SubsetFieldsProblem(pb6, fields=[0, 0, 1]) print 'pb7', pb7.metadata final_data = [[(1, 1), (1, 1), 0], [(3, 3), (3, 3), 1], [(0, 0), (0, 0), 0], [(1, 1), (1, 1), 0], [(3, 3), (3, 3), 1], [(4, 4), (4, 4), 0]] final_metadata = { 'input_size': 2, 'targets': set(['A', 'C']), 'class_to_id': { 'A': 0, 'C': 1 } } for ex1, ex2 in zip(pb7, final_data): assert cmp(ex1, ex2) == 0 print pb7.metadata, final_metadata assert cmp(pb7.metadata, final_metadata) == 0 raw_data2 = zip(range(6, 10), ['C', 'B', 'A', 'C']) metadata2 = {'length': 4, 'targets': ['A', 'B', 'C'], 'input_size': 1} pbtest = pb7.apply_on(raw_data2, metadata2) final_test_data = [[(6, 6), (6, 6), 1], [(8, 8), (8, 8), 0], [(9, 9), (9, 9), 1]] final_test_metadata = { 'input_size': 2, 'targets': set(['A', 'C']), 'class_to_id': { 'A': 0, 'C': 1 } } for ex1, ex2 in zip(pbtest, final_test_data): assert cmp(ex1, ex2) == 0 assert cmp(pbtest.metadata, final_test_metadata) == 0