def load(dir_path, load_to_memory=False): """ Loads the Rectangles dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 784 dir_path = os.path.expanduser(dir_path) targets = set(range(2)) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]), int(float(tokens[-1]))) train_file, valid_file, test_file = [ os.path.join(dir_path, 'rectangles_' + ds + '.amat') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [1000, 200, 50000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path,load_to_memory=False): """ Loads the CAData (California housing prices) dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` """ input_size = 8 dir_path = os.path.expanduser(dir_path) def load_line(line): return mlio.libsvm_load_line(line, float, float, sparse=False, input_size=input_size) train_file,valid_file,test_file = [os.path.join(dir_path, 'cadata_' + ds + '.libsvm') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [16512, 2064, 2064] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False,fold=1): """ Loads the LETOR 4.0 MQ2007 dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. This dataset comes with 5 predefined folds, which can be specified with option ``fold`` (default = 1). **Defined metadata:** * ``'input_size'`` * ``'scores'`` * ``'n_queries'`` * ``'length'`` """ input_size=46 dir_path = os.path.expanduser(dir_path) sparse=False if fold not in [1,2,3,4,5]: raise error('There are 5 predefined folds. Option fold should be an integer between 1 and 5') def convert(feature,value): if feature != 'qid': raise ValueError('Unexpected feature') return int(value) def load_line(line): return mlio.libsvm_load_line(line,convert,int,sparse,input_size) n_queries = [ [ 1017, 339, 336 ], [ 1017, 336, 339 ], [ 1014, 339, 339 ], [ 1014, 339, 339 ], [ 1014, 339, 339 ] ] lengths = [ [42158, 13813, 13652], [41958, 13652, 14013], [41320, 14013, 14290], [41478, 14290, 13855], [41955, 13855, 13813] ] # Get data file paths train_file,valid_file,test_file = [os.path.join(dir_path, 'MQ2007/Fold' + str(fold) + '/' + ds + '.txt') for ds in ['train','vali','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,),(1,)],[np.float64,int,int],l) for d,l in zip([train,valid,test],lengths[fold-1])] train_meta,valid_meta,test_meta = [{'input_size':input_size, 'scores':range(3), 'n_queries':nq, 'length':l, 'n_pairs':l} for nq,l in zip(n_queries[fold-1],lengths[fold-1])] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path, load_to_memory=False): """ Loads the DNA dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 180 dir_path = os.path.expanduser(dir_path) targets = set([0, 1, 2]) target_mapping = {'1': 0, '2': 1, '3': 2} def convert_target(target): return target_mapping[target] def load_line(line): return mlio.libsvm_load_line(line, convert_target=convert_target, sparse=False, input_size=input_size) train_file, valid_file, test_file = [ os.path.join(dir_path, 'dna_scale_' + ds + '.libsvm') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [1400, 600, 1186] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the RCV1 dataset. This is actually a smaller version of it, with 150 inputs and binary targets. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 150 dir_path = os.path.expanduser(dir_path) targets = set([0, 1]) target_mapping = {'0': 0, '1': 1} def convert_target(target): return target_mapping[target] def load_line(line): tokens = line.split() return (np.array([int(i) for i in tokens[:-1]]), int(tokens[-1])) train_file, valid_file, test_file = [ os.path.join(dir_path, 'rcv1_all_subset.binary_' + ds + '_voc_150.amat') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [40000, 10000, 150000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the Housing dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` """ input_size = 13 #targets = set(range(2)) #targets = set([0,1]) #target_mapping = {'-1':0,'+1':1} dir_path = os.path.expanduser(dir_path) def load_line(line): return mlio.libsvm_load_line(line, float, float, sparse=False, input_size=input_size) #return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size) train_file, valid_file, test_file = [ os.path.join(dir_path, 'housing_' + ds + '.libsvm') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [404, 51, 51] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, np.float64], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata #train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets} for l in lengths] train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False, dtype=np.float64): """ Loads the OCR letters dataset. The data is given by a dictionary mapping from strings 'train', 'valid' and 'test' to the associated pair of data and metadata. Defined metadata: - 'input_size' - 'targets' - 'length' References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest Larochelle, Bengio and Turian link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf OCR dataset (web page) link: http://www.seas.upenn.edu/~taskar/ocr/ """ input_size = 128 targets = set(range(26)) dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1])) #return mlio.libsvm_load_line(line,float,int,sparse,input_size) train_file, valid_file, test_file = [ os.path.join(dir_path, 'ocr_letters_' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [32152, 10000, 10000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [dtype, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the 20-newsgroups dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs have been put in binary format, and the vocabulary has been restricted to 5000 words. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 5000 targets = set(range(20)) dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1])) train_file, valid_file, test_file = [ os.path.join(dir_path, '20newsgroups_' + ds + '_binary_5000_voc.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [9578, 1691, 7505] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the MNIST dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs have been normalized between 0 and 1. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 784 targets = set(range(10)) dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1])) #return mlio.libsvm_load_line(line,float,int,sparse,input_size) train_file, valid_file, test_file = [ os.path.join(dir_path, 'mnist_' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [50000, 10000, 10000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False, dtype=np.float64): """ Loads the NIPS 0-12 dataset. The data is given by a dictionary mapping from strings 'train', 'valid' and 'test' to the associated pair of data and metadata. Defined metadata: - 'input_size' - 'length' References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest Larochelle, Bengio and Turian link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf LIBSVM Data: Classification, Regression, and Multi-label (web page) link: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ """ input_size = 500 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return np.array([int(i) for i in tokens[:-1] ]) #The last element is bogus (don't ask...) train_file, valid_file, test_file = [ os.path.join(dir_path, 'nips-0-12_all_shuffled_bidon_target_' + ds + '.amat') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [400, 100, 1240] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, )], [dtype], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the occluded MNIST dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs and targets have been converted to a binary format. **Defined metadata:** * ``'input_size'`` * ``'target_size'`` * ``'length'`` """ input_size = 784 target_size = 784 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([int(i) for i in tokens[:input_size]]), np.array([int(i) for i in tokens[input_size:]])) train_file, valid_file, test_file = [ os.path.join(dir_path, 'occluded_mnist_' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [50000, 10000, 10000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (target_size, )], [np.float64, np.float64], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'target_size': target_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False, dtype=np.float64): """ Loads a binarized version of MNIST. The data is given by a dictionary mapping from strings 'train', 'valid' and 'test' to the associated pair of data and metadata. Defined metadata: - 'input_size' - 'length' Reference: On the Quantitative Analysis of Deep Belief Networks Salakhutdinov and Murray link: http://www.mit.edu/~rsalakhu/papers/dbn_ais.pdf The MNIST database of handwritten digits (web page) Yann LeCun and Corinna Cortes link: http://yann.lecun.com/exdb/mnist/ """ input_size = 784 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return np.array([int(i) for i in tokens]) train_file, valid_file, test_file = [ os.path.join(dir_path, 'binarized_mnist_' + ds + '.amat') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [50000, 10000, 10000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, )], [dtype], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ SARCOS inverse dynamics dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'target_size'`` * ``'length'`` """ input_size = 21 target_size = 7 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:input_size]]), np.array([float(i) for i in tokens[input_size:]])) train_file, valid_file, test_file = [ os.path.join(dir_path, 'sarcos_' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [40036, 4448, 4449] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (target_size, )], [np.float64, np.float64], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'target_size': target_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the NIPS 0-12 dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` """ input_size = 500 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return np.array([int(i) for i in tokens[:-1] ]) #The last element is bogus (don't ask...) train_file, valid_file, test_file = [ os.path.join(dir_path, 'nips-0-12_all_shuffled_bidon_target_' + ds + '.amat') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [400, 100, 1240] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, )], [np.float64], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path,load_to_memory=False,dtype=np.float64): """ Loads the DNA dataset. The data is given by a dictionary mapping from strings 'train', 'valid' and 'test' to the associated pair of data and metadata. Defined metadata: - 'input_size' - 'targets' - 'length' References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest Larochelle, Bengio and Turian link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf LIBSVM Data: Classification, Regression, and Multi-label (web page) link: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ """ input_size=180 dir_path = os.path.expanduser(dir_path) targets = set([0,1,2]) target_mapping = {'1':0,'2':1,'3':2} def convert_target(target): return target_mapping[target] def load_line(line): return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size) train_file,valid_file,test_file = [os.path.join(dir_path, 'dna_scale_' + ds + '.libsvm') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [1400,600,1186] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[dtype,int],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False): """ Loads the MajMin dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'target_size'`` * ``'length'`` """ input_size=389 target_size=96 dir_path = os.path.expanduser(dir_path) def convert_target(target_str): targets = np.zeros((target_size)) if target_str != '': for l in target_str.split(','): id = int(l) targets[id] = 1 return targets def load_line(line): return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size) train_file,valid_file,test_file = [os.path.join(dir_path, 'majmin_' + ds + '.libsvm') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [1587,471,480] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(target_size,)],[np.float64,bool],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size,'target_size':target_size, 'length':l} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path, load_to_memory=False, dtype=np.float64): """ Loads the RCV1 dataset. This is actually a smaller version of it, with 150 inputs and binary targets. The data is given by a dictionary mapping from strings 'train', 'valid' and 'test' to the associated pair of data and metadata. Defined metadata: - 'input_size' - 'targets' - 'length' References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest Larochelle, Bengio and Turian link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf LIBSVM Data: Classification, Regression, and Multi-label (web page) link: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ """ input_size = 150 dir_path = os.path.expanduser(dir_path) targets = set([0, 1]) target_mapping = {'0': 0, '1': 1} def convert_target(target): return target_mapping[target] def load_line(line): tokens = line.split() return (np.array([int(i) for i in tokens[:-1]]), int(tokens[-1])) train_file, valid_file, test_file = [ os.path.join(dir_path, 'rcv1_all_subset.binary_' + ds + '_voc_150.amat') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [40000, 10000, 150000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [dtype, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False, home_made_valid_split=False): """ Loads the Yahoo! Learning to Rank Challenge, Set 2 data. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. Option ``home_made_valid_split`` determines whether the original training set should be further split into a "home made" train/valid split (default: False). If True, the dictionary mapping will contain 4 keys instead of 3: ``'train'`` (home made training set), ``'valid'`` (home made validation set), ``'test'`` (original validation set) and ``'test2'`` (original test set). **Defined metadata:** * ``'input_size'`` * ``'scores'`` * ``'n_queries'`` * ``'n_pairs'`` * ``'length'`` """ input_size = 700 dir_path = os.path.expanduser(dir_path) sparse = False def convert(feature, value): if feature != 'qid': raise ValueError('Unexpected feature') return int(value) def load_line(line): return mlio.libsvm_load_line(line, convert, int, sparse, input_size) if home_made_valid_split: n_queries = [1000, 266, 1266, 3798] lengths = [27244, 7571, 34881, 103174] train_file, valid_file, test_file, test2_file = [ os.path.join(dir_path, 'set2.' + ds + '.txt') for ds in ['in_house_train', 'in_house_valid', 'valid', 'test'] ] # Get data train, valid, test, test2 = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file, test2_file] ] if load_to_memory: train, valid, test, test2 = [ mlio.MemoryDataset(d, [(input_size, ), (1, ), (1, )], [np.float64, int, int], l) for d, l in zip([train, valid, test, test2], lengths) ] # Get metadata train_meta, valid_meta, test_meta, test2_meta = [{ 'input_size': input_size, 'scores': range(5), 'n_queries': nq, 'length': l, 'n_pairs': l } for nq, l in zip(n_queries, lengths)] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta), 'test2': (test2, test2_meta) } else: n_queries = [1266, 1266, 3798] lengths = [34815, 34881, 103174] # Get data file paths train_file, valid_file, test_file = [ os.path.join(dir_path, 'set2.' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, ), (1, )], [np.float64, int, int], l) for d, l in zip([train, valid, test], lengths) ] train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'scores': range(5), 'n_queries': nq, 'length': l, 'n_pairs': l } for nq, l in zip(n_queries, lengths)] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }