示例#1
0
def load(dir_path, load_to_memory=False):
    """
    Loads the Rectangles dataset. 

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 784
    dir_path = os.path.expanduser(dir_path)
    targets = set(range(2))

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i)
                          for i in tokens[:-1]]), int(float(tokens[-1])))

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'rectangles_' + ds + '.amat')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [1000, 200, 50000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int],
                               l)
            for d, l in zip([train, valid, test], lengths)
        ]
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int],
                               l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
示例#2
0
def load(dir_path,load_to_memory=False):
    """
    Loads the CAData (California housing prices) dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``

    """
    
    input_size = 8
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        return mlio.libsvm_load_line(line, float, float, sparse=False, input_size=input_size)

    train_file,valid_file,test_file = [os.path.join(dir_path, 'cadata_' + ds + '.libsvm') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [16512, 2064, 2064]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
示例#3
0
def load(dir_path,load_to_memory=False,fold=1):
    """
    Loads the LETOR 4.0 MQ2007 dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.

    This dataset comes with 5 predefined folds, which can be specified
    with option ``fold`` (default = 1). 
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'scores'``
    * ``'n_queries'``
    * ``'length'``

    """
    
    input_size=46
    dir_path = os.path.expanduser(dir_path)
    sparse=False

    if fold not in [1,2,3,4,5]:
        raise error('There are 5 predefined folds. Option fold should be an integer between 1 and 5')

    def convert(feature,value):
        if feature != 'qid':
            raise ValueError('Unexpected feature')
        return int(value)

    def load_line(line):
        return mlio.libsvm_load_line(line,convert,int,sparse,input_size)

    n_queries = [ [ 1017, 339, 336 ],
                  [ 1017, 336, 339 ],
                  [ 1014, 339, 339 ],
                  [ 1014, 339, 339 ],
                  [ 1014, 339, 339 ] ]

    lengths = [ [42158, 13813, 13652],
                [41958, 13652, 14013],
                [41320, 14013, 14290],
                [41478, 14290, 13855],
                [41955, 13855, 13813] ]
    
    # Get data file paths
    train_file,valid_file,test_file = [os.path.join(dir_path, 'MQ2007/Fold' + str(fold) + '/' + ds + '.txt') for ds in ['train','vali','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,),(1,)],[np.float64,int,int],l) for d,l in zip([train,valid,test],lengths[fold-1])]
        
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                                        'scores':range(3),
                                        'n_queries':nq,
                                        'length':l,
                                        'n_pairs':l} for nq,l in zip(n_queries[fold-1],lengths[fold-1])]

    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
示例#4
0
文件: dna.py 项目: yuyunli2/projects
def load(dir_path, load_to_memory=False):
    """
    Loads the DNA dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 180
    dir_path = os.path.expanduser(dir_path)
    targets = set([0, 1, 2])
    target_mapping = {'1': 0, '2': 1, '3': 2}

    def convert_target(target):
        return target_mapping[target]

    def load_line(line):
        return mlio.libsvm_load_line(line,
                                     convert_target=convert_target,
                                     sparse=False,
                                     input_size=input_size)

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'dna_scale_' + ds + '.libsvm')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [1400, 600, 1186]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int],
                               l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
示例#5
0
文件: rcv1.py 项目: yuyunli2/projects
def load(dir_path, load_to_memory=False):
    """
    Loads the RCV1 dataset. This is actually a smaller version of it, with 150 inputs
    and binary targets.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 150
    dir_path = os.path.expanduser(dir_path)
    targets = set([0, 1])
    target_mapping = {'0': 0, '1': 1}

    def convert_target(target):
        return target_mapping[target]

    def load_line(line):
        tokens = line.split()
        return (np.array([int(i) for i in tokens[:-1]]), int(tokens[-1]))

    train_file, valid_file, test_file = [
        os.path.join(dir_path,
                     'rcv1_all_subset.binary_' + ds + '_voc_150.amat')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [40000, 10000, 150000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int],
                               l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
示例#6
0
def load(dir_path, load_to_memory=False):
    """
    Loads the Housing dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``

    """

    input_size = 13
    #targets = set(range(2))
    #targets = set([0,1])
    #target_mapping = {'-1':0,'+1':1}
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        return mlio.libsvm_load_line(line,
                                     float,
                                     float,
                                     sparse=False,
                                     input_size=input_size)
        #return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size)

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'housing_' + ds + '.libsvm')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [404, 51, 51]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )],
                               [np.float64, np.float64], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    #train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets} for l in lengths]
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
示例#7
0
def load(dir_path, load_to_memory=False, dtype=np.float64):
    """
    Loads the OCR letters dataset.

    The data is given by a dictionary mapping from strings
    'train', 'valid' and 'test' to the associated pair of data and metadata.
    
    Defined metadata: 
    - 'input_size'
    - 'targets'
    - 'length'

    References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest
                Larochelle, Bengio and Turian
                link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf

                OCR dataset (web page)
                link: http://www.seas.upenn.edu/~taskar/ocr/
    """

    input_size = 128
    targets = set(range(26))
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1]))
        #return mlio.libsvm_load_line(line,float,int,sparse,input_size)

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'ocr_letters_' + ds + '.txt')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [32152, 10000, 10000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [dtype, int], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
示例#8
0
def load(dir_path, load_to_memory=False):
    """
    Loads the 20-newsgroups dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs have been put in binary format, and the vocabulary has been
    restricted to 5000 words.

    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 5000
    targets = set(range(20))
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1]))

    train_file, valid_file, test_file = [
        os.path.join(dir_path, '20newsgroups_' + ds + '_binary_5000_voc.txt')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [9578, 1691, 7505]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int],
                               l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
示例#9
0
def load(dir_path, load_to_memory=False):
    """
    Loads the MNIST dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs have been normalized between 0 and 1.

    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 784
    targets = set(range(10))
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1]))
        #return mlio.libsvm_load_line(line,float,int,sparse,input_size)

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'mnist_' + ds + '.txt')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [50000, 10000, 10000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int],
                               l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
示例#10
0
def load(dir_path, load_to_memory=False, dtype=np.float64):
    """
    Loads the NIPS 0-12 dataset.

    The data is given by a dictionary mapping from strings
    'train', 'valid' and 'test' to the associated pair of data and metadata.
    
    Defined metadata: 
    - 'input_size'
    - 'length'

    References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest
                Larochelle, Bengio and Turian
                link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf

                LIBSVM Data: Classification, Regression, and Multi-label (web page)
                link: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
    """

    input_size = 500
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return np.array([int(i) for i in tokens[:-1]
                         ])  #The last element is bogus (don't ask...)

    train_file, valid_file, test_file = [
        os.path.join(dir_path,
                     'nips-0-12_all_shuffled_bidon_target_' + ds + '.amat')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [400, 100, 1240]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, )], [dtype], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
def load(dir_path, load_to_memory=False):
    """
    Loads the occluded MNIST dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs and targets have been converted to a binary format.

    **Defined metadata:**

    * ``'input_size'``
    * ``'target_size'``
    * ``'length'``

    """

    input_size = 784
    target_size = 784
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([int(i) for i in tokens[:input_size]]),
                np.array([int(i) for i in tokens[input_size:]]))

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'occluded_mnist_' + ds + '.txt')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [50000, 10000, 10000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (target_size, )],
                               [np.float64, np.float64], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'target_size': target_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
示例#12
0
def load(dir_path, load_to_memory=False, dtype=np.float64):
    """
    Loads a binarized version of MNIST. 

    The data is given by a dictionary mapping from strings
    'train', 'valid' and 'test' to the associated pair of data and metadata.
    
    Defined metadata: 
    - 'input_size'
    - 'length'

    Reference: On the Quantitative Analysis of Deep Belief Networks
               Salakhutdinov and Murray
               link: http://www.mit.edu/~rsalakhu/papers/dbn_ais.pdf

               The MNIST database of handwritten digits (web page)
               Yann LeCun and Corinna Cortes
               link: http://yann.lecun.com/exdb/mnist/
    """

    input_size = 784
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return np.array([int(i) for i in tokens])

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'binarized_mnist_' + ds + '.amat')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [50000, 10000, 10000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, )], [dtype], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
示例#13
0
def load(dir_path, load_to_memory=False):
    """
    SARCOS inverse dynamics dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**

    * ``'input_size'``
    * ``'target_size'``
    * ``'length'``

    """

    input_size = 21
    target_size = 7
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:input_size]]),
                np.array([float(i) for i in tokens[input_size:]]))

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'sarcos_' + ds + '.txt')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [40036, 4448, 4449]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (target_size, )],
                               [np.float64, np.float64], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'target_size': target_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
示例#14
0
def load(dir_path, load_to_memory=False):
    """
    Loads the NIPS 0-12 dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``

    """

    input_size = 500
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return np.array([int(i) for i in tokens[:-1]
                         ])  #The last element is bogus (don't ask...)

    train_file, valid_file, test_file = [
        os.path.join(dir_path,
                     'nips-0-12_all_shuffled_bidon_target_' + ds + '.amat')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [400, 100, 1240]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, )], [np.float64], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
示例#15
0
def load(dir_path,load_to_memory=False,dtype=np.float64):
    """
    Loads the DNA dataset.

    The data is given by a dictionary mapping from strings
    'train', 'valid' and 'test' to the associated pair of data and metadata.
    
    Defined metadata: 
    - 'input_size'
    - 'targets'
    - 'length'

    References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest
                Larochelle, Bengio and Turian
                link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf

                LIBSVM Data: Classification, Regression, and Multi-label (web page)
                link: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
    """
    
    input_size=180
    dir_path = os.path.expanduser(dir_path)
    targets = set([0,1,2])
    target_mapping = {'1':0,'2':1,'3':2}
    def convert_target(target):
        return target_mapping[target]

    def load_line(line):
        return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size)

    train_file,valid_file,test_file = [os.path.join(dir_path, 'dna_scale_' + ds + '.libsvm') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [1400,600,1186]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[dtype,int],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                              'length':l,'targets':targets} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
示例#16
0
def load(dir_path,load_to_memory=False):
    """
    Loads the MajMin dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'target_size'``
    * ``'length'``

    """
    
    input_size=389
    target_size=96
    dir_path = os.path.expanduser(dir_path)
    
    def convert_target(target_str):
        targets = np.zeros((target_size))
        if target_str != '':
            for l in target_str.split(','):
                id = int(l)
                targets[id] = 1
        return targets

    def load_line(line):
        return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size)

    train_file,valid_file,test_file = [os.path.join(dir_path, 'majmin_' + ds + '.libsvm') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [1587,471,480]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(target_size,)],[np.float64,bool],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,'target_size':target_size,
                                        'length':l} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
示例#17
0
def load(dir_path, load_to_memory=False, dtype=np.float64):
    """
    Loads the RCV1 dataset. This is actually a smaller version of it, with 150 inputs
    and binary targets.

    The data is given by a dictionary mapping from strings
    'train', 'valid' and 'test' to the associated pair of data and metadata.
    
    Defined metadata: 
    - 'input_size'
    - 'targets'
    - 'length'

    References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest
                Larochelle, Bengio and Turian
                link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf

                LIBSVM Data: Classification, Regression, and Multi-label (web page)
                link: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
    """

    input_size = 150
    dir_path = os.path.expanduser(dir_path)
    targets = set([0, 1])
    target_mapping = {'0': 0, '1': 1}

    def convert_target(target):
        return target_mapping[target]

    def load_line(line):
        tokens = line.split()
        return (np.array([int(i) for i in tokens[:-1]]), int(tokens[-1]))

    train_file, valid_file, test_file = [
        os.path.join(dir_path,
                     'rcv1_all_subset.binary_' + ds + '_voc_150.amat')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [40000, 10000, 150000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [dtype, int], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
示例#18
0
def load(dir_path, load_to_memory=False, home_made_valid_split=False):
    """
    Loads the Yahoo! Learning to Rank Challenge, Set 2 data.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    Option ``home_made_valid_split`` determines whether the original
    training set should be further split into a "home made"
    train/valid split (default: False). If True, the dictionary mapping
    will contain 4 keys instead of 3: ``'train'`` (home made training set), 
    ``'valid'`` (home made validation set), ``'test'`` (original validation set)
    and ``'test2'`` (original test set).

    **Defined metadata:**

    * ``'input_size'``
    * ``'scores'``
    * ``'n_queries'``
    * ``'n_pairs'``
    * ``'length'``

    """

    input_size = 700
    dir_path = os.path.expanduser(dir_path)
    sparse = False

    def convert(feature, value):
        if feature != 'qid':
            raise ValueError('Unexpected feature')
        return int(value)

    def load_line(line):
        return mlio.libsvm_load_line(line, convert, int, sparse, input_size)

    if home_made_valid_split:
        n_queries = [1000, 266, 1266, 3798]
        lengths = [27244, 7571, 34881, 103174]

        train_file, valid_file, test_file, test2_file = [
            os.path.join(dir_path, 'set2.' + ds + '.txt')
            for ds in ['in_house_train', 'in_house_valid', 'valid', 'test']
        ]
        # Get data
        train, valid, test, test2 = [
            mlio.load_from_file(f, load_line)
            for f in [train_file, valid_file, test_file, test2_file]
        ]

        if load_to_memory:
            train, valid, test, test2 = [
                mlio.MemoryDataset(d, [(input_size, ), (1, ), (1, )],
                                   [np.float64, int, int], l)
                for d, l in zip([train, valid, test, test2], lengths)
            ]

        # Get metadata
        train_meta, valid_meta, test_meta, test2_meta = [{
            'input_size': input_size,
            'scores': range(5),
            'n_queries': nq,
            'length': l,
            'n_pairs': l
        } for nq, l in zip(n_queries, lengths)]

        return {
            'train': (train, train_meta),
            'valid': (valid, valid_meta),
            'test': (test, test_meta),
            'test2': (test2, test2_meta)
        }
    else:
        n_queries = [1266, 1266, 3798]
        lengths = [34815, 34881, 103174]

        # Get data file paths
        train_file, valid_file, test_file = [
            os.path.join(dir_path, 'set2.' + ds + '.txt')
            for ds in ['train', 'valid', 'test']
        ]
        # Get data
        train, valid, test = [
            mlio.load_from_file(f, load_line)
            for f in [train_file, valid_file, test_file]
        ]
        if load_to_memory:
            train, valid, test = [
                mlio.MemoryDataset(d, [(input_size, ), (1, ), (1, )],
                                   [np.float64, int, int], l)
                for d, l in zip([train, valid, test], lengths)
            ]

        train_meta, valid_meta, test_meta = [{
            'input_size': input_size,
            'scores': range(5),
            'n_queries': nq,
            'length': l,
            'n_pairs': l
        } for nq, l in zip(n_queries, lengths)]

        return {
            'train': (train, train_meta),
            'valid': (valid, valid_meta),
            'test': (test, test_meta)
        }