Пример #1
0
def main(params_, **kwargs):
    print 'Params: ', params_
    #raise IndexError('aaaaaaaaaaaaaaaaaaaaaaaaaaaaa!')
    dataset = skdata.mnist.dataset.MNIST()
    dataset.fetch(True)
    convex_inputs = skdata.mnist.dataset.read(
        gfile(os.path.join(dataset.home(), "train-images-idx3-ubyte.gz")))
    convex_labels = skdata.mnist.dataset.read(
        gfile(os.path.join(dataset.home(), "train-labels-idx1-ubyte.gz")))
    convex_test = skdata.mnist.dataset.read(
        gfile(os.path.join(dataset.home(), "t10k-images-idx3-ubyte.gz")))
    convex_test_labels = skdata.mnist.dataset.read(
        gfile(os.path.join(dataset.home(), "t10k-labels-idx1-ubyte.gz")))
    convex_inputs = numpy.array(convex_inputs.reshape((-1, 784)),
                                dtype=numpy.float32)
    convex_test = numpy.array(convex_test.reshape((-1, 784)),
                              dtype=numpy.float32)

    fold = kwargs['fold']
    folds = kwargs['folds']
    downsize = kwargs['downsize']
    if folds == 1:
        train = convex_inputs[:int(50000 * downsize)]
        valid = convex_inputs[50000:60000]
        test = convex_test
        train_targets = convex_labels[:int(50000 * downsize)]
        valid_targets = convex_labels[50000:60000]
        test_targets = convex_test_labels
    elif folds > 1:
        cv_data = numpy.copy(convex_inputs[60000])
        train, valid = data_util.prepare_cv_for_fold(cv_data, fold, folds)
        cv_labels = numpy.copy(convex_labels[:60000])
        train_targets, valid_targets = data_util.prepare_cv_for_fold(
            cv_labels, fold, folds)
        test = convex_test
        test_targets = convex_test_labels
    else:
        raise ValueError("Folds cannot be less than 1")

    # I do not know why 256, but this gives us equal data than the one obtained
    # by Jasper Snoek
    train /= 256
    valid /= 256

    kwargs['train'] = train
    kwargs['train_targets'] = train_targets
    kwargs['valid'] = valid
    kwargs['valid_targets'] = valid_targets
    y = logistic(params_, **kwargs)
    print 'Result: ', y
    return y
Пример #2
0
def gsave(p, filename):
    """
    Same as ``save(p,filname)``, but saves into a gzipped file.
    """
    f = gfile(filename, 'wb')
    cPickle.dump(p, f, cPickle.HIGHEST_PROTOCOL)
    f.close()
Пример #3
0
def gsave(p, filename):
    """
    Same as ``save(p,filname)``, but saves into a gzipped file.
    """
    f=gfile(filename,'wb')
    cPickle.dump(p,f,cPickle.HIGHEST_PROTOCOL) 
    f.close()
Пример #4
0
def gload(filename):
    """
    Same as ``load(filname)``, but loads from a gzipped file.
    """
    f = gfile(filename, 'rb')
    y = cPickle.load(f)
    f.close()
    return y
Пример #5
0
def gload(filename):
    """
    Same as ``load(filname)``, but loads from a gzipped file.
    """    
    f=gfile(filename,'rb')
    y=cPickle.load(f)
    f.close()
    return y
Пример #6
0
def obtain(dir_path):
    """
    Downloads the dataset to ``dir_path``.
    """

    dir_path = os.path.expanduser(dir_path)
    print 'Downloading the dataset'
    import urllib
    urllib.urlretrieve('http://ai.stanford.edu/~btaskar/ocr/letter.data.gz',
                       os.path.join(dir_path, 'letter.data.gz'))

    print 'Splitting dataset into training/validation/test sets'
    file = gfile(os.path.join(dir_path, 'letter.data.gz'))
    train_file, valid_file, test_file = [
        open(os.path.join(dir_path, 'ocr_letters_' + ds + '.txt'), 'w')
        for ds in ['train', 'valid', 'test']
    ]
    letters = 'abcdefghijklmnopqrstuvwxyz'

    all_data = []
    s = ''
    # Putting all data in memory
    for line in file:
        tokens = line.strip('\n').strip('\t').split('\t')
        s += ' '.join(tokens[6:])
        target = letters.find(tokens[1])
        if target < 0:
            print 'Target ' + tokens[1] + ' not found!'
        s = s + ' ' + str(target)

        if int(tokens[2]) == -1:  # new word starts next
            s = s + '\n'
            all_data += [s]
            s = ''
        else:
            s = s + ' '

    # Shuffle data
    import random
    random.seed(12345)
    perm = range(len(all_data))
    random.shuffle(perm)
    line_id = 0
    train_valid_split = 5502
    valid_test_split = 5502 + 688
    for i in perm:
        s = all_data[i]
        if line_id < train_valid_split:
            train_file.write(s)
        elif line_id < valid_test_split:
            valid_file.write(s)
        else:
            test_file.write(s)
        line_id += 1
    train_file.close()
    valid_file.close()
    test_file.close()
    print 'Done                     '
Пример #7
0
def main(params_, **kwargs):
    print 'Params: ', params_
    #raise IndexError('aaaaaaaaaaaaaaaaaaaaaaaaaaaaa!')
    dataset = skdata.mnist.dataset.MNIST()
    dataset.fetch(True)
    convex_inputs = skdata.mnist.dataset.read(gfile(os.path.join(dataset.home(),"train-images-idx3-ubyte.gz")))
    convex_labels = skdata.mnist.dataset.read(gfile(os.path.join(dataset.home(), "train-labels-idx1-ubyte.gz")))
    convex_test = skdata.mnist.dataset.read(gfile(os.path.join(dataset.home(), "t10k-images-idx3-ubyte.gz")))
    convex_test_labels = skdata.mnist.dataset.read(gfile(os.path.join(dataset.home(), "t10k-labels-idx1-ubyte.gz")))
    convex_inputs = numpy.array(convex_inputs.reshape((-1, 784)), dtype=numpy.float32)
    convex_test = numpy.array(convex_test.reshape((-1, 784)), dtype=numpy.float32)

    fold = kwargs['fold']
    folds = kwargs['folds']
    downsize=kwargs['downsize']
    if folds == 1:
        train = convex_inputs[:int(50000*downsize)]
        valid = convex_inputs[50000:60000]
        test = convex_test
        train_targets = convex_labels[:int(50000*downsize)]
        valid_targets = convex_labels[50000:60000]
        test_targets = convex_test_labels
    elif folds > 1:
        cv_data = numpy.copy(convex_inputs[60000])
        train, valid = data_util.prepare_cv_for_fold(cv_data, fold, folds)
        cv_labels = numpy.copy(convex_labels[:60000])
        train_targets, valid_targets = data_util.prepare_cv_for_fold(cv_labels,
                                                                     fold, folds)
        test = convex_test
        test_targets = convex_test_labels
    else:
        raise ValueError("Folds cannot be less than 1")

    # I do not know why 256, but this gives us equal data than the one obtained
    # by Jasper Snoek
    train /= 256
    valid /= 256

    kwargs['train'] = train
    kwargs['train_targets'] = train_targets
    kwargs['valid'] = valid
    kwargs['valid_targets'] = valid_targets
    y = logistic(params_, **kwargs)
    print 'Result: ', y
    return y
Пример #8
0
def obtain(dir_path):
    """
    Downloads the dataset to ``dir_path``.
    """

    dir_path = os.path.expanduser(dir_path)
    print 'Downloading the dataset'
    import urllib
    urllib.urlretrieve(
        'http://info.usherbrooke.ca/hlarochelle/public/letter.data.gz',
        os.path.join(dir_path, 'letter.data.gz'))

    print 'Splitting dataset into training/validation/test sets'
    file = gfile(os.path.join(dir_path, 'letter.data.gz'))
    train_file, valid_file, test_file = [
        open(os.path.join(dir_path, 'ocr_letters_' + ds + '.txt'), 'w')
        for ds in ['train', 'valid', 'test']
    ]
    letters = 'abcdefghijklmnopqrstuvwxyz'
    all_data = []
    # Putting all data in memory
    for line in file:
        tokens = line.strip('\n').strip('\t').split('\t')
        s = ''
        for t in range(6, len(tokens)):
            s = s + tokens[t] + ' '
        target = letters.find(tokens[1])
        if target < 0:
            print 'Target ' + tokens[1] + ' not found!'
        s = s + str(target) + '\n'
        all_data += [s]

    # Shuffle data
    import random
    random.seed(12345)
    perm = range(len(all_data))
    random.shuffle(perm)
    line_id = 0
    train_valid_split = 32152
    valid_test_split = 42152
    for i in perm:
        s = all_data[i]
        if line_id < train_valid_split:
            train_file.write(s)
        elif line_id < valid_test_split:
            valid_file.write(s)
        else:
            test_file.write(s)
        line_id += 1
    train_file.close()
    valid_file.close()
    test_file.close()
    print 'Done                     '
Пример #9
0
def obtain(dir_path):
    """
    Downloads the dataset to ``dir_path``.
    """

    dir_path = os.path.expanduser(dir_path)
    print 'Downloading the dataset'
    import urllib
    urllib.urlretrieve('http://ai.stanford.edu/~btaskar/ocr/letter.data.gz',os.path.join(dir_path,'letter.data.gz'))

    print 'Splitting dataset into training/validation/test sets'
    file = gfile(os.path.join(dir_path,'letter.data.gz'))
    train_file,valid_file,test_file = [open(os.path.join(dir_path, 'ocr_letters_' + ds + '.txt'),'w') for ds in ['train','valid','test']]
    letters = 'abcdefghijklmnopqrstuvwxyz'
    all_data = []
    # Putting all data in memory
    for line in file:
        tokens = line.strip('\n').strip('\t').split('\t')
        s = ''        
        for t in range(6,len(tokens)):
            s = s + tokens[t] + ' '
        target = letters.find(tokens[1])
        if target < 0:
            print 'Target ' + tokens[1] + ' not found!'
        s = s + str(target) + '\n'
        all_data += [s]

    # Shuffle data
    import random
    random.seed(12345)
    perm = range(len(all_data))
    random.shuffle(perm)
    line_id = 0
    train_valid_split = 32152
    valid_test_split = 42152
    for i in perm:
        s = all_data[i]
        if line_id < train_valid_split:
            train_file.write(s)
        elif line_id < valid_test_split:
            valid_file.write(s)
        else:
            test_file.write(s)
        line_id += 1
    train_file.close()
    valid_file.close()
    test_file.close()
    print 'Done                     '
Пример #10
0
def load_file(filename, file_format, use_percentage):
    if not os.path.exists(filename):
        raise IOError("File %s not found", filename)

    if file_format == "gfile":
        logger.info("Loading file: %s", filename)
        fh = gfile(filename, "rb")
        data = pickle.load(fh)
        if use_percentage >= 100.:
            pass
        else:
            max_data = int(len(data) / 100. * use_percentage)
            data = data[:max_data]
        fh.close()
        logger.info("Done loading file: %s has %d datapoints", filename,
                    len(data))

    elif file_format == "pickle":
        logger.info("Loading file: %s", filename)
        fh = open(filename, "r")
        data = pickle.load(fh)
        if use_percentage >= 100.:
            pass
        else:
            data = data[:len(data) / 100. * use_percentage]
        fh.close()
        logger.info("Done loading file: %s has %d datapoints", filename,
                    len(data))

    elif file_format == "numpy":
        logger.info("Loading file: %s", filename)
        fh = open(filename, "r")
        data = np.load(fh)
        if use_percentage >= 100.:
            pass
        else:
            data = data[:len(data) / 100. * use_percentage]
        fh.close()
        logger.info("Done loading file: %s has %d datapoints", filename,
                    len(data))

    else:
        raise ValueError("%s is an unknown training_data_format", file_format)

    return data
Пример #11
0
def load_file(filename, file_format, use_percentage):
    if not os.path.exists(filename):
        raise IOError("File %s not found", filename)

    if file_format == "gfile":
        print "Loading file:", filename
        fh = gfile(filename, "rb")
        data = cPickle.load(fh)
        if use_percentage >= 100.:
            pass
        else:
            max_data = int(len(data) / 100. * use_percentage)
            data = data[:max_data]
        fh.close()
        print "Done loading file:", filename, "has", len(data), "datapoints"
        sys.stdout.flush()

    elif file_format == "pickle":
        print "Loading file:", filename
        fh = open(filename, "r")
        data = cPickle.load(fh)
        if use_percentage >= 100.:
            pass
        else:
            data = data[:len(data) / 100. * use_percentage]
        fh.close()
        print "Done loading file:", filename, "has", len(data), "datapoints"

    elif file_format == "numpy":
        print "Loading file:", filename
        fh = open(filename, "r")
        data = np.load(fh)
        if use_percentage >= 100.:
            pass
        else:
            data = data[:len(data) / 100. * use_percentage]
        fh.close()
        print "Done loading file:", filename, "has", len(data), "datapoints"

    else:
        raise ValueError("%s is an unknown training_data_format", file_format)

    return data
Пример #12
0
def load_file(filename, file_format, use_percentage):
    if not os.path.exists(filename):
        raise IOError("File %s not found", filename)

    if file_format == "gfile":
        print "Loading file:", filename
        fh = gfile(filename, "rb")
        data = cPickle.load(fh)
        if use_percentage >= 100.:
            pass
        else:
            max_data = int(len(data) / 100. * use_percentage)
            data = data[:max_data]
        fh.close()
        print "Done loading file:", filename, "has", len(data), "datapoints"
        sys.stdout.flush()

    elif file_format == "pickle":
        print "Loading file:", filename
        fh = open(filename, "r")
        data = cPickle.load(fh)
        if use_percentage >= 100.:
            pass
        else:
            data = data[:len(data) / 100. * use_percentage]
        fh.close()
        print "Done loading file:", filename, "has", len(data), "datapoints"

    elif file_format == "numpy":
        print "Loading file:", filename
        fh = open(filename, "r")
        data = np.load(fh)
        if use_percentage >= 100.:
            pass
        else:
            data = data[:len(data) / 100. * use_percentage]
        fh.close()
        print "Done loading file:", filename, "has", len(data), "datapoints"

    else:
        raise ValueError("%s is an unknown training_data_format", file_format)

    return data
Пример #13
0
def load_file(filename, file_format, use_percentage):
    if not os.path.exists(filename):
        raise IOError("File %s not found", filename)

    if file_format == "gfile":
        logger.info("Loading file: %s", filename)
        fh = gfile(filename, "rb")
        data = cPickle.load(fh)
        if use_percentage >= 100.0:
            pass
        else:
            max_data = int(len(data) / 100.0 * use_percentage)
            data = data[:max_data]
        fh.close()
        logger.info("Done loading file: %s has %d datapoints", filename, len(data))

    elif file_format == "pickle":
        logger.info("Loading file: %s", filename)
        fh = open(filename, "r")
        data = cPickle.load(fh)
        if use_percentage >= 100.0:
            pass
        else:
            data = data[: len(data) / 100.0 * use_percentage]
        fh.close()
        logger.info("Done loading file: %s has %d datapoints", filename, len(data))

    elif file_format == "numpy":
        logger.info("Loading file: %s", filename)
        fh = open(filename, "r")
        data = np.load(fh)
        if use_percentage >= 100.0:
            pass
        else:
            data = data[: len(data) / 100.0 * use_percentage]
        fh.close()
        logger.info("Done loading file: %s has %d datapoints", filename, len(data))

    else:
        raise ValueError("%s is an unknown training_data_format", file_format)

    return data
Пример #14
0
    def test_load_file(self):
        # Test numpy arrays
        train_data = np.zeros((100, 100))
        np.save("train_data.npy", train_data)
        data = data_util.load_file("train_data.npy", "numpy", 100)
        self.assertTrue((train_data == data).all())
        data = data_util.load_file("train_data.npy", "numpy", 10)
        self.assertTrue((train_data[:10] == data).all())
        os.remove("train_data.npy")

        # Test pickle files
        train_data = np.zeros((100, 100))
        fh = open("train_data.pkl", "w")
        cPickle.dump(train_data, fh)
        fh.close()
        data = data_util.load_file("train_data.pkl", "pickle", 100)
        self.assertTrue((train_data == data).all())
        data = data_util.load_file("train_data.pkl", "pickle", 10)
        self.assertTrue((train_data[:10] == data).all())
        os.remove("train_data.pkl")

        # Test zipped pickle files
        train_data = np.zeros((100, 100))
        from gzip import GzipFile as gfile
        fh = gfile("train_data.pkl.gz", "w")
        cPickle.dump(train_data, fh)
        fh.close()
        data = data_util.load_file("train_data.pkl.gz", "gfile", 100)
        self.assertTrue((train_data == data).all())
        data = data_util.load_file("train_data.pkl.gz", "gfile", 10)
        self.assertTrue((train_data[:10] == data).all())
        os.remove("train_data.pkl.gz")

        # Test wrong data file type
        self.assertRaises(IOError, data_util.load_file,
                          "test.sh", "uditare", 1)
        self.assertRaises(IOError, data_util.load_file,
                          "", "uditare", 1)
Пример #15
0
    def test_load_file(self):
        # Test numpy arrays
        train_data = np.zeros((100, 100))
        np.save("train_data.npy", train_data)
        data = data_util.load_file("train_data.npy", "numpy", 100)
        self.assertTrue((train_data == data).all())
        data = data_util.load_file("train_data.npy", "numpy", 10)
        self.assertTrue((train_data[:10] == data).all())
        os.remove("train_data.npy")

        # Test pickle files
        train_data = np.zeros((100, 100))
        fh = open("train_data.pkl", "w")
        cPickle.dump(train_data, fh)
        fh.close()
        data = data_util.load_file("train_data.pkl", "pickle", 100)
        self.assertTrue((train_data == data).all())
        data = data_util.load_file("train_data.pkl", "pickle", 10)
        self.assertTrue((train_data[:10] == data).all())
        os.remove("train_data.pkl")

        # Test zipped pickle files
        train_data = np.zeros((100, 100))
        from gzip import GzipFile as gfile
        fh = gfile("train_data.pkl.gz", "w")
        cPickle.dump(train_data, fh)
        fh.close()
        data = data_util.load_file("train_data.pkl.gz", "gfile", 100)
        self.assertTrue((train_data == data).all())
        data = data_util.load_file("train_data.pkl.gz", "gfile", 10)
        self.assertTrue((train_data[:10] == data).all())
        os.remove("train_data.pkl.gz")

        # Test wrong data file type
        self.assertRaises(IOError, data_util.load_file, "test.sh", "uditare",
                          1)
        self.assertRaises(IOError, data_util.load_file, "", "uditare", 1)
Пример #16
0
def gsave(p, filename):
    f=gfile(filename,'wb')
    cPickle.dump(p,f,cPickle.HIGHEST_PROTOCOL) 
    f.close()
Пример #17
0
def gload(filename):
    f=gfile(filename,'rb')
    y=cPickle.load(f)
    f.close()
    return y
Пример #18
0
def gsave(p, filename):
    f=gfile(filename,'wb')
    cPickle.dump(p,f,cPickle.HIGHEST_PROTOCOL) 
    f.close()
Пример #19
0
def gload(filename):
    f=gfile(filename,'rb')
    y=cPickle.load(f)
    f.close()
    return y