예제 #1
0
    def prepare(self):
        '''Set num_samples, num_mega_batches, num_features
        Read one file and set num_features, etc.
        This is a bit underoptimized.  We end up loading the first set of files twice - once here and
        once at start of pretraining.
        '''
        with open(self.disk_loader.datafiles[0], 'rb') as fobj:
            self.num_features = len(nn_util.load_pickle_file(fobj).next())
            self.num_samples = 1 + sum(1 for x in nn_util.load_pickle_file(fobj))
        self.num_minibatches = len(self.disk_loader.datafiles)
        self.num_samples *= self.num_minibatches
        with open(self.label_file) as fobj:
            labels = [int(x.strip()) for x in fobj]
        self.num_classes = len(labels)
        self._label2idx = dict([(ll, ii) for ii, ll in enumerate(sorted(labels))])
        logging.info('num_samples={}, num_features={}, num_classes={}'.format(
            self.num_samples, self.num_features, self.num_classes))

        with open(self.disk_loader.labelfiles[0]) as fobj:
            num_y_samples = sum(1 for x in fobj)
        num_y_samples *= len(self.disk_loader.labelfiles)
        assert self.num_samples == num_y_samples, "Num rows in X and Y don't match"

        self.num_minibatches = self.num_samples / self.batch_size
        self.num_minibatches_in_mega = self.num_samples / self.batch_size / self.num_mega_batches
        logging.info('num_minibatches={}, num_minibatches_in_mega={}'.format(
            self.num_minibatches, self.num_minibatches_in_mega))
예제 #2
0
    def load_shared_from_disk(self, mega_batch_index, load_y):
        datafile = self.disk_loader.datafiles[mega_batch_index]
        with open(datafile, 'rb') as fobj:
            xx = np.array([x for x in nn_util.load_pickle_file(fobj)], dtype=theano.config.floatX)
        if self.global_logging_level <= logging.DEBUG:
            logging.debug('Loaded {}'.format(datafile))

        self.th_train_set.x.set_value(xx)

        if load_y:
            labelfile = self.disk_loader.labelfiles[mega_batch_index]
            with open(labelfile, 'r') as ff:
                yy = np.array([self._label2idx[int(line.strip())] for line in ff], dtype=theano.config.floatX)
            if self.global_logging_level <= logging.DEBUG:
                logging.debug('Loaded {}'.format(labelfile))

            self.th_train_set.yf.set_value(yy)
예제 #3
0
    def load_data_mm(self):
        '''Load all of input data into main memory.
        Set up theano shared variables so that data can be moved from main memory to GPU memory in mega batches.
        '''
        logging.info('Loading file {}'.format(self.dataset_file))
        with open(self.dataset_file, 'rb') as fobj:
            self.mm_train_set.x = np.array([x for x in nn_util.load_pickle_file(fobj)], dtype=theano.config.floatX)

        logging.info('Loading file {}'.format(self.label_file))
        with open(self.label_file, 'r') as ff:
            self.mm_train_set.y = np.array([int(line.strip()) for line in ff], dtype=theano.config.floatX)

        # theano requires that the labels be in the range [0, L), where L is the number of unique labels.
        unique_labels = set(self.mm_train_set.y)
        if sorted(unique_labels) == list(unique_labels):
            logging.info('keeping the labels as is')
            label2idx = dict((ii, ii) for ii in unique_labels)
        else:
            logging.info('translating labels to range(N)')
            label2idx = dict((ll, ii) for ii, ll in enumerate(sorted(unique_labels)))
        self.mm_train_set.y = np.array([label2idx[ll] for ll in self.mm_train_set.y], dtype=theano.config.floatX)

        self.num_samples = self.mm_train_set.x.shape[0]
        self.num_features = self.mm_train_set.x.shape[1]
        self.num_classes = len(set(self.mm_train_set.y))
        logging.info('num_samples={}, num_features={}, num_classes={}'.format(
            self.num_samples, self.num_features, self.num_classes))

        assert self.num_samples == self.mm_train_set.y.shape[0], "Num rows in X and Y don't match"

        assert self.num_samples % self.batch_size == 0, 'num_samples not an int multiple of batch_size'

        self.num_minibatches = self.num_samples / self.batch_size
        self.num_minibatches_in_mega = self.num_samples / self.batch_size / self.num_mega_batches
        logging.info('num_minibatches={}, num_minibatches_in_mega={}'.format(
            self.num_minibatches, self.num_minibatches_in_mega))

        msg = 'num_samples must be an integer multiple of num_mega_batches'
        assert self.num_samples / self.num_mega_batches * self.num_mega_batches == self.num_samples, msg