def prepare(self): '''Set num_samples, num_mega_batches, num_features Read one file and set num_features, etc. This is a bit underoptimized. We end up loading the first set of files twice - once here and once at start of pretraining. ''' with open(self.disk_loader.datafiles[0], 'rb') as fobj: self.num_features = len(nn_util.load_pickle_file(fobj).next()) self.num_samples = 1 + sum(1 for x in nn_util.load_pickle_file(fobj)) self.num_minibatches = len(self.disk_loader.datafiles) self.num_samples *= self.num_minibatches with open(self.label_file) as fobj: labels = [int(x.strip()) for x in fobj] self.num_classes = len(labels) self._label2idx = dict([(ll, ii) for ii, ll in enumerate(sorted(labels))]) logging.info('num_samples={}, num_features={}, num_classes={}'.format( self.num_samples, self.num_features, self.num_classes)) with open(self.disk_loader.labelfiles[0]) as fobj: num_y_samples = sum(1 for x in fobj) num_y_samples *= len(self.disk_loader.labelfiles) assert self.num_samples == num_y_samples, "Num rows in X and Y don't match" self.num_minibatches = self.num_samples / self.batch_size self.num_minibatches_in_mega = self.num_samples / self.batch_size / self.num_mega_batches logging.info('num_minibatches={}, num_minibatches_in_mega={}'.format( self.num_minibatches, self.num_minibatches_in_mega))
def load_shared_from_disk(self, mega_batch_index, load_y): datafile = self.disk_loader.datafiles[mega_batch_index] with open(datafile, 'rb') as fobj: xx = np.array([x for x in nn_util.load_pickle_file(fobj)], dtype=theano.config.floatX) if self.global_logging_level <= logging.DEBUG: logging.debug('Loaded {}'.format(datafile)) self.th_train_set.x.set_value(xx) if load_y: labelfile = self.disk_loader.labelfiles[mega_batch_index] with open(labelfile, 'r') as ff: yy = np.array([self._label2idx[int(line.strip())] for line in ff], dtype=theano.config.floatX) if self.global_logging_level <= logging.DEBUG: logging.debug('Loaded {}'.format(labelfile)) self.th_train_set.yf.set_value(yy)
def load_data_mm(self): '''Load all of input data into main memory. Set up theano shared variables so that data can be moved from main memory to GPU memory in mega batches. ''' logging.info('Loading file {}'.format(self.dataset_file)) with open(self.dataset_file, 'rb') as fobj: self.mm_train_set.x = np.array([x for x in nn_util.load_pickle_file(fobj)], dtype=theano.config.floatX) logging.info('Loading file {}'.format(self.label_file)) with open(self.label_file, 'r') as ff: self.mm_train_set.y = np.array([int(line.strip()) for line in ff], dtype=theano.config.floatX) # theano requires that the labels be in the range [0, L), where L is the number of unique labels. unique_labels = set(self.mm_train_set.y) if sorted(unique_labels) == list(unique_labels): logging.info('keeping the labels as is') label2idx = dict((ii, ii) for ii in unique_labels) else: logging.info('translating labels to range(N)') label2idx = dict((ll, ii) for ii, ll in enumerate(sorted(unique_labels))) self.mm_train_set.y = np.array([label2idx[ll] for ll in self.mm_train_set.y], dtype=theano.config.floatX) self.num_samples = self.mm_train_set.x.shape[0] self.num_features = self.mm_train_set.x.shape[1] self.num_classes = len(set(self.mm_train_set.y)) logging.info('num_samples={}, num_features={}, num_classes={}'.format( self.num_samples, self.num_features, self.num_classes)) assert self.num_samples == self.mm_train_set.y.shape[0], "Num rows in X and Y don't match" assert self.num_samples % self.batch_size == 0, 'num_samples not an int multiple of batch_size' self.num_minibatches = self.num_samples / self.batch_size self.num_minibatches_in_mega = self.num_samples / self.batch_size / self.num_mega_batches logging.info('num_minibatches={}, num_minibatches_in_mega={}'.format( self.num_minibatches, self.num_minibatches_in_mega)) msg = 'num_samples must be an integer multiple of num_mega_batches' assert self.num_samples / self.num_mega_batches * self.num_mega_batches == self.num_samples, msg