def read_data(self, max_train_size, max_dev_size): utils.debug('reading training data') train_set = utils.read_dataset(self.filenames.train, self.extensions, self.vocabs, max_size=max_train_size, binary_input=self.binary_input, character_level=self.character_level) self.batch_iterator = utils.read_ahead_batch_iterator(train_set, self.batch_size, read_ahead=10) utils.debug('reading development data') dev_sets = [ utils.read_dataset(dev, self.extensions, self.vocabs, max_size=max_dev_size, binary_input=self.binary_input, character_level=self.character_level) for dev in self.filenames.dev ] # subset of the dev set whose perplexity is periodically evaluated self.dev_batches = [ utils.get_batches(dev_set, batch_size=self.batch_size, batches=-1) for dev_set in dev_sets ]
def read_data(self, max_train_size, max_dev_size, read_ahead=10, batch_mode='standard', shuffle=True, **kwargs): utils.debug('reading training data') train_set = utils.read_dataset(self.filenames.train, self.extensions, self.vocabs, max_size=max_train_size, binary_input=self.binary_input, character_level=self.character_level, max_seq_len=self.max_input_len) self.train_size = len(train_set) self.batch_iterator = utils.read_ahead_batch_iterator( train_set, self.batch_size, read_ahead=read_ahead, mode=batch_mode, shuffle=shuffle) utils.debug('reading development data') dev_sets = [ utils.read_dataset(dev, self.extensions, self.vocabs, max_size=max_dev_size, binary_input=self.binary_input, character_level=self.character_level) for dev in self.filenames.dev ] # subset of the dev set whose perplexity is periodically evaluated self.dev_batches = [ utils.get_batches(dev_set, batch_size=self.batch_size) for dev_set in dev_sets ]
def read_data(self, max_train_size, max_dev_size, read_ahead=10, batch_mode='standard', shuffle=True, crash_test=False, **kwargs): utils.debug('reading training data') self.batch_iterator, self.train_size = utils.get_batch_iterator( self.filenames.train, self.extensions, self.vocabs, self.batch_size, max_size=max_train_size, character_level=self.character_level, max_seq_len=self.max_len, read_ahead=read_ahead, mode=batch_mode, shuffle=shuffle, binary=self.binary, crash_test=crash_test ) utils.debug('reading development data') dev_sets = [ utils.read_dataset(dev, self.extensions, self.vocabs, max_size=max_dev_size, character_level=self.character_level, binary=self.binary)[0] for dev in self.filenames.dev ] # subset of the dev set whose loss is periodically evaluated self.dev_batches = [utils.get_batches(dev_set, batch_size=self.batch_size) for dev_set in dev_sets]