示例#1
0
    def read_data(self, max_train_size, max_dev_size):
        utils.debug('reading training data')
        train_set = utils.read_dataset(self.filenames.train,
                                       self.extensions,
                                       self.vocabs,
                                       max_size=max_train_size,
                                       binary_input=self.binary_input,
                                       character_level=self.character_level)
        self.batch_iterator = utils.read_ahead_batch_iterator(train_set,
                                                              self.batch_size,
                                                              read_ahead=10)

        utils.debug('reading development data')
        dev_sets = [
            utils.read_dataset(dev,
                               self.extensions,
                               self.vocabs,
                               max_size=max_dev_size,
                               binary_input=self.binary_input,
                               character_level=self.character_level)
            for dev in self.filenames.dev
        ]
        # subset of the dev set whose perplexity is periodically evaluated
        self.dev_batches = [
            utils.get_batches(dev_set, batch_size=self.batch_size, batches=-1)
            for dev_set in dev_sets
        ]
示例#2
0
    def read_data(self, max_train_size, max_dev_size, read_ahead=10, batch_mode='standard', shuffle=True,
                  crash_test=False, **kwargs):
        utils.debug('reading training data')
        self.batch_iterator, self.train_size = utils.get_batch_iterator(
            self.filenames.train, self.extensions, self.vocabs, self.batch_size,
            max_size=max_train_size, character_level=self.character_level, max_seq_len=self.max_len,
            read_ahead=read_ahead, mode=batch_mode, shuffle=shuffle, binary=self.binary, crash_test=crash_test
        )

        utils.debug('reading development data')

        dev_sets = [
            utils.read_dataset(dev, self.extensions, self.vocabs, max_size=max_dev_size,
                               character_level=self.character_level, binary=self.binary)[0]
            for dev in self.filenames.dev
        ]
        # subset of the dev set whose loss is periodically evaluated
        self.dev_batches = [utils.get_batches(dev_set, batch_size=self.batch_size) for dev_set in dev_sets]
示例#3
0
    def read_data(self,
                  max_train_size,
                  max_dev_size,
                  read_ahead=10,
                  batch_mode='standard',
                  shuffle=True,
                  **kwargs):
        utils.debug('reading training data')
        train_set = utils.read_dataset(self.filenames.train,
                                       self.extensions,
                                       self.vocabs,
                                       max_size=max_train_size,
                                       binary_input=self.binary_input,
                                       character_level=self.character_level,
                                       max_seq_len=self.max_input_len)
        self.train_size = len(train_set)
        self.batch_iterator = utils.read_ahead_batch_iterator(
            train_set,
            self.batch_size,
            read_ahead=read_ahead,
            mode=batch_mode,
            shuffle=shuffle)

        utils.debug('reading development data')
        dev_sets = [
            utils.read_dataset(dev,
                               self.extensions,
                               self.vocabs,
                               max_size=max_dev_size,
                               binary_input=self.binary_input,
                               character_level=self.character_level)
            for dev in self.filenames.dev
        ]
        # subset of the dev set whose perplexity is periodically evaluated
        self.dev_batches = [
            utils.get_batches(dev_set, batch_size=self.batch_size)
            for dev_set in dev_sets
        ]