def make_batches(self, lines): token_lst = [ self.task.source_dictionary.encode_line( line, add_if_not_exist=False).long() for line in lines ] length_lst = torch.LongTensor([tokens.numel() for tokens in token_lst]) ds = data.TokenBlockDataset(token_lst, length_lst, self.args.tokens_per_sample, pad=self.task.dictionary.pad(), eos=self.task.dictionary.eos(), break_mode='eos', include_targets=True) add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' itr = self.task.get_batch_iterator( dataset=data.MonolingualDataset(ds, ds.sizes, self.task.dictionary, self.task.target_dictionary, add_eos_for_other_targets, shuffle=False, targets=self.task.targets), max_tokens=self.args.max_tokens or 3000, max_sentences=self.args.max_sentences, max_positions=utils.resolve_max_positions( *[model.max_positions() for model in self.models]), num_shards=self.args.num_shards, shard_id=self.args.shard_id, ignore_invalid_inputs=True, num_workers=self.args.num_workers, ).next_epoch_itr(shuffle=False) return itr
def score_sentence(self, line): # Tokenize the input sentence into a batch of size one. tokens = tokenizer.Tokenizer.tokenize(line, self.task.dictionary, add_if_not_exist=False).long() lengths = np.array([tokens.numel()]) ds = data.TokenBlockDataset(tokens, lengths, self.args.tokens_per_sample, pad=self.task.dictionary.pad(), eos=self.task.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True) # Create a batch iterator to wrap the data. add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' itr = self.task.get_batch_iterator( dataset=data.MonolingualDataset(ds, ds.sizes, self.task.dictionary, self.task.target_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=False, targets=self.task.targets), max_tokens=self.args.max_tokens or 3000, max_sentences=self.args.max_sentences, max_positions=utils.resolve_max_positions(*[ model.max_positions() for model in self.models ]), num_shards=self.args.num_shards, shard_id=self.args.shard_id, ignore_invalid_inputs=True, ).next_epoch_itr(shuffle=False) # Evaluate the sentence and return the fluency score. results = self.scorer.score_batched_itr(itr, cuda=self.use_cuda) for _, _, _, hypos in results: for hypo in hypos: # Ignore words with infinite probability. This can happen when # running low-precision inference on the GPU. pos_scores = hypo['positional_scores'] word_prob = [score for score in pos_scores if score != float('-inf') and score != float('inf')] return self._fluency_score(word_prob) return 0.0
def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch): tokens = torch.LongTensor(list(range(epoch_size))).view(1, -1) tokens_ds = data.TokenBlockDataset( tokens, sizes=[tokens.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) trainer = mock_trainer(epoch, num_updates, iterations_in_epoch) dataset = data.LanguagePairDataset(tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False) epoch_itr = data.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=[[i] for i in range(epoch_size)], ) return trainer, epoch_itr
def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch): tokens = torch.LongTensor(list(range(epoch_size))) tokens_ds = data.TokenBlockDataset(tokens, [len(tokens)], 1, include_targets=False) trainer = mock_trainer(epoch, num_updates, iterations_in_epoch) epoch_itr = data.EpochBatchIterator( dataset=data.LanguagePairDataset(tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False), max_tokens=1, ) return trainer, epoch_itr