def test_char_rnn_generate(self): test_args = test_utils.ModelParamsDict(sequence_lstm=True) test_args.arch = "char_source" test_args.char_source_dict_size = 126 test_args.char_embed_dim = 8 test_args.char_rnn_units = 12 test_args.char_rnn_layers = 2 _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args) task = tasks.DictionaryHolderTask(src_dict, tgt_dict) model = task.build_model(test_args) translator = beam_decode.SequenceGenerator([model], task.target_dictionary, use_char_source=True) src_tokens = torch.LongTensor([[0, 0, 0], [0, 0, 0]]) src_lengths = torch.LongTensor([3, 3]) char_inds = torch.LongTensor(np.zeros((2, 3, 5))) word_lengths = torch.LongTensor([[5, 5, 5], [5, 5, 5]]) encoder_input = { "src_tokens": src_tokens, "src_lengths": src_lengths, "char_inds": char_inds, "word_lengths": word_lengths, } translator.generate(encoder_input, maxlen=7)
def test_basic_generate(self): test_args = test_utils.ModelParamsDict() _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args) model = models.build_model(test_args, src_dict, tgt_dict) translator = beam_decode.SequenceGenerator([model]) src_tokens = torch.LongTensor([[0, 0, 0], [0, 0, 0]]) src_lengths = torch.LongTensor([3, 3]) translator.generate(src_tokens=src_tokens, src_lengths=src_lengths)
def test_basic_generate(self): test_args = test_utils.ModelParamsDict() _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args) task = tasks.DictionaryHolderTask(src_dict, tgt_dict) model = task.build_model(test_args) translator = beam_decode.SequenceGenerator([model], task.target_dictionary) src_tokens = torch.LongTensor([[0, 0, 0], [0, 0, 0]]) src_lengths = torch.LongTensor([3, 3]) encoder_input = (src_tokens, src_lengths) translator.generate(encoder_input, maxlen=7)
def test_char_rnn_generate(self): test_args = test_utils.ModelParamsDict(sequence_lstm=True) test_args.arch = "char_source" test_args.char_source_dict_size = 126 test_args.char_embed_dim = 8 test_args.char_rnn_units = 12 test_args.char_rnn_layers = 2 _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args) model = models.build_model(test_args, src_dict, tgt_dict) translator = beam_decode.SequenceGenerator([model]) src_tokens = torch.LongTensor([[0, 0, 0], [0, 0, 0]]) src_lengths = torch.LongTensor([3, 3]) char_inds = torch.LongTensor(np.zeros((2, 3, 5))) word_lengths = torch.LongTensor([[5, 5, 5], [5, 5, 5]]) encoder_input = (src_tokens, src_lengths, char_inds, word_lengths) translator.generate(encoder_input, maxlen=7)
def test_diversity_sibling_rank(self): """ Testing calculation of sibling_rank() function. """ test_args = test_utils.ModelParamsDict() _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args) task = tasks.DictionaryHolderTask(src_dict, tgt_dict) model = task.build_model(test_args) translator = beam_decode.SequenceGenerator([model], task.target_dictionary) logprobs = torch.FloatTensor([[[2, 1, 3, 5, 6], [0, 1, 3, 2, 4]], [[2, 3, 1, 5, 0], [3, 1, 5, 2, 0]]]) logprobs_out = torch.FloatTensor([ [[-1, -3, 1, 4, 6], [-4, -2, 2, 0, 4]], [[0, 2, -2, 5, -4], [2, -2, 5, 0, -4]], ]) logprobs = translator.diversity_sibling_rank(logprobs, 1) np.testing.assert_allclose(actual=logprobs_out.view(-1, 5).numpy(), desired=logprobs.numpy(), atol=1e-5)
def _generate_score(models, args, dataset, dataset_split): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print("| loading model(s) from {}".format(", ".join(args.path))) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam) # Initialize generator model_weights = None if args.model_weights: model_weights = [ float(w.strip()) for w in args.model_weights.split(",") ] translator = beam_decode.SequenceGenerator( models, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, word_reward=args.word_reward, model_weights=model_weights, ) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Generate and compute BLEU score scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk()) max_positions = min(model.max_encoder_positions() for model in models) itr = dataset.eval_dataloader( dataset_split, max_sentences=args.max_sentences, max_positions=max_positions, skip_invalid_size_inputs_valid_test=( args.skip_invalid_size_inputs_valid_test), ) if args.num_shards > 1: if args.shard_id < 0 or args.shard_id >= args.num_shards: raise ValueError("--shard-id must be between 0 and num_shards") itr = data.sharded_iterator(itr, args.num_shards, args.shard_id) num_sentences = 0 with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, ) for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and ground truth target_tokens = target_tokens.int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = dataset.splits[dataset_split].src.get_original_text( sample_id) target_str = dataset.splits[ dataset_split].dst.get_original_text(sample_id) else: src_str = dataset.src_dict.string(src_tokens, args.remove_bpe) target_str = dataset.dst_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: print(f"S-{sample_id}\t{src_str}") print(f"T-{sample_id}\t{target_str}") # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo["tokens"].int().cpu(), src_str=src_str, alignment=hypo["alignment"].int().cpu(), align_dict=align_dict, dst_dict=dataset.dst_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: print(f"H-{sample_id}\t{hypo['score']}\t{hypo_str}") print("A-{}\t{}".format( sample_id, " ".join(map(lambda x: str(utils.item(x)), alignment)), )) # Score only the top hypothesis if i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement # and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize( target_str, dataset.dst_dict, add_if_not_exist=True) scorer.add(target_tokens, hypo_tokens) wps_meter.update(src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 return scorer, num_sentences, gen_timer
def load_dataset( self, split, src_bin_path, tgt_bin_path, forward_model=None, backward_model=None ): """Load a dataset split.""" corpus = ptt_data.ParallelCorpusConfig( source=ptt_data.CorpusConfig( dialect=self.source_lang, data_file=src_bin_path ), target=ptt_data.CorpusConfig( dialect=self.target_lang, data_file=tgt_bin_path ), weights_file=None, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) data_utils.validate_corpus_exists(corpus=corpus, split=split) forward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file ) backward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file ) forward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file ) backward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file ) forward_parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=forward_src_dataset, src_sizes=forward_src_dataset.sizes, src_dict=self.source_dictionary, tgt=forward_tgt_dataset, tgt_sizes=forward_tgt_dataset.sizes, tgt_dict=self.target_dictionary, remove_eos_from_source=self.remove_eos_from_source, append_eos_to_target=True, ) backward_parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=backward_src_dataset, src_sizes=backward_src_dataset.sizes, src_dict=self.target_dictionary, tgt=backward_tgt_dataset, tgt_sizes=backward_tgt_dataset.sizes, tgt_dict=self.source_dictionary, remove_eos_from_source=self.remove_eos_from_source, append_eos_to_target=True, ) dataset_map = OrderedDict( [ (f"{self.source_lang}-{self.target_lang}", forward_parallel_dataset), (f"{self.target_lang}-{self.source_lang}", backward_parallel_dataset), ] ) assert (forward_model and backward_model) or ( forward_model is None and backward_model is None ), ( "Only one of forward or backward models can't be null;" " both have to be non-null or null" ) if forward_model and backward_model: fwd_generator = beam_decode.SequenceGenerator( models=[forward_model], tgt_dict=self.source_dictionary ) bwd_generator = beam_decode.SequenceGenerator( models=[backward_model], tgt_dict=self.target_dictionary ) def monolingual_dataset( path, dictionary, is_source=False, num_examples_limit: Optional[int] = None, ): dataset = self.load_monolingual_dataset( path, is_source=is_source, num_examples_limit=num_examples_limit ) return LanguagePairDataset( src=dataset, src_sizes=dataset.sizes, src_dict=dictionary, tgt=None, tgt_sizes=None, tgt_dict=None, ) monolingual_num_examples_limit = None if self.args.monolingual_ratio is not None: monolingual_num_examples_limit = int( self.args.monolingual_ratio * len(forward_parallel_dataset) ) src_dataset = monolingual_dataset( path=self.args.train_mono_source_binary_path, dictionary=self.source_dictionary, is_source=True, num_examples_limit=monolingual_num_examples_limit, ) tgt_dataset = monolingual_dataset( path=self.args.train_mono_target_binary_path, dictionary=self.target_dictionary, is_source=False, num_examples_limit=monolingual_num_examples_limit, ) dataset_map[ f"{self.source_lang}-" f"{self.target_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}" ] = BacktranslationDataset( tgt_dataset=TransformEosDataset( dataset=tgt_dataset, eos=self.target_dictionary.eos(), # Remove EOS from the input before backtranslation. remove_eos_from_src=True, ), backtranslation_fn=bwd_generator.generate, max_len_a=self.args.max_len_a, max_len_b=self.args.max_len_b, output_collater=TransformEosDataset( dataset=tgt_dataset, eos=self.target_dictionary.eos(), # The original input (now the target) doesn't have # an EOS, so we need to add one. The generated # backtranslation (now the source) will have an EOS, # so we want to remove it. append_eos_to_tgt=True, remove_eos_from_src=True, ).collater, ) dataset_map[ f"{self.target_lang}-" f"{self.source_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}" ] = BacktranslationDataset( tgt_dataset=src_dataset, backtranslation_fn=fwd_generator.generate, max_len_a=self.args.max_len_a, max_len_b=self.args.max_len_b, output_collater=TransformEosDataset( dataset=src_dataset, eos=self.source_dictionary.eos(), # The original input (now the target) doesn't have # an EOS, so we need to add one. The generated # backtranslation (now the source) will have an EOS, # so we want to remove it. append_eos_to_tgt=True, remove_eos_from_src=True, ).collater, ) # print before loading RoundRobinZipDatasets to help catch any bugs for dataset_key, dataset in dataset_map.items(): print(f"| {split}: {dataset_key} {len(dataset)} examples in dataset") self.datasets[split] = RoundRobinZipDatasets(dataset_map) print( f"| {split} {len(self.datasets[split])} examples in RoundRobinZipDatasets" ) if self.args.log_verbose: print("Finished loading dataset", flush=True)