示例#1
0
 def setUp(self):
     self.src_txt, self.trg_txt = test_utils.create_test_text_files()
     self.vocab_file_path = test_utils.make_temp_file()
     self.d = dictionary.Dictionary.build_vocab_file(
         corpus_files=[self.src_txt, self.trg_txt],
         vocab_file=self.vocab_file_path,
         max_vocab_size=0,
         padding_factor=1,  # don't add extra padding symbols
     )
     # src_ref is reversed
     self.src_ref = [
         [106, 104, 102, 100],
         [104, 104, 102, 102, 100, 100],
         [102, 102, 102, 102, 100, 100, 100, 100],
         [100, 100, 100, 100, 100, 100, 100, 100, 100, 100],
     ]
     self.trg_ref = [
         [101, 101, 101, 101, 101, 101, 101, 101, 101, 101],
         [101, 101, 101, 101, 103, 103, 103, 103],
         [101, 101, 103, 103, 105, 105],
         [101, 103, 105, 107],
     ]
     self.src_txt_numberized, self.trg_txt_numberized = test_utils.create_test_numberized_data_files(
         self.src_ref, self.trg_ref, reverse_source=True
     )
     self.num_sentences = 4
示例#2
0
 def test_load_data_single_path(self):
     test_args = test_utils.ModelParamsDict()
     test_args.source_lang = "en"
     test_args.target_lang = "fr"
     test_args.log_verbose = False
     src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
     src_text_file, tgt_text_file = test_utils.create_test_text_files()
     src_bin_path = preprocess.binarize_text_file(
         text_file=src_text_file,
         dictionary=src_dict,
         output_path=tempfile.NamedTemporaryFile().name,
         append_eos=True,
         reverse_order=False,
     )
     tgt_bin_path = preprocess.binarize_text_file(
         text_file=tgt_text_file,
         dictionary=tgt_dict,
         output_path=tempfile.NamedTemporaryFile().name,
         append_eos=True,
         reverse_order=False,
     )
     task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
     split = "0"
     task.load_dataset(split, src_bin_path, tgt_bin_path)
     self.assertEqual(len(task.datasets[split]), 4)
     self.assertIsInstance(task.datasets[split], LanguagePairDataset)
示例#3
0
 def test_load_data_multi_path(self):
     test_args = test_utils.ModelParamsDict()
     test_args.source_lang = "en"
     test_args.target_lang = "fr"
     test_args.log_verbose = False
     src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
     num_paths = 4
     src_bin_path, tgt_bin_path = {}, {}
     for i in range(num_paths):
         src_text_file, tgt_text_file = test_utils.create_test_text_files()
         src_bin_path[i] = preprocess.binarize_text_file(
             text_file=src_text_file,
             dictionary=src_dict,
             output_path=tempfile.NamedTemporaryFile().name,
             append_eos=True,
             reverse_order=False,
         )
         tgt_bin_path[i] = preprocess.binarize_text_file(
             text_file=tgt_text_file,
             dictionary=tgt_dict,
             output_path=tempfile.NamedTemporaryFile().name,
             append_eos=True,
             reverse_order=False,
         )
     task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
     split = "1"
     task.load_dataset(split, src_bin_path, tgt_bin_path)
     self.assertEqual(len(task.datasets[split]), 16)
     self.assertIsInstance(task.datasets[split], MultiCorpusSampledDataset)
示例#4
0
 def _prepare_data_multi_path(self, num_paths):
     test_args = test_utils.ModelParamsDict()
     test_args.source_lang = "en"
     test_args.target_lang = "fr"
     test_args.log_verbose = False
     test_args.dataset_upsampling = None
     test_args.dataset_relative_ratio = None
     src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
     src_bin_path, tgt_bin_path = {}, {}
     for i in range(num_paths):
         src_text_file, tgt_text_file = test_utils.create_test_text_files()
         src_bin_path[i] = preprocess.binarize_text_file(
             text_file=src_text_file,
             dictionary=src_dict,
             output_path=tempfile.NamedTemporaryFile().name,
             append_eos=True,
             reverse_order=False,
         )
         tgt_bin_path[i] = preprocess.binarize_text_file(
             text_file=tgt_text_file,
             dictionary=tgt_dict,
             output_path=tempfile.NamedTemporaryFile().name,
             append_eos=True,
             reverse_order=False,
         )
     return test_args, src_dict, tgt_dict, src_bin_path, tgt_bin_path
示例#5
0
 def setUp(self):
     self.src_txt, self.trg_txt = test_utils.create_test_text_files()
     self.vocab_file_path = test_utils.make_temp_file()
     self.d = dictionary.Dictionary.build_vocab_file(
         corpus_files=[self.src_txt, self.trg_txt],
         vocab_file=self.vocab_file_path,
         max_vocab_size=0,
     )
     # src_ref is reversed, +1 for lua
     self.src_ref = [
         [107, 105, 103, 101],
         [105, 105, 103, 103, 101, 101],
         [103, 103, 103, 103, 101, 101, 101, 101],
         [101, 101, 101, 101, 101, 101, 101, 101, 101, 101],
     ]
     self.trg_ref = [
         [102, 102, 102, 102, 102, 102, 102, 102, 102, 102],
         [102, 102, 102, 102, 104, 104, 104, 104],
         [102, 102, 104, 104, 106, 106],
         [102, 104, 106, 108],
     ]
     self.src_txt_numberized, self.trg_txt_numberized = test_utils.create_test_numberized_data_files(
         self.src_ref, self.trg_ref, reverse_source=True)
     self.lua_eos = self.d.eos_index + 1
     self.num_sentences = 4
示例#6
0
 def test_build_vocab_file_max_vocab(self):
     src_txt, trg_txt = test_utils.create_test_text_files()
     tmp_prefix = test_utils.make_temp_file()
     src_dict1 = dictionary.Dictionary.build_vocab_file(
         corpus_files=[src_txt],
         vocab_file=f"{tmp_prefix}.src1",
         max_vocab_size=1)
     src_dict2 = dictionary.Dictionary.build_vocab_file(
         corpus_files=[src_txt],
         vocab_file=f"{tmp_prefix}.src2",
         max_vocab_size=2)
     src_dict3 = dictionary.Dictionary.build_vocab_file(
         corpus_files=[src_txt],
         vocab_file=f"{tmp_prefix}.src3",
         max_vocab_size=104)
     src_dict4 = dictionary.Dictionary.build_vocab_file(
         corpus_files=[src_txt],
         vocab_file=f"{tmp_prefix}.src4",
         max_vocab_size=0)
     self.assertEqual(src_dict1.nspecial + 1, len(src_dict1))
     self.assertEqual(src_dict2.nspecial + 2, len(src_dict2))
     self.assertEqual(src_dict3.nspecial + 4, len(src_dict3))
     self._assert_vocab_equal(src_dict3, src_dict4)
     os.remove(f"{tmp_prefix}.src1")
     os.remove(f"{tmp_prefix}.src2")
     os.remove(f"{tmp_prefix}.src3")
     os.remove(f"{tmp_prefix}.src4")
     os.remove(src_txt)
     os.remove(trg_txt)
 def test_build_vocab_file(self):
     src_txt, trg_txt = test_utils.create_test_text_files()
     tmp_prefix = test_utils.make_temp_file()
     src_dict1 = dictionary.Dictionary.build_vocab_file(
         corpus_files=[src_txt],
         vocab_file=f"{tmp_prefix}.src1",
         max_vocab_size=1000)
     src_dict2 = dictionary.Dictionary.build_vocab_file(
         corpus_files=[src_txt, src_txt, src_txt],
         vocab_file=f"{tmp_prefix}.src2",
         max_vocab_size=1000,
         padding_factor=1,
     )
     trg_dict1 = dictionary.Dictionary.build_vocab_file(
         corpus_files=[trg_txt],
         vocab_file=f"{tmp_prefix}.trg1",
         max_vocab_size=1000)
     trg_dict2 = dictionary.Dictionary.build_vocab_file(
         corpus_files=[trg_txt, trg_txt, trg_txt],
         vocab_file=f"{tmp_prefix}.trg2",
         max_vocab_size=1000,
         padding_factor=1,
     )
     srctrg_dict = dictionary.Dictionary.build_vocab_file(
         corpus_files=[src_txt, trg_txt],
         vocab_file=f"{tmp_prefix}.srctrg",
         max_vocab_size=1000,
         padding_factor=1,
     )
     nspecial = src_dict1.nspecial
     self.assertEqual(len(src_dict1), nspecial + 4)
     self.assertEqual(len(trg_dict1), nspecial + 4)
     self.assertEqual(len(srctrg_dict), nspecial + 8)
     for s in src_dict1.symbols:
         self.assertIn(s, srctrg_dict.symbols)
     for s in trg_dict1.symbols:
         self.assertIn(s, srctrg_dict.symbols)
     src_dict1_loaded = dictionary.Dictionary.load(f"{tmp_prefix}.src1")
     src_dict2_loaded = dictionary.Dictionary.load(f"{tmp_prefix}.src2")
     trg_dict1_loaded = dictionary.Dictionary.load(f"{tmp_prefix}.trg1")
     trg_dict2_loaded = dictionary.Dictionary.load(f"{tmp_prefix}.trg2")
     self._assert_vocab_equal(src_dict1, src_dict2)
     self._assert_vocab_equal(src_dict1, src_dict1_loaded)
     self._assert_vocab_equal(src_dict1, src_dict2_loaded)
     self._assert_vocab_equal(trg_dict1, trg_dict2)
     self._assert_vocab_equal(trg_dict1, trg_dict1_loaded)
     self._assert_vocab_equal(trg_dict1, trg_dict2_loaded)
     for c in range(nspecial, nspecial + 4):
         self.assertEqual(src_dict1.count[c], src_dict1_loaded.count[c])
         self.assertEqual(src_dict2.count[c], src_dict2_loaded.count[c])
         self.assertEqual(src_dict1.count[c] * 3, src_dict2.count[c])
         self.assertEqual(trg_dict1.count[c], trg_dict1_loaded.count[c])
         self.assertEqual(trg_dict2.count[c], trg_dict2_loaded.count[c])
         self.assertEqual(trg_dict1.count[c] * 3, trg_dict2.count[c])
     os.remove(f"{tmp_prefix}.src1")
     os.remove(f"{tmp_prefix}.src2")
     os.remove(f"{tmp_prefix}.trg1")
     os.remove(f"{tmp_prefix}.trg2")
     os.remove(src_txt)
     os.remove(trg_txt)
示例#8
0
 def test_load_data_single_path_idx_bin(self):
     test_args = test_utils.ModelParamsDict()
     test_args.source_lang = "en"
     test_args.target_lang = "fr"
     test_args.log_verbose = False
     src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
     src_text_file, tgt_text_file = test_utils.create_test_text_files()
     task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
     with tempfile.TemporaryDirectory() as destdir:
         preprocess_args = [
             "--source-lang",
             test_args.source_lang,
             "--target-lang",
             test_args.target_lang,
             "--destdir",
             destdir,
         ]
         preproc_parser = preprocess_options.get_preprocessing_parser()
         preproc_args = preproc_parser.parse_args(preprocess_args)
         preproc_args.dataset_impl = "mmap"
         split = "train"
         binarize(
             preproc_args,
             src_text_file,
             src_dict,
             split,
             test_args.source_lang,
             offset=0,
             end=-1,
         )
         binarize(
             preproc_args,
             tgt_text_file,
             tgt_dict,
             split,
             test_args.target_lang,
             offset=0,
             end=-1,
         )
         src_path = dataset_dest_prefix(preproc_args, split,
                                        test_args.source_lang)
         tgt_path = dataset_dest_prefix(preproc_args, split,
                                        test_args.target_lang)
         task.load_dataset(split, src_path, tgt_path, is_npz=False)
         self.assertEqual(len(task.datasets[split]), 4)
         self.assertIsInstance(task.datasets[split], LanguagePairDataset)
示例#9
0
 def test_load_data_noising(self):
     test_args = test_utils.ModelParamsDict()
     test_args.source_lang = "en"
     test_args.target_lang = "fr"
     test_args.log_verbose = False
     src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
     num_paths = 4
     src_bin_path, tgt_bin_path = {}, {}
     for i in range(num_paths):
         src_text_file, tgt_text_file = test_utils.create_test_text_files()
         src_bin_path[i] = preprocess.binarize_text_file(
             text_file=src_text_file,
             dictionary=src_dict,
             output_path=tempfile.NamedTemporaryFile().name,
             append_eos=True,
             reverse_order=False,
         )
         tgt_bin_path[i] = preprocess.binarize_text_file(
             text_file=tgt_text_file,
             dictionary=tgt_dict,
             output_path=tempfile.NamedTemporaryFile().name,
             append_eos=True,
             reverse_order=False,
         )
     task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
     split = "1"
     task.load_dataset(
         split,
         src_bin_path,
         tgt_bin_path,
         noiser={
             0:
             UnsupervisedMTNoising(
                 dictionary=src_dict,
                 max_word_shuffle_distance=3,
                 word_dropout_prob=0.2,
                 word_blanking_prob=0.2,
             )
         },
     )
     self.assertEqual(len(task.datasets[split]), 16)
     self.assertIsInstance(task.datasets[split].datasets[0].src,
                           NoisingDataset)
示例#10
0
 def setUp(self):
     self.src_txt, self.trg_txt = test_utils.create_test_text_files()
     self.vocab_file_path = test_utils.make_temp_file()
     self.word_dict = Dictionary.build_vocab_file(
         corpus_files=[self.src_txt, self.trg_txt],
         vocab_file=self.vocab_file_path,
         max_vocab_size=0,
         padding_factor=1,  # don't add extra padding symbols
     )
     self.char_dict = Dictionary.build_vocab_file(
         corpus_files=[self.src_txt, self.trg_txt],
         vocab_file=self.vocab_file_path,
         max_vocab_size=0,
         is_char_vocab=True,
         padding_factor=1,  # don't add extra padding symbols
     )
     self.sample = self._dummy_char_data_sample(
         src_dict=self.word_dict,
         dst_dict=self.word_dict,
         src_char_dict=self.char_dict,
         dst_char_dict=self.char_dict,
     )
示例#11
0
 def test_push(self):
     max_vocab_dict = dictionary.MaxVocabDictionary()
     src_txt, trg_txt = test_utils.create_test_text_files()
     tmp_prefix = test_utils.make_temp_file()
     src_dict = dictionary.Dictionary.build_vocab_file(
         corpus_files=[src_txt], vocab_file=f"{tmp_prefix}.src", max_vocab_size=1000
     )
     srctrg_dict = dictionary.Dictionary.build_vocab_file(
         corpus_files=[src_txt, trg_txt],
         vocab_file=f"{tmp_prefix}.srctrg",
         max_vocab_size=1000,
     )
     self.assertEqual(len(max_vocab_dict), max_vocab_dict.nspecial)
     max_vocab_dict.push(src_dict)
     self.assertEqual(len(max_vocab_dict), len(src_dict))
     max_vocab_dict.push(srctrg_dict)
     self.assertEqual(len(max_vocab_dict), len(srctrg_dict))
     max_vocab_dict.push(src_dict)
     self.assertEqual(len(max_vocab_dict), len(srctrg_dict))
     os.remove(f"{tmp_prefix}.src")
     os.remove(f"{tmp_prefix}.srctrg")
     os.remove(src_txt)
     os.remove(trg_txt)
示例#12
0
 def setUp(self):
     (
         self.source_text_file,
         self.target_text_file,
     ) = test_utils.create_test_text_files()