def load_dataset( self, split: str, src_bin_path: str, tgt_bin_path: str, weights_file=None, is_npz=True, ): src_bin_path = pytorch_translate_utils.maybe_parse_collection_argument( src_bin_path ) tgt_bin_path = pytorch_translate_utils.maybe_parse_collection_argument( tgt_bin_path ) # At most one of dataset_upsampling / dataset_relative_ratio could be # specified. if type(src_bin_path) is str: assert type(tgt_bin_path) is str self._load_dataset_single_path( split=split, src_bin_path=src_bin_path, tgt_bin_path=tgt_bin_path, weights_file=weights_file, is_npz=is_npz, ) else: self._load_dataset_multi_path( split, src_bin_path, tgt_bin_path, is_npz=is_npz ) if getattr(self.args, "log_verbose", False): print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} examples")
def preprocess_corpora(args, dictionary_cls=Dictionary): if (args.train_source_binary_path is not None and args.train_target_binary_path is not None): if isinstance( utils.maybe_parse_collection_argument( args.train_source_binary_path), str) and isinstance( utils.maybe_parse_collection_argument( args.train_target_binary_path), str): args.train_source_binary_path = maybe_generate_temp_file_path( args.train_source_binary_path) args.train_target_binary_path = maybe_generate_temp_file_path( args.train_target_binary_path) args.eval_source_binary_path = maybe_generate_temp_file_path( args.eval_source_binary_path) args.eval_target_binary_path = maybe_generate_temp_file_path( args.eval_target_binary_path) # Additional text preprocessing options could be added here before # binarizing. if pytorch_translate_data.is_multilingual(args): preprocess_corpora_multilingual(args) elif pytorch_translate_data.is_multilingual_many_to_one(args): preprocess_corpora_multilingual_many_to_one(args, dictionary_cls) elif pytorch_translate_data.is_latent_variable(args): preprocess_corpora_latent_variable(args) else: # Vocabs are built before preprocessing because we might need to use # both monolingual and bilingual corpora sources to build the vocab # (in the case of semisupervised training) source_dict, char_source_dict, target_dict = build_vocabs( args=args, dictionary_cls=dictionary_cls) preprocess_bilingual_corpora( args=args, source_dict=source_dict, char_source_dict=char_source_dict, target_dict=target_dict, ) # Binarize additional monolingual corpora for the semisupervised translation # task if (args.task == constants.SEMI_SUPERVISED_TASK or args.task == constants.DENOISING_AUTOENCODER_TASK): args.train_mono_source_binary_path = maybe_generate_temp_file_path( output_path=getattr(args, "train_mono_source_binary_path", None)) args.train_mono_target_binary_path = maybe_generate_temp_file_path( output_path=getattr(args, "train_mono_target_binary_path", None)) preprocess_monolingual_corpora( args, source_dict=source_dict, char_source_dict=char_source_dict, target_dict=target_dict, )
def _load_dataset_multi_path( self, split: str, src_bin_path: str, tgt_bin_path: str, is_npz: bool = True ): assert type(tgt_bin_path) is not str assert set(src_bin_path.keys()) == set(tgt_bin_path.keys()) source_lang = self.args.source_lang or "src" target_lang = self.args.target_lang or "tgt" direction = source_lang + "-" + target_lang dataset_upsampling = ( pytorch_translate_utils.maybe_parse_collection_argument( self.args.dataset_upsampling )[direction] if self.args.dataset_upsampling else None ) dataset_relative_ratio = ( pytorch_translate_utils.maybe_parse_collection_argument( self.args.dataset_relative_ratio )[direction] if self.args.dataset_relative_ratio else None ) noiser = {} noise_options = [ "word_dropout_prob", "max_word_shuffle_distance", "word_blanking_prob", ] for option in noise_options: option_map = getattr(self.args, option + "_map", None) if option_map: option_map = pytorch_translate_utils.maybe_parse_collection_argument( option_map )[direction] for key in option_map: if key not in noiser: noiser[key] = { noise_option: None for noise_option in noise_options } noiser[key][option] = option_map[key] for key in noiser: noiser[key] = UnsupervisedMTNoising( dictionary=self.src_dict, max_word_shuffle_distance=noiser[key]["max_word_shuffle_distance"] or 0, word_dropout_prob=noiser[key]["word_dropout_prob"] or 0, word_blanking_prob=noiser[key]["word_blanking_prob"] or 0, ) if dataset_relative_ratio is not None: assert dataset_upsampling is None, "dataset_upsampling and " "dataset_relative_ratio couldn't be specified together." assert dataset_relative_ratio[0] in src_bin_path.keys() self._load_dataset_multi_path_helper( split=split, src_multiple_bin_paths=src_bin_path, tgt_multiple_bin_paths=tgt_bin_path, dataset_relative_ratio=dataset_relative_ratio, seed=self.args.seed, noiser=noiser, is_npz=is_npz, ) elif dataset_upsampling is not None: for key in dataset_upsampling.keys(): assert key in src_bin_path.keys() self._load_dataset_multi_path_helper( split=split, src_multiple_bin_paths=src_bin_path, tgt_multiple_bin_paths=tgt_bin_path, dataset_upsampling=dataset_upsampling, seed=self.args.seed, noiser=noiser, is_npz=is_npz, ) else: self._load_dataset_multi_path_helper( split=split, src_multiple_bin_paths=src_bin_path, tgt_multiple_bin_paths=tgt_bin_path, seed=self.args.seed, noiser=noiser, is_npz=is_npz, )
def setup_training_model(args): """Parse args, load dataset, and build model with criterion.""" if not torch.cuda.is_available(): print("Warning: training without CUDA is likely to be slow!") else: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task and load dataset task = tasks.setup_task(args) # Build model and criterion model = task.build_model(args) print("| building criterion") criterion = task.build_criterion(args) print(f"| model {args.arch}, criterion {criterion.__class__.__name__}") print(f"| num. model params: \ {sum(p.numel() for p in model.parameters())}") if args.task == constants.SEMI_SUPERVISED_TASK: # TODO(T35638969): hide this inside the task itself, just use self.args task.load_dataset( split=args.train_subset, src_bin_path=args.train_source_binary_path, tgt_bin_path=args.train_target_binary_path, forward_model=task.forward_model, backward_model=task.backward_model, ) elif args.task == "pytorch_translate_denoising_autoencoder": task.load_dataset( split=args.train_subset, src_bin_path=args.train_source_binary_path, tgt_bin_path=args.train_target_binary_path, seed=args.seed, use_noiser=True, ) elif args.task == "dual_learning_task": task.load_dataset(split=args.train_subset, seed=args.seed) elif args.task == "pytorch_translate_knowledge_distillation": task.load_dataset( split=args.train_subset, src_bin_path=args.train_source_binary_path, tgt_bin_path=args.train_target_binary_path, weights_file=getattr(args, "train_weights_path", None), is_train=True, ) else: # Support both single and multi path loading for now dataset_upsampling = getattr(args, "dataset_upsampling", None) dataset_relative_ratio = getattr(args, "dataset_relative_ratio", None) source_lang = getattr(args, "source_lang", "src") target_lang = getattr(args, "target_lang", "tgt") direction = source_lang + "-" + target_lang if dataset_upsampling: dataset_upsampling = pytorch_translate_utils.maybe_parse_collection_argument( dataset_upsampling)[direction] if dataset_relative_ratio: dataset_relative_ratio = pytorch_translate_utils.maybe_parse_collection_argument( dataset_relative_ratio)[direction] noiser = {} noise_options = [ "word_dropout_prob", "max_word_shuffle_distance", "word_blanking_prob", ] for option in noise_options: option_map = getattr(args, option + "_map", None) if option_map: option_map = pytorch_translate_utils.maybe_parse_collection_argument( option_map)[direction] for key in option_map: if key not in noiser: noiser[key] = { noise_option: None for noise_option in noise_options } noiser[key][option] = option_map[key] for key in noiser: noiser[key] = UnsupervisedMTNoising( dictionary=task.src_dict, max_word_shuffle_distance=noiser[key] ["max_word_shuffle_distance"] or 0, word_dropout_prob=noiser[key]["word_dropout_prob"] or 0, word_blanking_prob=noiser[key]["word_blanking_prob"] or 0, ) task.load_dataset( split=args.train_subset, src_bin_path=pytorch_translate_utils. maybe_parse_collection_argument(args.train_source_binary_path), tgt_bin_path=pytorch_translate_utils. maybe_parse_collection_argument(args.train_target_binary_path), weights_file=getattr(args, "train_weights_path", None), dataset_upsampling=dataset_upsampling, dataset_relative_ratio=dataset_relative_ratio, seed=args.seed, noiser=noiser, ) if args.task == "dual_learning_task": task.load_dataset(split=args.valid_subset, seed=args.seed) else: task.load_dataset( split=args.valid_subset, src_bin_path=args.eval_source_binary_path, tgt_bin_path=args.eval_target_binary_path, ) return task, model, criterion