def setup_training_state(args, trainer, task): """Set up the directory for saving checkpoints. Load pretrained model if specified.""" os.makedirs(args.save_dir, exist_ok=True) # If --restore-file is already present under --save-dir, use that one # instead of --pretrained-checkpoint-file. The idea is that # --pretrained-checkpoint-file allows the user to specify restoring from a # different run's checkpoint (possibly with different training params), # while not polluting the previous run's checkpoint directory # with new checkpoints. However, if training gets interrupted # and the user restarts training, we want to resume from # the checkpoints under --save-dir, instead of # restarting again from the old run's checkpoint at # --pretrained-checkpoint-file. # # Note that if args.restore_file is an absolute path, os.path.join() will # ignore previous directory args and just use the absolute path as is. checkpoint_path = os.path.join(args.save_dir, args.restore_file) restore_state = True if os.path.isfile(checkpoint_path): print( f"| Using --save-dir={args.save_dir}, --restore-file={args.restore_file}." ) elif args.pretrained_checkpoint_file and os.path.isfile( args.pretrained_checkpoint_file): checkpoint_path = args.pretrained_checkpoint_file restore_state = args.load_pretrained_checkpoint_state print( f"| Using --pretrained-checkpoint-file={args.pretrained_checkpoint_file}, " f"--load-pretrained-checkpoint-state={args.load_pretrained_checkpoint_state}." ) extra_state = default_extra_state(args) if not os.path.isfile(checkpoint_path) and args.multi_model_restore_files: print( f"| Restoring individual models from {args.multi_model_restore_files}" ) multi_model.import_individual_models(args.multi_model_restore_files, trainer) else: loaded, loaded_extra_state = load_existing_checkpoint( checkpoint_path=checkpoint_path, trainer=trainer, restore_state=restore_state, ) if loaded_extra_state: extra_state.update(loaded_extra_state) if loaded and distributed_utils.is_master(args): args.path = checkpoint_path calculate_bleu_on_subset( args=args, task=task, epoch_str="initial loaded checkpoint", offset=None, dataset_split=args.valid_subset, ) print(f"| extra_state: {extra_state}") return extra_state
def setup_training(args): """Parse args, load dataset, and load model trainer.""" if not torch.cuda.is_available(): raise NotImplementedError("Training on CPU is not supported") torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task and load dataset task = tasks.setup_task(args) task.load_dataset( args.train_subset, args.train_source_binary_path, args.train_target_binary_path, weights_file=getattr(args, "train_weights_path", None), ) task.load_dataset(args.valid_subset, args.eval_source_binary_path, args.eval_target_binary_path) # Build model and criterion model = task.build_model(args) print("| building criterion") criterion = task.build_criterion(args) print(f"| model {args.arch}, criterion {criterion.__class__.__name__}") print(f"| num. model params: \ {sum(p.numel() for p in model.parameters())}") # Create adversarial criterion adv_criterion = task.build_adversarial_criterion(args) # Adversary adversary = adversaries.build_adversary(args, model, task) # Print a bit of info print(f"| model {args.arch}, " f"adversarial criterion {adv_criterion.__class__.__name__}, " f"adversary {adversary.__class__.__name__}") # Build trainer if args.fp16: print(f"| WARNING: 16 bit training is not supported yet.") trainer = adversarial_trainer.AdversarialTrainer( args=args, task=task, model=model, criterion=criterion, adversarial_criterion=adv_criterion, adversary=adversary, ) print(f"| training on {args.distributed_world_size} GPUs") print( f"| max tokens per GPU = {args.max_tokens} and \ max sentences per GPU = {args.max_sentences}", flush=True, ) os.makedirs(args.save_dir, exist_ok=True) # If --restore-file is already present under --save-dir, use that one # instead of the --restore-file that may be present under # --restore-checkpoint-dir. The idea is that --restore-checkpoint-dir # allows the user to specify restoring from a different run's # checkpoint (possibly with different training params), while not # polluting the previous run's checkpoint directory with new checkpoints. # However, if training gets interrupted and the user restarts training, # we want to resume from the checkpoints under --save-dir, instead of # restarting again from the old run's checkpoint under # --restore-checkpoint-dir. # # Note that if args.restore_file is an absolute path, os.path.join() will # ignore previous directory args and just use the absolute path as is. checkpoint_path = os.path.join(args.save_dir, args.restore_file) if os.path.exists(checkpoint_path): print( f"| Using --save-dir={args.save_dir}, --restore-file={args.restore_file}." ) elif args.restore_checkpoint_dir: checkpoint_path = os.path.join(args.restore_checkpoint_dir, args.restore_file) print( f"| Using --restore-checkpoint-dir={args.restore_checkpoint_dir}, " f"--restore-file={args.restore_file}.") extra_state = default_extra_state(args) if not os.path.isfile(checkpoint_path) and args.multi_model_restore_files: print( f"| Restoring individual models from {args.multi_model_restore_files}" ) multi_model.import_individual_models(args.multi_model_restore_files, trainer) else: loaded, loaded_extra_state = load_existing_checkpoint( checkpoint_path=checkpoint_path, trainer=trainer, restore_state=args.restore_checkpoint_state, ) if loaded_extra_state: extra_state.update(loaded_extra_state) if loaded: args.path = checkpoint_path calculate_bleu_on_subset( args=args, task=task, epoch_str="initial loaded checkpoint", offset=None, dataset_split=args.valid_subset, ) print(f"| extra_state: {extra_state}") epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=trainer.get_model().max_positions(), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) epoch = extra_state["epoch"] if extra_state["batch_offset"] == 0: epoch -= 1 # this will be incremented when we call epoch_itr.next_epoch_itr() epoch_itr.load_state_dict({ "epoch": epoch, "iterations_in_epoch": extra_state["batch_offset"] }) return extra_state, trainer, task, epoch_itr
def setup_training_state(args, trainer, task, epoch_itr): """Set up the directory for saving checkpoints. Load pretrained model if specified.""" os.makedirs(args.save_dir, exist_ok=True) # If --restore-file is already present under --save-dir, use that one # instead of --pretrained-checkpoint-file. The idea is that # --pretrained-checkpoint-file allows the user to specify restoring from a # different run's checkpoint (possibly with different training params), # while not polluting the previous run's checkpoint directory # with new checkpoints. However, if training gets interrupted # and the user restarts training, we want to resume from # the checkpoints under --save-dir, instead of # restarting again from the old run's checkpoint at # --pretrained-checkpoint-file. # # Note that if args.restore_file is an absolute path, os.path.join() will # ignore previous directory args and just use the absolute path as is. checkpoint_path = os.path.join(args.save_dir, args.restore_file) restore_state = True if os.path.isfile(checkpoint_path): print( f"| Using --save-dir={args.save_dir}, --restore-file={args.restore_file}." ) elif args.pretrained_checkpoint_file and os.path.isfile( args.pretrained_checkpoint_file ): checkpoint_path = args.pretrained_checkpoint_file restore_state = args.load_pretrained_checkpoint_state print( f"| Using --pretrained-checkpoint-file={args.pretrained_checkpoint_file}, " f"--load-pretrained-checkpoint-state={args.load_pretrained_checkpoint_state}." ) extra_state = default_extra_state(args) if not os.path.isfile(checkpoint_path) and args.multi_model_restore_files: print(f"| Restoring individual models from {args.multi_model_restore_files}") multi_model.import_individual_models(args.multi_model_restore_files, trainer) else: loaded, loaded_extra_state = checkpoint.load_existing_checkpoint( checkpoint_path=checkpoint_path, trainer=trainer, restore_state=restore_state, ) if loaded_extra_state: extra_state.update(loaded_extra_state) # Reset the start time for the current training run. extra_state["start_time"] = time.time() # Skips printing all training progress to prevent log spam. training_progress = extra_state["training_progress"] extra_state["training_progress"] = ( ["...truncated...", training_progress[-1]] if len(training_progress) > 0 else [] ) print(f"| extra_state: {extra_state}") extra_state["training_progress"] = training_progress epoch = extra_state["epoch"] if extra_state["batch_offset"] == 0: epoch -= 1 # this will be incremented when we call epoch_itr.next_epoch_itr() epoch_itr.load_state_dict( {"epoch": epoch, "iterations_in_epoch": extra_state["batch_offset"]} ) checkpoint_manager = None if distributed_utils.is_master(args): checkpoint_manager = checkpoint.CheckpointManager( num_avg_checkpoints=args.num_avg_checkpoints, auto_clear_checkpoints=args.auto_clear_checkpoints, log_verbose=args.log_verbose, checkpoint_files=extra_state["checkpoint_files"], ) return extra_state, epoch_itr, checkpoint_manager
def setup_training(args): """Parse args, load dataset, and load model trainer.""" if not torch.cuda.is_available(): raise NotImplementedError("Training on CPU is not supported") torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Load dataset splits = [args.train_subset, args.valid_subset] validate_and_set_default_args(args) train_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.train_source_binary_path), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.train_target_binary_path), weights_file=args.train_weights_path if hasattr( args, "train_weights_path") else None, ) eval_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.eval_source_binary_path), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.eval_target_binary_path), weights_file=None, ) if args.log_verbose: print("Starting to load binarized data files.", flush=True) use_char_source = args.arch == "char_source" dataset = pytorch_translate_data.load_binarized_dataset( train_corpus=train_corpus, eval_corpus=eval_corpus, train_split=args.train_subset, eval_split=args.valid_subset, args=args, use_char_source=use_char_source, ) if args.log_verbose: print("Finished loading dataset", flush=True) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types") print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types") for split in splits: print(f"| {split} {len(dataset.splits[split])} examples") # Build model and criterion model = models.build_model(args, dataset.src_dict, dataset.dst_dict) print("building criterion") criterion = criterions.build_criterion(args, dataset.src_dict, dataset.dst_dict) print(f"| model {args.arch}, criterion {criterion.__class__.__name__}") print(f"| num. model params: \ {sum(p.numel() for p in model.parameters())}") # Load pretrained model weights if applicable if args.pretrained_weights_file: utils.load_model_state(args.pretrained_weights_file, model, cuda_device=torch.cuda.current_device()) # Build trainer trainer = Trainer(args, model, criterion) print(f"| training on {args.distributed_world_size} GPUs") print( f"| max tokens per GPU = {args.max_tokens} and \ max sentences per GPU = {args.max_sentences}", flush=True, ) os.makedirs(args.save_dir, exist_ok=True) checkpoint_path = os.path.join(args.save_dir, args.restore_file) if not os.path.isfile(checkpoint_path) and args.multi_model_restore_files: print( f"| Restoring individual models from {args.multi_model_restore_files}" ) extra_state = multi_model.import_individual_models( args.multi_model_restore_files, trainer) else: extra_state = load_existing_checkpoint(checkpoint_path, trainer) return extra_state, trainer, dataset
def setup_training(args): """Parse args, load dataset, and load model trainer.""" if not torch.cuda.is_available(): raise NotImplementedError("Training on CPU is not supported") torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Load dataset splits = [args.train_subset, args.valid_subset] validate_and_set_default_args(args) train_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.train_source_binary_path), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.train_target_binary_path), weights_file=args.train_weights_path if hasattr( args, "train_weights_path") else None, ) eval_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.eval_source_binary_path), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.eval_target_binary_path), weights_file=None, ) if args.log_verbose: print("Starting to load binarized data files.", flush=True) use_char_source = args.arch == "char_source" dataset = pytorch_translate_data.load_binarized_dataset( train_corpus=train_corpus, eval_corpus=eval_corpus, train_split=args.train_subset, eval_split=args.valid_subset, args=args, use_char_source=use_char_source, ) if args.log_verbose: print("Finished loading dataset", flush=True) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types") print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types") for split in splits: print(f"| {split} {len(dataset.splits[split])} examples") # Build model and criterion model = models.build_model(args, dataset.src_dict, dataset.dst_dict) print("building criterion") criterion = criterions.build_criterion(args, dataset.src_dict, dataset.dst_dict) print(f"| model {args.arch}, criterion {criterion.__class__.__name__}") print(f"| num. model params: \ {sum(p.numel() for p in model.parameters())}") # Build trainer trainer = Trainer(args, model, criterion) print(f"| training on {args.distributed_world_size} GPUs") print( f"| max tokens per GPU = {args.max_tokens} and \ max sentences per GPU = {args.max_sentences}", flush=True, ) os.makedirs(args.save_dir, exist_ok=True) # If --restore-file is already present under --save-dir, use that one # instead of the --restore-file that may be present under # --restore-checkpoint-dir. The idea is that --restore-checkpoint-dir # allows the user to specify restoring from a different run's # checkpoint (possibly with different training params), while not # polluting the previous run's checkpoint directory with new checkpoints. # However, if training gets interrupted and the user restarts training, # we want to resume from the checkpoints under --save-dir, instead of # restarting again from the old run's checkpoint under # --restore-checkpoint-dir. # # Note that if args.restore_file is an absolute path, os.path.join() will # ignore previous directory args and just use the absolute path as is. checkpoint_path = os.path.join(args.save_dir, args.restore_file) if os.path.exists(checkpoint_path): print(f"Using --save-dir={args.save_dir}, " f"--restore-file={args.restore_file}.") elif args.restore_checkpoint_dir: checkpoint_path = os.path.join(args.restore_checkpoint_dir, args.restore_file) print(f"Using --restore-checkpoint-dir={args.restore_checkpoint_dir}, " f"--restore-file={args.restore_file}.") if not os.path.isfile(checkpoint_path) and args.multi_model_restore_files: print( f"| Restoring individual models from {args.multi_model_restore_files}" ) extra_state = multi_model.import_individual_models( args.multi_model_restore_files, trainer) else: extra_state = load_existing_checkpoint( checkpoint_path=checkpoint_path, trainer=trainer, restore_state=args.restore_checkpoint_state, ) return extra_state, trainer, dataset