def main(args): dummy_batch_size = args.max_tokens if args.max_tokens is None: args.max_tokens = 4096 dummy_batch_size = 1024 #wandb.init(config=args, project="SbanBert", name="Try_1") if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train'] + args.valid_subset.split(',')) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and trainer.get_num_updates() < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=0, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N', help='batch size') dataset_args.add_argument('--test-batch-size', default=32, type=int, metavar='N', help='batch size for test set') dataset_args.add_argument('--valid-batch-size', default=32, type=int, metavar='N', help='batch size for validation set') dataset_args.add_argument( '--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument( '--valid-subset', default='valid', metavar='SPLIT', help='comma separated list ofdata subsets ' ' to use for validation (train, valid, valid1,test, test1)') dataset_args.add_argument('--test-subset', default='test', metavar='SPLIT', help='comma separated list ofdata subset ' 'to use for testing (train, valid, test)') dataset_args.add_argument( '--valid-script', nargs='+', metavar='PATH', help='path to external validation script (optional).') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) print(args) if args.no_progress_bar: progress_bar.enabled = False progress_bar.print_interval = args.log_interval if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Setting args.max_tokens to infinity(same as setting to None) if args.max_tokens == 0: args.max_tokens = None # Load dataset dataset = data.load_with_check(args.data, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in dataset.splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') num_gpus = torch.cuda.device_count() print('| using {} GPUs (with max tokens per GPU = {})'.format( num_gpus, args.max_tokens)) # Build model print('| model {}'.format(args.arch)) model = utils.build_model(args, dataset) criterion = utils.build_criterion(args, dataset) # Start multiprocessing trainer = MultiprocessingTrainer(args, model) # Load the latest checkpoint if one is available epoch, batch_offset = trainer.load_checkpoint( os.path.join(args.save_dir, args.restore_file)) # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, criterion, dataset, num_gpus) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, criterion, dataset, subset, num_gpus) if k == 0: if not args.no_save: # save checkpoint trainer.save_checkpoint( args, epoch, 0, val_loss, validation_script=args.valid_script) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Generate on test set and compute BLEU score for beam in [1, 5, 10, 20]: for subset in args.test_subset.split(','): scorer = score_test(args, trainer.get_model(), dataset, subset, beam, cuda_device=(0 if num_gpus > 0 else None)) print('| Test on {} with beam={}: {}'.format( subset, beam, scorer.result_string())) # Stop multiprocessing trainer.stop()
def main(args, init_distributed=False): import_user_module(args) if args.max_tokens is None: args.max_tokens = 6000 print(args) if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Initialize distributed training (after data loading) if init_distributed: import socket args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format( socket.gethostname(), args.distributed_rank)) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) oom_batch = task.dataset('train').get_dummy_batch(1, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch, oom_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.train_subset, combine=True, epoch=0) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=True, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Load the latest checkpoint if one is available load_checkpoint(args, trainer, epoch_itr, max_positions, task) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) epoch_itr = reload_train(args, epoch_itr, max_positions, task) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) ## "Prune" heads (actually mask but shh...) #if len(args.transformer_mask_heads) > 0: # # Determine which head to prune # to_prune = parse_head_pruning_descriptors( # args.transformer_mask_heads, # reverse_descriptors=args.transformer_mask_all_but_one_head, # n_heads=model.encoder.layers[0].self_attn.num_heads # ) # print(to_prune) # # Apply pruning # mask_heads(model, to_prune, args.transformer_mask_rescale) # Save initial model initial = os.path.join(args.save_dir, "checkpoint_initial.pt") trainer.save_checkpoint(initial, {}) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) #if epoch_itr.epoch % args.save_interval == 0: # save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) # ***********************************Below Changed****************************** # save checkpoint #if epoch_itr.epoch % args.save_interval == 0: save_interval = 5 # prune and save checkpoint for every five epoch if epoch_itr.epoch % save_interval == 0: #****** changed # do prunning before saving prune2(args, task, model, trainer, epoch_itr) #****** changed2 # save checkpoint save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) prune2(args, task, model, trainer, epoch_itr ) #****** changed2 do last prunning on the last chekcpoint saved # ***********************************Above Changed****************************** train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def prune2(args, task, model, trainer, epoch_itr): # changed2 if args.max_tokens is None: args.max_tokens = 6000 print(args) # avoid aliasing task = copy.deepcopy(task) model = copy.deepcopy(model) trainer = copy.deepcopy(trainer) epoch_itr = copy.deepcopy(trainer) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) print('| Optimizer {}'.format(trainer.optimizer.__class__.__name__)) # Train until the learning rate gets too small prune_meter = StopwatchMeter() prune_meter.start() # Estimate head importance scores head_importance, head_stats = estimate_head_importance( args, trainer, task, epoch_itr) prune_meter.stop() print('| done estimating head importance in {:.1f} seconds'.format( prune_meter.sum)) torch.save(head_stats, f"{os.path.dirname(args.restore_file)}/heads_stats.bin") # Print print("Head importances") print("Encoder self attention") for layer in range(head_importance["encoder_self"].size(0)): print("\t".join(f"{x:.5f}" for x in head_importance["encoder_self"][layer])) print("Encoder decoder attention") for layer in range(head_importance["encoder_decoder"].size(0)): print("\t".join(f"{x:.5f}" for x in head_importance["encoder_decoder"][layer])) print("Decoder self attention") for layer in range(head_importance["decoder_self"].size(0)): print("\t".join(f"{x:.5f}" for x in head_importance["decoder_self"][layer])) # Print sorted pruning profile encoder_self_profile = get_profile(head_importance["encoder_self"], prefix="E") encoder_decoder_profile = get_profile(head_importance["encoder_decoder"], prefix="A") decoder_self_profile = get_profile(head_importance["decoder_self"], prefix="D") # Join all all_profiles = {} if not (args.decoder_self_only or args.encoder_decoder_only): all_profiles.update(encoder_self_profile) if not (args.encoder_self_only or args.decoder_self_only): all_profiles.update(encoder_decoder_profile) if not (args.encoder_self_only or args.encoder_decoder_only): all_profiles.update(decoder_self_profile) sorted_profiles = sorted(all_profiles.items(), key=lambda x: x[1], reverse=args.one_minus) print("Heads sorted by importance:") print(" ".join(p for p, _ in sorted_profiles)) print("Sorted head importance scores:") print(" ".join(f"{v.data:.5f}" for _, v in sorted_profiles)) tot_n_heads = len(sorted_profiles) for i in range(0, 10): n_to_prune = int(ceil(tot_n_heads * i / 10)) to_prune_profile = [p for p, _ in sorted_profiles[:n_to_prune]] to_prune = parse_head_pruning_descriptors(to_prune_profile, reverse_descriptors=False) print(f"Evaluating following profile: \t{' '.join(to_prune_profile)}") # Apply pruning mask_heads(model, to_prune, args.transformer_mask_rescale) bleu = eval_bleu_score( model, task, task.dataset(args.valid_subset), beam=args.beam, replace_unk=args.replace_unk, lenpen=args.lenpen, buffer_size=100, use_cuda=torch.cuda.is_available() and not args.cpu, remove_bpe=args.remove_bpe, max_sentences=args.max_sentences, max_tokens=args.max_tokens, stop_early=not args.no_early_stop, normalize_scores=not args.unnormalized, min_len=args.min_len, ) print(f"BLEU score: \t{bleu.score:.2f}") sys.stdout.flush()
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) # translation using the cstm requires access to all of the datasets # during generation, since the test set has nearest neighbors in # both valid and train if args.task == "cstm_translation": task.load_dataset("train") task.load_dataset("valid") task.load_dataset("test") else: task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = utils.load_ensemble_for_inference( args.path.split(':'), task, model_arg_overrides=eval(args.model_overrides), ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad(sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset(args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset(args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for i, hypo in enumerate(hypos[i][:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( sample_id, ' '.join(map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )) )) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join(map(lambda x: str(utils.item(x)), alignment)) )) # Score only the top hypothesis if has_target and i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line(target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) return scorer
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(args, task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format(sum(p.numel() for p in model.parameters()))) # Build trainer if args.fp16: trainer = FP16Trainer(args, task, model, criterion) else: if torch.cuda.get_device_capability(0)[0] >= 7: print('| NOTICE: your device may support faster training with --fp16') trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader max_positions = trainer.get_model().max_positions() epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available load_checkpoint(args, trainer, epoch_itr) # Send a dummy batch to warm the caching allocator dummy_batch = task.dataset('train').get_dummy_batch(args.max_tokens, max_positions) trainer.dummy_train_step(dummy_batch) # print summary print("-" * 80) print("| model parameters:") total_param, total_size = 0, 0 for idx, (_name, _param) in enumerate(model.named_parameters()): print("| var {:3}: {:60} shape {:16} size {}".format( idx, _name, str(list(_param.data.shape)), _param.data.numel())) total_param += 1 total_size += _param.data.numel() print("| total parameters: {}, total size: {}".format(total_param, total_size)) print("-" * 80) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) print('Done %d Epochs' % epoch_itr.epoch) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairnr_cli.render') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(os.pathsep), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation for model in models: if args.fp16: model.half() if use_cuda: model.cuda() logging.info(model) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_workers=args.num_workers).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) shard_id, world_size = args.distributed_rank, args.distributed_world_size output_files = [] if generator.test_poses is not None: total_frames = generator.test_poses.shape[0] _frames = int(np.floor(total_frames / world_size)) step = shard_id * _frames frames = _frames if shard_id < (world_size - 1) else total_frames - step else: step = shard_id * args.render_num_frames frames = args.render_num_frames with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for i, sample in enumerate(t): sample = utils.move_to_cuda(sample) if use_cuda else sample gen_timer.start() step, _output_files = task.inference_step(generator, models, [sample, step, frames]) output_files += _output_files gen_timer.stop(500) wps_meter.update(500) t.log({'wps': round(wps_meter.avg)}) timestamp = generator.save_images( output_files, steps='shard{}'.format(shard_id), combine_output=args.render_combine_output) # join videos from all GPUs and delete temp files try: timestamps = distributed_utils.all_gather_list(timestamp) except: timestamps = [timestamp] if shard_id == 0: generator.merge_videos(timestamps)
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument( '--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument( '--valid-subset', default='valid', metavar='SPLIT', help='comma separated list ofdata subsets ' ' to use for validation (train, valid, valid1,test, test1)') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) print(args) if args.no_progress_bar: progress_bar.enabled = False progress_bar.print_interval = args.log_interval if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Load dataset dataset = data.load_with_check(args.data, ['train', 'valid'], args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in ['train', 'valid']: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') num_gpus = torch.cuda.device_count() print('| using {} GPUs (with max tokens per GPU = {})'.format( num_gpus, args.max_tokens)) # Build model and criterion model = utils.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) # Start multiprocessing trainer = MultiprocessingTrainer(args, model, criterion) # Load the latest checkpoint if one is available checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format( checkpoint_path, epoch)) if batch_offset == 0: epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, dataset, num_gpus) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, dataset, subset, num_gpus) if k == 0: if not args.no_save: # save checkpoint save_checkpoint(trainer, args, epoch, 0, val_loss) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Stop multiprocessing trainer.stop()
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_init_hvd(args) # Print args print(args) # if not HAS_NSML: # args.data[0] = args.data[0].replace("/train", "") # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) if args.train_decoder_only: for name, param in model.named_parameters(): if "decoder" not in name: param.requires_grad_(False) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Setup session if HAS_WANDB and distributed_utils.is_master(args): wandb.init(project="cmlm", config=args) wandb.watch(model) # Load pre-trained model data_token = args.data[0].split("/")[-1] if "bert" in args.arch: pretrained_path = "{}/train/pretrained_models/maskPredict_{}/checkpoint_best.pt".format( DATASET_PATH, data_token.split(".")[-1].replace("-", "_")) if not HAS_NSML: pretrained_path = pretrained_path.replace("/train", "") print("| loading", pretrained_path) state = checkpoint_utils.load_checkpoint_to_cpu(pretrained_path) model.load_state_dict(state["model"], strict=True) baseline_model = task.build_model(args) baseline_model.load_state_dict(state["model"], strict=True) if torch.cuda.is_available(): baseline_model.cuda() task.set_baseline_model(baseline_model) if not args.masking and HAS_NSML: def nsml_bind(model): def save(dir_path): state = { 'model': model.state_dict(), } torch.save(state, os.path.join(dir_path, 'best.pt')) def load(dir_path): state = torch.load(os.path.join(dir_path, 'best.pt'), map_location="cpu") model.load_state_dict(state['model'], strict=False) model.cuda() print('model loaded!') nsml.bind(save=save, load=load) nsml_bind(model) if args.load: print("loading model from session", args.load) if args.load.startswith("nsml://"): session = args.load.replace("nsml://", "") if ".pt" in session: session = session.replace(".pt", "") session, checkpoint_name = session.rsplit("/", 1) else: checkpoint_name = "best" if "-" in checkpoint_name: start, end = checkpoint_name.replace("epoch", "").split("-") checkpoints = [ "epoch{}".format(i) for i in range(int(start), int(end) + 1) ] print("| checkpoint average:", checkpoints) state_dict = None def load(dir_path): nonlocal state_dict, checkpoints state = torch.load(os.path.join(dir_path, 'best.pt')) model_state = state["model"] for k in model_state: model_state[k] = model_state[k] / float(len(checkpoints)) if state_dict is None: state_dict = model_state else: for k in state_dict: state_dict[k] += model_state[k] print("checkpoint loaded") for checkpoint_name in checkpoints: nsml.load(checkpoint_name, load_fn=load, session=session) model.load_state_dict(state_dict) else: def load(dir_path): state = torch.load(os.path.join(dir_path, 'best.pt')) state_dict = state["model"] model.load_state_dict(state_dict) print("loaded") nsml.load(checkpoint_name, load_fn=load, session=session) # Prepare for decoder wise training if args.decoder_wise_training: print("| Decoder wise training, start refinement step 0") progressive_training_step = 0 assert args.ddp_backend == "c10d" else: progressive_training_step = None # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') if hasattr(args, "progressive") and args.progressive: for i in range(args.refinetot if not getattr(args, "pnet", False) else args.refinetot - 1): print("validating for refine step", i) validate(args, trainer, task, epoch_itr, valid_subsets, force_refine_step=i) print("---") validate(args, trainer, task, epoch_itr, valid_subsets) while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr, force_refine_step=progressive_training_step) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate( args, trainer, task, epoch_itr, valid_subsets, force_refine_step=progressive_training_step) else: valid_losses = [None] if args.decoder_wise_training: progressive_training_step = update_num_to_refine_step( trainer.get_num_updates()) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: if HAS_NSML: if distributed_utils.is_master(args): print("nsml save for epoch", epoch_itr.epoch) nsml.save("epoch{}".format(epoch_itr.epoch)) else: torch.save({"model": trainer.get_model().state_dict()}, "/tmp/epoch{}.pt".format(epoch_itr.epoch)) if HAS_WANDB: wandb.save("/tmp/epoch{}.pt".format(epoch_itr.epoch)) # checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if ':' in getattr(args, 'data', ''): # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) torch.manual_seed(args.seed) # Optimize ensemble for generation for model in models: if use_cuda: model.cuda() config = utils.get_subtransformer_config(args) model.set_sample_config(config) model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() print(model, file=sys.stderr) print(args.path, file=sys.stderr) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) num_sentences = 0 has_target = True decoder_times_all = [] input_len_all = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos, decoder_times = task.inference_step(generator, models, sample, prefix_tokens) input_len_all.append( np.mean(sample['net_input']['src_lengths'].cpu().numpy())) print(decoder_times) decoder_times_all.append(decoder_times) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment)))) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences']
def train( args, extra_state: Dict[str, Any], trainer, task, epoch_itr, output_queue: Optional[mp_queues.Queue] = None, **train_step_kwargs, ): # offset for current epoch (may be different from checkpoint offset) starting_offset = extra_state["batch_offset"] # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() stop_training_mid_epoch = False stop_training_end_of_epoch = False do_prune = args.pruning_percentile > 0 if do_prune: prune_masks = create_prune_masks(args, trainer) apply_prune_masks(prune_masks, trainer) while lr > args.min_lr and extra_state["epoch"] <= max_epoch: """Train the model for one epoch.""" itr, progress, extra_meters = setup_epoch( args=args, epoch_itr=epoch_itr, trainer=trainer ) for i, samples in enumerate(progress, start=starting_offset): clear_per_step_extra_state(extra_state) extra_state["num_iterations"] = extra_state.get("num_iterations", 0) + 1 if ( train_step_kwargs is not None and "augment_adv" in train_step_kwargs.keys() ): train_step_kwargs["augment_adv"] = ( extra_state["num_iterations"] > args.warmup_steps ) try: log_output = trainer.train_step(samples, **train_step_kwargs) # Fairseq's fp16_trainer raises this uncommon error to indicate # that we should stop training. except FloatingPointError as e: print(f"Stopping training due to: {e}.") stop_training_mid_epoch = True break if do_prune: apply_prune_masks(prune_masks, trainer) if i == starting_offset: # ignore the first mini-batch in words-per-second calculation trainer.get_meter("wps").reset() num_iterations = extra_state["num_iterations"] do_eval_tune_loss = ( args.subepoch_validate_interval > 0 and num_iterations % args.subepoch_validate_interval == 0 ) do_save = ( not args.no_save and args.save_interval_updates > 0 and num_iterations % args.save_interval_updates == 0 ) do_eval_bleu = ( # We can only do BLEU eval when we have a new checkpoint to load. do_save and args.generate_bleu_eval_interval > 0 and num_iterations - extra_state["tune_bleu"]["last_eval_step"] >= args.generate_bleu_eval_interval ) if do_eval_bleu: extra_state["tune_bleu"]["last_eval_step"] = num_iterations extra_state["batch_offset"] = i + 1 (extra_state, stop_training_mid_epoch, translation_samples) = save_and_eval( args=args, trainer=trainer, task=task, extra_state=extra_state, do_eval_tune_loss=do_eval_tune_loss, do_save=do_save, do_eval_bleu=do_eval_bleu, ) # This should come after save_and_eval. Even if log_output is None, # meaning that there was an overflow, We should still run # save_and_eval to sync all_reduce and then skip the batch. if log_output is None: # This indicates that the batch was skipped, typically # because of OOM or FP16 overflow. continue train_stats = log_mid_epoch_stats( trainer=trainer, progress=progress, extra_meters=extra_meters, log_output=log_output, ) if distributed_utils.is_master(args) and output_queue is not None: output_queue.put_nowait( ( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": extra_state["tune_eval"]["perplexity"], "tune_bleu": extra_state["tune_bleu"]["current"], "translation_samples": translation_samples, }, ) ) if ( do_eval_bleu and args.shrink_lr_no_best_bleu_eval > 0 and extra_state["tune_bleu"]["num_since_best"] > args.shrink_lr_no_best_bleu_eval ): current_lr = trainer.optimizer.get_lr() trainer.optimizer.set_lr(current_lr * args.lr_shrink) lr = trainer.optimizer.get_lr() print(f"Decayed lr from {current_lr} to {lr}.") stop_training_mid_epoch = ( stop_training_mid_epoch or is_training_over_time_limit( extra_state["start_time"], args.stop_time_hr ) ) if stop_training_mid_epoch: break # log end-of-epoch stats train_stats = log_end_epoch_stats( trainer=trainer, progress=progress, extra_meters=extra_meters ) # Run a training step if not stopping mid-epoch. if not stop_training_mid_epoch: # batch_offset being None denotes the end of an epoch. extra_state["batch_offset"] = None ( extra_state, stop_training_end_of_epoch, translation_samples, ) = save_and_eval( args=args, trainer=trainer, task=task, extra_state=extra_state, do_eval_tune_loss=True, do_save=not args.no_save and not args.no_end_of_epoch_checkpoints, do_eval_bleu=args.generate_bleu_eval_per_epoch, ) if distributed_utils.is_master(args) and output_queue is not None: output_queue.put_nowait( ( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": extra_state["tune_eval"]["perplexity"], "tune_bleu": extra_state["tune_bleu"]["current"], "translation_samples": translation_samples, }, ) ) if stop_training_mid_epoch or stop_training_end_of_epoch: break lr = trainer.lr_step(extra_state["epoch"], extra_state["tune_eval"]["loss"]) extra_state["epoch"] += 1 extra_state["batch_offset"] = 0 starting_offset = 0 train_meter.stop() print(f"| done training in {train_meter.sum:.1f} seconds") print( f"| Best BLEU score of {extra_state['tune_bleu']['best']} was from " f"epoch {extra_state['tune_bleu']['best_epoch']}" )
def main(parsed_args): assert parsed_args.path is not None, '--path required for evaluation!' utils.import_user_module(parsed_args) logger.info(parsed_args) use_cuda = torch.cuda.is_available() and not parsed_args.cpu task = tasks.setup_task(parsed_args) # Load ensemble logger.info('loading model(s) from {}'.format(parsed_args.path)) models, args = checkpoint_utils.load_model_ensemble( parsed_args.path.split(os.pathsep), arg_overrides=eval(parsed_args.model_overrides), task=task, ) for arg in vars(parsed_args).keys(): if arg not in { 'self_target', 'future_target', 'past_target', 'tokens_per_sample', 'output_size_dictionary', 'add_bos_token', }: setattr(args, arg, getattr(parsed_args, arg)) # reduce tokens per sample by the required context window size args.tokens_per_sample -= args.context_window task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.gen_subset) dataset = task.dataset(args.gen_subset) if args.context_window > 0: dataset = LMContextWindowDataset( dataset=dataset, tokens_per_sample=args.tokens_per_sample, context_window=args.context_window, pad_idx=task.source_dictionary.pad(), ) logger.info('{} {} {} examples'.format(args.data, args.gen_subset, len(dataset))) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.make_generation_fast_() if args.fp16: model.half() if use_cuda: model.cuda() assert len(models) > 0 logger.info('num. model params: {}'.format( sum(p.numel() for p in models[0].parameters()))) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens or 36000, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( *[model.max_positions() for model in models]), ignore_invalid_inputs=True, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) gen_timer = StopwatchMeter() scorer = SequenceScorer(task.target_dictionary, args.softmax_batch, args=args) score_sum = 0. count = 0 if args.remove_bpe is not None: if args.remove_bpe == 'sentencepiece': raise NotImplementedError else: bpe_cont = args.remove_bpe.rstrip() bpe_toks = { i for i in range(len(task.source_dictionary)) if task.source_dictionary[i].endswith(bpe_cont) } bpe_len = len(bpe_cont) else: bpe_toks = None bpe_len = 0 word_stats = dict() if args.knnlm and args.save_knnlm_dstore: raise ValueError( "Cannot use knnlm while trying to build the datastore!") if args.knnlm: knn_dstore = KNN_Dstore(args) with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() if args.save_knnlm_dstore: print('keytype being saved:', args.knn_keytype) if args.dstore_fp16: print('Saving fp16') dstore_keys = np.memmap(args.dstore_mmap + '_keys.npy', dtype=np.float16, mode='w+', shape=(args.dstore_size, args.decoder_embed_dim)) dstore_vals = np.memmap(args.dstore_mmap + '_vals.npy', dtype=np.int16, mode='w+', shape=(args.dstore_size, 1)) else: print('Saving fp32') dstore_keys = np.memmap(args.dstore_mmap + '_keys.npy', dtype=np.float32, mode='w+', shape=(args.dstore_size, args.decoder_embed_dim)) dstore_vals = np.memmap(args.dstore_mmap + '_vals.npy', dtype=np.int, mode='w+', shape=(args.dstore_size, 1)) dstore_idx = 0 for ex_i, sample in enumerate(t): if 'net_input' not in sample: continue sample = utils.move_to_cuda(sample) if use_cuda else sample gen_timer.start() if args.knnlm: hypos = scorer.generate(models, sample, knn_dstore=knn_dstore) else: hypos = scorer.generate(models, sample) gen_timer.stop(sample['ntokens']) for i, hypos_i in enumerate(hypos): hypo = hypos_i[0] if args.save_knnlm_dstore: shape = hypo['dstore_keys'].shape if shape[0] == args.tokens_per_sample: if dstore_idx + shape[0] > args.dstore_size: shape = [args.dstore_size - dstore_idx] hypo['dstore_keys'] = hypo[ 'dstore_keys'][:shape[0]] if args.dstore_fp16: dstore_keys[dstore_idx:shape[0] + dstore_idx] = hypo['dstore_keys'].view( -1, args.decoder_embed_dim).cpu( ).numpy().astype(np.float16) dstore_vals[dstore_idx:shape[0] + dstore_idx] = hypo['tokens'].view( -1, 1).cpu().numpy().astype(np.int16) else: dstore_keys[dstore_idx:shape[0] + dstore_idx] = hypo['dstore_keys'].view( -1, args.decoder_embed_dim).cpu( ).numpy().astype(np.float32) dstore_vals[dstore_idx:shape[0] + dstore_idx] = hypo['tokens'].view( -1, 1).cpu().numpy().astype(np.int) dstore_idx += shape[0] else: print('Skipping this one with shape', shape) sample_id = sample['id'][i] tokens = hypo['tokens'] tgt_len = tokens.numel() pos_scores = hypo['positional_scores'].float() if args.add_bos_token: assert hypo['tokens'][0].item( ) == task.target_dictionary.bos() tokens = tokens[1:] pos_scores = pos_scores[1:] skipped_toks = 0 if bpe_toks is not None: for i in range(tgt_len - 1): if tokens[i].item() in bpe_toks: skipped_toks += 1 pos_scores[i + 1] += pos_scores[i] pos_scores[i] = 0 #inf_scores = pos_scores.eq(float('inf')) | pos_scores.eq(float('-inf')) #if inf_scores.any(): # logger.info( # 'skipping tokens with inf scores:', # task.target_dictionary.string(tokens[inf_scores.nonzero()]) # ) # pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += pos_scores.sum().cpu() count += pos_scores.numel() - skipped_toks if args.output_word_probs or args.output_word_stats: w = '' word_prob = [] is_bpe = False for i in range(len(tokens)): w_ind = tokens[i].item() w += task.source_dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True else: word_prob.append((w, pos_scores[i].item())) next_prob = None ind = i + 1 while ind < len(tokens): if pos_scores[ind].item() != 0: next_prob = pos_scores[ind] break ind += 1 word_stats.setdefault(w, WordStat(w, is_bpe)).add( pos_scores[i].item(), next_prob) is_bpe = False w = '' if args.output_word_probs: logger.info( str(int(sample_id)) + " " + ('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob))) wps_meter.update(sample['ntokens']) t.log({'wps': round(wps_meter.avg)}) if args.save_knnlm_dstore: print("dstore_idx", dstore_idx, "final shape", shape) print("Keys", dstore_keys.shape, dstore_keys.dtype) print("Vals", dstore_vals.shape, dstore_vals.dtype) avg_nll_loss = -score_sum / count / math.log(2) # convert to base 2 logger.info('Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)'.format( gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) logger.info('Loss (base 2): {:.4f}, Perplexity: {:.2f}'.format( avg_nll_loss, 2**avg_nll_loss)) if args.output_word_stats: for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): logger.info(ws)
def main_tpu(args): def log_step(step_type, device, step, tracker=None, metrics_debug=False): msg = '{}/ {}, device {}, step {}'.format(step_type, utils.now(), device, step) if tracker: rates = tracker.rate(), tracker.global_rate() msg += ', Rate={:.2f}, Global Rate={:.2f}'.format(*rates) return msg def train_loop_fn(model, loader, device, context): trainer = trainers[str(device)] stats = None tracker = xm.RateTracker() for i, samples in loader: if i and not (i % args.log_steps): print( log_step( 'training', device, i, tracker=tracker, metrics_debug=args.metrics_debug)) _log_output = trainer.train_step(samples) xm.optimizer_step(trainer.optimizer) tracker.add(len(samples) * args.max_sentences) # n_batches * batch_size stats = fairseq_train.get_training_stats(trainer) return tracker, stats def valid_loop_fn(model, loader, device, context): trainer = trainers[str(device)] # reset validation loss meters for k in ['valid_loss', 'valid_nll_loss']: meter = trainer.get_meter(k) if meter is not None: meter.reset() extra_meters = collections.defaultdict(lambda: AverageMeter()) for i, sample in loader: if not (i % args.log_steps): print( log_step( 'validation', device, i, tracker=None, metrics_debug=args.metrics_debug)) log_output = trainer.valid_step(sample) for k, v in log_output.items(): if k in ['loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size']: continue extra_meters[k].update(v) stats = fairseq_train.get_valid_stats(trainer) for k, meter in extra_meters.items(): stats[k] = meter.avg return stats def validate_subset(args, trainers, task, epoch_itr, subset): print('Validating the subset "{}"'.format(subset)) # Initialize data iterator itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), list(trainers.values())[0].get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_workers=args.num_workers).next_epoch_itr(shuffle=False) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, prefix='valid on \'{}\' subset'.format(subset), no_progress_bar='simple') stats_per_device = model_parallel(valid_loop_fn, progress) valid_losses = [stats['loss'].avg for stats in stats_per_device] print('validation stats on subset "{}" - {}'.format(subset, utils.now())) for stats in stats_per_device: progress.print(stats, tag=subset, step=trainer.get_num_updates()) return valid_losses def validate(args, trainers, task, epoch_itr, subsets): valid_losses = { subset: validate_subset(args, trainers, task, epoch_itr, subset) for subset in subsets } return valid_losses def initialize_loader_for_epoch(args, epoch_itr): if epoch_itr.epoch <= len(args.update_freq): update_freq = args.update_freq[epoch_itr.epoch - 1] else: update_freq = args.update_freq[-1] # Initialize data iterator itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=False, shuffle=(epoch_itr.epoch >= args.curriculum)) itr = iterators.GroupedIterator(itr, update_freq) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, no_progress_bar='simple') return progress def keep_training(lr, epoch_itr, trainers): # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = min(trainer.get_lr() for trainer in trainers.values()) n_updates = max(trainer.get_num_updates() for trainer in trainers.values()) return ((lr > FLAGS.min_lr) and (epoch_itr.epoch < max_epoch) and (n_updates < max_update)) xu.eprint('Args') for key, val in args.__dict__.items(): xu.eprint('\t{} {}'.format(key, val)) xu.eprint('---------') devices = xm.get_xla_supported_devices(max_devices=args.num_cores) task, trainers, model_parallel, epoch_itr, lr, valid_subsets = prepare_task( args, devices) train_meter = StopwatchMeter() train_meter.start() while keep_training(lr, epoch_itr, trainers): # TRAINING print('Epoch {} begin {}'.format(epoch_itr.epoch + 1, utils.now())) progress = initialize_loader_for_epoch(args, epoch_itr) out = model_parallel(train_loop_fn, progress) trackers, stats_ = zip(*out) print('Epoch {} Training stats:'.format(epoch_itr.epoch)) for device, trainer in trainers.items(): stats = fairseq_train.get_training_stats(trainer) print('device {}'.format(device)) progress.print(stats, tag=device) print('Epoch {} Tracker Rates:'.format(epoch_itr.epoch)) for tracker in trackers: rates = tracker.rate(), tracker.global_rate() print('\tRate={:.2f}, Global Rate={:.2f}'.format(*rates)) print('Epoch {} end {}'.format(epoch_itr.epoch, utils.now())) if args.metrics_debug: print(torch_xla._XLAC._xla_metrics_report()) # VALIDATION if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainers, task, epoch_itr, valid_subsets) # only use average first validation loss from the first device # to update the learning rate vloss = valid_losses[valid_subsets[0]][0] print('old learning rate: {}'.format(lr)) lr = trainers[devices[0]].lr_step(epoch_itr.epoch, vloss) print('new learning rate: {}'.format(lr)) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, vloss) if args.metrics_debug: print(torch_xla._XLAC._xla_metrics_report()) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format(sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch(args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def train(args, extra_state, trainer, task, epoch_itr): # offset for current epoch (may be different from checkpoint offset) starting_offset = extra_state["batch_offset"] # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() stop_training_mid_epoch = False stop_training_end_of_epoch = False do_prune = args.pruning_percentile > 0 if do_prune: prune_masks = create_prune_masks(args, trainer) apply_prune_masks(prune_masks, trainer) while lr > args.min_lr and extra_state["epoch"] <= max_epoch: """Train the model for one epoch.""" itr, progress, extra_meters = setup_epoch(args=args, epoch_itr=epoch_itr, trainer=trainer) for i, sample in enumerate(progress, start=starting_offset): log_output = trainer.train_step( sample, augment_adv=extra_state["epoch"] > args.warmup_epochs) if do_prune: apply_prune_masks(prune_masks, trainer) train_stats = log_mid_epoch_stats( trainer=trainer, progress=progress, extra_meters=extra_meters, log_output=log_output, ) if i == starting_offset: # ignore the first mini-batch in words-per-second calculation trainer.get_meter("wps").reset() num_updates = trainer.get_num_updates() do_validate = (args.subepoch_validate_interval > 0 and num_updates % args.subepoch_validate_interval == 0) do_save = (not args.no_save and args.save_interval_updates > 0 and num_updates % args.save_interval_updates == 0) do_eval_bleu = ( # We can only do BLEU eval when we have a new checkpoint to load. do_save and args.generate_bleu_eval_interval > 0 and num_updates - extra_state["last_bleu_eval"] >= args.generate_bleu_eval_interval) if do_eval_bleu: extra_state["last_bleu_eval"] = num_updates extra_state["batch_offset"] = i + 1 ( _, val_ppl, val_bleu, stop_training_mid_epoch, translation_samples, lr, ) = validate_save_and_evaluate_bleu( args=args, trainer=trainer, task=task, extra_state=extra_state, do_validate=do_validate, do_save=do_save, do_eval_bleu=do_eval_bleu, ) yield ( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": val_ppl, "tune_bleu": val_bleu, "translation_samples": translation_samples, }, ) stop_training_mid_epoch = (stop_training_mid_epoch or is_training_over_time_limit( extra_state["start_time"], args.stop_time_hr)) if stop_training_mid_epoch: break # log end-of-epoch stats train_stats = log_end_epoch_stats(trainer=trainer, progress=progress, extra_meters=extra_meters) # Run a training step if not stopping mid-epoch. if not stop_training_mid_epoch: # batch_offset being None denotes the end of an epoch. extra_state["batch_offset"] = None ( val_loss, val_ppl, val_bleu, stop_training_end_of_epoch, translation_samples, lr, ) = validate_save_and_evaluate_bleu( args=args, trainer=trainer, task=task, extra_state=extra_state, do_validate=True, do_save=not args.no_save and not args.no_end_of_epoch_checkpoints, do_eval_bleu=args.generate_bleu_eval_per_epoch, ) extra_state["val_loss"] = val_loss yield ( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": val_ppl, "tune_bleu": val_bleu, "translation_samples": translation_samples, }, ) if stop_training_mid_epoch or stop_training_end_of_epoch: break lr = trainer.lr_step(extra_state["epoch"], val_loss) extra_state["epoch"] += 1 extra_state["batch_offset"] = 0 starting_offset = 0 train_meter.stop() print(f"| done training in {train_meter.sum:.1f} seconds") print( f"| Best BLEU score of {extra_state['evaluate_bleu']['best']} was from " f"epoch {extra_state['evaluate_bleu']['best_epoch']}")
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr, filtered_maxpos_indices = checkpoint_utils.load_checkpoint( args, trainer) # pretrain data actor # only the language actor model can be pretrained if args.pretrain_laser and args.pretrain_data_actor and args.data_actor == 'ave': # pretrain the agent with LASER score # epoch_itr, indices = trainer.get_train_iterator(1) path = '/home/wtan12/multiDDS/' trainer.pretrain_LASER('en-ps.laser-score', epoch_itr) if args.compare_laser: epoch_itr, indices = trainer.get_train_iterator(1) print('Number of Indices: ', len(indices)) scores = collections.defaultdict(float) # compare with laser label using R^2 Score, only used after model is trained # itr = epoch_itr.next_epoch_itr(fix_batches_to_gpus=False, shuffle=False) data_actor = trainer.data_actor itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=args.fix_batches_to_gpus, shuffle=False, offset=0, datasize=-1, ) for i, sample in enumerate(itr): sample = trainer._prepare_sample(sample) sample = list(sample.values())[0] score = data_actor(sample).cpu().detach().numpy().tolist() indices = sample['id'].data.cpu().numpy().ravel().tolist() for k, v in zip(indices, score): scores[k] = float(v[0]) scores = sorted(scores.items(), key=lambda x: x[0]) print('Number of Indices in Scoring file: ', len(scores)) path = '/home/wtan12/multiDDS/' with open(path + 'en-ps.laser-score', 'r') as r: data = r.read() laser_score = [] for i, item in enumerate(data.split('\n')): laser_score.append(item) laser_score.pop() r2 = 0.0 with open(path + 'en-ps.dds_score', 'w') as f: for k, v in scores: f.write(str(v) + '\n') truth = float(laser_score[k]) r2 += (truth - v)**2 print('R2 Score compared to LASER file: ', r2) return # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') if args.eval_bleu: generator = task.build_generator(args) args.maximize_best_checkpoint_metric = True else: generator = None while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch epoch_itr = train(args, trainer, task, epoch_itr, generator, filtered_maxpos_indices) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, generator) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if ':' in getattr(args, 'data', ''): # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch)[0] train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') tokenize = sacrebleu.DEFAULT_TOKENIZER if not args.eval_tokenized_bleu else 'none' hyps, refs = validate(args, trainer, task, epoch_itr, valid_subsets) for h, r, split in zip(hyps, refs, args.valid_subset.split(',')): assert len(h) == len(r) sacrebleu_score, _, _ = sacrebleu.corpus_bleu( h, [r], tokenize=tokenize), hyps, refs bleu = compute_cvpr_bleu(h, r) rouge_score = rouge.rouge(h, r) print('{} set has {} samples,\n' 'sacrebleu: {},\n' 'CVPR BLEU scripts: {}\n' 'CVPR ROUGE: {}'.format(split, len(h), sacrebleu_score, bleu, rouge_score)) print('performance: {:.2f} {}'.format( rouge_score['rouge_l/f_score'] * 100, ' '.join([str(b) for b in bleu])))
def main(args): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) mlperf_compliance.mlperf_log.LOGGER.propagate = False # framework = f'Pytorch NGC {os.environ["NVIDIA_PYTORCH_VERSION"]}' # mlperf_submission_log( # benchmark=mlperf_compliance.constants.TRANSFORMER, # framework=framework) mlperf_compliance.mlperf_log.setdefault( root_dir=os.path.dirname(os.path.abspath(__file__)), benchmark=mlperf_compliance.constants.TRANSFORMER, stack_offset=1, extra_print=False) mlperf_print(key=mlperf_compliance.constants.INIT_START, log_all_ranks=True) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) random.seed(args.seed) np.random.seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # preinit and warmup streams/groups for allreduce communicators allreduce_communicators = None if args.distributed_world_size > 1 and args.enable_parallel_backward_allred_opt: allreduce_groups = [ torch.distributed.new_group() for _ in range(args.parallel_backward_allred_cuda_nstreams) ] allreduce_streams = [ torch.cuda.Stream() for _ in range(args.parallel_backward_allred_cuda_nstreams) ] for group, stream in zip(allreduce_groups, allreduce_streams): with torch.cuda.stream(stream): torch.distributed.all_reduce(torch.cuda.FloatTensor(1), group=group) allreduce_communicators = (allreduce_groups, allreduce_streams) if args.max_tokens is None: args.max_tokens = 6000 print(args) mlperf_print(key=mlperf_compliance.constants.GLOBAL_BATCH_SIZE, value=args.max_tokens * args.distributed_world_size) mlperf_print(key=mlperf_compliance.constants.OPT_NAME, value=args.optimizer) assert (len(args.lr) == 1) mlperf_print(key=mlperf_compliance.constants.OPT_BASE_LR, value=args.lr[0] if len(args.lr) == 1 else args.lr) mlperf_print(key=mlperf_compliance.constants.OPT_LR_WARMUP_STEPS, value=args.warmup_updates) assert (args.max_source_positions == args.max_target_positions) mlperf_print(key=mlperf_compliance.constants.MAX_SEQUENCE_LENGTH, value=args.max_target_positions) mlperf_print(key=mlperf_compliance.constants.OPT_ADAM_BETA_1, value=eval(args.adam_betas)[0]) mlperf_print(key=mlperf_compliance.constants.OPT_ADAM_BETA_2, value=eval(args.adam_betas)[1]) mlperf_print(key=mlperf_compliance.constants.OPT_ADAM_EPSILON, value=args.adam_eps) pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) result = torch.cuda.cudart().cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) result = torch.cuda.cudart().cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) # torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Build trainer if args.fp16: trainer = FP16Trainer(args, task, model, criterion, allreduce_communicators=allreduce_communicators) else: if torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) trainer = Trainer(args, task, model, criterion, allreduce_communicators=None) #if (args.online_eval or args.target_bleu) and not args.remove_bpe: # args.remove_bpe='@@ ' print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader max_positions = trainer.get_model().max_positions() # Send a dummy batch to warm the caching allocator dummy_batch = language_pair_dataset.get_dummy_batch_isolated( args.max_tokens, max_positions, 8) trainer.dummy_train_step(dummy_batch) # Train until the learning rate gets too small or model reaches target score max_epoch = args.max_epoch if args.max_epoch >= 0 else math.inf max_update = args.max_update or math.inf tgt_bleu = args.target_bleu or math.inf current_bleu = 0.0 lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') # mlperf compliance synchronization if args.distributed_world_size > 1: assert (torch.distributed.is_initialized()) torch.distributed.all_reduce(torch.cuda.FloatTensor(1)) torch.cuda.synchronize() mlperf_print(key=mlperf_compliance.constants.INIT_STOP, sync=True) mlperf_print(key=mlperf_compliance.constants.RUN_START, sync=True) # second sync after RUN_START tag is printed. # this ensures no rank touches data until after RUN_START tag is printed. barrier() # Load dataset splits load_dataset_splits(task, ['train', 'test']) ctr = 0 class DummyEpochBatchIterator: def __init__(self, epoch=0): self.epoch = epoch epoch_itr = DummyEpochBatchIterator(0) # Main training loop while lr >= args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update and current_bleu < tgt_bleu: first_epoch = epoch_itr.epoch + 1 mlperf_print(key=mlperf_compliance.constants.BLOCK_START, metadata={ 'first_epoch_num': first_epoch, 'epoch_count': 1 }, sync=True) mlperf_print(key=mlperf_compliance.constants.EPOCH_START, metadata={'epoch_num': first_epoch}, sync=True) start = time.time() gc.disable() epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), dataloader_num_workers=args.dataloader_num_workers, dataloader_pin_memory=args.enable_dataloader_pin_memory, max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, epoch=epoch_itr.epoch if ctr is not 0 else 0, bucket_growth_factor=args.bucket_growth_factor, seq_len_multiple=args.seq_len_multiple, batching_scheme=args.batching_scheme, batch_multiple_strategy=args.batch_multiple_strategy, ) print("got epoch iterator", time.time() - start) # Load the latest checkpoint if one is available if ctr is 0: load_checkpoint(args, trainer, epoch_itr) # train for one epoch start = time.time() #exit(1) train(args, trainer, task, epoch_itr) print("epoch time ", time.time() - start) start = time.time() mlperf_print(key=mlperf_compliance.constants.EPOCH_STOP, metadata={'epoch_num': first_epoch}, sync=True) #if epoch_itr.epoch % args.validate_interval == 0: # valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # Eval BLEU score if args.online_eval or (not tgt_bleu is math.inf): current_bleu = score(args, trainer, task, epoch_itr, args.gen_subset) mlperf_print(key=mlperf_compliance.tags.EVAL_ACCURACY, value=str(current_bleu), metadata={'epoch_num': first_epoch}) gc.enable() # Only use first validation loss to update the learning rate #lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # Save checkpoint #if epoch_itr.epoch % args.save_interval == 0: # save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) ctr = ctr + 1 print("validation and scoring ", time.time() - start) mlperf_print(key=mlperf_compliance.constants.BLOCK_STOP, metadata={'first_epoch_num': first_epoch}, sync=True) train_meter.stop() status = 'success' if current_bleu >= tgt_bleu else 'aborted' mlperf_print(key=mlperf_compliance.constants.RUN_STOP, metadata={'status': status}) print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') while (lr > args.min_lr and (epoch_itr.epoch < max_epoch or (epoch_itr.epoch == max_epoch and epoch_itr._next_epoch_itr is not None)) and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = ':' in getattr(args, 'data', '') # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(parsed_args): assert parsed_args.path is not None, '--path required for evaluation!' utils.import_user_module(parsed_args) print(parsed_args) use_cuda = torch.cuda.is_available() and not parsed_args.cpu task = tasks.setup_task(parsed_args) # Load ensemble print('| loading model(s) from {}'.format(parsed_args.path)) models, args = checkpoint_utils.load_model_ensemble( parsed_args.path.split(':'), arg_overrides=eval(parsed_args.model_overrides), task=task, ) for arg in vars(parsed_args).keys(): if arg not in { 'self_target', 'future_target', 'past_target', 'tokens_per_sample', 'output_size_dictionary', 'add_bos_token', }: setattr(args, arg, getattr(parsed_args, arg)) # reduce tokens per sample by the required context window size args.tokens_per_sample -= args.context_window task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.gen_subset) dataset = task.dataset(args.gen_subset) if args.context_window > 0: dataset = LMContextWindowDataset( dataset=dataset, tokens_per_sample=args.tokens_per_sample, context_window=args.context_window, pad_idx=task.source_dictionary.pad(), ) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset))) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.make_generation_fast_() if args.fp16: model.half() if use_cuda: model.cuda() assert len(models) > 0 print('num. model params: {}'.format( sum(p.numel() for p in models[0].parameters()))) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens or 36000, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( *[model.max_positions() for model in models]), ignore_invalid_inputs=True, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) gen_timer = StopwatchMeter() scorer = SequenceScorer(task.target_dictionary, args.softmax_batch) score_sum = 0. count = 0 if args.remove_bpe is not None: if args.remove_bpe == 'sentencepiece': raise NotImplementedError else: bpe_cont = args.remove_bpe.rstrip() bpe_toks = set(i for i in range(len(task.source_dictionary)) if task.source_dictionary[i].endswith(bpe_cont)) bpe_len = len(bpe_cont) else: bpe_toks = None bpe_len = 0 word_stats = dict() with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: if 'net_input' not in sample: continue sample = utils.move_to_cuda(sample) if use_cuda else sample gen_timer.start() hypos = scorer.generate(models, sample) gen_timer.stop(sample['ntokens']) for hypos_i in hypos: hypo = hypos_i[0] tokens = hypo['tokens'] tgt_len = tokens.numel() pos_scores = hypo['positional_scores'].float() if args.add_bos_token: assert hypo['tokens'][0].item( ) == task.target_dictionary.bos() tokens = tokens[1:] pos_scores = pos_scores[1:] skipped_toks = 0 if bpe_toks is not None: for i in range(tgt_len - 1): if tokens[i].item() in bpe_toks: skipped_toks += 1 pos_scores[i + 1] += pos_scores[i] pos_scores[i] = 0 inf_scores = pos_scores.eq(float('inf')) | pos_scores.eq( float('-inf')) if inf_scores.any(): print( '| Skipping tokens with inf scores:', task.target_dictionary.string( tokens[inf_scores.nonzero()])) pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += pos_scores.sum().cpu() count += pos_scores.numel() - skipped_toks if args.output_word_probs or args.output_word_stats: w = '' word_prob = [] is_bpe = False for i in range(len(tokens)): w_ind = tokens[i].item() w += task.source_dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True else: word_prob.append((w, pos_scores[i].item())) next_prob = None ind = i + 1 while ind < len(tokens): if pos_scores[ind].item() != 0: next_prob = pos_scores[ind] break ind += 1 word_stats.setdefault(w, WordStat(w, is_bpe)).add( pos_scores[i].item(), next_prob) is_bpe = False w = '' if args.output_word_probs: print('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob)) wps_meter.update(sample['ntokens']) t.log({'wps': round(wps_meter.avg)}) avg_nll_loss = -score_sum / count print('| Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)'.format( gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) print('| Loss: {:.4f}, Perplexity: {:.2f}'.format(avg_nll_loss, np.exp(avg_nll_loss))) if args.output_word_stats: for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): print(ws)
def prune(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', "valid"]) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {},'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) print('| Optimizer {}'.format(trainer.optimizer.__class__.__name__)) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small prune_meter = StopwatchMeter() prune_meter.start() # Estimate head importance scores head_importance, head_stats = estimate_head_importance( args, trainer, task, epoch_itr) prune_meter.stop() print('| done estimating head importance in {:.1f} seconds'.format( prune_meter.sum)) torch.save(head_stats, f"{os.path.dirname(args.restore_file)}/heads_stats.bin") # Print print("Head importances") print("Encoder self attention") for layer in range(head_importance["encoder_self"].size(0)): print("\t".join(f"{x:.5f}" for x in head_importance["encoder_self"][layer])) print("Encoder decoder attention") for layer in range(head_importance["encoder_decoder"].size(0)): print("\t".join(f"{x:.5f}" for x in head_importance["encoder_decoder"][layer])) print("Decoder self attention") for layer in range(head_importance["decoder_self"].size(0)): print("\t".join(f"{x:.5f}" for x in head_importance["decoder_self"][layer])) # Print sorted pruning profile encoder_self_profile = get_profile(head_importance["encoder_self"], prefix="E") encoder_decoder_profile = get_profile(head_importance["encoder_decoder"], prefix="A") decoder_self_profile = get_profile(head_importance["decoder_self"], prefix="D") # Join all all_profiles = {} if not (args.decoder_self_only or args.encoder_decoder_only): all_profiles.update(encoder_self_profile) if not (args.encoder_self_only or args.decoder_self_only): all_profiles.update(encoder_decoder_profile) if not (args.encoder_self_only or args.encoder_decoder_only): all_profiles.update(decoder_self_profile) sorted_profiles = sorted(all_profiles.items(), key=lambda x: x[1], reverse=args.one_minus) print("Heads sorted by importance:") print(" ".join(p for p, _ in sorted_profiles)) print("Sorted head importance scores:") print(" ".join(f"{v.data:.5f}" for _, v in sorted_profiles)) if args.only_importance: return tot_n_heads = len(sorted_profiles) # Eval pruning if args.one_head: kept_layers = set() to_prune_profile = [] for p, _ in reversed(sorted_profiles): layer_name = ":".join(p.split(":")[:-1]) if layer_name not in kept_layers: kept_layers.add(layer_name) continue else: to_prune_profile.insert(0, p) to_prune = parse_head_pruning_descriptors(to_prune_profile, reverse_descriptors=False) print(f"Evaluating following profile: \t{' '.join(to_prune_profile)}") # Apply pruning mask_heads(model, to_prune, args.transformer_mask_rescale) bleu = eval_bleu_score( model, task, task.dataset(args.valid_subset), beam=args.beam, replace_unk=args.replace_unk, lenpen=args.lenpen, buffer_size=100, use_cuda=torch.cuda.is_available() and not args.cpu, remove_bpe=args.remove_bpe, max_sentences=args.max_sentences, max_tokens=args.max_tokens, stop_early=not args.no_early_stop, normalize_scores=not args.unnormalized, min_len=args.min_len, ) print(f"BLEU score: \t{bleu.score:.2f}") sys.stdout.flush() return for i in range(0, 10): n_to_prune = int(ceil(tot_n_heads * i / 10)) to_prune_profile = [p for p, _ in sorted_profiles[:n_to_prune]] to_prune = parse_head_pruning_descriptors(to_prune_profile, reverse_descriptors=False) print(f"Evaluating following profile: \t{' '.join(to_prune_profile)}") # Apply pruning mask_heads(model, to_prune, args.transformer_mask_rescale) bleu = eval_bleu_score( model, task, task.dataset(args.valid_subset), beam=args.beam, replace_unk=args.replace_unk, lenpen=args.lenpen, buffer_size=100, use_cuda=torch.cuda.is_available() and not args.cpu, remove_bpe=args.remove_bpe, max_sentences=args.max_sentences, max_tokens=args.max_tokens, stop_early=not args.no_early_stop, normalize_scores=not args.unnormalized, min_len=args.min_len, ) print(f"BLEU score: \t{bleu.score:.2f}") sys.stdout.flush()
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairseq_cli.generate') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: score = hypo['score'] / math.log( 2) # convert to base 2 print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str), file=output_file) print( 'P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_( math.log(2)).tolist(), ))), file=output_file) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ])), file=output_file) if args.print_step: print('I-{}\t{}'.format(sample_id, hypo['steps']), file=output_file) if getattr(args, 'retain_iter_history', False): for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format( sample_id, step, h_str), file=output_file) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: logger.info('Generate {} with beam={}: {}'.format( args.gen_subset, args.beam, scorer.result_string())) return scorer
def train(args, extra_state, trainer, dataset): # offset for current epoch (may be different from checkpoint offset) starting_offset = extra_state["batch_offset"] # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and extra_state["epoch"] <= max_epoch: """Train the model for one epoch.""" itr, progress, extra_meters = setup_epoch( args=args, epoch=extra_state["epoch"], batch_offset=starting_offset, trainer=trainer, dataset=dataset, ) for i, sample in enumerate(itr, start=starting_offset): log_output = trainer.train_step(sample) train_stats = log_mid_epoch_stats( trainer=trainer, progress=progress, extra_meters=extra_meters, log_output=log_output, ) if (args.continuous_averaging_after_epochs >= 0 and extra_state["epoch"] > args.continuous_averaging_after_epochs): model_param_dict = trainer.model.state_dict() if "param_totals" not in extra_state: extra_state["param_totals"] = {} for name, value in model_param_dict.items(): extra_state["param_totals"][name] = value.clone() extra_state["param_accum_count"] = 1 else: for name, value in model_param_dict.items(): extra_state["param_totals"][name] += value extra_state["param_accum_count"] += 1 if i == starting_offset: # ignore the first mini-batch in words-per-second calculation trainer.get_meter("wps").reset() num_updates = trainer.get_num_updates() do_validate = (args.subepoch_validate_interval > 0 and num_updates % args.subepoch_validate_interval == 0) do_save = (not args.no_save and args.save_interval > 0 and num_updates % args.save_interval == 0) do_eval_bleu = ( # We can only do BLEU eval when we have a new checkpoint to load. do_save and args.generate_bleu_eval_interval > 0 and num_updates - extra_state["last_bleu_eval"] >= args.generate_bleu_eval_interval) if do_eval_bleu: extra_state["last_bleu_eval"] = num_updates extra_state["batch_offset"] = i + 1 ( _, val_ppl, val_bleu, stop_training_mid_epoch, ) = validate_save_and_evaluate_bleu( args=args, trainer=trainer, dataset=dataset, extra_state=extra_state, do_validate=do_validate, do_save=do_save, do_eval_bleu=do_eval_bleu, ) yield ( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": val_ppl, "tune_bleu": val_bleu, }, ) if stop_training_mid_epoch: break # log end-of-epoch stats train_stats = log_end_epoch_stats(trainer=trainer, progress=progress, extra_meters=extra_meters) if stop_training_mid_epoch: break # batch_offset being None denotes the end of an epoch. extra_state["batch_offset"] = None ( val_loss, val_ppl, val_bleu, stop_training_end_of_epoch, ) = validate_save_and_evaluate_bleu( args=args, trainer=trainer, dataset=dataset, extra_state=extra_state, do_validate=True, do_save=not args.no_save and not args.no_end_of_epoch_checkpoints, do_eval_bleu=args.generate_bleu_eval_per_epoch, ) extra_state["val_loss"] = val_loss yield ( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": val_ppl, "tune_bleu": val_bleu, }, ) if stop_training_end_of_epoch: break lr = trainer.lr_step(extra_state["epoch"], val_loss) extra_state["epoch"] += 1 extra_state["batch_offset"] = 0 starting_offset = 0 if is_training_over_time_limit(extra_state["start_time"], args.stop_time_hr): break train_meter.stop() print(f"| done training in {train_meter.sum:.1f} seconds") if "evaluate_bleu" in extra_state: print( f"| Best BLEU score of {extra_state['evaluate_bleu']['best']} was from " f"epoch {extra_state['evaluate_bleu']['best_epoch']}")
def train( args, extra_state: Dict[str, Any], trainer, task, epoch_itr, checkpoint_manager: Optional[checkpoint.CheckpointManager], output_queue: Optional[mp_queues.Queue] = None, **train_step_kwargs, ): # offset for current epoch (may be different from checkpoint offset) starting_offset = extra_state["batch_offset"] # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() stop_training_mid_epoch = False stop_training_end_of_epoch = False do_prune = args.pruning_percentile > 0 if do_prune: prune_masks = create_prune_masks(args, trainer) apply_prune_masks(prune_masks, trainer) while lr > args.min_lr and extra_state["epoch"] <= max_epoch: """Train the model for one epoch.""" itr, progress, extra_meters = setup_epoch( args=args, epoch_itr=epoch_itr, trainer=trainer ) for i, samples in enumerate(progress, start=starting_offset): clear_per_step_extra_state(extra_state) extra_state["num_iterations"] = extra_state.get("num_iterations", 0) + 1 if ( train_step_kwargs is not None and "augment_adv" in train_step_kwargs.keys() ): train_step_kwargs["augment_adv"] = ( extra_state["num_iterations"] > args.warmup_steps ) try: log_output = trainer.train_step(samples, **train_step_kwargs) # Fairseq's fp16_trainer raises this uncommon error to indicate # that we should stop training. except FloatingPointError as e: print(f"Stopping training due to: {e}.") stop_training_mid_epoch = True break if do_prune: apply_prune_masks(prune_masks, trainer) if i == starting_offset: # ignore the first mini-batch in words-per-second calculation trainer.get_meter("wps").reset() # Clear any remaining metrics from previous steps. This should already # have been done before, but just in case - to make sure we catch # any case where extra_case does not get populated correctly. extra_state = clear_per_step_extra_state(extra_state) extra_state["batch_offset"] = i + 1 ( extra_state, stop_training_mid_epoch, translation_samples, ) = evals.save_and_eval( args=args, trainer=trainer, task=task, extra_state=extra_state, checkpoint_manager=checkpoint_manager, ) # This should come after save_and_eval. Even if log_output is None, # meaning that there was an overflow, We should still run # save_and_eval to sync all_reduce and then skip the batch. if log_output is None: # This indicates that the batch was skipped, typically # because of OOM or FP16 overflow. continue train_stats = evals.log_mid_epoch_stats( trainer=trainer, progress=progress, extra_meters=extra_meters, log_output=log_output, ) extra_state = update_output( args=args, extra_state=extra_state, output_queue=output_queue, num_updates=trainer.get_num_updates(), train_ppl=train_stats["ppl"], # We only report wps at the end of an epoch, since # the meter gets reset at the start of every epoch. wps=None, ) if ( args.save_interval_updates > 0 and extra_state["num_iterations"] % args.save_interval_updates == 0 and args.shrink_lr_no_best_bleu_eval > 0 and extra_state["tune_bleu"]["num_since_best"] > args.shrink_lr_no_best_bleu_eval ): current_lr = trainer.optimizer.get_lr() trainer.optimizer.set_lr(current_lr * args.lr_shrink) lr = trainer.optimizer.get_lr() print(f"Decayed lr from {current_lr} to {lr}.") if stop_training_mid_epoch: break # log end-of-epoch stats train_stats = evals.log_end_epoch_stats( trainer=trainer, progress=progress, extra_meters=extra_meters ) # batch_offset being None denotes the end of an epoch. extra_state["batch_offset"] = None ( extra_state, stop_training_end_of_epoch, translation_samples, ) = evals.save_and_eval( args=args, trainer=trainer, task=task, extra_state=extra_state, end_of_epoch=True, checkpoint_manager=checkpoint_manager, ) extra_state = update_output( args=args, extra_state=extra_state, output_queue=output_queue, num_updates=trainer.get_num_updates(), train_ppl=train_stats["ppl"], wps=train_stats["wps"], ) if stop_training_mid_epoch or stop_training_end_of_epoch: break lr = trainer.lr_step(extra_state["epoch"], extra_state["tune_eval"]["loss"]) extra_state["epoch"] += 1 extra_state["batch_offset"] = 0 starting_offset = 0 train_meter.stop() print(f"| done training in {train_meter.sum:.1f} seconds") print( f"| Best BLEU score of {extra_state['tune_bleu']['best']} was from " f"epoch {extra_state['tune_bleu']['best_epoch']}" )
def save_checkpoint(args, trainer, epoch_itr, val_loss): if args.no_save or not distributed_utils.is_master(args): return write_timer = StopwatchMeter() write_timer.start() epoch = epoch_itr.epoch end_of_epoch = epoch_itr.end_of_epoch() updates = trainer.get_num_updates() checkpoint_conds = collections.OrderedDict() checkpoint_conds['checkpoint{}.pt'.format(epoch)] = ( end_of_epoch and not args.no_epoch_checkpoints and epoch % args.save_interval == 0) checkpoint_conds['checkpoint_{}_{}.pt'.format( epoch, updates)] = (not end_of_epoch and args.save_interval_updates > 0 and updates % args.save_interval_updates == 0) checkpoint_conds['checkpoint_best.pt'] = ( val_loss is not None and (not hasattr(save_checkpoint, 'best') or val_loss < save_checkpoint.best)) checkpoint_conds[ 'checkpoint_last.pt'] = True # keep this last so that it's a symlink prev_best = getattr(save_checkpoint, 'best', val_loss) if val_loss is not None: save_checkpoint.best = min(val_loss, prev_best) extra_state = { 'train_iterator': epoch_itr.state_dict(), 'val_loss': val_loss, } if hasattr(save_checkpoint, 'best'): extra_state.update({'best': save_checkpoint.best}) checkpoints = [ os.path.join(args.save_dir, fn) for fn, cond in checkpoint_conds.items() if cond ] if len(checkpoints) > 0: trainer.save_checkpoint(checkpoints[0], extra_state) for cp in checkpoints[1:]: shutil.copyfile(checkpoints[0], cp) write_timer.stop() print( '| saved checkpoint {} (epoch {} @ {} updates) (writing took {} seconds)' .format(checkpoints[0], epoch, updates, write_timer.sum)) if not end_of_epoch and args.keep_interval_updates > 0: # remove old checkpoints; checkpoints are sorted in descending order checkpoints = checkpoint_utils.checkpoint_paths( args.save_dir, pattern=r'checkpoint_\d+_(\d+)\.pt', ) for old_chk in checkpoints[args.keep_interval_updates:]: if os.path.lexists(old_chk): os.remove(old_chk) if args.keep_last_epochs > 0: # remove old epoch checkpoints; checkpoints are sorted in descending order checkpoints = checkpoint_utils.checkpoint_paths( args.save_dir, pattern=r'checkpoint(\d+)\.pt', ) for old_chk in checkpoints[args.keep_last_epochs:]: if os.path.lexists(old_chk): os.remove(old_chk)
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) summary_writer = SummaryWriter(log_dir=args.save_dir, enable=args.distributed_rank == 0) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) first_train = True # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) first_train = False # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') if not hasattr(save_checkpoint, 'not_best'): save_checkpoint.not_best = 0 if not args.no_first_valid and first_train: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, True, summary_writer) if args.finetune_params != '': print("| train parameters.") for name, param in trainer.model.named_parameters(): if trainer.should_train(name): print(name) print("| fixed parameters.") for name, param in trainer.model.named_parameters(): if not trainer.should_train(name): print(name) if args.start_ckpt != '': save_checkpoint.not_best = 0 save_checkpoint.best = 9999 print("| train begin.") while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr, summary_writer) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate( args, trainer, task, epoch_itr, valid_subsets, epoch_itr.epoch % args.test_bleu_interval == 0, summary_writer) if args.early_stop > 0: if hasattr(save_checkpoint, 'best') and valid_losses[0] > save_checkpoint.best: save_checkpoint.not_best += 1 print("| Not the best ckpt... not best:", save_checkpoint.not_best) if save_checkpoint.not_best > args.early_stop: print("| Early stop...") break else: save_checkpoint.not_best = 0 # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) os.system("ps aux | grep redis-server | awk '{print $2}' | xargs kill") if args.save_output: save_expert_outputs(args, task, trainer)
def infer_onebyone(args, models, task, input): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary input = ' '.join([i for i in input]) src_tokens = tgt_dict.encode_line(input).type(torch.LongTensor) input_sample = { 'id': torch.Tensor([0]), 'nsentences': 1, 'ntokens': len(src_tokens), 'net_input': { 'src_tokens': src_tokens.unsqueeze(0), 'src_lengths': torch.tensor([len(src_tokens)]), 'prev_output_tokens': torch.tensor([[tgt_dict.eos()]]) }, 'target': src_tokens.unsqueeze(0), } # Load ensemble print('| loading model(s) from {}'.format(args.path)) # Optimize ensemble for generation # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True wps_meter = TimeMeter() sample = input_sample sample = utils.move_to_cuda(sample) if use_cuda else sample prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() #pdb.set_trace() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad(sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: output = ''.join(hypo_str.split(' ')) print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ]))) if args.print_step: print('I-{}\t{}'.format(sample_id, hypo['steps'])) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line(target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) num_sentences += sample['nsentences'] print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) return scorer, output
def main(args): print(args) setup_logger(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) if args.distributed_world_size > 1: assert (torch.distributed.is_initialized()) torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() if args.max_tokens is None: args.max_tokens = 6000 pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) ctypes.CDLL('libcudart.so').cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) ctypes.CDLL('libcudart.so').cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) torch.manual_seed(args.seed) src_dict, tgt_dict = data_utils.load_dictionaries(args) add_extra_items_to_checkpoint({'src_dict': src_dict, 'tgt_dict': tgt_dict}) datasets = load_dataset_splits(args, ['train', 'valid', 'test'], src_dict, tgt_dict) model = build_model(args) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Build trainer if torch.cuda.get_device_capability(0)[0] >= 7 and not args.amp: print('| NOTICE: your device may support faster training with --amp') trainer = DDPTrainer(args, model) print('| model {}, criterion {}'.format( args.arch, trainer.criterion.__class__.__name__)) if (args.online_eval or args.target_bleu) and not args.remove_bpe: args.remove_bpe = '@@ ' print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) epoch_itr = data.EpochBatchIterator( dataset=datasets[args.train_subset], max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=args.max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available load_checkpoint(args, trainer, epoch_itr) # Send a dummy batch to warm the caching allocator dummy_batch = data_utils.get_dummy_batch(args.max_tokens, src_dict, tgt_dict) trainer.dummy_train_step(dummy_batch) # Sanity check if args.do_sanity_check: print('Performing sanity check...') sanity_score = score(args, trainer, datasets['test'], src_dict, tgt_dict, 'test.raw.de') DLLogger.log(step='SANITY_CHECK', data={'sanity_check_score': sanity_score}, verbosity=1) # Train until the learning rate gets too small or model reaches target score max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf tgt_bleu = args.target_bleu or math.inf current_bleu = 0.0 best_bleu = -1.0 lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') run_summary = { 'loss': float('inf'), 'val_loss': float('inf'), 'speed': 0, 'accuracy': 0 } while lr >= args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update and current_bleu < tgt_bleu: DLLogger.log(step=trainer.get_num_updates(), data={'epoch': epoch_itr.epoch}, verbosity=0) # train for one epoch with torch.autograd.profiler.profile(enabled=args.profile, use_cuda=True) as prof: train(args, trainer, datasets, epoch_itr) if args.profile: print(prof.key_averages().table(sort_by="cuda_time_total")) if args.profiler_file: with open(os.path.join(args.save_dir, args.profiler_file), 'w') as f: f.write( prof.key_averages().table(sort_by="cuda_time_total")) exit(0) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, datasets, valid_subsets) valid_bleu = score(args, trainer, datasets[valid_subsets[0]], src_dict, tgt_dict, 'valid.raw.de') DLLogger.log(step=trainer.get_num_updates(), data={ 'val_loss': valid_losses[0], 'val_bleu': valid_bleu }, verbosity=1) # Eval BLEU score if args.online_eval or (not tgt_bleu is math.inf): current_bleu = score(args, trainer, datasets[args.gen_subset], src_dict, tgt_dict, 'test.raw.de') DLLogger.log(step=trainer.get_num_updates(), data={'test_bleu': current_bleu}, verbosity=1) if current_bleu > best_bleu: best_bleu = current_bleu DLLogger.log(step='RUN', data={'BLEU': best_bleu}, verbosity=0) save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if valid_losses[0] < run_summary['val_loss']: run_summary['val_loss'] = valid_losses[0] if best_bleu < 0: run_summary['accuracy'] = valid_bleu else: run_summary['accuracy'] = best_bleu run_summary['loss'] = valid_losses[0] run_summary['speed'] = trainer.throughput_meter.u_avg # Only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # Save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() DLLogger.log(step=[], data=run_summary, verbosity=0) DLLogger.log(step='RUN', data={'walltime': train_meter.sum}, verbosity=0) print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info('max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') print(args.multi_views) while ( lr > args.min_lr and ( epoch_itr.epoch < max_epoch # allow resuming training from the final checkpoint or epoch_itr._next_epoch_itr is not None ) and trainer.get_num_updates() < max_update ): # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) bart = BARTHubInterface(args, task, trainer.model).cuda() #print(bart.device) bart.eval() count = 1 bsz = 8 print("Test on val set: ") with open('../data/val_sent_trans_cons_label.source') as source, open('../data/val_sent_c99_label.source') as source2, open('./val_best_multi_attn_'+str(args.lr_weight)+'_.hypo', 'wt', encoding='utf-8') as fout: s1 = source.readlines() s2 = source2.readlines() slines = [s1[0].strip()] slines2 = [s2[0].strip()] for i in tqdm(range(1, len(s1))): if count % bsz == 0: with torch.no_grad(): if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() slines = [] slines2 = [] slines.append(s1[i].strip()) slines2.append(s2[i].strip()) count += 1 if slines != []: if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) #hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() hyp_path = './val_best_multi_attn_'+str(args.lr_weight)+'_.hypo' ref_path = '../data/val_sent_trans_cons_label.target' hypothesis = [] with open(hyp_path, 'r') as f: lines = f.readlines() for l in lines: hypothesis.append(l[:-1]) reference = [] with open(ref_path, 'r') as f: lines = f.readlines() for l in lines: reference.append(l[:-1]) rouge = Rouge() print("Val", rouge.get_scores(hypothesis, reference, avg = True)) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) print("Test on testing set: ") count = 1 bsz = 8 with open('../data/test_sent_trans_cons_label.source') as source, open('../data/test_sent_c99_label.source') as source2, open('./test_best_multi_attn_'+str(args.lr_weight)+'_.hypo', 'wt', encoding='utf-8') as fout: s1 = source.readlines() s2 = source2.readlines() slines = [s1[0].strip()] slines2 = [s2[0].strip()] for i in tqdm(range(1, len(s1))): if count % bsz == 0: with torch.no_grad(): if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() slines = [] slines2 = [] slines.append(s1[i].strip()) slines2.append(s2[i].strip()) count += 1 if slines != []: if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() hyp_path = './test_best_multi_attn_'+str(args.lr_weight)+'_.hypo' ref_path = '../data/test_sent_trans_cons_label.target' hypothesis = [] with open(hyp_path, 'r') as f: lines = f.readlines() for l in lines: hypothesis.append(l[:-1]) reference = [] with open(ref_path, 'r') as f: lines = f.readlines() for l in lines: reference.append(l[:-1]) rouge = Rouge() print('Test', rouge.get_scores(hypothesis, reference, avg = True)) # early stop if should_stop_early(args, valid_losses[0]): logger.info('early stop since valid performance hasn\'t improved for last {} runs'.format(args.patience)) break epoch_itr = trainer.get_train_iterator( epoch_itr.epoch, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, 'data', '')), ) train_meter.stop() logger.info('done training in {:.1f} seconds'.format(train_meter.sum))
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument('--max-sentences', type=int, metavar='N', help='maximum number of sentences in a batch') dataset_args.add_argument('--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument('--valid-subset', default='valid', metavar='SPLIT', help='comma separated list of data subsets ' ' to use for validation (train, valid, valid1,test, test1)') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) if args.no_progress_bar and args.log_format is None: args.log_format = 'simple' if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Load dataset splits = ['train', 'valid'] if data.has_binary_files(args.data, splits): dataset = data.load_dataset(args.data, splits, args.source_lang, args.target_lang) else: dataset = data.load_raw_text_dataset(args.data, splits, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print(args) print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') num_gpus = torch.cuda.device_count() print('| using {} GPUs (with max tokens per GPU = {} and max sentences per GPU = {})'.format( num_gpus, args.max_tokens, args.max_sentences)) # Build model and criterion model = utils.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) # The max number of positions can be different for train and valid # e.g., RNNs may support more positions at test time than seen in training max_positions_train = (args.max_source_positions, args.max_target_positions) max_positions_valid = ( min(args.max_source_positions, model.max_encoder_positions()), min(args.max_target_positions, model.max_decoder_positions()) ) # Start multiprocessing trainer = MultiprocessingTrainer(args, model, criterion) # Load the latest checkpoint if one is available checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format(checkpoint_path, epoch)) if batch_offset == 0: epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, dataset, max_positions_train, num_gpus) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, dataset, max_positions_valid, subset, num_gpus) if k == 0: if not args.no_save: # save checkpoint save_checkpoint(trainer, args, epoch, 0, val_loss) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Stop multiprocessing trainer.stop()