def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small valid_losses = [None] valid_subsets = args.valid_subset.split(',') if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None]
def main(args, init_distributed=False): utils.import_user_module(args) ########### xml initialization ## xml_model/mlm_tlm_xnli15_1024.pth reloaded = torch.load(args.xml_model_path) params = AttrDict(reloaded['params']) print("Supported languages: %s" % ", ".join(params.lang2id.keys())) # build dictionary / update parameters xml_dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) params.n_words = len(xml_dico) params.bos_index = xml_dico.index(BOS_WORD) params.eos_index = xml_dico.index(EOS_WORD) params.pad_index = xml_dico.index(PAD_WORD) params.unk_index = xml_dico.index(UNK_WORD) params.mask_index = xml_dico.index(MASK_WORD) xml_model = TransformerModel(params, xml_dico, True, True) xml_model.eval() print(xml_model._modules['position_embeddings']._parameters['weight'][ 0, 0].item()) xml_model.load_state_dict(reloaded['model']) print(xml_model._modules['position_embeddings']._parameters['weight'][ 0, 0].item()) ############## end of xml initilization assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, shuffle=False, epoch=0, xml_dico=xml_dico, xml_params=params, task=args.task) # Build model and criterion predictor, estimator, xml_estmimator = task.build_model(args) # xml_estmimator = None for n, p in predictor.named_parameters(): if not n.startswith('estimator'): p.requires_grad = False criterion = task.build_criterion(args) print(predictor) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in predictor.parameters()), sum(p.numel() for p in predictor.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, predictor, estimator, criterion, xml_model=xml_model, xml_estmimator=xml_estmimator) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator print(predictor._modules['encoder']._modules['embed_tokens']. _parameters['weight'].data[0, 0].item()) checkpoint_utils.load_predictor_checkpoint(args, trainer) print(predictor._modules['encoder']._modules['embed_tokens']. _parameters['weight'].data[0, 0].item()) extra_state, epoch_itr = checkpoint_utils.load_estimator_checkpoint( args, trainer, xml_dico=xml_dico, xml_params=params) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') if args.evaluate > 0: validate(args, trainer, task, epoch_itr, valid_subsets, evaluate=True) return while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train_qe(args, trainer, task, epoch_itr) # if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: if not args.disable_validation: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint # if epoch_itr.epoch % args.save_interval == 0: # checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if ':' in getattr(args, 'data', ''): # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) # Initialize CUDA and distributed training use_cuda = torch.cuda.is_available() and not args.cpu if use_cuda: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup tasks, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Get base path training_name = get_training_name(args) base_path = os.path.join(args.save_dir, get_training_name(args, False)) # Iterate through each item in ckpt dict for ckpt_id, ckpt_item in args.checkpoint_dict.items(): # Set up trainer trainer = Trainer(args, task, model, criterion) # Intialize neptune experiment setattr(args, 'training_name', os.path.join(training_name, 'ft', ckpt_id)) if distributed_utils.is_master(args) and not args.debug: initialize_neptune(trainer, None, args) # Create list of ckpt paths for current ckpt_item if 'checkpoints' not in ckpt_item.keys(): ckpt_base_path = os.path.join(base_path, ckpt_id) ckpt_list = list( glob.glob(os.path.join(ckpt_base_path, 'checkpoints/*.pt'), recursive=True)) elif len(ckpt_item['checkpoints']) == 0: ckpt_base_path = os.path.join(base_path, ckpt_id) ckpt_list = list( glob.glob(os.path.join(ckpt_base_path, 'checkpoints/*.pt'), recursive=True)) else: ckpt_list = ckpt_item['checkpoints'] # Filter checkpoint_best and checkpoint_last out of ckpt_list if args.filter_best_last_ckpts: ckpt_list = [ x for x in ckpt_list if x.split('/')[-1][-7:-3] != 'best' and x.split('/')[-1][-7:-3] != 'last' ] # Iterate through each ckpt path for current ckpt_item for ckpt in ckpt_list: try: ckpt_idx = int(ckpt.split('/')[-1][10:].split('.')[0]) except: ckpt_idx = get_checkpoint_epoch(ckpt) evaluate_checkpoint(args, ckpt, ckpt_idx, trainer)
def main(cfg: FairseqConfig) -> None: if isinstance(cfg, argparse.Namespace): cfg = convert_namespace_to_omegaconf(cfg) utils.import_user_module(cfg.common) if is_master(cfg.distributed_training) and "job_logging_cfg" in cfg: # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) logging.config.dictConfig(OmegaConf.to_container(cfg.job_logging_cfg)) assert ( cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" metrics.reset() np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) if distributed_utils.is_master(cfg.distributed_training): checkpoint_utils.verify_checkpoint_directory(cfg.checkpoint.save_dir) # Print args logger.info(cfg) if cfg.checkpoint.write_checkpoints_asynchronously: try: import iopath # noqa: F401 except ImportError: logging.exception( "Asynchronous checkpoint writing is specified but iopath is " "not installed: `pip install iopath`") return # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(cfg.task) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in cfg.dataset.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) assert cfg.criterion, "Please specify criterion to train a model" # Build model and criterion model = task.build_model(cfg.model) criterion = task.build_criterion(cfg.criterion) logger.info(model) logger.info("task: {}".format(task.__class__.__name__)) logger.info("model: {}".format(model.__class__.__name__)) logger.info("criterion: {}".format(criterion.__class__.__name__)) logger.info("num. model params: {:,} (num. trained: {:,})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # (optionally) Configure quantization if cfg.common.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=cfg.common.quantization_config_path, max_epoch=cfg.optimization.max_epoch, max_update=cfg.optimization.max_update, ) else: quantizer = None # Build trainer if cfg.common.model_parallel_size == 1: trainer = Trainer(cfg, task, model, criterion, quantizer) else: trainer = MegatronTrainer(cfg, task, model, criterion) logger.info("training on {} devices (GPUs/TPUs)".format( cfg.distributed_training.distributed_world_size)) logger.info("max tokens per GPU = {} and batch size per GPU = {}".format( cfg.dataset.max_tokens, cfg.dataset.batch_size, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint( cfg.checkpoint, trainer, # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) max_epoch = cfg.optimization.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while epoch_itr.next_epoch_idx <= max_epoch: if lr <= cfg.optimization.stop_min_lr: logger.info( f"stopping training because current learning rate ({lr}) is smaller " "than or equal to minimum learning rate " f"(--stop-min-lr={cfg.optimization.stop_min_lr})") break # train for one epoch valid_losses, should_stop = train(cfg, trainer, task, epoch_itr) if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=task.has_sharded_data("train"), # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum)) # ioPath implementation to wait for all asynchronous file writes to complete. if cfg.checkpoint.write_checkpoints_asynchronously: logger.info( "ioPath PathManager waiting for all asynchronous checkpoint " "writes to finish.") PathManager.async_close() logger.info("ioPath PathManager finished waiting.")
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info('max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') print(args.multi_views) while ( lr > args.min_lr and ( epoch_itr.epoch < max_epoch # allow resuming training from the final checkpoint or epoch_itr._next_epoch_itr is not None ) and trainer.get_num_updates() < max_update ): # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) bart = BARTHubInterface(args, task, trainer.model).cuda() #print(bart.device) bart.eval() count = 1 bsz = 8 print("Test on val set: ") with open('../data/val_sent_trans_cons_label.source') as source, open('../data/val_sent_c99_label.source') as source2, open('./val_best_multi_attn_'+str(args.lr_weight)+'_.hypo', 'wt', encoding='utf-8') as fout: s1 = source.readlines() s2 = source2.readlines() slines = [s1[0].strip()] slines2 = [s2[0].strip()] for i in tqdm(range(1, len(s1))): if count % bsz == 0: with torch.no_grad(): if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() slines = [] slines2 = [] slines.append(s1[i].strip()) slines2.append(s2[i].strip()) count += 1 if slines != []: if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) #hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() hyp_path = './val_best_multi_attn_'+str(args.lr_weight)+'_.hypo' ref_path = '../data/val_sent_trans_cons_label.target' hypothesis = [] with open(hyp_path, 'r') as f: lines = f.readlines() for l in lines: hypothesis.append(l[:-1]) reference = [] with open(ref_path, 'r') as f: lines = f.readlines() for l in lines: reference.append(l[:-1]) rouge = Rouge() print("Val", rouge.get_scores(hypothesis, reference, avg = True)) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) print("Test on testing set: ") count = 1 bsz = 8 with open('../data/test_sent_trans_cons_label.source') as source, open('../data/test_sent_c99_label.source') as source2, open('./test_best_multi_attn_'+str(args.lr_weight)+'_.hypo', 'wt', encoding='utf-8') as fout: s1 = source.readlines() s2 = source2.readlines() slines = [s1[0].strip()] slines2 = [s2[0].strip()] for i in tqdm(range(1, len(s1))): if count % bsz == 0: with torch.no_grad(): if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() slines = [] slines2 = [] slines.append(s1[i].strip()) slines2.append(s2[i].strip()) count += 1 if slines != []: if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() hyp_path = './test_best_multi_attn_'+str(args.lr_weight)+'_.hypo' ref_path = '../data/test_sent_trans_cons_label.target' hypothesis = [] with open(hyp_path, 'r') as f: lines = f.readlines() for l in lines: hypothesis.append(l[:-1]) reference = [] with open(ref_path, 'r') as f: lines = f.readlines() for l in lines: reference.append(l[:-1]) rouge = Rouge() print('Test', rouge.get_scores(hypothesis, reference, avg = True)) # early stop if should_stop_early(args, valid_losses[0]): logger.info('early stop since valid performance hasn\'t improved for last {} runs'.format(args.patience)) break epoch_itr = trainer.get_train_iterator( epoch_itr.epoch, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, 'data', '')), ) train_meter.stop() logger.info('done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') tokenize = sacrebleu.DEFAULT_TOKENIZER if not args.eval_tokenized_bleu else 'none' hyps, refs = validate(args, trainer, task, epoch_itr, valid_subsets) for h, r, split in zip(hyps, refs, args.valid_subset.split(',')): assert len(h) == len(r) sacrebleu_score, _, _ = sacrebleu.corpus_bleu( h, [r], tokenize=tokenize), hyps, refs bleu = compute_cvpr_bleu(h, r) rouge_score = rouge.rouge(h, r) print('{} set has {} samples,\n' 'sacrebleu: {},\n' 'CVPR BLEU scripts: {}\n' 'CVPR ROUGE: {}'.format(split, len(h), sacrebleu_score, bleu, rouge_score)) print('performance: {:.2f} {}'.format( rouge_score['rouge_l/f_score'] * 100, ' '.join([str(b) for b in bleu])))
def main(args): import_user_module(args) assert ( args.max_tokens is not None or args.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" metrics.reset() np.random.seed(args.seed) utils.set_torch_seed(args.seed) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info("task: {} ({})".format(args.task, task.__class__.__name__)) logger.info("model: {} ({})".format(args.arch, model.__class__.__name__)) logger.info("criterion: {} ({})".format(args.criterion, criterion.__class__.__name__)) logger.info("num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # breakpoint() # ========== initialize the model with pretrained BART parameters ========== # for shared embeddings and subtoken split for amr nodes if 'bartsv' in args.arch: if args.initialize_with_bart: logger.info( '-' * 10 + ' initializing model parameters with pretrained BART model ' + '-' * 10) new_state_dict = copy.deepcopy(task.bart.model.state_dict()) # treat the embedding initialization separately later, as the size different logger.info( '-' * 10 + ' delay encoder embeddings, decoder input and output embeddings initialization ' + '-' * 10) ignore_keys = set([ 'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'decoder.output_projection.weight' ]) for k in ignore_keys: del new_state_dict[k] if not args.initialize_with_bart_enc: logger.info( '-' * 10 + ' do not initialize with BART encoder parameters ' + '-' * 10) for k in list(new_state_dict.keys()): if k.startswith('encoder'): del new_state_dict[k] if not args.initialize_with_bart_dec: logger.info( '-' * 10 + ' do not initialize with BART decoder parameters ' + '-' * 10) for k in list(new_state_dict.keys()): if k.startswith('decoder'): del new_state_dict[k] model.load_state_dict(new_state_dict, strict=False, args=args) # initialize the Bart part embeddings bart_vocab_size = task.target_dictionary.bart_vocab_size # NOTE we need to prune the pretrained BART embeddings, especially for bart.base bart_embed_weight = task.bart.model.encoder.embed_tokens.weight.data[: bart_vocab_size] assert len(bart_embed_weight) == bart_vocab_size with torch.no_grad(): model.encoder.embed_tokens.weight[:bart_vocab_size].copy_( bart_embed_weight) model.decoder.embed_tokens.weight[:bart_vocab_size].copy_( bart_embed_weight) model.decoder.output_projection.weight[:bart_vocab_size].copy_( bart_embed_weight) if args.bart_emb_init_composition: logger.info( '-' * 10 + ' initialize extended target embeddings with compositional embeddings ' 'from BART vocabulary ' + '-' * 10) # breakpoint() symbols = [ task.target_dictionary[idx] for idx in range(bart_vocab_size, len(task.target_dictionary)) ] mapper = MapAvgEmbeddingBART(task.bart, task.bart.model.decoder.embed_tokens) comp_embed_weight, map_all = mapper.map_avg_embeddings( symbols, transform=transform_action_symbol, add_noise=False) assert len(comp_embed_weight) == len(symbols) with torch.no_grad(): model.encoder.embed_tokens.weight[bart_vocab_size:].copy_( comp_embed_weight) model.decoder.embed_tokens.weight[bart_vocab_size:].copy_( comp_embed_weight) model.decoder.output_projection.weight[bart_vocab_size:].copy_( comp_embed_weight) elif 'bart' in args.arch: if args.initialize_with_bart: logger.info( '-' * 10 + ' initializing model parameters with pretrained BART model ' + '-' * 10) new_state_dict = copy.deepcopy(task.bart.model.state_dict()) if not args.bart_emb_decoder: logger.info('-' * 10 + ' build a separate decoder dictionary embedding ' + '-' * 10) if not args.bart_emb_decoder_input: ignore_keys = set([ 'decoder.embed_tokens.weight', 'decoder.output_projection.weight' ]) else: logger.info( '-' * 10 + ' use BART dictionary embedding for target input ' + '-' * 10) ignore_keys = set(['decoder.output_projection.weight']) for k in ignore_keys: del new_state_dict[k] if not args.initialize_with_bart_enc: logger.info( '-' * 10 + ' do not initialize with BART encoder parameters ' + '-' * 10) for k in list(new_state_dict.keys()): if k.startswith('encoder'): del new_state_dict[k] if not args.initialize_with_bart_dec: logger.info( '-' * 10 + ' do not initialize with BART decoder parameters ' + '-' * 10) for k in list(new_state_dict.keys()): if k.startswith('decoder'): del new_state_dict[k] model.load_state_dict(new_state_dict, strict=False, args=args) # initialize the target embeddings with average of subtoken embeddings in BART vocabulary if args.bart_emb_init_composition: assert not args.bart_emb_decoder, 'should not use the compositional embeddings on top of BART vocabulary here' logger.info( '-' * 10 + ' initialize target embeddings with compositional embeddings from BART vocabulary ' + '-' * 10) composite_embed = CompositeEmbeddingBART( task.bart, task.bart.model.decoder.embed_tokens, task.target_dictionary) if args.bart_emb_decoder_input: # only initialize the decoder output embeddings with torch.no_grad(): model.decoder.output_projection.weight.copy_( composite_embed.embedding_weight) else: # initialize both the decoder input and output embeddings with torch.no_grad(): model.decoder.embed_tokens.weight.copy_( composite_embed.embedding_weight) model.decoder.output_projection.weight.copy_( composite_embed.embedding_weight) elif 'roberta' in args.arch: # initialize the target embeddings with average of subtoken embeddings in BART vocabulary if args.bart_emb_init_composition: assert not args.bart_emb_decoder, 'should not use the compositional embeddings on top of RoBERTa vocabulary here' logger.info( '-' * 10 + ' initialize target embeddings with compositional embeddings from RoBERTa vocabulary ' + '-' * 10) composite_embed = CompositeEmbeddingBART( task.bart, # NOTE here "bart" means roberta task.bart.model.encoder.sentence_encoder.embed_tokens, task.target_dictionary) if args.bart_emb_decoder_input: # only initialize the decoder output embeddings with torch.no_grad(): model.decoder.output_projection.weight.copy_( composite_embed.embedding_weight) else: # initialize both the decoder input and output embeddings with torch.no_grad(): model.decoder.embed_tokens.weight.copy_( composite_embed.embedding_weight) model.decoder.output_projection.weight.copy_( composite_embed.embedding_weight) else: raise ValueError # ========================================================================== # breakpoint() # (optionally) Configure quantization if args.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=args.quantization_config_path, max_epoch=args.max_epoch, max_update=args.max_update, ) else: quantizer = None # Build trainer if args.model_parallel_size == 1: trainer = Trainer(args, task, model, criterion, quantizer) else: trainer = MegatronTrainer(args, task, model, criterion) logger.info("training on {} devices (GPUs/TPUs)".format( args.distributed_world_size)) logger.info( "max tokens per GPU = {} and max sentences per GPU = {}".format( args.max_tokens, args.batch_size)) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint( args, trainer, # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch: # train for one epoch valid_losses, should_stop = train(args, trainer, task, epoch_itr) if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=task.has_sharded_data("train"), # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') if args.eval_mode != 'none': start_val_time = time.time() with torch.no_grad(): if args.eval_mode != 'entropy': _ = validate(args, trainer, task, epoch_itr, valid_subsets, args.prune_num) print('elapsed time (seconds): {}'.format(time.time() - start_val_time)) _ = validate_iw(args, trainer, task, epoch_itr, valid_subsets, args.prune_num, mode=args.eval_mode) return while (lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) # early stop if should_stop_early(args, valid_losses[0]): logger.info( 'early stop since valid performance hasn\'t improved for last {} runs' .format(args.patience)) break epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, 'data', '')), ) logger.info('done training in {:.1f} seconds'.format(train_meter.sum)) # _ = validate_iw(args, trainer, task, epoch_itr, valid_subsets) train_meter.stop()
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: ## 单机多卡和多机多卡训练都会调用这个函数 ## 此函数中调用init_process_group函数, ## 此时还没有load数据,因此应该就没有了之前版本多机训练时因为load数据速度不同导致的超时问题 args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): ## 判断当前GPU是否是master GPU(args.distributed_rank = 0) checkpoint_utils.verify_checkpoint_directory(args.save_dir) ## 确认checkpoint的目标存储路径 # Print args print(args) # Setup task, e.g., translation, language modeling, etc. ## 创建对应的TranslationTask类,读入两个dictionary: self.src_dict, self.tgt_dict, 并确定是left paddig or right padding task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) # 用于验证的开发集, 每个集合的名字为valid_sub_split。load之后,根据valid_sub_split的名字存放在task.datasets中 for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) ## 搭建神经网络模型, 翻译即使用TransformerModel类, 继承自FairseqEncoderDecoderModel criterion = task.build_criterion(args) ## 搭建loss函数, 此处即使用LabelSmoothedCrossEntropyCriterion print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) ##print the number of parameters of each matrix #for name, param in model.named_parameters(recurse=True): # print (name, param.numel()) #exit(0) # Build trainer # 如果distributed_world_size > 1, 则会对model和criterion使用models.DistributedFairseqModel进行wrap trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) ## generate data iterator, epoch_itr # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') while ( lr > args.min_lr and (epoch_itr.epoch < max_epoch or (epoch_itr.epoch == max_epoch and epoch_itr._next_epoch_itr is not None)) and trainer.get_num_updates() < max_update ): # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) ##每个epoch都新建一个epoch data iterator来遍历所有的训练数据 reload_dataset = ':' in getattr(args, 'data', '') # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) adv_criterion = task.build_adversarial_criterion(args) adv = task.build_adversary(args, model) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = AdversarialTrainer(args, task, model, criterion, adv_criterion, adv) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr, filtered_maxpos_indices = checkpoint_utils.load_checkpoint( args, trainer) # pretrain data actor if args.pretrain_data_actor and args.data_actor == 'lan' and args.data_actor_step_update: trainer.pretrain_data_actor() # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') if args.eval_bleu: gen_args = copy.deepcopy(args) gen_args.sample = False gen_args.beam = 5 gen_args.batch_size = 32 generator = task.build_generator(gen_args) args.maximize_best_checkpoint_metric = True else: generator = None while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch epoch_itr = train(args, trainer, task, epoch_itr, generator, filtered_maxpos_indices) #trainer.update_language_sampler(args) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, generator) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if ':' in getattr(args, 'data', ''): # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch)[0] train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) for idx in sorted(trainer.idx_to_dev_grad_dotprod.keys()): print(idx) str_dotprod = [str(i) for i in trainer.idx_to_dev_grad_dotprod[idx]] print(" ".join(str_dotprod))
def main(parsed_args, **unused_kwargs): assert parsed_args.path is not None, '--path required for evaluation!' if torch.cuda.is_available() and not parsed_args.cpu: torch.cuda.set_device(parsed_args.device_id) utils.import_user_module(parsed_args) logger.info(parsed_args) use_cuda = torch.cuda.is_available() and not parsed_args.cpu task = tasks.setup_task(parsed_args) # Load ensemble logger.info('loading model(s) from {}'.format(parsed_args.path)) models, args = checkpoint_utils.load_model_ensemble( parsed_args.path.split(os.pathsep), arg_overrides=eval(parsed_args.model_overrides), task=task, suffix=getattr(parsed_args, "checkpoint_suffix", ""), ) for arg in vars(parsed_args).keys(): if arg not in { 'self_target', 'future_target', 'past_target', 'tokens_per_sample', 'output_size_dictionary', 'add_bos_token', }: setattr(args, arg, getattr(parsed_args, arg)) # reduce tokens per sample by the required context window size args.tokens_per_sample -= args.context_window task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.gen_subset) dataset = task.dataset(args.gen_subset) if args.context_window > 0: dataset = LMContextWindowDataset( dataset=dataset, tokens_per_sample=args.tokens_per_sample, context_window=args.context_window, pad_idx=task.source_dictionary.pad(), ) logger.info('{} {} {} examples'.format(args.data, args.gen_subset, len(dataset))) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.prepare_for_inference_(args) if args.fp16: model.half() if use_cuda: model.cuda() assert len(models) > 0 logger.info('num. model params: {}'.format(sum(p.numel() for p in models[0].parameters()))) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens or 36000, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions(*[ model.max_positions() for model in models ]), ignore_invalid_inputs=True, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, data_buffer_size=args.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=('tqdm' if not args.no_progress_bar else 'none'), ) gen_timer = StopwatchMeter() scorer = SequenceScorer(task.target_dictionary, args.softmax_batch) score_sum = 0. count = 0 if args.remove_bpe is not None: if args.remove_bpe == 'sentencepiece': raise NotImplementedError else: bpe_cont = args.remove_bpe.rstrip() bpe_toks = { i for i in range(len(task.source_dictionary)) if task.source_dictionary[i].endswith(bpe_cont) } bpe_len = len(bpe_cont) else: bpe_toks = None bpe_len = 0 word_stats = dict() print(os.path.dirname(args.jason_test_output)) checkpoint_utils.verify_checkpoint_directory(os.path.dirname(args.jason_test_output)) test_loss_writer = open(args.jason_test_output, 'w') # test_loss_uid_writer = open(args.jason_test_uid_output, 'w') wps_meter = TimeMeter() for sample in progress: if 'net_input' not in sample: continue sample = utils.move_to_cuda(sample) if use_cuda else sample gen_timer.start() hypos = scorer.generate(models, sample) gen_timer.stop(sample['ntokens']) for i, hypos_i in enumerate(hypos): hypo = hypos_i[0] sample_id = sample['id'][i] tokens = hypo['tokens'] tgt_len = tokens.numel() pos_scores = hypo['positional_scores'].float() if getattr(args, 'add_bos_token', False): assert hypo['tokens'][0].item() == task.target_dictionary.bos() tokens = tokens[1:] pos_scores = pos_scores[1:] skipped_toks = 0 if bpe_toks is not None: for i in range(tgt_len - 1): if tokens[i].item() in bpe_toks: skipped_toks += 1 pos_scores[i + 1] += pos_scores[i] pos_scores[i] = 0 inf_scores = pos_scores.eq(float('inf')) | pos_scores.eq(float('-inf')) if inf_scores.any(): logger.info( 'skipping tokens with inf scores:', task.target_dictionary.string(tokens[inf_scores.nonzero()]) ) pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += pos_scores.sum().cpu() count += pos_scores.numel() - skipped_toks # print(i, pos_scores.size(), pos_scores.cpu()[-3:], pos_scores.sum().cpu(), pos_scores.numel() - skipped_toks) # print(parsed_args.jason_test_output_dir) pos_scores_cpu = pos_scores.cpu() output_line = "" for j in range(pos_scores_cpu.size()[0]): nll_loss_base2 = - pos_scores_cpu[j].item() / math.log(2) test_loss_writer.write(f"{nll_loss_base2}\n") output_line += f"{nll_loss_base2:.5f}," output_line = output_line[:-1] + "\n" # test_loss_uid_writer.write(output_line) if args.output_word_probs or args.output_word_stats: w = '' word_prob = [] is_bpe = False for i in range(len(tokens)): w_ind = tokens[i].item() w += task.source_dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True else: word_prob.append((w, pos_scores[i].item())) next_prob = None ind = i + 1 while ind < len(tokens): if pos_scores[ind].item() != 0: next_prob = pos_scores[ind] break ind += 1 word_stats.setdefault(w, WordStat(w, is_bpe)).add(pos_scores[i].item(), next_prob) is_bpe = False w = '' if args.output_word_probs: logger.info( str(int(sample_id)) + " " + ('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob)) ) wps_meter.update(sample['ntokens']) progress.log({'wps': round(wps_meter.avg)}) avg_nll_loss = -score_sum / count / math.log(2) # convert to base 2 logger.info('Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)'.format( gen_timer.n, gen_timer.sum, 1. / gen_timer.avg )) logger.info('Loss (base 2): {:.4f}, Perplexity: {:.2f}'.format( avg_nll_loss, 2**avg_nll_loss )) if args.output_word_stats: for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): logger.info(ws)
def main(cfg: DictConfig) -> None: if isinstance(cfg, argparse.Namespace): cfg = convert_namespace_to_omegaconf(cfg) utils.import_user_module(cfg.common) assert ( cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" metrics.reset() np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) if distributed_utils.is_master(cfg.distributed_training): checkpoint_utils.verify_checkpoint_directory(cfg.checkpoint.save_dir) # Print args logger.info(cfg) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(cfg.task) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in cfg.dataset.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) assert cfg.criterion, "Please specify criterion to train a model" # Build model and criterion model = task.build_model(cfg.model) criterion = task.build_criterion(cfg.criterion) logger.info(model) logger.info("task: {}".format(task.__class__.__name__)) logger.info("model: {}".format(model.__class__.__name__)) logger.info("criterion: {}".format(criterion.__class__.__name__)) logger.info( "num. model params: {:,} (num. trained: {:,})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), ) ) ''' 2021-01-15 12:02:31 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 2021-01-15 12:02:33 | INFO | fairseq.tasks.translation | [zh] dictionary: 45384 types 2021-01-15 12:02:33 | INFO | fairseq.tasks.translation | [en] dictionary: 33624 types 2021-01-15 12:02:35 | INFO | fairseq.data.data_utils | loaded 4,999 examples from: /content/drive/MyDrive/Colab/zh-en/valid.zh-en.zh 2021-01-15 12:02:37 | INFO | fairseq.data.data_utils | loaded 4,999 examples from: /content/drive/MyDrive/Colab/zh-en/valid.zh-en.en 2021-01-15 12:02:37 | INFO | fairseq.tasks.translation | /content/drive/MyDrive/Colab/zh-en valid zh-en 4999 examples 2021-01-15 12:02:39 | INFO | fairseq_cli.train | TransformerModel( ''' # (optionally) Configure quantization if cfg.common.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=cfg.common.quantization_config_path, max_epoch=cfg.optimization.max_epoch, max_update=cfg.optimization.max_update, ) else: quantizer = None # Build trainer if cfg.common.model_parallel_size == 1: trainer = Trainer(cfg, task, model, criterion, quantizer) else: trainer = MegatronTrainer(cfg, task, model, criterion) logger.info( "training on {} devices (GPUs/TPUs)".format( cfg.distributed_training.distributed_world_size ) ) logger.info( "max tokens per GPU = {} and batch size per GPU = {}".format( cfg.dataset.max_tokens, cfg.dataset.batch_size, ) ) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint( cfg.checkpoint, trainer, # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) max_epoch = cfg.optimization.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while epoch_itr.next_epoch_idx <= max_epoch: if lr <= cfg.optimization.stop_min_lr: logger.info( f"stopping training because current learning rate ({lr}) is smaller " "than or equal to minimum learning rate " f"(--stop-min-lr={cfg.optimization.stop_min_lr})" ) break # train for one epoch valid_losses, should_stop = train(cfg, trainer, task, epoch_itr) if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=task.has_sharded_data("train"), # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) # TODO: Enable MACS #from fairseq.models.define_transformer import DeFINETransformerModel #if isinstance(model, DeFINETransformerModel): # model.compute_macs_params(args) #exit() criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {:.2f} M (num. trained: {:.2f} M)'.format( sum(p.numel() for p in model.parameters()) / 1e6, sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6, )) # Build trainer trainer = Trainer(args, task, model, criterion) logger.info('LR scheduler: {}'.format(trainer.lr_scheduler)) logger.info('training on {} GPUs with update freq of {}'.format( args.distributed_world_size, args.update_freq)) logger.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') while (lr > args.min_lr and (epoch_itr.epoch < max_epoch # allow resuming training from the final checkpoint or epoch_itr._next_epoch_itr is not None) and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) # early stop if should_stop_early(args, valid_losses[0]): logger.info( 'early stop since valid performance hasn\'t improved for last {} runs' .format(args.patience)) break epoch_itr = trainer.get_train_iterator( epoch_itr.epoch, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, 'data', '')), ) train_meter.stop() logger.info('done training in {:.1f} seconds'.format(train_meter.sum))
def main(cfg: DictConfig) -> None: if isinstance(cfg, argparse.Namespace): cfg = convert_namespace_to_omegaconf(cfg) utils.import_user_module(cfg.common) assert ( cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" metrics.reset() np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) if distributed_utils.is_master(cfg.distributed_training): checkpoint_utils.verify_checkpoint_directory(cfg.checkpoint.save_dir) # Print args # logger.info(cfg) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(cfg.task) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in cfg.dataset.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) assert cfg.criterion, "Please specify criterion to train a model" # Build model and criterion model = task.build_model(cfg.model) model.apply_dropout(cfg.pruning.num_of_heads, cfg.pruning.temperature) criterion = task.build_criterion(cfg.criterion) # logger.info(model) # logger.info("task: {}".format(task.__class__.__name__)) # logger.info("model: {}".format(model.__class__.__name__)) # logger.info("criterion: {}".format(criterion.__class__.__name__)) # logger.info( # "num. model params: {} (num. trained: {})".format( # sum(p.numel() for p in model.parameters()), # sum(p.numel() for p in model.parameters() if p.requires_grad), # ) # ) # (optionally) Configure quantization if cfg.common.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=cfg.common.quantization_config_path, max_epoch=cfg.optimization.max_epoch, max_update=cfg.optimization.max_update, ) else: quantizer = None # Build trainer if cfg.common.model_parallel_size == 1: trainer = Trainer(cfg, task, model, criterion, quantizer, cfg.pruning.dropout_lr, cfg.pruning.post) else: trainer = MegatronTrainer(cfg, task, model, criterion) # logger.info( # "training on {} devices (GPUs/TPUs)".format( # cfg.distributed_training.distributed_world_size # ) # ) # logger.info( # "max tokens per GPU = {} and batch size per GPU = {}".format( # cfg.dataset.max_tokens, # cfg.dataset.batch_size, # ) # ) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint( cfg.checkpoint, trainer, # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) # print(model.get_w()) max_epoch = cfg.optimization.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() global_step = 0 logger.info( "tempereature: {}, num_of_heads: {}, cooldown_steps: {}, starting_temperature: {}, "\ "starting_num_of_heads: {}, dropout_lr: {}".format( cfg.pruning.temperature, cfg.pruning.num_of_heads, cfg.pruning.cooldown_steps if cfg.pruning.annealing or cfg.pruning.reducing_heads else "N.A.", cfg.pruning.starting_temperature if cfg.pruning.annealing else "N.A.", cfg.pruning.starting_num_of_heads if cfg.pruning.reducing_heads else "N.A.", cfg.pruning.dropout_lr, )) while epoch_itr.next_epoch_idx <= max_epoch: if lr <= cfg.optimization.stop_min_lr: logger.info( f"stopping training because current learning rate ({lr}) is smaller " "than or equal to minimum learning rate " f"(--stop-min-lr={cfg.optimization.stop_min_lr})" ) break # train for one epoch valid_losses, should_stop, global_step = train(cfg, trainer, task, epoch_itr, global_step) # print(model.get_w()) if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=task.has_sharded_data("train"), # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) train_meter.stop() # logger.info("done training in {:.1f} seconds".format(train_meter.sum)) if (cfg.pruning.annealing or cfg.pruning.reducing_heads) and global_step < cfg.pruning.cooldown_steps: warnings.warn("It never cools down!!!")
def main(args, config=None, init_distributed=False): utils.import_user_module(args) experiment = None if config: experiment = ExistingExperiment( api_key=config["api_key"], previous_experiment=config["experiment_key"], auto_output_logging=None, ) assert ( args.max_tokens is not None or args.max_sentences is not None ), "Must specify batch size either with --max-tokens or --max-sentences" # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) print(args) if experiment: experiment.log_parameters(vars(args), prefix="Device {} :: ".format( args.device_id)) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print("| model {}, criterion {}".format(args.arch, criterion.__class__.__name__)) print("| num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) if experiment: experiment.log_parameters( { "criterion": criterion.__class__.__name__, "num. model params": sum(p.numel() for p in model.parameters()), "num. trained params": sum(p.numel() for p in model.parameters() if p.requires_grad), }, prefix="Device {} :: ".format(args.device_id), ) # Build trainer trainer = Trainer(args, task, model, criterion) print("| training on {} GPUs".format(args.distributed_world_size)) print("| max tokens per GPU = {} and max sentences per GPU = {}".format( args.max_tokens, args.max_sentences)) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(",") while (lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr, experiment) if (not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0): valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, experiment) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = ":" in getattr(args, "data", "") # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print("| done training in {:.1f} seconds".format(train_meter.sum)) if experiment: experiment.log_metrics( { "valid_loss": valid_losses[0], "lr": lr }, prefix="Device {} ".format(args.device_id), )
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' metrics.reset() # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): # checkpoint_utils.verify_checkpoint_directory(args.save_dir) checkpoint_utils.verify_checkpoint_directory(args.actor_path) checkpoint_utils.verify_checkpoint_directory(args.critic_path) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. # task = tasks.setup_task(args) actor_args, actor_task, actor_model, actor_criterion, actor_trainer, \ actor_epoch_itr, actor_extra_state = get_ready(args, 'a') critic_args, critic_task, critic_model, critic_criterion, critic_trainer, \ critic_epoch_itr, critic_extra_state = get_ready(args, 'a') # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): actor_task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion ''' model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer if args.model_parallel_size == 1: trainer = Trainer(args, task, model, criterion) else: trainer = MegatronTrainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info('max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) ''' # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf actor_lr = actor_trainer.get_lr() critic_lr = critic_trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') while (min(actor_lr, critic_lr) > args.min_lr and max(actor_epoch_itr.next_epoch_idx, critic_epoch_itr.next_epoch_idx) <= max_epoch and max(actor_trainer.get_num_updates(), critic_trainer.get_num_updates()) < max_update): # train for one epoch actor_args, actor_trainer, actor_task, actor_epoch_itr, \ critic_args, critic_trainer, critic_task, critic_epoch_itr \ = train_ac(actor_args, actor_trainer, actor_task, actor_epoch_itr, critic_args, critic_trainer, critic_task, critic_epoch_itr) if not args.disable_validation and actor_epoch_itr.epoch % args.validate_interval == 0: actor_valid_losses = validate(actor_args, actor_trainer, actor_task, actor_epoch_itr, valid_subsets) else: actor_valid_losses = [None] if not args.disable_validation and critic_epoch_itr.epoch % args.validate_interval == 0: critic_valid_losses = validate(critic_args, critic_trainer, critic_task, critic_epoch_itr, valid_subsets) else: critic_valid_losses = [None] # only use first validation loss to update the learning rate actor_lr = actor_trainer.lr_step(actor_epoch_itr.epoch, actor_valid_losses[0]) critic_lr = critic_trainer.lr_step(critic_epoch_itr.epoch, critic_valid_losses[0]) # save checkpoint if actor_epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(actor_args, actor_trainer, actor_epoch_itr, actor_valid_losses[0]) if critic_epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(critic_args, critic_trainer, critic_epoch_itr, critic_valid_losses[0]) # early stop if should_stop_early(args, actor_valid_losses[0]): logger.info( 'early stop since valid performance hasn\'t improved for last {} runs' .format(args.patience)) break actor_epoch_itr = actor_trainer.get_train_iterator( actor_epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, 'data', '')), ) critic_epoch_itr = critic_trainer.get_train_iterator( critic_epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, 'data', '')), ) train_meter.stop() logger.info('done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr, filtered_maxpos_indices = checkpoint_utils.load_checkpoint( args, trainer) # pretrain data actor # only the language actor model can be pretrained if args.pretrain_laser and args.pretrain_data_actor and args.data_actor == 'ave': # pretrain the agent with LASER score # epoch_itr, indices = trainer.get_train_iterator(1) path = '/home/wtan12/multiDDS/' trainer.pretrain_LASER('en-ps.laser-score', epoch_itr) if args.compare_laser: epoch_itr, indices = trainer.get_train_iterator(1) print('Number of Indices: ', len(indices)) scores = collections.defaultdict(float) # compare with laser label using R^2 Score, only used after model is trained # itr = epoch_itr.next_epoch_itr(fix_batches_to_gpus=False, shuffle=False) data_actor = trainer.data_actor itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=args.fix_batches_to_gpus, shuffle=False, offset=0, datasize=-1, ) for i, sample in enumerate(itr): sample = trainer._prepare_sample(sample) sample = list(sample.values())[0] score = data_actor(sample).cpu().detach().numpy().tolist() indices = sample['id'].data.cpu().numpy().ravel().tolist() for k, v in zip(indices, score): scores[k] = float(v[0]) scores = sorted(scores.items(), key=lambda x: x[0]) print('Number of Indices in Scoring file: ', len(scores)) path = '/home/wtan12/multiDDS/' with open(path + 'en-ps.laser-score', 'r') as r: data = r.read() laser_score = [] for i, item in enumerate(data.split('\n')): laser_score.append(item) laser_score.pop() r2 = 0.0 with open(path + 'en-ps.dds_score', 'w') as f: for k, v in scores: f.write(str(v) + '\n') truth = float(laser_score[k]) r2 += (truth - v)**2 print('R2 Score compared to LASER file: ', r2) return # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') if args.eval_bleu: generator = task.build_generator(args) args.maximize_best_checkpoint_metric = True else: generator = None while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch epoch_itr = train(args, trainer, task, epoch_itr, generator, filtered_maxpos_indices) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, generator) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if ':' in getattr(args, 'data', ''): # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch)[0] train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) src_dict = task.dictionary tgt_dict = task.label_dictionary # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) model = models[0] criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) trainer = Trainer(args, task, model, criterion) epoch_itr, filtered_indices = trainer.get_train_iterator(epoch=0) # Update parameters every N batches update_freq = 1 num_reset = 1 datasize = -1 for reset_idx in range(num_reset): print("resetting at step", reset_idx) # Initialize data iterator itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=args.fix_batches_to_gpus, shuffle=(epoch_itr.epoch >= args.curriculum), offset=reset_idx * (args.update_language_sampling * args.update_freq[0] + 1), datasize=datasize, ) itr = iterators.GroupedIterator(itr, update_freq) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, no_progress_bar='simple', ) for _, samples in enumerate(progress, start=epoch_itr.iterations_in_epoch): for sample in samples: sample = trainer._prepare_sample(sample) grad_norm = task.get_grad_wrt_input(sample, model, criterion) #print(grad_norm) #print(grad_norm.size()) for i, sample_id in enumerate(sample['id'].tolist()): #target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() target_tokens = sample['target'][ i, :].int().cpu() + tgt_dict.nspecial src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], src_dict.pad()) src_str = src_dict.string(src_tokens[1:]) target_str = tgt_dict.string(target_tokens) print('S-{}\t{}'.format(sample_id, src_str)) print('T-{}\t{}'.format(sample_id, target_str)) grad_norm_i = grad_norm[i, :].data.float().cpu().numpy() #print(src_tokens) #print(" ".join([str(g) for g in grad_norm_i])) print('N-{}\t{}'.format( sample_id, " ".join([ str(g) for g in grad_norm_i[1:len(src_tokens) - 1] ])))
def main(args): utils.import_user_module(args) assert ( args.max_tokens is not None or args.max_sentences is not None ), "Must specify batch size either with --max-tokens or --max-sentences" metrics.reset() np.random.seed(args.seed) utils.set_torch_seed(args.seed) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) checkpoint_utils.verify_checkpoint_directory(args.jason_log_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info("task: {} ({})".format(args.task, task.__class__.__name__)) logger.info("model: {} ({})".format(args.arch, model.__class__.__name__)) logger.info( "criterion: {} ({})".format(args.criterion, criterion.__class__.__name__) ) logger.info( "num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), ) ) # (optionally) Configure quantization if args.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=args.quantization_config_path, max_epoch=args.max_epoch, max_update=args.max_update, ) else: quantizer = None # Build trainer if args.model_parallel_size == 1: trainer = Trainer(args, task, model, criterion, quantizer) else: trainer = MegatronTrainer(args, task, model, criterion) logger.info( "training on {} devices (GPUs/TPUs)".format(args.distributed_world_size) ) logger.info( "max tokens per GPU = {} and max sentences per GPU = {}".format( args.max_tokens, args.max_sentences ) ) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint( args, trainer, # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() ##### begin jason ##### updates_list = []; train_ppl_list = []; train_loss_list = []; val_ppl_list = []; val_loss_list = []; train_uid_loss_list = []; val_uid_loss_list = [] log_writer = open(os.path.join(args.save_dir, 'train_logs.csv'), 'w') log_writer.write(f'updates,train_loss,train_ppl,val_loss,val_ppl\n') backup_writefile = os.path.join(args.jason_log_dir, 'train_logs_backup.csv') os.system(f'touch {backup_writefile}') os.system(f'echo "updates,train_loss,train_ppl,val_loss,val_ppl,train_uid_loss,val_uid_loss" >> {backup_writefile}') ##### end jason ##### while lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch: # train for one epoch valid_losses, should_stop, train_stats, valid_stats = train(args, trainer, task, epoch_itr) print("hello", valid_stats, train_stats) ##### begin jason ##### if train_stats and valid_stats: updates_list.append(train_stats['num_updates']) train_loss_list.append(train_stats['loss']) train_ppl_list.append(train_stats['ppl']) val_loss_list.append(valid_stats['loss']) val_ppl_list.append(valid_stats['ppl']) if 'uid_loss' not in train_stats: train_stats['uid_loss'] = -1 valid_stats['uid_loss'] = -1 train_uid_loss_list.append(train_stats['uid_loss']) val_uid_loss_list.append(valid_stats['uid_loss']) log_line = f"{train_stats['num_updates']},{train_stats['loss']},{train_stats['ppl']},{valid_stats['loss']},{valid_stats['ppl']},{train_stats['uid_loss']},{valid_stats['uid_loss']}" log_writer.write(f"{log_line}\n") os.system(f'echo "{log_line}" >> {backup_writefile}') best_val_loss = min(val_loss_list) best_val_loss_idx = val_loss_list.index(best_val_loss) updates_to_best_val_loss = updates_list[best_val_loss_idx] train_loss_at_best_val_loss = train_loss_list[best_val_loss_idx] jasons_vis.plot_jasons_lineplot( x_list = updates_list, y_list_list = [train_loss_list, val_loss_list, train_uid_loss_list, val_uid_loss_list], y_labels_list = ['train', 'dev', 'train uid', 'dev uid'], x_ax_label = "Updates", y_ax_label = "Loss", title = f"dev_l={best_val_loss} updates={updates_to_best_val_loss} train_l={train_loss_at_best_val_loss}", output_png_path = os.path.join(args.jason_log_dir, f"{args.jason_log_dir.split('/')[-1]}_loss.png"), ) jasons_vis.plot_jasons_lineplot( x_list = updates_list, y_list_list = [train_ppl_list, val_ppl_list], y_labels_list = ['train', 'dev'], x_ax_label = "Updates", y_ax_label = "Perplexity", title = f" best_val_ppl={best_val_loss} " + args.jason_log_dir[:20], output_png_path = os.path.join(args.jason_log_dir, f"{args.jason_log_dir.split('/')[-1]}_perplexity.png"), ) ##### end jason ##### if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=task.has_sharded_data("train"), # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) utils.handle_save_path(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training #if torch.cuda.is_available() and not args.cpu: # torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(f"| Configs: {args}") # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print( f"| Model: {args.arch} \n| Criterion: {criterion.__class__.__name__}") # Log architecture if args.train_subtransformer: print(" \n\n\t\tWARNING!!! Training one single SubTransformer\n\n") print( f"| SubTransformer Arch: {utils.get_subtransformer_config(args)} \n" ) else: print(" \n\n\t\tWARNING!!! Training SuperTransformer\n\n") print(f"| SuperTransformer Arch: {model} \n") # Log model size if args.train_subtransformer: print( f"| SubTransformer size (without embedding weights): {model.get_sampled_params_numel(utils.get_subtransformer_config(args))}" ) embed_size = args.decoder_embed_dim_subtransformer * len(task.tgt_dict) print(f"| Embedding layer size: {embed_size} \n") else: model_s = 0 # if use model.state_dict, then will add 2 more parameters, they are encoder.version and decoder.version. Should not count them for name, param in model.named_parameters(): if 'embed' not in name: model_s += param.numel() print( f"| SuperTransofmer model size (without embedding weights): {model_s}" ) print( f"| Embedding layer size: {sum(p.numel() for p in model.parameters() if p.requires_grad) - model_s} \n" ) # specify the length of the dummy input for profile # for iwslt, the average length is 23, for wmt, that is 30 dummy_sentence_length_dict = {'iwslt': 23, 'wmt': 30} if 'iwslt' in args.arch: dummy_sentence_length = dummy_sentence_length_dict['iwslt'] elif 'wmt' in args.arch: dummy_sentence_length = dummy_sentence_length_dict['wmt'] else: raise NotImplementedError dummy_src_tokens = [2] + [7] * (dummy_sentence_length - 1) dummy_prev = [7] * (dummy_sentence_length - 1) + [2] # profile the overall FLOPs number if args.profile_flops: import torchprofile config_subtransformer = utils.get_subtransformer_config(args) model.set_sample_config(config_subtransformer) model.profile(mode=True) macs = torchprofile.profile_macs(model, args=(torch.tensor([dummy_src_tokens], dtype=torch.long), torch.tensor([30]), torch.tensor([dummy_prev], dtype=torch.long))) model.profile(mode=False) last_layer_macs = config_subtransformer['decoder'][ 'decoder_embed_dim'] * dummy_sentence_length * len(task.tgt_dict) print(f"| Total FLOPs: {macs * 2}") print(f"| Last layer FLOPs: {last_layer_macs * 2}") print( f"| Total FLOPs without last layer: {(macs - last_layer_macs) * 2} \n" ) exit(0) with torch.autograd.set_detect_anomaly(True): # Build trainer trainer = Trainer(args, task, model, criterion) print(f"| Training on {args.distributed_world_size} GPUs") # print(f"| Max tokens per GPU = {args.max_tokens} and max sentences per GPU = {args.max_sentences} \n") print( f"| Max tokens per GPU = {args.max_tokens} and max sentences per GPU = {None} \n" ) # Measure model latency, the program will exit after profiling latency if args.latcpu or args.latgpu: utils.measure_latency(args, model, dummy_src_tokens, dummy_prev) exit(0) # Load the latest checkpoint if one is available and restore the corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Evaluate the SubTransformer if args.validate_subtransformer: config = utils.get_subtransformer_config(args) trainer.set_sample_config(config) valid_loss = validate(args, trainer, task, epoch_itr, ['valid'], 'SubTransformer') print(f"| SubTransformer validation loss:{valid_loss}") # Loop boundaries max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') represent_configs = utils.get_represent_configs(args) # Main training loop while lr > args.stop_min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: for k, v in represent_configs.items(): trainer.set_sample_config(config=v) valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, sampled_arch_name=k) else: valid_losses = [None] # update the best loss and get current lr; the real lr scheduling is done in trainer.train_step() lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint epoch level if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| Done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' metrics.reset() # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # (optionally) Configure quantization if args.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=args.quantization_config_path, max_epoch=args.max_epoch, max_update=args.max_update, ) else: quantizer = None # Build trainer if args.model_parallel_size == 1: trainer = Trainer(args, task, model, criterion, quantizer) else: trainer = MegatronTrainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while (lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch): # train for one epoch valid_losses = train(args, trainer, task, epoch_itr, max_update) if should_stop_early( args, valid_losses[0]) or trainer.get_num_updates() >= max_update: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, 'data', '')), ) train_meter.stop() logger.info('done training in {:.1f} seconds'.format(train_meter.sum))
def main(cfg: DictConfig) -> None: if isinstance(cfg, argparse.Namespace): cfg = convert_namespace_to_omegaconf(cfg) utils.import_user_module(cfg.common) assert ( cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" metrics.reset() np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) if distributed_utils.is_master(cfg.distributed_training): checkpoint_utils.verify_checkpoint_directory(cfg.checkpoint.save_dir) # Print args logger.info(cfg) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(cfg.task) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in cfg.dataset.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) assert cfg.criterion, "Please specify criterion to train a model" # Build model and criterion model = task.build_model(cfg.model) criterion = task.build_criterion(cfg.criterion) logger.info(model) logger.info("task: {}".format(task.__class__.__name__)) logger.info("model: {}".format(model.__class__.__name__)) logger.info("criterion: {}".format(criterion.__class__.__name__)) logger.info("num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # (optionally) Configure quantization if cfg.common.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=cfg.common.quantization_config_path, max_epoch=cfg.optimization.max_epoch, max_update=cfg.optimization.max_update, ) else: quantizer = None # Build trainer if cfg.common.model_parallel_size == 1: trainer = Trainer(cfg, task, model, criterion, quantizer) else: trainer = MegatronTrainer(cfg, task, model, criterion) logger.info("training on {} devices (GPUs/TPUs)".format( cfg.distributed_training.distributed_world_size)) logger.info("max tokens per GPU = {} and batch size per GPU = {}".format( cfg.dataset.max_tokens, cfg.dataset.batch_size, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint( cfg.checkpoint, trainer, # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) # scores, sparsities, all_head_masks = mask_heads(cfg, task, trainer, epoch_itr, model, exact_pruning=True) for k in [28, 32, 36]: gibbs_sampling(cfg, task, trainer, epoch_itr, model, early_stop_step=12, K=k)
def main(cfg: DictConfig) -> None: if isinstance(cfg, argparse.Namespace): cfg = convert_namespace_to_omegaconf(cfg) utils.import_user_module(cfg.common) assert ( cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" metrics.reset() np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) if distributed_utils.is_master(cfg.distributed_training): checkpoint_utils.verify_checkpoint_directory(cfg.checkpoint.save_dir) # Print args logger.info(cfg) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(cfg.task) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in cfg.dataset.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) assert cfg.criterion, "Please specify criterion to train a model" # Build model and criterion model = task.build_model(cfg.model) criterion = task.build_criterion(cfg.criterion) logger.info(model) logger.info("task: {}".format(task.__class__.__name__)) logger.info("model: {}".format(model.__class__.__name__)) logger.info("criterion: {})".format(criterion.__class__.__name__)) logger.info("num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # (optionally) Configure quantization if cfg.common.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=cfg.common.quantization_config_path, max_epoch=cfg.optimization.max_epoch, max_update=cfg.optimization.max_update, ) else: quantizer = None # Build trainer if cfg.common.model_parallel_size == 1: trainer = Trainer(cfg, task, model, criterion, quantizer) else: trainer = MegatronTrainer(cfg, task, model, criterion) logger.info("training on {} devices (GPUs/TPUs)".format( cfg.distributed_training.distributed_world_size)) logger.info("max tokens per GPU = {} and batch size per GPU = {}".format( cfg.dataset.max_tokens, cfg.dataset.batch_size, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint( cfg.checkpoint, trainer, # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) max_epoch = cfg.optimization.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while lr > cfg.optimization.min_lr and epoch_itr.next_epoch_idx <= max_epoch: # train for one epoch valid_losses, should_stop = train(cfg, trainer, task, epoch_itr) if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=task.has_sharded_data("train"), # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) try: from fairseq.fb_pathmgr import fb_pathmgr global fb_pathmgr_registerd if not fb_pathmgr_registerd: fb_pathmgr.register() fb_pathmgr_registerd = True except (ModuleNotFoundError, ImportError): pass assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # filter the params that is unused for finetuing, ad-hoc for finetuing, should turn off when bert pretraining. for n, p in model.named_parameters(): if "lm_head" in n: p.requires_grad = False # print(n) # print(n, p.requires_grad, p.shape) # for i, (n, p) in enumerate(model.named_parameters()): # print(i, n, p.size()) # asdf # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') if not hasattr(checkpoint_utils.save_checkpoint, 'not_best'): checkpoint_utils.save_checkpoint.not_best = 0 #import pdb; pdb.set_trace() while epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: print('Start training') # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) if args.early_stop > 0: if hasattr( checkpoint_utils.save_checkpoint, 'best' ) and valid_losses[0] > checkpoint_utils.save_checkpoint.best: checkpoint_utils.save_checkpoint.not_best += 1 print("| Not the best ckpt... not best:", checkpoint_utils.save_checkpoint.not_best) if checkpoint_utils.save_checkpoint.not_best > args.early_stop: print("| Early stop...") break else: checkpoint_utils.save_checkpoint.not_best = 0 else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = ':' in getattr(args, 'data', '') # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): utils.import_user_module(args) assert ( args.max_tokens is not None or args.max_sentences is not None ), "Must specify batch size either with --max-tokens or --max-sentences" metrics.reset() np.random.seed(args.seed) utils.set_torch_seed(args.seed) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info("task: {} ({})".format(args.task, task.__class__.__name__)) logger.info("model: {} ({})".format(args.arch, model.__class__.__name__)) logger.info("criterion: {} ({})".format(args.criterion, criterion.__class__.__name__)) logger.info("num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # (optionally) Configure quantization if args.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=args.quantization_config_path, max_epoch=args.max_epoch, max_update=args.max_update, ) else: quantizer = None # Build trainer if args.model_parallel_size == 1: trainer = Trainer(args, task, model, criterion, quantizer) else: trainer = MegatronTrainer(args, task, model, criterion) logger.info("training on {} devices (GPUs/TPUs)".format( args.distributed_world_size)) logger.info( "max tokens per GPU = {} and max sentences per GPU = {}".format( args.max_tokens, args.max_sentences)) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch: # train for one epoch valid_losses, should_stop = train(args, trainer, task, epoch_itr) if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=task.has_sharded_data("train"), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum))
def main( args, init_distributed=False, after_distributed_init_fn: Optional[Callable[[argparse.Namespace], argparse.Namespace]] = None, ): utils.import_user_module(args) assert ( args.max_tokens is not None or args.max_sentences is not None ), "Must specify batch size either with --max-tokens or --max-sentences" metrics.reset() # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu and not getattr( args, "tpu", False): torch.cuda.set_device(args.device_id) np.random.seed(args.seed) utils.set_torch_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if after_distributed_init_fn: args = after_distributed_init_fn(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info("model {}, criterion {}".format(args.arch, criterion.__class__.__name__)) logger.info("num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # (optionally) Configure quantization if args.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=args.quantization_config_path, max_epoch=args.max_epoch, max_update=args.max_update, ) else: quantizer = None # Build trainer if args.model_parallel_size == 1: trainer = Trainer(args, task, model, criterion, quantizer) else: trainer = MegatronTrainer(args, task, model, criterion) logger.info("training on {} devices (GPUs/TPUs)".format( args.distributed_world_size)) logger.info("training on {} devices (GPUs/TPUs)".format( args.distributed_world_size)) logger.info( "max tokens per GPU = {} and max sentences per GPU = {}".format( args.max_tokens, args.max_sentences)) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) if args.tpu: import torch_xla.core.xla_model as xm xm.rendezvous("load_checkpoint") # wait for all workers xm.mark_step() # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() experiment_path = args.mhr_experiment # path for experiment configuration total_samples = 0 restore = { 'enc_self_attn': None, 'dec_self_attn': None, 'dec_enc_attn': None } last_epoch_num = { 'enc_self_attn': 0, 'dec_self_attn': 0, 'dec_enc_attn': 0 } while lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch: # train for one epoch valid_losses, should_stop, total_samples_temp, restore, last_epoch_num = train( args, trainer, task, epoch_itr, model, experiment_path, total_samples=total_samples, restore=restore, last_epoch_num=last_epoch_num) total_samples = total_samples_temp if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, "data", "")), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = ':' in getattr(args, 'data', '') # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup tasks, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid datasets (we load training data below, based on the latest checkpoint) valid_subsets = args.valid_subset.split(',') for valid_sub_split in valid_subsets: task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build models model = task.build_model(args) # Build criterions criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) if distributed_utils.is_master(args) and not args.debug: initialize_neptune(trainer, extra_state, args) if getattr(args, 'eval_downstream', None) and len(args.downstream_dict) > 0: downstream_dict = {} for downstream_name, downstream_kwargs in args.downstream_dict.items(): downstream_dict[downstream_name] = create_downstream_dict( args, downstream_name, downstream_kwargs, model) # Move model and criterion to gpu if torch.cuda.is_available() and not args.cpu: model.to('cuda:{}'.format(args.device_id)) criterion.to('cuda:{}'.format(args.device_id)) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() if args.validate_before_training and extra_state is None: # We want to make sure we do validate_before_training # only when we start the trainig from scratch (thus, extra_state is None). # Here, we assert that indeed the training has just started # and training epoch is equal to one. assert epoch_itr.epoch == 1 valid_losses = validate(args, trainer, task, 0, valid_subsets) if args.eval_downstream: run_downstream(args, downstream_dict, model, criterion, 0, trainer.get_num_updates()) while (not args.disable_training and ((isinstance(lr, np.ndarray) and all(lr > args.min_lr)) or (not isinstance(lr, np.ndarray) and lr > args.min_lr)) and epoch_itr.next_epoch_idx <= max_epoch and trainer.get_num_updates() < max_update): train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: # validate on task validation set valid_losses = validate(args, trainer, task, epoch_itr.epoch, valid_subsets) # evaluate on downstream tasks if getattr(args, 'eval_downstream', None): run_downstream(args, downstream_dict, model, criterion, epoch_itr.epoch, trainer.get_num_updates()) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_extra_state = {} if get_experiment_id(): save_extra_state['neptune_id'] = get_experiment_id() save_checkpoint(args, trainer, epoch_itr, valid_losses[0], save_extra_state) # early stop if should_stop_early(args, valid_losses[0]): logger.info( 'early stop since valid performance hasn\'t improved for last {} runs' .format(args.patience)) break reload_dataset = getattr(args, 'reload', False) # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.next_epoch_idx, load_dataset=reload_dataset) train_meter.stop() logger.info('done training in {:.1f} seconds'.format(train_meter.sum))