def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) hidden_size = 1024 encoder = EncoderRNN.EncoderRNN(params.n_words, hidden_size).cuda() decoder = Attention_decoder.Attention_decoder(hidden_size, params.n_words, dropout_p=0.1).cuda() trainer = LSTM_Trainer(encoder, decoder, data, params) evaluator = LSTM_Evaluator(trainer, data, params) # set sampling probabilities for training set_sampling_probs(data, params) # language model training for count in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.try_lstm(lang1, lang2, params.lambda_mt) logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores) # save the output of softmax trainer.save_softmax_output(clm_temp, 'clm_temp') trainer.save_softmax_output(ml_temp, 'ml_temp') trainer.save_softmax_output(bt_temp, 'bt_temp')
def seq2seq_main(params): ''' Use different vocabulary/dictionary for src and tgt ''' # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = seq2seq_load_data(params) # build model # 因為 language pair 會重新升冪排序 (zh-en) --> (en-zh) # 所以 en 變成 src , zh 變成 tgt encoder, decoder = build_seq2seq_model( params, data['tgt_dico'], data['src_dico']) # build trainer, reload potential checkpoints / build evaluator trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = MyEncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # float16 if params.fp16: assert torch.backends.cudnn.enabled if params.encoder_only: model = network_to_half(model) else: encoder = network_to_half(encoder) decoder = network_to_half(decoder) # distributed # if params.multi_gpu: # logger.info("Using nn.parallel.DistributedDataParallel ...") # if params.encoder_only: # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) # else: # encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True) # decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # mass prediction steps for lang in shuf_order(params.mass_steps): trainer.mass_step(lang, params.lambda_mass) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) # back-parallel steps for lang1, lang2 in shuf_order(params.bmt_steps, params): trainer.bmt_step(lang1, lang2, params.lambda_bmt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) params.lgs = lgs = params.lgs.split("-") if len(lgs) == 1: lgs.append(lgs[0]) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # Replace the original MLM steps for lang1, lang2 in shuf_order(params.mlm_steps, params): if params.do_meta_update: trainer.meta_mlm_step(lang1) else: trainer.mlm_step(lang1, lang2, params.lambda_mlm) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model # reload-model options are in here if params.encoder_only: model = build_model(params, data['dico']) if params.use_adapters: logger.info("Using adapters") for param in model.named_parameters(): if param[0][:8] != "adapters": param[1].requires_grad = False for param_name, param in model.embeddings.named_parameters(): param.requires_grad = True for param_name, param in model.position_embeddings.named_parameters( ): param.requires_grad = True for param_name, param in model.pred_layer.named_parameters(): param.requires_grad = True for param in model.layer_norm_emb.parameters(): param.requires_grad = True for param in model.named_parameters(): logger.info(param[0] + ' required grad = ' + str(param[1].requires_grad)) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) logger.info("Number of trainable parameters (encoder): %i" % sum( [p.numel() for p in trainer.model.parameters() if p.requires_grad])) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) logger.info( "Number of trainable parameters (encoder): %i" % sum([p.numel() for p in encoder.parameters() if p.requires_grad])) logger.info( "Number of trainable parameters (decoder): %i" % sum([p.numel() for p in decoder.parameters() if p.requires_grad])) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for epoch in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # # initialize SLURM signal handler for time limit / pre-emption # init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # # float16 # if params.fp16: # assert torch.backends.cudnn.enabled # if params.encoder_only: # model = network_to_half(model) # else: # encoder = network_to_half(encoder) # decoder = network_to_half(decoder) # # distributed # if params.multi_gpu: # logger.info("Using nn.parallel.DistributedDataParallel ...") # if params.fp16: # if params.encoder_only: # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) # else: # encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True) # decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True) # else: # if params.encoder_only: # model = nn.parallel.DistributedDataParallel(model, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # else: # encoder = nn.parallel.DistributedDataParallel(encoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # decoder = nn.parallel.DistributedDataParallel(decoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # # evaluation # if params.eval_only: # scores = evaluator.run_all_evals(trainer) # for k, v in scores.items(): # logger.info("%s -> %.6f" % (k, v)) # logger.info("__log__:%s" % json.dumps(scores)) # exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 trainer.n_images = 0 while trainer.n_sentences < trainer.epoch_size or trainer.n_images < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) # shuf_order's result could be: ['fr', 'fr'] or ['en', 'fr'] or ['fr', 'en'] or ['en', 'en'] for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # Image-language pretraining steps trainer.ipm_step("coco36", params.lambda_ipm) # CMLM steps steps for m1, m2 in shuf_order(params.cmlm_steps, params): trainer.cmlm_step(m1, m2, params.lambda_cmlm) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # float16 if params.fp16: assert torch.backends.cudnn.enabled if params.encoder_only: model = network_to_half(model) else: encoder = network_to_half(encoder) decoder = network_to_half(decoder) # distributed if params.multi_gpu: logger.info("Using nn.parallel.DistributedDataParallel ...") if params.encoder_only: model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) else: encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True) decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: logger.info('Evaluating and saving new result file') scores = evaluator.run_all_evals_match(trainer) for k, v in scores.items(): if 'likelihood' in k: logger.info("%s -> %.6f" % (k, np.mean(v))) elif 'scores' in k: logger.info("%s -> %s" % (k, v.shape)) else: logger.info("%s -> %.6f" % (k, v)) np.savetxt(os.path.join(params.dump_path, 'best-fwd-prediction.txt'),scores['%s_%s_fwd_scores' % ('test', params.mass_steps[0])],fmt='%f') for match in params.match_files.split(','): np.savetxt(os.path.join(params.dump_path, 'best-match-prediction{}.txt'.format(match.split('.')[-1])), scores['%s_%s_sentence_likelihood' % (match, params.mass_steps[0])], fmt='%f') labels = np.loadtxt(os.path.join(params.data_path, 'labels')) targets = np.loadtxt(os.path.join(params.data_path, 'suffix')) preds = scores['%s_%s_sentence_likelihood' % ('match', params.mass_steps[0])] results = pd.DataFrame({'label': labels, 'target': targets, 'pred': preds}) results.to_pickle(os.path.join(params.dump_path, 'best-matching-prediction.pkl')) #logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # mass prediction steps for lang in shuf_order(params.mass_steps): trainer.mass_step(lang, params.lambda_mass) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_epoch_evals_match(trainer) # print / JSON log for k, v in scores.items(): if 'likelihood' in k: logger.info("%s -> %.6f" % (k, np.mean(v))) elif 'scores' in k: logger.info("%s -> %s" % (k, v.shape)) else: logger.info("%s -> %.6f" % (k, v)) #if params.is_master: #logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # load checkpoint if params.model_path != "": reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() encoder.load_state_dict(reloaded['encoder']) decoder.load_state_dict(reloaded['decoder']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) else: # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() if params.other_seed > -1: # deterministic torch.manual_seed(params.other_seed) torch.cuda.manual_seed(params.other_seed) np.random.seed(params.other_seed) random.seed(params.other_seed) if params.iter_seed == -1: # non-deterministic params.iter_seed = None # load data data = load_data(params) writer = SummaryWriter(params.dump_path + "/" + params.exp_name + "_log") # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) sys.exit() # set sampling probabilities for training set_sampling_probs(data, params) _iter = 0 # dump initial weights if params.save_initial: trainer.save_checkpoint('initial', include_optimizers=False) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): if params.only_vlm: # with visual features trainer.vlm_step(lang1, lang2, params.lambda_mlm, _iter) else: trainer.mlm_step(lang1, lang2, params.lambda_mlm, _iter) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) for lang1, lang2 in shuf_order(params.mmt_steps, params): trainer.mmt_step(lang1, lang2, params.lambda_mt) trainer.iter() _iter += 1 logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): writer.add_scalar(k, v, _iter) logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) print(data) # build model if params.encoder_only: model = build_model(params) else: encoder, decoder = build_model(params) # build trainer, reload potential checkpoints / build evaluator trainer = XTrainer(model, data, params) evaluator = XEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): if params.is_understanding: trainer.mlm_step(lang1, lang2, params.lambda_mlm) for lang1, lang2 in shuf_order(params.text_steps, params): if params.is_ntg: trainer.ntg_step(lang1, None, params.lambda_mlm) # cross-modal caption steps for lang1, lang2 in shuf_order(params.cross_modal_steps, params): if params.is_mt: trainer.mt_ic_step(lang1, lang2, params.lambda_ic) else: trainer.ic_step(lang1, lang2, params.lambda_ic) if params.is_freelb: trainer.free_lb_ic_step(lang1, lang2, params.lambda_ic) for lang1, lang2 in shuf_order(params.mlm_steps, params, n=3): if params.is_generation: trainer.bart_mlm_step(lang1, lang2, params.lambda_imlm) trainer.bart_mass_step(lang1, lang2, params.lambda_imlm) for lang1, lang2 in shuf_order(params.cross_ae_steps, params): trainer.bart_img_step(lang1, lang2, params.lambda_ida) for lang1, lang2 in shuf_order(params.cross_rel_steps, params): if params.is_pretrain: trainer.pretrain_rel_step(lang1, lang2) else: if params.is_slide: trainer.slide_step(lang1, lang2, params.lambda_t2i) else: # support multi languages trainer.rel_step(lang1, lang2, params.lambda_t2i, params.lambda_i2t) # for lang1, lang2 in shuf_order(params.cross_mlm_steps, params): # trainer.mlm_step(lang1, lang2, params.lambda_mlm) # # for lang1, lang2 in shuf_order(params.cross_mrm_steps, params): # trainer.mrm_step(lang1, lang2, params.lambda_mrm) # # for lang1, lang2 in shuf_order(params.cross_mrfr_steps, params): # trainer.mrfr_step(lang1, lang2, params.lambda_mrfr) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) evaluate_results = [] import os if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) evaluate_results.append(json.dumps(scores)) with open(os.path.join(params.dump_path, "epoch_{0}.eval_log".format(trainer.epoch)), 'w') as writer: for line in evaluate_results: writer.write(line + '\n') # end of epoch trainer.save_best_model(scores) if trainer.epoch % params.save_every_epoch == 0 and params.is_master: trainer.save_model('model_pretrain_%i' % trainer.epoch) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data voc_path = "/home/zchen/XLM/data/processed/XLM_en_zh/50k/vocab" data = load_wikisum_data(voc_path, params.data_path, params) # build model if params.encoder_only: model = build_model(params, data['dico']) elif params.WS: global_encoder, local_encoder, decoder = build_model( params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator trainer = WikisumTrainer(global_encoder, local_encoder, decoder, data, params) evaluator = WikisumEvaluator(trainer, data, params, data_set="valid") # evaluation if params.eval_only: scores = evaluator.evaluate(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): trainer.epoch += 1 logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 for i, batch in enumerate(trainer.dataloader): for lang1, lang2 in shuf_order(params.ws_steps, params): trainer.step(lang1, lang2, batch) trainer.iter() # if i > 1000: # break logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.evaluate(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) if params.build_nmt_domain_feature: import torch from src.curriculum import build_nmt_domain_feature dataset = data['para'][('de', 'en')]['train'] batches, indices = dataset.get_iterator( shuffle=False, group_by_size=params.group_by_size, n_sentences=-1, ) features = build_nmt_domain_feature(data, params, batches, dataset) result = {'indices': indices, 'domain_feature': features} torch.save(result, params.build_output_path) return if params.build_nlm_domain_feature: import torch from src.curriculum import build_nlm_domain_feature dataset = data['para'][('de', 'en')]['train'] batches, indices = dataset.get_iterator( shuffle=False, group_by_size=params.group_by_size, n_sentences=-1, ) # dataset = data['mono_stream']['en']['train'] # indices = dataset.get_iterator( # shuffle=False # ) features, domain_score, sents = build_nlm_domain_feature( data, params, batches, dataset) result = { 'indices': indices, 'domain_feature': features, 'domain_score': domain_score, 'sents': sents } if params.build_output_path.endswith('pth'): build_output_path = params.build_output_path else: build_output_path = f'{params.build_output_path}/final.pth' torch.save(result, build_output_path) return # build model if params.encoder_only: model = build_model(params, data['dico']) elif params.dual_encoder: encoder1, encoder2 = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) elif params.domains: if params.curriculum_learning: trainer = CurriculumTrainer(encoder, decoder, data, params) elif params.dual_encoder: trainer = DualEncoderTrainer(encoder1, encoder2, data, params) else: trainer = MultiDomainTrainer(encoder, decoder, data, params) if params.local_adapt: evaluator = MetaMultiDomainEvaluator(trainer, data, params) else: if params.curriculum_learning: evaluator = EncDecEvaluator(trainer, data, params) elif params.dual_encoder: evaluator = DualEncoderEvaluator(trainer, data, params) else: evaluator = MultiDomainEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() if not params.dual_encoder and params.domains and params.domain_ratio_update_freq > 0 and trainer.n_total_iter % params.domain_ratio_update_freq == 0 and not params.sampling_uniform: evaluator.update_language_sampler_multidomain() evaluator.update_dataset_ratio(trainer) if not params.dual_encoder and params.domain_reset_freq > 0 and trainer.n_total_iter % params.domain_reset_freq == 0: evaluator.reset_dataset_ratio(trainer) logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): if params.adv: params.use_lang_emb = False print("Language embeddings are not used...\n \n \n \n") # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model # reload-model options are in here if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for epoch in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores))
def clts_elmo_main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # cross lingual encoder # cross lingual text summarization encoder, text summarization decoder elmo, ts_encoder, ts_decoder = build_clts_elmo_model(params, data['dico']) trainer = XLMCLTSEncDecTrainer(elmo, ts_encoder, ts_decoder, data, params) evaluator = XLMCLTSEncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment meta_params = copy.deepcopy(params).meta_params params.meta_params = "..." # to long to be log logger = initialize_exp(params) params.meta_params = meta_params # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # todo : good params.n_words (We take the one from the first task have this parameter for the moment.) """ But we think that if all the task data are based on the same vocabulary, all these parameters will be the same, and therefore no problem if we choose one at random. """ p = params.meta_params[data['key']] # build model if params.encoder_only: model = build_model(params=p, dico=data['dico']) else: encoder, decoder = build_model(params=p, dico=data['dico']) # todo : good pad_index and eos_index and ... (I'll take the one from the first task for the moment.) """ But we think that if all the task data are based on the same vocabulary, all these parameters will be the same, and therefore no problem if we choose one at random. """ params.pad_index = p.pad_index params.eos_index = p.eos_index # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) if not params.meta_learning: for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) else: for lgs in params.meta_params.keys(): logger.info("============ task : %s " % lgs) for k, v in scores[lgs].items(): if k != "epoch": logger.info("%s -> %.6f" % (k, v)) logger.info("============ all") for k, v in scores.items(): if not (k in (list(params.meta_params.keys()) + ['epoch'])): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) if not params.meta_learning: trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() else: # our trainer.n_sentences = {} """ Here we build language lists for each of our meta-taks. Indeed, for two language lists l1 and l2, the objective will be done with l1[i] and l2[i] respectively, this for each index i of the two lists. """ lang1_dic, lang2_dic, lang3_dic = {}, {}, {} """ In the case of meta-learning, we have a (meta-)data dictionary for each (meta-)task, so the keys are the languages conserved by the task. """ data_keys_dic = {} # equivalent to "for task in list of task" in the original algorithm, except here we prepare all the tasks beforehand. for lgs in params.meta_params.keys(): trainer.n_sentences[lgs] = 0 # CLM try: lang1_dic['clm_step'] except KeyError: lang1_dic['clm_step'], lang2_dic[ 'clm_step'], data_keys_dic['clm_step'] = [], [], [] for lang1, lang2 in shuf_order( params.meta_params[lgs].clm_steps, params): lang1_dic['clm_step'].append(lang1) lang2_dic['clm_step'].append(lang2) data_keys_dic['clm_step'].append(lgs) # MLM try: lang1_dic['mlm_step'] except KeyError: lang1_dic['mlm_step'], lang2_dic[ 'mlm_step'], data_keys_dic['mlm_step'] = [], [], [] for lang1, lang2 in shuf_order( params.meta_params[lgs].mlm_steps, params): lang1_dic['mlm_step'].append(lang1) lang2_dic['mlm_step'].append(lang2) data_keys_dic['mlm_step'].append(lgs) # parallel classification try: lang1_dic['pc_step'] except KeyError: lang1_dic['pc_step'], lang2_dic['pc_step'], data_keys_dic[ 'pc_step'] = [], [], [] for lang1, lang2 in shuf_order( params.meta_params[lgs].pc_steps, params): lang1_dic['pc_step'].append(lang1) lang2_dic['pc_step'].append(lang2) data_keys_dic['pc_step'].append(lgs) # denoising auto-encoder try: lang1_dic['ae_step'] except KeyError: lang1_dic['ae_step'], data_keys_dic['ae_step'] = [], [] for lang1 in shuf_order(params.meta_params[lgs].ae_steps): lang1_dic['ae_step'].append(lang1) data_keys_dic['ae_step'].append(lgs) # machine translation try: lang1_dic['mt_step'] except KeyError: lang1_dic['mt_step'], lang2_dic['mt_step'], data_keys_dic[ 'mt_step'] = [], [], [] for lang1, lang2 in shuf_order( params.meta_params[lgs].mt_steps, params): lang1_dic['mt_step'].append(lang1) lang2_dic['mt_step'].append(lang2) data_keys_dic['mt_step'].append(lgs) # back-translation try: lang1_dic['bt_step'] except KeyError: lang1_dic['bt_step'], lang2_dic['bt_step'], lang3_dic[ 'bt_step'], data_keys_dic['bt_step'] = [], [], [], [] for lang1, lang2, lang3 in shuf_order( params.meta_params[lgs].bt_steps): lang1_dic['bt_step'].append(lang1) lang2_dic['bt_step'].append(lang2) lang3_dic['bt_step'].append(lang3) data_keys_dic['bt_step'].append(lgs) flag = True # equivalent to "while not done do" in the original algorithm while flag: # CLM steps #print("clm_step", flag) a = trainer.clm_step(lang1_dic['clm_step'], lang2_dic['clm_step'], params.lambda_clm, data_keys_dic['clm_step']) #print("mlm_step", flag) # MLM steps (also includes TLM if lang2 is not None) b = trainer.mlm_step(lang1_dic['mlm_step'], lang2_dic['mlm_step'], params.lambda_mlm, data_keys_dic['mlm_step']) # parallel classification steps c = trainer.pc_step(lang1_dic['pc_step'], lang2_dic['pc_step'], params.lambda_pc, data_keys_dic['pc_step']) if isinstance(trainer, EncDecTrainer): # denoising auto-encoder steps d = trainer.mt_step(lang1_dic['ae_step'], lang1_dic['ae_step'], params.lambda_ae, data_keys_dic['ae_step']) # machine translation steps e = trainer.mt_step(lang1_dic['mt_step'], lang2_dic['mt_step'], params.lambda_mt, data_keys_dic['mt_step']) # back-translation steps f = trainer.bt_step(lang1_dic['bt_step'], lang2_dic['bt_step'], lang3_dic['bt_step'], params.lambda_bt, data_keys_dic['bt_step']) # do things better if (not a) and (not b) and (not c) and (not d) and ( not e) and (not f): flag = False # End of epoch else: flag = True else: # do things better if (not a) and (not b) and (not c): flag = False # End of epoch else: flag = True trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log if not params.meta_learning: for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) else: for lgs in params.meta_params.keys(): logger.info("============ task : %s " % lgs) for k, v in scores[lgs].items(): if k != "epoch": logger.info("%s -> %.6f" % (k, v)) logger.info("============ all") for k, v in scores.items(): if not (k in (list(params.meta_params.keys()) + ['epoch'])): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores) # our logger.info("============ garbage collector collecting %d ..." % gc.collect())
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps (causal languge model) for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt, params.bt_sample_temperature) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch if params.validation_metrics != '': trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build the big model if params.encoder_only: big_model = build_model(params, data['dico'], cut=False) else: # 修改处1 big_encoder, big_decoder = build_model(params, data['dico'], cut=False) # if we cut some layers, must build a small model if params.cut_layer: if params.encoder_only: small_model = build_model(params, data['dico'], cut=True) else: # 修改处1 small_encoder, small_decoder = build_model(params, data['dico'], cut=True) # build the big trainer, reload potential checkpoints # the big trainer is used to train, so need't a evaluator for it if params.encoder_only: big_trainer = SingleTrainer(big_model, data, params) else: big_trainer = EncDecTrainer(big_encoder, big_decoder, data, params) params.lambda_mlm = "1" params.lambda_clm = "1" params.lambda_pc = "1" params.lambda_ae = "1" params.lambda_mt = "1" params.lambda_bt = "1" # build the small model, and use it for evaluator if params.encoder_only: small_trainer = small_SingleTrainer(small_model, data, params) evaluator = SingleEvaluator(small_trainer, data, params) else: small_trainer = small_EncDecTrainer(small_encoder, small_decoder, data, params) evaluator = EncDecEvaluator(small_trainer, data, params) # evaluation only for the small trainer if params.eval_only: scores = evaluator.run_all_evals(small_trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for count in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % small_trainer.epoch) small_trainer.n_sentences = 0 while small_trainer.n_sentences < small_trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): small_trainer.clm_step(lang1, lang2, params.lambda_clm, big_trainer) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): small_trainer.mlm_step(lang1, lang2, params.lambda_mlm, big_trainer) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): small_trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): small_trainer.mt_step(lang, lang, params.lambda_ae, big_trainer) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): small_trainer.mt_step(lang1, lang2, params.lambda_mt, big_trainer) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): small_trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) small_trainer.iter() logger.info("============ End of epoch %i ============" % small_trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(small_trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch small_trainer.save_best_model(scores) small_trainer.save_periodic() small_trainer.end_epoch(scores)