def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # float16 if params.fp16: assert torch.backends.cudnn.enabled if params.encoder_only: model = network_to_half(model) else: encoder = network_to_half(encoder) decoder = network_to_half(decoder) # distributed if params.multi_gpu: logger.info("Using nn.parallel.DistributedDataParallel ...") if params.fp16: if params.encoder_only: model = apex.parallel.DistributedDataParallel( model, delay_allreduce=True) else: encoder = apex.parallel.DistributedDataParallel( encoder, delay_allreduce=True) decoder = apex.parallel.DistributedDataParallel( decoder, delay_allreduce=True) else: if params.encoder_only: model = nn.parallel.DistributedDataParallel( model, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) else: encoder = nn.parallel.DistributedDataParallel( encoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) decoder = nn.parallel.DistributedDataParallel( decoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # mass prediction steps for lang in shuf_order(params.ms_steps): p = np.random.random() trainer.ms_step(lang, params.lambda_ms) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # print(params) # input() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model # reload-model options are in here if params.encoder_only: model = build_model(params, data['dico']) if params.use_adapters: logger.info("Using adapters") for param in model.named_parameters(): if param[0][:8] != "adapters": param[1].requires_grad = False for param_name, param in model.embeddings.named_parameters(): param.requires_grad = True for param_name, param in model.position_embeddings.named_parameters( ): param.requires_grad = True for param_name, param in model.pred_layer.named_parameters(): param.requires_grad = True for param in model.layer_norm_emb.parameters(): param.requires_grad = True for param in model.named_parameters(): logger.info(param[0] + ' required grad = ' + str(param[1].requires_grad)) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) logger.info("Number of trainable parameters (encoder): %i" % sum( [p.numel() for p in trainer.model.parameters() if p.requires_grad])) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) logger.info( "Number of trainable parameters (encoder): %i" % sum([p.numel() for p in encoder.parameters() if p.requires_grad])) logger.info( "Number of trainable parameters (decoder): %i" % sum([p.numel() for p in decoder.parameters() if p.requires_grad])) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for epoch in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # # initialize SLURM signal handler for time limit / pre-emption # init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # # float16 # if params.fp16: # assert torch.backends.cudnn.enabled # if params.encoder_only: # model = network_to_half(model) # else: # encoder = network_to_half(encoder) # decoder = network_to_half(decoder) # # distributed # if params.multi_gpu: # logger.info("Using nn.parallel.DistributedDataParallel ...") # if params.fp16: # if params.encoder_only: # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) # else: # encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True) # decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True) # else: # if params.encoder_only: # model = nn.parallel.DistributedDataParallel(model, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # else: # encoder = nn.parallel.DistributedDataParallel(encoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # decoder = nn.parallel.DistributedDataParallel(decoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # # evaluation # if params.eval_only: # scores = evaluator.run_all_evals(trainer) # for k, v in scores.items(): # logger.info("%s -> %.6f" % (k, v)) # logger.info("__log__:%s" % json.dumps(scores)) # exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 trainer.n_images = 0 while trainer.n_sentences < trainer.epoch_size or trainer.n_images < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) # shuf_order's result could be: ['fr', 'fr'] or ['en', 'fr'] or ['fr', 'en'] or ['en', 'en'] for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # Image-language pretraining steps trainer.ipm_step("coco36", params.lambda_ipm) # CMLM steps steps for m1, m2 in shuf_order(params.cmlm_steps, params): trainer.cmlm_step(m1, m2, params.lambda_cmlm) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment meta_params = copy.deepcopy(params).meta_params params.meta_params = "..." # to long to be log logger = initialize_exp(params) params.meta_params = meta_params # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # todo : good params.n_words (We take the one from the first task have this parameter for the moment.) """ But we think that if all the task data are based on the same vocabulary, all these parameters will be the same, and therefore no problem if we choose one at random. """ p = params.meta_params[data['key']] # build model if params.encoder_only: model = build_model(params=p, dico=data['dico']) else: encoder, decoder = build_model(params=p, dico=data['dico']) # todo : good pad_index and eos_index and ... (I'll take the one from the first task for the moment.) """ But we think that if all the task data are based on the same vocabulary, all these parameters will be the same, and therefore no problem if we choose one at random. """ params.pad_index = p.pad_index params.eos_index = p.eos_index # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) if not params.meta_learning: for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) else: for lgs in params.meta_params.keys(): logger.info("============ task : %s " % lgs) for k, v in scores[lgs].items(): if k != "epoch": logger.info("%s -> %.6f" % (k, v)) logger.info("============ all") for k, v in scores.items(): if not (k in (list(params.meta_params.keys()) + ['epoch'])): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) if not params.meta_learning: trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() else: # our trainer.n_sentences = {} """ Here we build language lists for each of our meta-taks. Indeed, for two language lists l1 and l2, the objective will be done with l1[i] and l2[i] respectively, this for each index i of the two lists. """ lang1_dic, lang2_dic, lang3_dic = {}, {}, {} """ In the case of meta-learning, we have a (meta-)data dictionary for each (meta-)task, so the keys are the languages conserved by the task. """ data_keys_dic = {} # equivalent to "for task in list of task" in the original algorithm, except here we prepare all the tasks beforehand. for lgs in params.meta_params.keys(): trainer.n_sentences[lgs] = 0 # CLM try: lang1_dic['clm_step'] except KeyError: lang1_dic['clm_step'], lang2_dic[ 'clm_step'], data_keys_dic['clm_step'] = [], [], [] for lang1, lang2 in shuf_order( params.meta_params[lgs].clm_steps, params): lang1_dic['clm_step'].append(lang1) lang2_dic['clm_step'].append(lang2) data_keys_dic['clm_step'].append(lgs) # MLM try: lang1_dic['mlm_step'] except KeyError: lang1_dic['mlm_step'], lang2_dic[ 'mlm_step'], data_keys_dic['mlm_step'] = [], [], [] for lang1, lang2 in shuf_order( params.meta_params[lgs].mlm_steps, params): lang1_dic['mlm_step'].append(lang1) lang2_dic['mlm_step'].append(lang2) data_keys_dic['mlm_step'].append(lgs) # parallel classification try: lang1_dic['pc_step'] except KeyError: lang1_dic['pc_step'], lang2_dic['pc_step'], data_keys_dic[ 'pc_step'] = [], [], [] for lang1, lang2 in shuf_order( params.meta_params[lgs].pc_steps, params): lang1_dic['pc_step'].append(lang1) lang2_dic['pc_step'].append(lang2) data_keys_dic['pc_step'].append(lgs) # denoising auto-encoder try: lang1_dic['ae_step'] except KeyError: lang1_dic['ae_step'], data_keys_dic['ae_step'] = [], [] for lang1 in shuf_order(params.meta_params[lgs].ae_steps): lang1_dic['ae_step'].append(lang1) data_keys_dic['ae_step'].append(lgs) # machine translation try: lang1_dic['mt_step'] except KeyError: lang1_dic['mt_step'], lang2_dic['mt_step'], data_keys_dic[ 'mt_step'] = [], [], [] for lang1, lang2 in shuf_order( params.meta_params[lgs].mt_steps, params): lang1_dic['mt_step'].append(lang1) lang2_dic['mt_step'].append(lang2) data_keys_dic['mt_step'].append(lgs) # back-translation try: lang1_dic['bt_step'] except KeyError: lang1_dic['bt_step'], lang2_dic['bt_step'], lang3_dic[ 'bt_step'], data_keys_dic['bt_step'] = [], [], [], [] for lang1, lang2, lang3 in shuf_order( params.meta_params[lgs].bt_steps): lang1_dic['bt_step'].append(lang1) lang2_dic['bt_step'].append(lang2) lang3_dic['bt_step'].append(lang3) data_keys_dic['bt_step'].append(lgs) flag = True # equivalent to "while not done do" in the original algorithm while flag: # CLM steps #print("clm_step", flag) a = trainer.clm_step(lang1_dic['clm_step'], lang2_dic['clm_step'], params.lambda_clm, data_keys_dic['clm_step']) #print("mlm_step", flag) # MLM steps (also includes TLM if lang2 is not None) b = trainer.mlm_step(lang1_dic['mlm_step'], lang2_dic['mlm_step'], params.lambda_mlm, data_keys_dic['mlm_step']) # parallel classification steps c = trainer.pc_step(lang1_dic['pc_step'], lang2_dic['pc_step'], params.lambda_pc, data_keys_dic['pc_step']) if isinstance(trainer, EncDecTrainer): # denoising auto-encoder steps d = trainer.mt_step(lang1_dic['ae_step'], lang1_dic['ae_step'], params.lambda_ae, data_keys_dic['ae_step']) # machine translation steps e = trainer.mt_step(lang1_dic['mt_step'], lang2_dic['mt_step'], params.lambda_mt, data_keys_dic['mt_step']) # back-translation steps f = trainer.bt_step(lang1_dic['bt_step'], lang2_dic['bt_step'], lang3_dic['bt_step'], params.lambda_bt, data_keys_dic['bt_step']) # do things better if (not a) and (not b) and (not c) and (not d) and ( not e) and (not f): flag = False # End of epoch else: flag = True else: # do things better if (not a) and (not b) and (not c): flag = False # End of epoch else: flag = True trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log if not params.meta_learning: for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) else: for lgs in params.meta_params.keys(): logger.info("============ task : %s " % lgs) for k, v in scores[lgs].items(): if k != "epoch": logger.info("%s -> %.6f" % (k, v)) logger.info("============ all") for k, v in scores.items(): if not (k in (list(params.meta_params.keys()) + ['epoch'])): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores) # our logger.info("============ garbage collector collecting %d ..." % gc.collect())
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() if params.other_seed > -1: # deterministic torch.manual_seed(params.other_seed) torch.cuda.manual_seed(params.other_seed) np.random.seed(params.other_seed) random.seed(params.other_seed) if params.iter_seed == -1: # non-deterministic params.iter_seed = None # load data data = load_data(params) writer = SummaryWriter(params.dump_path + "/" + params.exp_name + "_log") # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) sys.exit() # set sampling probabilities for training set_sampling_probs(data, params) _iter = 0 # dump initial weights if params.save_initial: trainer.save_checkpoint('initial', include_optimizers=False) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): if params.only_vlm: # with visual features trainer.vlm_step(lang1, lang2, params.lambda_mlm, _iter) else: trainer.mlm_step(lang1, lang2, params.lambda_mlm, _iter) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) for lang1, lang2 in shuf_order(params.mmt_steps, params): trainer.mmt_step(lang1, lang2, params.lambda_mt) trainer.iter() _iter += 1 logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): writer.add_scalar(k, v, _iter) logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) if params.build_nmt_domain_feature: import torch from src.curriculum import build_nmt_domain_feature dataset = data['para'][('de', 'en')]['train'] batches, indices = dataset.get_iterator( shuffle=False, group_by_size=params.group_by_size, n_sentences=-1, ) features = build_nmt_domain_feature(data, params, batches, dataset) result = {'indices': indices, 'domain_feature': features} torch.save(result, params.build_output_path) return if params.build_nlm_domain_feature: import torch from src.curriculum import build_nlm_domain_feature dataset = data['para'][('de', 'en')]['train'] batches, indices = dataset.get_iterator( shuffle=False, group_by_size=params.group_by_size, n_sentences=-1, ) # dataset = data['mono_stream']['en']['train'] # indices = dataset.get_iterator( # shuffle=False # ) features, domain_score, sents = build_nlm_domain_feature( data, params, batches, dataset) result = { 'indices': indices, 'domain_feature': features, 'domain_score': domain_score, 'sents': sents } if params.build_output_path.endswith('pth'): build_output_path = params.build_output_path else: build_output_path = f'{params.build_output_path}/final.pth' torch.save(result, build_output_path) return # build model if params.encoder_only: model = build_model(params, data['dico']) elif params.dual_encoder: encoder1, encoder2 = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) elif params.domains: if params.curriculum_learning: trainer = CurriculumTrainer(encoder, decoder, data, params) elif params.dual_encoder: trainer = DualEncoderTrainer(encoder1, encoder2, data, params) else: trainer = MultiDomainTrainer(encoder, decoder, data, params) if params.local_adapt: evaluator = MetaMultiDomainEvaluator(trainer, data, params) else: if params.curriculum_learning: evaluator = EncDecEvaluator(trainer, data, params) elif params.dual_encoder: evaluator = DualEncoderEvaluator(trainer, data, params) else: evaluator = MultiDomainEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() if not params.dual_encoder and params.domains and params.domain_ratio_update_freq > 0 and trainer.n_total_iter % params.domain_ratio_update_freq == 0 and not params.sampling_uniform: evaluator.update_language_sampler_multidomain() evaluator.update_dataset_ratio(trainer) if not params.dual_encoder and params.domain_reset_freq > 0 and trainer.n_total_iter % params.domain_reset_freq == 0: evaluator.reset_dataset_ratio(trainer) logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)