def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecGenerator(trainer, data, params) # evaluation if params.eval_only: evaluator.generate(trainer) exit()
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) hidden_size = 1024 encoder = EncoderRNN.EncoderRNN(params.n_words, hidden_size).cuda() decoder = Attention_decoder.Attention_decoder(hidden_size, params.n_words, dropout_p=0.1).cuda() trainer = LSTM_Trainer(encoder, decoder, data, params) evaluator = LSTM_Evaluator(trainer, data, params) # set sampling probabilities for training set_sampling_probs(data, params) # language model training for count in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.try_lstm(lang1, lang2, params.lambda_mt) logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores) # save the output of softmax trainer.save_softmax_output(clm_temp, 'clm_temp') trainer.save_softmax_output(ml_temp, 'ml_temp') trainer.save_softmax_output(bt_temp, 'bt_temp')
def main(params, params_pretrain, trainer_class): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # Model if params.pretrain: #if params.model_path : # params_pretrain.reload_model="%s,%s"%(params.model_path, params.model_path) for attr_name in ['n_gpu_per_node', 'multi_gpu', 'is_master']: setattr(params_pretrain, attr_name, getattr(params, attr_name)) pre_trainer, evaluator, _ = get_trainer_evaluator( params_pretrain, logger) else: pre_trainer, evaluator = None, None model = build_model(params, logger, pre_trainer=pre_trainer) # Data train_dataset, val_dataset = load_dataset(params, logger, model) # optimizers optimizers = model.get_optimizers(params) if not params.eval_only else [] # Trainer trainer = trainer_class(params_pretrain, params, model, optimizers, train_dataset, val_dataset, logger, pre_trainer, evaluator) if params.pretrain: assert id(trainer.model.embedder.model) == id( getattr(pre_trainer, params.reload_key)) # Run train/evaluation logger.info("") if not params.eval_only: trainer.train(get_loss, end_of_epoch) else: trainer.eval(get_loss, end_of_epoch)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) _lang1, _lang2 = ( params.langs[0], params.langs[1]) if params.langs[0] < params.langs[1] else ( params.langs[1], params.langs[0]) dataset = data['para'][(_lang1, _lang2)]['test'] print(params.n_words) print("ref_paths" + str(params.ref_paths)) for i, ((x1, len1, id1, lenid1), (x2, len2, id2, lenid2)) in enumerate( dataset.get_iterator(shuffle=False, group_by_size=True, n_sentences=-1, tokens_per_batch=2000)): print('x2' + str(x2.size())) print("len2[None] - 1" + str(len2[None] - 1) + " " + str(len2[None])) print(str(len2[0])) print('len2' + str(len2)) alen = torch.arange(len2.max(), dtype=torch.long, device=len2.device) # do not predict anything given the last target word pred_mask = alen[:, None] < len2[None] - 1 print("pred_mask" + str(pred_mask)) print(str(pred_mask.size())) y = x2[1:].masked_select(pred_mask[:-1]) print("yyyy" + str(y)) print(str(y.size())) assert len(y) == (len2 - 1).sum().item()
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment meta_params = copy.deepcopy(params).meta_params params.meta_params = "..." # to long to be log logger = initialize_exp(params) params.meta_params = meta_params # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # todo : good params.n_words (We take the one from the first task have this parameter for the moment.) """ But we think that if all the task data are based on the same vocabulary, all these parameters will be the same, and therefore no problem if we choose one at random. """ p = params.meta_params[data['key']] # build model if params.encoder_only: model = build_model(params=p, dico=data['dico']) else: encoder, decoder = build_model(params=p, dico=data['dico']) # todo : good pad_index and eos_index and ... (I'll take the one from the first task for the moment.) """ But we think that if all the task data are based on the same vocabulary, all these parameters will be the same, and therefore no problem if we choose one at random. """ params.pad_index = p.pad_index params.eos_index = p.eos_index # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) if not params.meta_learning: for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) else: for lgs in params.meta_params.keys(): logger.info("============ task : %s " % lgs) for k, v in scores[lgs].items(): if k != "epoch": logger.info("%s -> %.6f" % (k, v)) logger.info("============ all") for k, v in scores.items(): if not (k in (list(params.meta_params.keys()) + ['epoch'])): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) if not params.meta_learning: trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() else: # our trainer.n_sentences = {} """ Here we build language lists for each of our meta-taks. Indeed, for two language lists l1 and l2, the objective will be done with l1[i] and l2[i] respectively, this for each index i of the two lists. """ lang1_dic, lang2_dic, lang3_dic = {}, {}, {} """ In the case of meta-learning, we have a (meta-)data dictionary for each (meta-)task, so the keys are the languages conserved by the task. """ data_keys_dic = {} # equivalent to "for task in list of task" in the original algorithm, except here we prepare all the tasks beforehand. for lgs in params.meta_params.keys(): trainer.n_sentences[lgs] = 0 # CLM try: lang1_dic['clm_step'] except KeyError: lang1_dic['clm_step'], lang2_dic[ 'clm_step'], data_keys_dic['clm_step'] = [], [], [] for lang1, lang2 in shuf_order( params.meta_params[lgs].clm_steps, params): lang1_dic['clm_step'].append(lang1) lang2_dic['clm_step'].append(lang2) data_keys_dic['clm_step'].append(lgs) # MLM try: lang1_dic['mlm_step'] except KeyError: lang1_dic['mlm_step'], lang2_dic[ 'mlm_step'], data_keys_dic['mlm_step'] = [], [], [] for lang1, lang2 in shuf_order( params.meta_params[lgs].mlm_steps, params): lang1_dic['mlm_step'].append(lang1) lang2_dic['mlm_step'].append(lang2) data_keys_dic['mlm_step'].append(lgs) # parallel classification try: lang1_dic['pc_step'] except KeyError: lang1_dic['pc_step'], lang2_dic['pc_step'], data_keys_dic[ 'pc_step'] = [], [], [] for lang1, lang2 in shuf_order( params.meta_params[lgs].pc_steps, params): lang1_dic['pc_step'].append(lang1) lang2_dic['pc_step'].append(lang2) data_keys_dic['pc_step'].append(lgs) # denoising auto-encoder try: lang1_dic['ae_step'] except KeyError: lang1_dic['ae_step'], data_keys_dic['ae_step'] = [], [] for lang1 in shuf_order(params.meta_params[lgs].ae_steps): lang1_dic['ae_step'].append(lang1) data_keys_dic['ae_step'].append(lgs) # machine translation try: lang1_dic['mt_step'] except KeyError: lang1_dic['mt_step'], lang2_dic['mt_step'], data_keys_dic[ 'mt_step'] = [], [], [] for lang1, lang2 in shuf_order( params.meta_params[lgs].mt_steps, params): lang1_dic['mt_step'].append(lang1) lang2_dic['mt_step'].append(lang2) data_keys_dic['mt_step'].append(lgs) # back-translation try: lang1_dic['bt_step'] except KeyError: lang1_dic['bt_step'], lang2_dic['bt_step'], lang3_dic[ 'bt_step'], data_keys_dic['bt_step'] = [], [], [], [] for lang1, lang2, lang3 in shuf_order( params.meta_params[lgs].bt_steps): lang1_dic['bt_step'].append(lang1) lang2_dic['bt_step'].append(lang2) lang3_dic['bt_step'].append(lang3) data_keys_dic['bt_step'].append(lgs) flag = True # equivalent to "while not done do" in the original algorithm while flag: # CLM steps #print("clm_step", flag) a = trainer.clm_step(lang1_dic['clm_step'], lang2_dic['clm_step'], params.lambda_clm, data_keys_dic['clm_step']) #print("mlm_step", flag) # MLM steps (also includes TLM if lang2 is not None) b = trainer.mlm_step(lang1_dic['mlm_step'], lang2_dic['mlm_step'], params.lambda_mlm, data_keys_dic['mlm_step']) # parallel classification steps c = trainer.pc_step(lang1_dic['pc_step'], lang2_dic['pc_step'], params.lambda_pc, data_keys_dic['pc_step']) if isinstance(trainer, EncDecTrainer): # denoising auto-encoder steps d = trainer.mt_step(lang1_dic['ae_step'], lang1_dic['ae_step'], params.lambda_ae, data_keys_dic['ae_step']) # machine translation steps e = trainer.mt_step(lang1_dic['mt_step'], lang2_dic['mt_step'], params.lambda_mt, data_keys_dic['mt_step']) # back-translation steps f = trainer.bt_step(lang1_dic['bt_step'], lang2_dic['bt_step'], lang3_dic['bt_step'], params.lambda_bt, data_keys_dic['bt_step']) # do things better if (not a) and (not b) and (not c) and (not d) and ( not e) and (not f): flag = False # End of epoch else: flag = True else: # do things better if (not a) and (not b) and (not c): flag = False # End of epoch else: flag = True trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log if not params.meta_learning: for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) else: for lgs in params.meta_params.keys(): logger.info("============ task : %s " % lgs) for k, v in scores[lgs].items(): if k != "epoch": logger.info("%s -> %.6f" % (k, v)) logger.info("============ all") for k, v in scores.items(): if not (k in (list(params.meta_params.keys()) + ['epoch'])): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores) # our logger.info("============ garbage collector collecting %d ..." % gc.collect())
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data voc_path = "/home/zchen/XLM/data/processed/XLM_en_zh/50k/vocab" data = load_wikisum_data(voc_path, params.data_path, params) # build model if params.encoder_only: model = build_model(params, data['dico']) elif params.WS: global_encoder, local_encoder, decoder = build_model( params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator trainer = WikisumTrainer(global_encoder, local_encoder, decoder, data, params) evaluator = WikisumEvaluator(trainer, data, params, data_set="valid") # evaluation if params.eval_only: scores = evaluator.evaluate(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): trainer.epoch += 1 logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 for i, batch in enumerate(trainer.dataloader): for lang1, lang2 in shuf_order(params.ws_steps, params): trainer.step(lang1, lang2, batch) trainer.iter() # if i > 1000: # break logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.evaluate(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model model = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator trainer = Trainer(model, data, params) evaluator = Evaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # MLM steps trainer.mlm_step(params.lambda_mlm) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # float16 if params.fp16: assert torch.backends.cudnn.enabled if params.encoder_only: model = network_to_half(model) else: encoder = network_to_half(encoder) decoder = network_to_half(decoder) # distributed if params.multi_gpu: logger.info("Using nn.parallel.DistributedDataParallel ...") if params.encoder_only: model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) else: encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True) decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: logger.info('Evaluating and saving new result file') scores = evaluator.run_all_evals_match(trainer) for k, v in scores.items(): if 'likelihood' in k: logger.info("%s -> %.6f" % (k, np.mean(v))) elif 'scores' in k: logger.info("%s -> %s" % (k, v.shape)) else: logger.info("%s -> %.6f" % (k, v)) np.savetxt(os.path.join(params.dump_path, 'best-fwd-prediction.txt'),scores['%s_%s_fwd_scores' % ('test', params.mass_steps[0])],fmt='%f') for match in params.match_files.split(','): np.savetxt(os.path.join(params.dump_path, 'best-match-prediction{}.txt'.format(match.split('.')[-1])), scores['%s_%s_sentence_likelihood' % (match, params.mass_steps[0])], fmt='%f') labels = np.loadtxt(os.path.join(params.data_path, 'labels')) targets = np.loadtxt(os.path.join(params.data_path, 'suffix')) preds = scores['%s_%s_sentence_likelihood' % ('match', params.mass_steps[0])] results = pd.DataFrame({'label': labels, 'target': targets, 'pred': preds}) results.to_pickle(os.path.join(params.dump_path, 'best-matching-prediction.pkl')) #logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # mass prediction steps for lang in shuf_order(params.mass_steps): trainer.mass_step(lang, params.lambda_mass) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_epoch_evals_match(trainer) # print / JSON log for k, v in scores.items(): if 'likelihood' in k: logger.info("%s -> %.6f" % (k, np.mean(v))) elif 'scores' in k: logger.info("%s -> %s" % (k, v.shape)) else: logger.info("%s -> %.6f" % (k, v)) #if params.is_master: #logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # float16 if params.fp16: assert torch.backends.cudnn.enabled if params.encoder_only: model = network_to_half(model) else: encoder = network_to_half(encoder) decoder = network_to_half(decoder) # distributed # if params.multi_gpu: # logger.info("Using nn.parallel.DistributedDataParallel ...") # if params.encoder_only: # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) # else: # encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True) # decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # mass prediction steps for lang in shuf_order(params.mass_steps): trainer.mass_step(lang, params.lambda_mass) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) # back-parallel steps for lang1, lang2 in shuf_order(params.bmt_steps, params): trainer.bmt_step(lang1, lang2, params.lambda_bmt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def clts_elmo_main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # cross lingual encoder # cross lingual text summarization encoder, text summarization decoder elmo, ts_encoder, ts_decoder = build_clts_elmo_model(params, data['dico']) trainer = XLMCLTSEncDecTrainer(elmo, ts_encoder, ts_decoder, data, params) evaluator = XLMCLTSEncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training # initialize experiment / SLURM signal handler for time limit / pre-emption init_distributed_mode(params) logger = initialize_exp(params) init_signal_handler() # CPU / CUDA if params.cpu: assert not params.multi_gpu else: assert torch.cuda.is_available() src.utils.CUDA = not params.cpu # build environment / modules / trainer / evaluator env = build_env(params) modules = build_modules(env, params) trainer = Trainer(modules, env, params) evaluator = Evaluator(trainer) # evaluation if params.eval_only: scores = evaluator.run_all_evals() for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_equations = 0 while trainer.n_equations < trainer.epoch_size: # training steps for task_id in np.random.permutation(len(params.tasks)): task = params.tasks[task_id] if params.export_data: trainer.export_data(task) else: trainer.enc_dec_step(task) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals() # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment / load data logger = initialize_exp(params) # Seed torch.manual_seed(params.seed) torch.cuda.manual_seed_all(params.seed) # initialize SLURM signal handler for time limit / pre-emption if params.is_slurm_job: init_signal_handler() # data loaders / samplers populate_dataset(params) train_data_loader, train_sampler, _ = get_data_loader( img_size=params.img_size, crop_size=params.crop_size, shuffle=True, batch_size=params.batch_size, num_classes=params.num_classes, nb_workers=params.nb_workers, distributed_sampler=params.multi_gpu, dataset=params.dataset, data_path=params.train_path, transform=params.train_transform, split='valid' if params.debug_train else 'train', seed=params.seed) valid_data_loader, _, _ = get_data_loader(img_size=params.img_size, crop_size=params.crop_size, shuffle=False, batch_size=params.batch_size, num_classes=params.num_classes, nb_workers=params.nb_workers, distributed_sampler=False, dataset=params.dataset, transform='center', split='valid', seed=params.seed) # build model / cuda logger.info("Building %s model ..." % params.architecture) ftmodel = build_model(params) ftmodel.fc = nn.Sequential() ftmodel.eval().cuda() linearmodel = nn.Linear(EMBEDDING_SIZE[params.architecture], params.num_classes).cuda() if params.from_ckpt != "": ckpt = torch.load(params.from_ckpt) state_dict = { k.replace("module.", ""): v for k, v in ckpt['model'].items() } del state_dict["fc.weight"] if "fc.bias" in state_dict: del state_dict["fc.bias"] missing_keys, unexcepted_keys = ftmodel.load_state_dict(state_dict, strict=False) print("Missing keys: ", missing_keys) print("Unexcepted keys: ", unexcepted_keys) # distributed # TODO: check this https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main.py#L142 if params.multi_gpu: logger.info("Using nn.parallel.DistributedDataParallel ...") linearmodel = nn.parallel.DistributedDataParallel( linearmodel, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # build trainer / reload potential checkpoints / build evaluator trainer = Trainer(model=linearmodel, params=params, ftmodel=ftmodel) trainer.reload_checkpoint() evaluator = Evaluator(trainer, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer, evals=['classif'], data_loader=valid_data_loader) for k, v in scores.items(): logger.info('%s -> %.6f' % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # training for epoch in range(trainer.epoch, params.epochs): # update epoch / sampler / learning rate trainer.epoch = epoch logger.info("============ Starting epoch %i ... ============" % trainer.epoch) if params.multi_gpu: train_sampler.set_epoch(epoch) # update learning rate trainer.update_learning_rate() # train for i, (images, targets) in enumerate(train_data_loader): trainer.classif_step(images, targets) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate classification accuracy scores = evaluator.run_all_evals(trainer, evals=['classif'], data_loader=valid_data_loader) for name, val in trainer.get_scores().items(): scores[name] = val # print / JSON log for k, v in scores.items(): logger.info('%s -> %.6f' % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(args): """ This code implements the paper: https://arxiv.org/abs/1905.01278 The method consists in alternating between a hierachical clustering of the features and learning the parameters of a convnet by predicting both the angle of the rotation applied to the input data and the cluster assignments in a single hierachical loss. """ # initialize communication groups training_groups, clustering_groups = init_distributed_mode(args) # check parameters check_parameters(args) # initialize the experiment logger, training_stats = initialize_exp(args, 'epoch', 'iter', 'prec', 'loss', 'prec_super_class', 'loss_super_class', 'prec_sub_class', 'loss_sub_class') # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data dataset = YFCC100M_dataset(r'./dataset', size=args.size_dataset) # prepare the different data transformations tr_cluster, tr_train = get_data_transformations(args.rotation * 90) # build model skeleton fix_random_seeds() model = model_factory(args.sobel) logger.info('model created') # load pretrained weights load_pretrained(model, args) # convert batch-norm layers to nvidia wrapper to enable batch stats reduction model = apex.parallel.convert_syncbn_model(model) # distributed training wrapper model = to_cuda(model, args.gpu_to_work_on, apex=False) logger.info('model to cuda') # set optimizer optimizer = sgd_optimizer(model, args.lr, args.wd) # load cluster assignments cluster_assignments = load_cluster_assignments(args, dataset) # build prediction layer on the super_class pred_layer, optimizer_pred_layer = build_prediction_layer( model.module.body.dim_output_space, args, ) nmb_sub_classes = args.k // args.nmb_super_clusters sub_class_pred_layer, optimizer_sub_class_pred_layer = build_prediction_layer( model.module.body.dim_output_space, args, num_classes=nmb_sub_classes, group=training_groups[args.training_local_world_id], ) # variables to fetch in checkpoint to_restore = {'epoch': 0, 'start_iter': 0} # re start from checkpoint restart_from_checkpoint( args, run_variables=to_restore, state_dict=model, optimizer=optimizer, pred_layer_state_dict=pred_layer, optimizer_pred_layer=optimizer_pred_layer, ) pred_layer_name = str(args.training_local_world_id) + '-pred_layer.pth.tar' restart_from_checkpoint( args, ckp_path=os.path.join(args.dump_path, pred_layer_name), state_dict=sub_class_pred_layer, optimizer=optimizer_sub_class_pred_layer, ) args.epoch = to_restore['epoch'] args.start_iter = to_restore['start_iter'] for _ in range(args.epoch, args.nepochs): logger.info("============ Starting epoch %i ... ============" % args.epoch) fix_random_seeds(args.epoch) # step 1: Get the final activations for the whole dataset / Cluster them if cluster_assignments is None and not args.epoch % args.reassignment: logger.info("=> Start clustering step") dataset.transform = tr_cluster cluster_assignments = get_cluster_assignments( args, model, dataset, clustering_groups) # reset prediction layers if args.nmb_super_clusters > 1: pred_layer, optimizer_pred_layer = build_prediction_layer( model.module.body.dim_output_space, args, ) sub_class_pred_layer, optimizer_sub_class_pred_layer = build_prediction_layer( model.module.body.dim_output_space, args, num_classes=nmb_sub_classes, group=training_groups[args.training_local_world_id], ) # step 2: Train the network with the cluster assignments as labels # prepare dataset dataset.transform = tr_train dataset.sub_classes = cluster_assignments # concatenate models and their corresponding optimizers models = [model, pred_layer, sub_class_pred_layer] optimizers = [ optimizer, optimizer_pred_layer, optimizer_sub_class_pred_layer ] # train the network for one epoch scores = train_network(args, models, optimizers, dataset) ## save training statistics logger.info(scores) training_stats.update(scores) # reassign clusters at the next epoch if not args.epoch % args.reassignment: cluster_assignments = None dataset.subset_indexes = None end_of_epoch(args) dist.barrier()
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) if params.build_nmt_domain_feature: import torch from src.curriculum import build_nmt_domain_feature dataset = data['para'][('de', 'en')]['train'] batches, indices = dataset.get_iterator( shuffle=False, group_by_size=params.group_by_size, n_sentences=-1, ) features = build_nmt_domain_feature(data, params, batches, dataset) result = {'indices': indices, 'domain_feature': features} torch.save(result, params.build_output_path) return if params.build_nlm_domain_feature: import torch from src.curriculum import build_nlm_domain_feature dataset = data['para'][('de', 'en')]['train'] batches, indices = dataset.get_iterator( shuffle=False, group_by_size=params.group_by_size, n_sentences=-1, ) # dataset = data['mono_stream']['en']['train'] # indices = dataset.get_iterator( # shuffle=False # ) features, domain_score, sents = build_nlm_domain_feature( data, params, batches, dataset) result = { 'indices': indices, 'domain_feature': features, 'domain_score': domain_score, 'sents': sents } if params.build_output_path.endswith('pth'): build_output_path = params.build_output_path else: build_output_path = f'{params.build_output_path}/final.pth' torch.save(result, build_output_path) return # build model if params.encoder_only: model = build_model(params, data['dico']) elif params.dual_encoder: encoder1, encoder2 = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) elif params.domains: if params.curriculum_learning: trainer = CurriculumTrainer(encoder, decoder, data, params) elif params.dual_encoder: trainer = DualEncoderTrainer(encoder1, encoder2, data, params) else: trainer = MultiDomainTrainer(encoder, decoder, data, params) if params.local_adapt: evaluator = MetaMultiDomainEvaluator(trainer, data, params) else: if params.curriculum_learning: evaluator = EncDecEvaluator(trainer, data, params) elif params.dual_encoder: evaluator = DualEncoderEvaluator(trainer, data, params) else: evaluator = MultiDomainEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() if not params.dual_encoder and params.domains and params.domain_ratio_update_freq > 0 and trainer.n_total_iter % params.domain_ratio_update_freq == 0 and not params.sampling_uniform: evaluator.update_language_sampler_multidomain() evaluator.update_dataset_ratio(trainer) if not params.dual_encoder and params.domain_reset_freq > 0 and trainer.n_total_iter % params.domain_reset_freq == 0: evaluator.reset_dataset_ratio(trainer) logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment / load data logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption if params.is_slurm_job: init_signal_handler() if params.dataset == "imagenet": params.num_classes = 1000 params.img_size = 256 params.crop_size = 224 else: if params.dataset == "cifar10": params.num_classes = 10 elif params.dataset == "cifar100": params.num_classes = 100 else: assert False, "Dataset unbeknownst to me" params.img_size = 40 params.crop_size = 32 # data loaders / samplers train_data_loader, train_sampler = get_data_loader( img_size=params.img_size, crop_size=params.crop_size, shuffle=True, batch_size=params.batch_size, nb_workers=params.nb_workers, distributed_sampler=params.multi_gpu, dataset=params.dataset, transform=params.transform, split='valid' if params.debug_train else params.split_train, ) valid_data_loader, _ = get_data_loader( img_size=params.img_size, crop_size=params.crop_size, shuffle=False, batch_size=params.batch_size, nb_workers=params.nb_workers, distributed_sampler=False, dataset=params.dataset, transform='center', split='valid', ) # build model / cuda logger.info("Building %s model ..." % params.architecture) model = build_model(params) model.cuda() # distributed # TODO: check this https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main.py#L142 if params.multi_gpu: logger.info("Using nn.parallel.DistributedDataParallel ...") model = nn.parallel.DistributedDataParallel( model, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # build trainer / reload potential checkpoints / build evaluator trainer = Trainer(model=model, params=params) trainer.reload_checkpoint() evaluator = Evaluator(trainer, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer, evals=['classif', 'recognition'], data_loader=valid_data_loader) for k, v in scores.items(): logger.info('%s -> %.6f' % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # training for epoch in range(trainer.epoch, params.epochs): # update epoch / sampler / learning rate trainer.epoch = epoch logger.info("============ Starting epoch %i ... ============" % trainer.epoch) if params.multi_gpu: train_sampler.set_epoch(epoch) # update learning rate trainer.update_learning_rate() # train for i, (images, targets) in enumerate(train_data_loader): trainer.classif_step(images, targets) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate classification accuracy scores = evaluator.run_all_evals(trainer, evals=['classif'], data_loader=valid_data_loader) for name, val in trainer.get_scores().items(): scores[name] = val # print / JSON log for k, v in scores.items(): logger.info('%s -> %.6f' % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): if params.adv: params.use_lang_emb = False print("Language embeddings are not used...\n \n \n \n") # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model # reload-model options are in here if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for epoch in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores))
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # load checkpoint if params.model_path != "": reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() encoder.load_state_dict(reloaded['encoder']) decoder.load_state_dict(reloaded['decoder']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) else: # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(args): # initialize the multi-GPU / multi-node training init_distributed_mode(args, make_communication_groups=False) # initialize the experiment logger, training_stats = initialize_exp(args, 'epoch', 'iter', 'prec', 'loss', 'prec_val', 'loss_val') # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() if not 'pascal' in args.data_path: main_data_path = args.data_path args.data_path = os.path.join(main_data_path, 'train') train_dataset = load_data(args) else: train_dataset = VOC2007_dataset(args.data_path, split=args.split) args.test = 'val' if args.split == 'train' else 'test' if not 'pascal' in args.data_path: if args.cross_valid is None: args.data_path = os.path.join(main_data_path, 'val') val_dataset = load_data(args) else: val_dataset = VOC2007_dataset(args.data_path, split=args.test) if args.cross_valid is not None: kfold = KFold(per_target(train_dataset.imgs), args.cross_valid, args.kfold) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=kfold.train, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, sampler=kfold.val, num_workers=args.workers) else: train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) # prepare the different data transformations tr_val, tr_train = get_data_transformations() train_dataset.transform = tr_train val_dataset.transform = tr_val # build model skeleton fix_random_seeds(args.seed) model = model_factory(args.arch, args.sobel) load_pretrained(model, args) # keep only conv layers model.body.classifier = None model.conv = args.conv if 'places' in args.data_path: nmb_classes = 205 elif 'pascal' in args.data_path: nmb_classes = 20 else: nmb_classes = 1000 reglog = RegLog(args.arch, nmb_classes, args.conv) # distributed training wrapper model = to_cuda(model, [args.gpu_to_work_on], apex=False) reglog = to_cuda(reglog, [args.gpu_to_work_on], apex=False) logger.info('model to cuda') # set optimizer optimizer = sgd_optimizer(reglog, args.lr, args.wd) ## variables to reload to fetch in checkpoint to_restore = {'epoch': 0, 'start_iter': 0} # re start from checkpoint restart_from_checkpoint( args, run_variables=to_restore, state_dict=reglog, optimizer=optimizer, ) args.epoch = to_restore['epoch'] args.start_iter = to_restore['start_iter'] model.eval() reglog.train() # Linear training for _ in range(args.epoch, args.nepochs): logger.info("============ Starting epoch %i ... ============" % args.epoch) # train the network for one epoch scores = train_network(args, model, reglog, optimizer, train_loader) if not 'pascal' in args.data_path: scores_val = validate_network(val_loader, [model, reglog], args) else: scores_val = evaluate_pascal(val_dataset, [model, reglog]) scores = scores + scores_val # save training statistics logger.info(scores) training_stats.update(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment meta_params = copy.deepcopy(params).meta_params params.meta_params = "..." # to long to be log logger = initialize_exp(params) params.meta_params = meta_params # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() trainer, evaluator, eval_trainers = get_trainer_evaluator(params, logger) # evaluation if params.eval_only: end_of_epoch(params=params, logger=logger, trainer=trainer, evaluator=evaluator) if params.eval_tasks: logger.info("============ Evaluation task ============") for eval_task in params.eval_tasks: logger.info("============ %s ============" % eval_task) end_of_epoch(params=eval_trainers[eval_task]["params"], logger=logger, trainer=eval_trainers[eval_task]['trainer'], evaluator=eval_trainers[eval_task]['evaluator'], eval_task=eval_task) exit() # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) one_epoch(trainer, params) if params.eval_tasks: logger.info("============ Evaluation task ============") for eval_task in params.eval_tasks: logger.info("============ %s ============" % eval_task) if params.encoder_only: eval_trainers[eval_task]['trainer'].model = copy.deepcopy( trainer.model) else: eval_trainers[eval_task][ 'trainer'].encoder = copy.deepcopy(trainer.encoder) eval_trainers[eval_task][ 'trainer'].decoder = copy.deepcopy(trainer.decoder) one_epoch(eval_trainers[eval_task]['trainer'], eval_trainers[eval_task]["params"], eval_task=eval_task) logger.info("============ End of epoch %i ============" % trainer.epoch) end_of_epoch(params=params, logger=logger, trainer=trainer, evaluator=evaluator) if params.eval_tasks: logger.info("============ Evaluation task ============") for eval_task in params.eval_tasks: end_of_epoch(params=eval_trainers[eval_task]["params"], logger=logger, trainer=eval_trainers[eval_task]['trainer'], evaluator=eval_trainers[eval_task]['evaluator'], eval_task=eval_task) # our logger.info("============ garbage collector collecting %d ..." % gc.collect())
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model # reload-model options are in here if params.encoder_only: model = build_model(params, data['dico']) if params.use_adapters: logger.info("Using adapters") for param in model.named_parameters(): if param[0][:8] != "adapters": param[1].requires_grad = False for param_name, param in model.embeddings.named_parameters(): param.requires_grad = True for param_name, param in model.position_embeddings.named_parameters( ): param.requires_grad = True for param_name, param in model.pred_layer.named_parameters(): param.requires_grad = True for param in model.layer_norm_emb.parameters(): param.requires_grad = True for param in model.named_parameters(): logger.info(param[0] + ' required grad = ' + str(param[1].requires_grad)) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) logger.info("Number of trainable parameters (encoder): %i" % sum( [p.numel() for p in trainer.model.parameters() if p.requires_grad])) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) logger.info( "Number of trainable parameters (encoder): %i" % sum([p.numel() for p in encoder.parameters() if p.requires_grad])) logger.info( "Number of trainable parameters (decoder): %i" % sum([p.numel() for p in decoder.parameters() if p.requires_grad])) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for epoch in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() if params.other_seed > -1: # deterministic torch.manual_seed(params.other_seed) torch.cuda.manual_seed(params.other_seed) np.random.seed(params.other_seed) random.seed(params.other_seed) if params.iter_seed == -1: # non-deterministic params.iter_seed = None # load data data = load_data(params) writer = SummaryWriter(params.dump_path + "/" + params.exp_name + "_log") # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) sys.exit() # set sampling probabilities for training set_sampling_probs(data, params) _iter = 0 # dump initial weights if params.save_initial: trainer.save_checkpoint('initial', include_optimizers=False) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): if params.only_vlm: # with visual features trainer.vlm_step(lang1, lang2, params.lambda_mlm, _iter) else: trainer.mlm_step(lang1, lang2, params.lambda_mlm, _iter) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) for lang1, lang2 in shuf_order(params.mmt_steps, params): trainer.mmt_step(lang1, lang2, params.lambda_mt) trainer.iter() _iter += 1 logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): writer.add_scalar(k, v, _iter) logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) params.lgs = lgs = params.lgs.split("-") if len(lgs) == 1: lgs.append(lgs[0]) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # Replace the original MLM steps for lang1, lang2 in shuf_order(params.mlm_steps, params): if params.do_meta_update: trainer.meta_mlm_step(lang1) else: trainer.mlm_step(lang1, lang2, params.lambda_mlm) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) print(data) # build model if params.encoder_only: model = build_model(params) else: encoder, decoder = build_model(params) # build trainer, reload potential checkpoints / build evaluator trainer = XTrainer(model, data, params) evaluator = XEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): if params.is_understanding: trainer.mlm_step(lang1, lang2, params.lambda_mlm) for lang1, lang2 in shuf_order(params.text_steps, params): if params.is_ntg: trainer.ntg_step(lang1, None, params.lambda_mlm) # cross-modal caption steps for lang1, lang2 in shuf_order(params.cross_modal_steps, params): if params.is_mt: trainer.mt_ic_step(lang1, lang2, params.lambda_ic) else: trainer.ic_step(lang1, lang2, params.lambda_ic) if params.is_freelb: trainer.free_lb_ic_step(lang1, lang2, params.lambda_ic) for lang1, lang2 in shuf_order(params.mlm_steps, params, n=3): if params.is_generation: trainer.bart_mlm_step(lang1, lang2, params.lambda_imlm) trainer.bart_mass_step(lang1, lang2, params.lambda_imlm) for lang1, lang2 in shuf_order(params.cross_ae_steps, params): trainer.bart_img_step(lang1, lang2, params.lambda_ida) for lang1, lang2 in shuf_order(params.cross_rel_steps, params): if params.is_pretrain: trainer.pretrain_rel_step(lang1, lang2) else: if params.is_slide: trainer.slide_step(lang1, lang2, params.lambda_t2i) else: # support multi languages trainer.rel_step(lang1, lang2, params.lambda_t2i, params.lambda_i2t) # for lang1, lang2 in shuf_order(params.cross_mlm_steps, params): # trainer.mlm_step(lang1, lang2, params.lambda_mlm) # # for lang1, lang2 in shuf_order(params.cross_mrm_steps, params): # trainer.mrm_step(lang1, lang2, params.lambda_mrm) # # for lang1, lang2 in shuf_order(params.cross_mrfr_steps, params): # trainer.mrfr_step(lang1, lang2, params.lambda_mrfr) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) evaluate_results = [] import os if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) evaluate_results.append(json.dumps(scores)) with open(os.path.join(params.dump_path, "epoch_{0}.eval_log".format(trainer.epoch)), 'w') as writer: for line in evaluate_results: writer.write(line + '\n') # end of epoch trainer.save_best_model(scores) if trainer.epoch % params.save_every_epoch == 0 and params.is_master: trainer.save_model('model_pretrain_%i' % trainer.epoch) trainer.save_periodic() trainer.end_epoch(scores)
def seq2seq_main(params): ''' Use different vocabulary/dictionary for src and tgt ''' # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = seq2seq_load_data(params) # build model # 因為 language pair 會重新升冪排序 (zh-en) --> (en-zh) # 所以 en 變成 src , zh 變成 tgt encoder, decoder = build_seq2seq_model( params, data['tgt_dico'], data['src_dico']) # build trainer, reload potential checkpoints / build evaluator trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = MyEncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(args): # initialize the multi-GPU / multi-node training init_distributed_mode(args, make_communication_groups=False) # initialize the experiment logger, training_stats = initialize_exp(args, 'epoch', 'iter', 'prec', 'loss', 'prec_val', 'loss_val') # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() main_data_path = args.data_path if args.debug: args.data_path = os.path.join(main_data_path, 'val') else: args.data_path = os.path.join(main_data_path, 'train') train_dataset = load_data(args) args.data_path = os.path.join(main_data_path, 'val') val_dataset = load_data(args) # prepare the different data transformations tr_val, tr_train = get_data_transformations() train_dataset.transform = tr_train val_dataset.transform = tr_val val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, ) # build model skeleton fix_random_seeds(args.seed) nmb_classes = 205 if 'places' in args.data_path else 1000 model = model_factory(args, relu=True, num_classes=nmb_classes) # load pretrained weights load_pretrained(model, args) # merge sobel layers with first convolution layer if args.sobel2RGB: sobel2RGB(model) # re initialize classifier if hasattr(model.body, 'classifier'): for m in model.body.classifier.modules(): if isinstance(m, nn.Linear): m.weight.data.normal_(0, 0.01) m.bias.data.fill_(0.1) # distributed training wrapper model = to_cuda(model, [args.gpu_to_work_on], apex=True) logger.info('model to cuda') # set optimizer optimizer = sgd_optimizer(model, args.lr, args.wd) ## variables to reload to fetch in checkpoint to_restore = {'epoch': 0, 'start_iter': 0} # re start from checkpoint restart_from_checkpoint( args, run_variables=to_restore, state_dict=model, optimizer=optimizer, ) args.epoch = to_restore['epoch'] args.start_iter = to_restore['start_iter'] if args.evaluate: validate_network(val_loader, [model], args) return # Supervised training for _ in range(args.epoch, args.nepochs): logger.info("============ Starting epoch %i ... ============" % args.epoch) fix_random_seeds(args.seed + args.epoch) # train the network for one epoch adjust_learning_rate(optimizer, args) scores = train_network(args, model, optimizer, train_dataset) scores_val = validate_network(val_loader, [model], args) # save training statistics logger.info(scores + scores_val) training_stats.update(scores + scores_val)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps (causal languge model) for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt, params.bt_sample_temperature) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch if params.validation_metrics != '': trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build the big model if params.encoder_only: big_model = build_model(params, data['dico'], cut=False) else: # 修改处1 big_encoder, big_decoder = build_model(params, data['dico'], cut=False) # if we cut some layers, must build a small model if params.cut_layer: if params.encoder_only: small_model = build_model(params, data['dico'], cut=True) else: # 修改处1 small_encoder, small_decoder = build_model(params, data['dico'], cut=True) # build the big trainer, reload potential checkpoints # the big trainer is used to train, so need't a evaluator for it if params.encoder_only: big_trainer = SingleTrainer(big_model, data, params) else: big_trainer = EncDecTrainer(big_encoder, big_decoder, data, params) params.lambda_mlm = "1" params.lambda_clm = "1" params.lambda_pc = "1" params.lambda_ae = "1" params.lambda_mt = "1" params.lambda_bt = "1" # build the small model, and use it for evaluator if params.encoder_only: small_trainer = small_SingleTrainer(small_model, data, params) evaluator = SingleEvaluator(small_trainer, data, params) else: small_trainer = small_EncDecTrainer(small_encoder, small_decoder, data, params) evaluator = EncDecEvaluator(small_trainer, data, params) # evaluation only for the small trainer if params.eval_only: scores = evaluator.run_all_evals(small_trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for count in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % small_trainer.epoch) small_trainer.n_sentences = 0 while small_trainer.n_sentences < small_trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): small_trainer.clm_step(lang1, lang2, params.lambda_clm, big_trainer) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): small_trainer.mlm_step(lang1, lang2, params.lambda_mlm, big_trainer) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): small_trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): small_trainer.mt_step(lang, lang, params.lambda_ae, big_trainer) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): small_trainer.mt_step(lang1, lang2, params.lambda_mt, big_trainer) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): small_trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) small_trainer.iter() logger.info("============ End of epoch %i ============" % small_trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(small_trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch small_trainer.save_best_model(scores) small_trainer.save_periodic() small_trainer.end_epoch(scores)