Пример #1
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

    # float16
    if params.fp16:
        assert torch.backends.cudnn.enabled
        if params.encoder_only:
            model = network_to_half(model)
        else:
            encoder = network_to_half(encoder)
            decoder = network_to_half(decoder)

    # distributed
    if params.multi_gpu:
        logger.info("Using nn.parallel.DistributedDataParallel ...")
        if params.fp16:
            if params.encoder_only:
                model = apex.parallel.DistributedDataParallel(
                    model, delay_allreduce=True)
            else:
                encoder = apex.parallel.DistributedDataParallel(
                    encoder, delay_allreduce=True)
                decoder = apex.parallel.DistributedDataParallel(
                    decoder, delay_allreduce=True)
        else:
            if params.encoder_only:
                model = nn.parallel.DistributedDataParallel(
                    model,
                    device_ids=[params.local_rank],
                    output_device=params.local_rank,
                    broadcast_buffers=True)
            else:
                encoder = nn.parallel.DistributedDataParallel(
                    encoder,
                    device_ids=[params.local_rank],
                    output_device=params.local_rank,
                    broadcast_buffers=True)
                decoder = nn.parallel.DistributedDataParallel(
                    decoder,
                    device_ids=[params.local_rank],
                    output_device=params.local_rank,
                    broadcast_buffers=True)

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # CLM steps
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                trainer.clm_step(lang1, lang2, params.lambda_clm)

            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                trainer.mlm_step(lang1, lang2, params.lambda_mlm)

            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            # mass prediction steps
            for lang in shuf_order(params.ms_steps):
                p = np.random.random()
                trainer.ms_step(lang, params.lambda_ms)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Пример #2
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # print(params)
    # input()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # CLM steps
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                trainer.clm_step(lang1, lang2, params.lambda_clm)

            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                trainer.mlm_step(lang1, lang2, params.lambda_mlm)

            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Пример #3
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # build model
    # reload-model options are in here
    if params.encoder_only:
        model = build_model(params, data['dico'])

        if params.use_adapters:
            logger.info("Using adapters")
            for param in model.named_parameters():

                if param[0][:8] != "adapters":
                    param[1].requires_grad = False

            for param_name, param in model.embeddings.named_parameters():
                param.requires_grad = True
            for param_name, param in model.position_embeddings.named_parameters(
            ):
                param.requires_grad = True
            for param_name, param in model.pred_layer.named_parameters():
                param.requires_grad = True
            for param in model.layer_norm_emb.parameters():
                param.requires_grad = True
            for param in model.named_parameters():
                logger.info(param[0] + ' required grad = ' +
                            str(param[1].requires_grad))

    else:
        encoder, decoder = build_model(params, data['dico'])

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
        logger.info("Number of trainable parameters (encoder): %i" % sum(
            [p.numel()
             for p in trainer.model.parameters() if p.requires_grad]))

    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)
        logger.info(
            "Number of trainable parameters (encoder): %i" %
            sum([p.numel() for p in encoder.parameters() if p.requires_grad]))
        logger.info(
            "Number of trainable parameters (decoder): %i" %
            sum([p.numel() for p in decoder.parameters() if p.requires_grad]))

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for epoch in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # CLM steps
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                trainer.clm_step(lang1, lang2, params.lambda_clm)

            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                trainer.mlm_step(lang1, lang2, params.lambda_mlm)

            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)

            # denoising auto-encoder
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Пример #4
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)
    
    # initialize the experiment
    logger = initialize_exp(params)

#     # initialize SLURM signal handler for time limit / pre-emption
#     init_signal_handler()

    # load data
    data = load_data(params)

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

#     # float16
#     if params.fp16:
#         assert torch.backends.cudnn.enabled
#         if params.encoder_only:
#             model = network_to_half(model)
#         else:
#             encoder = network_to_half(encoder)
#             decoder = network_to_half(decoder)

#     # distributed
#     if params.multi_gpu:
#         logger.info("Using nn.parallel.DistributedDataParallel ...")
#         if params.fp16:
#             if params.encoder_only:
#                 model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True)
#             else:
#                 encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True)
#                 decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True)
#         else:
#             if params.encoder_only:
#                 model = nn.parallel.DistributedDataParallel(model, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True)
#             else:
#                 encoder = nn.parallel.DistributedDataParallel(encoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True)
#                 decoder = nn.parallel.DistributedDataParallel(decoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True)

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

#     # evaluation
#     if params.eval_only:
#         scores = evaluator.run_all_evals(trainer)
#         for k, v in scores.items():
#             logger.info("%s -> %.6f" % (k, v))
#         logger.info("__log__:%s" % json.dumps(scores))
#         exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" % trainer.epoch)

        trainer.n_sentences = 0
        trainer.n_images = 0

        while trainer.n_sentences < trainer.epoch_size or trainer.n_images < trainer.epoch_size:

            # CLM steps
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                trainer.clm_step(lang1, lang2, params.lambda_clm)

            # MLM steps (also includes TLM if lang2 is not None)
            # shuf_order's result could be: ['fr', 'fr'] or ['en', 'fr'] or ['fr', 'en'] or ['en', 'en']
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                trainer.mlm_step(lang1, lang2, params.lambda_mlm)
                
            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)
                
            # Image-language pretraining steps 
            trainer.ipm_step("coco36", params.lambda_ipm)

            # CMLM steps steps
            for m1, m2 in shuf_order(params.cmlm_steps, params):
                trainer.cmlm_step(m1, m2, params.lambda_cmlm)

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            trainer.iter()


        logger.info("============ End of epoch %i ============" % trainer.epoch)
Пример #5
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    meta_params = copy.deepcopy(params).meta_params
    params.meta_params = "..."  # to long to be log
    logger = initialize_exp(params)
    params.meta_params = meta_params

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # todo : good params.n_words (We take the one from the first task have this parameter for the moment.)
    """
    But we think that if all the task data are based on the same vocabulary, all these parameters will be the same, 
    and therefore no problem if we choose one at random.
    """
    p = params.meta_params[data['key']]

    # build model
    if params.encoder_only:
        model = build_model(params=p, dico=data['dico'])
    else:
        encoder, decoder = build_model(params=p, dico=data['dico'])

    # todo : good pad_index and eos_index and ... (I'll take the one from the first task for the moment.)
    """
    But we think that if all the task data are based on the same vocabulary, all these parameters will be the same, 
    and therefore no problem if we choose one at random.
    """
    params.pad_index = p.pad_index
    params.eos_index = p.eos_index

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        if not params.meta_learning:
            for k, v in scores.items():
                logger.info("%s -> %.6f" % (k, v))
        else:
            for lgs in params.meta_params.keys():
                logger.info("============ task : %s " % lgs)
                for k, v in scores[lgs].items():
                    if k != "epoch":
                        logger.info("%s -> %.6f" % (k, v))
            logger.info("============ all")
            for k, v in scores.items():
                if not (k in (list(params.meta_params.keys()) + ['epoch'])):
                    logger.info("%s -> %.6f" % (k, v))

        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        if not params.meta_learning:
            trainer.n_sentences = 0
            while trainer.n_sentences < trainer.epoch_size:
                # CLM steps
                for lang1, lang2 in shuf_order(params.clm_steps, params):
                    trainer.clm_step(lang1, lang2, params.lambda_clm)

                # MLM steps (also includes TLM if lang2 is not None)
                for lang1, lang2 in shuf_order(params.mlm_steps, params):
                    trainer.mlm_step(lang1, lang2, params.lambda_mlm)

                # parallel classification steps
                for lang1, lang2 in shuf_order(params.pc_steps, params):
                    trainer.pc_step(lang1, lang2, params.lambda_pc)

                # denoising auto-encoder steps
                for lang in shuf_order(params.ae_steps):
                    trainer.mt_step(lang, lang, params.lambda_ae)

                # machine translation steps
                for lang1, lang2 in shuf_order(params.mt_steps, params):
                    trainer.mt_step(lang1, lang2, params.lambda_mt)

                # back-translation steps
                for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                    trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

                trainer.iter()
        else:
            # our
            trainer.n_sentences = {}
            """
            Here we build language lists for each of our meta-taks. Indeed, for two language lists l1 and l2, 
            the objective will be done with l1[i] and l2[i] respectively, this for each index i of the two lists. 
            """
            lang1_dic, lang2_dic, lang3_dic = {}, {}, {}
            """
            In the case of meta-learning, we have a (meta-)data dictionary for each (meta-)task, 
            so the keys are the languages conserved by the task. 
            """
            data_keys_dic = {}

            # equivalent to "for task in list of task" in the original algorithm,  except here we prepare all the tasks beforehand.
            for lgs in params.meta_params.keys():
                trainer.n_sentences[lgs] = 0

                # CLM
                try:
                    lang1_dic['clm_step']
                except KeyError:
                    lang1_dic['clm_step'], lang2_dic[
                        'clm_step'], data_keys_dic['clm_step'] = [], [], []
                for lang1, lang2 in shuf_order(
                        params.meta_params[lgs].clm_steps, params):
                    lang1_dic['clm_step'].append(lang1)
                    lang2_dic['clm_step'].append(lang2)
                    data_keys_dic['clm_step'].append(lgs)

                # MLM
                try:
                    lang1_dic['mlm_step']
                except KeyError:
                    lang1_dic['mlm_step'], lang2_dic[
                        'mlm_step'], data_keys_dic['mlm_step'] = [], [], []
                for lang1, lang2 in shuf_order(
                        params.meta_params[lgs].mlm_steps, params):
                    lang1_dic['mlm_step'].append(lang1)
                    lang2_dic['mlm_step'].append(lang2)
                    data_keys_dic['mlm_step'].append(lgs)

                # parallel classification
                try:
                    lang1_dic['pc_step']
                except KeyError:
                    lang1_dic['pc_step'], lang2_dic['pc_step'], data_keys_dic[
                        'pc_step'] = [], [], []
                for lang1, lang2 in shuf_order(
                        params.meta_params[lgs].pc_steps, params):
                    lang1_dic['pc_step'].append(lang1)
                    lang2_dic['pc_step'].append(lang2)
                    data_keys_dic['pc_step'].append(lgs)

                # denoising auto-encoder
                try:
                    lang1_dic['ae_step']
                except KeyError:
                    lang1_dic['ae_step'], data_keys_dic['ae_step'] = [], []
                for lang1 in shuf_order(params.meta_params[lgs].ae_steps):
                    lang1_dic['ae_step'].append(lang1)
                    data_keys_dic['ae_step'].append(lgs)

                # machine translation
                try:
                    lang1_dic['mt_step']
                except KeyError:
                    lang1_dic['mt_step'], lang2_dic['mt_step'], data_keys_dic[
                        'mt_step'] = [], [], []
                for lang1, lang2 in shuf_order(
                        params.meta_params[lgs].mt_steps, params):
                    lang1_dic['mt_step'].append(lang1)
                    lang2_dic['mt_step'].append(lang2)
                    data_keys_dic['mt_step'].append(lgs)

                # back-translation
                try:
                    lang1_dic['bt_step']
                except KeyError:
                    lang1_dic['bt_step'], lang2_dic['bt_step'], lang3_dic[
                        'bt_step'], data_keys_dic['bt_step'] = [], [], [], []
                for lang1, lang2, lang3 in shuf_order(
                        params.meta_params[lgs].bt_steps):
                    lang1_dic['bt_step'].append(lang1)
                    lang2_dic['bt_step'].append(lang2)
                    lang3_dic['bt_step'].append(lang3)
                    data_keys_dic['bt_step'].append(lgs)

            flag = True

            # equivalent to "while not done do" in the original algorithm
            while flag:

                # CLM steps
                #print("clm_step", flag)
                a = trainer.clm_step(lang1_dic['clm_step'],
                                     lang2_dic['clm_step'], params.lambda_clm,
                                     data_keys_dic['clm_step'])

                #print("mlm_step", flag)
                # MLM steps (also includes TLM if lang2 is not None)
                b = trainer.mlm_step(lang1_dic['mlm_step'],
                                     lang2_dic['mlm_step'], params.lambda_mlm,
                                     data_keys_dic['mlm_step'])

                # parallel classification steps
                c = trainer.pc_step(lang1_dic['pc_step'], lang2_dic['pc_step'],
                                    params.lambda_pc, data_keys_dic['pc_step'])

                if isinstance(trainer, EncDecTrainer):

                    # denoising auto-encoder steps
                    d = trainer.mt_step(lang1_dic['ae_step'],
                                        lang1_dic['ae_step'], params.lambda_ae,
                                        data_keys_dic['ae_step'])

                    # machine translation steps
                    e = trainer.mt_step(lang1_dic['mt_step'],
                                        lang2_dic['mt_step'], params.lambda_mt,
                                        data_keys_dic['mt_step'])

                    # back-translation steps
                    f = trainer.bt_step(lang1_dic['bt_step'],
                                        lang2_dic['bt_step'],
                                        lang3_dic['bt_step'], params.lambda_bt,
                                        data_keys_dic['bt_step'])

                    # do things better
                    if (not a) and (not b) and (not c) and (not d) and (
                            not e) and (not f):
                        flag = False  # End of epoch
                    else:
                        flag = True
                else:
                    # do things better
                    if (not a) and (not b) and (not c):
                        flag = False  # End of epoch
                    else:
                        flag = True

                trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        if not params.meta_learning:
            for k, v in scores.items():
                logger.info("%s -> %.6f" % (k, v))
        else:
            for lgs in params.meta_params.keys():
                logger.info("============ task : %s " % lgs)
                for k, v in scores[lgs].items():
                    if k != "epoch":
                        logger.info("%s -> %.6f" % (k, v))
            logger.info("============ all")
            for k, v in scores.items():
                if not (k in (list(params.meta_params.keys()) + ['epoch'])):
                    logger.info("%s -> %.6f" % (k, v))

        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)

        # our
        logger.info("============ garbage collector collecting %d ..." %
                    gc.collect())
Пример #6
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    if params.other_seed > -1:
        # deterministic
        torch.manual_seed(params.other_seed)
        torch.cuda.manual_seed(params.other_seed)
        np.random.seed(params.other_seed)
        random.seed(params.other_seed)

    if params.iter_seed == -1:
        # non-deterministic
        params.iter_seed = None

    # load data
    data = load_data(params)
    writer = SummaryWriter(params.dump_path + "/" + params.exp_name + "_log")

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        sys.exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)
    _iter = 0

    # dump initial weights
    if params.save_initial:
        trainer.save_checkpoint('initial', include_optimizers=False)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:
            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                if params.only_vlm:
                    # with visual features
                    trainer.vlm_step(lang1, lang2, params.lambda_mlm, _iter)
                else:
                    trainer.mlm_step(lang1, lang2, params.lambda_mlm, _iter)

            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            for lang1, lang2 in shuf_order(params.mmt_steps, params):
                trainer.mmt_step(lang1, lang2, params.lambda_mt)

            trainer.iter()
            _iter += 1

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            writer.add_scalar(k, v, _iter)
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Пример #7
0
def main(params):
    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    if params.build_nmt_domain_feature:
        import torch
        from src.curriculum import build_nmt_domain_feature
        dataset = data['para'][('de', 'en')]['train']
        batches, indices = dataset.get_iterator(
            shuffle=False,
            group_by_size=params.group_by_size,
            n_sentences=-1,
        )

        features = build_nmt_domain_feature(data, params, batches, dataset)
        result = {'indices': indices, 'domain_feature': features}
        torch.save(result, params.build_output_path)
        return

    if params.build_nlm_domain_feature:
        import torch
        from src.curriculum import build_nlm_domain_feature
        dataset = data['para'][('de', 'en')]['train']
        batches, indices = dataset.get_iterator(
            shuffle=False,
            group_by_size=params.group_by_size,
            n_sentences=-1,
        )
        # dataset = data['mono_stream']['en']['train']
        # indices = dataset.get_iterator(
        #     shuffle=False
        # )

        features, domain_score, sents = build_nlm_domain_feature(
            data, params, batches, dataset)
        result = {
            'indices': indices,
            'domain_feature': features,
            'domain_score': domain_score,
            'sents': sents
        }
        if params.build_output_path.endswith('pth'):
            build_output_path = params.build_output_path
        else:
            build_output_path = f'{params.build_output_path}/final.pth'
        torch.save(result, build_output_path)
        return

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    elif params.dual_encoder:
        encoder1, encoder2 = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    elif params.domains:
        if params.curriculum_learning:
            trainer = CurriculumTrainer(encoder, decoder, data, params)
        elif params.dual_encoder:
            trainer = DualEncoderTrainer(encoder1, encoder2, data, params)
        else:
            trainer = MultiDomainTrainer(encoder, decoder, data, params)
        if params.local_adapt:
            evaluator = MetaMultiDomainEvaluator(trainer, data, params)
        else:
            if params.curriculum_learning:
                evaluator = EncDecEvaluator(trainer, data, params)
            elif params.dual_encoder:
                evaluator = DualEncoderEvaluator(trainer, data, params)
            else:
                evaluator = MultiDomainEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # CLM steps
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                trainer.clm_step(lang1, lang2, params.lambda_clm)

            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                trainer.mlm_step(lang1, lang2, params.lambda_mlm)

            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            trainer.iter()

            if not params.dual_encoder and params.domains and params.domain_ratio_update_freq > 0 and trainer.n_total_iter % params.domain_ratio_update_freq == 0 and not params.sampling_uniform:
                evaluator.update_language_sampler_multidomain()
                evaluator.update_dataset_ratio(trainer)

            if not params.dual_encoder and params.domain_reset_freq > 0 and trainer.n_total_iter % params.domain_reset_freq == 0:
                evaluator.reset_dataset_ratio(trainer)

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)