forbidden_index=cnfg.forbidden_indexes) if cnfg.src_emb is not None: logger.info("Load source embedding from: " + cnfg.src_emb) load_emb(cnfg.src_emb, mymodel.enc.wemb.weight, nwordi, cnfg.scale_down_emb, cnfg.freeze_srcemb) if cnfg.tgt_emb is not None: logger.info("Load target embedding from: " + cnfg.tgt_emb) load_emb(cnfg.tgt_emb, mymodel.dec.wemb.weight, nwordt, cnfg.scale_down_emb, cnfg.freeze_tgtemb) if use_cuda: mymodel.to(cuda_device) lossf.to(cuda_device) optimizer = Optimizer(mymodel.parameters(), lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams) optimizer.zero_grad(set_to_none=True) use_amp = cnfg.use_amp and use_cuda scaler = GradScaler() if use_amp else None if multi_gpu: mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True,
_emb = _emb.narrow(0, 0, nwordt).contiguous() if cnfg.scale_down_emb: _emb.div_(sqrt(cnfg.isize)) mymodel.dec.wemb.weight.data = _emb if cnfg.freeze_tgtemb: mymodel.dec.wemb.weight.requires_grad_(False) else: mymodel.dec.wemb.weight.requires_grad_(True) _emb = None if use_cuda: mymodel.to(cuda_device) lossf.to(cuda_device) # lr will be over written by GoogleLR before used optimizer = optim.Adam(mymodel.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9, weight_decay=cnfg.weight_decay, amsgrad=use_ams) if multi_gpu: #mymodel = nn.DataParallel(mymodel, device_ids=cuda_devices, output_device=cuda_device.index) mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False) lossf = DataParallelCriterion(lossf, device_ids=cuda_devices,
mymodel = load_model_cpu(fine_tune_m, mymodel) lossf = LabelSmoothingLoss(nwordt, cnfg.label_smoothing, ignore_index=pad_id, reduction='sum', forbidden_index=cnfg.forbidden_indexes) if cnfg.src_emb is not None: logger.info("Load source embedding from: " + cnfg.src_emb) load_emb(cnfg.src_emb, mymodel.enc.wemb.weight, nwordi, cnfg.scale_down_emb, cnfg.freeze_srcemb) if cnfg.tgt_emb is not None: logger.info("Load target embedding from: " + cnfg.tgt_emb) load_emb(cnfg.tgt_emb, mymodel.dec.wemb.weight, nwordt, cnfg.scale_down_emb, cnfg.freeze_tgtemb) if use_cuda: mymodel.to(cuda_device) lossf.to(cuda_device) optimizer = optim.Adam(mymodel.parameters(), lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams) optimizer.zero_grad() use_amp = cnfg.use_amp and use_cuda scaler = GradScaler() if use_amp else None if multi_gpu: mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False) lossf = DataParallelCriterion(lossf, device_ids=cuda_devices, output_device=cuda_device.index, replicate_once=True) fine_tune_state = cnfg.fine_tune_state if fine_tune_state is not None: logger.info("Load optimizer state from: " + fine_tune_state) optimizer.load_state_dict(h5load(fine_tune_state)) lrsch = GoogleLR(optimizer, cnfg.isize, cnfg.warm_step, scale=cnfg.lr_scale)