示例#1
0
                           forbidden_index=cnfg.forbidden_indexes)

if cnfg.src_emb is not None:
    logger.info("Load source embedding from: " + cnfg.src_emb)
    load_emb(cnfg.src_emb, mymodel.enc.wemb.weight, nwordi,
             cnfg.scale_down_emb, cnfg.freeze_srcemb)
if cnfg.tgt_emb is not None:
    logger.info("Load target embedding from: " + cnfg.tgt_emb)
    load_emb(cnfg.tgt_emb, mymodel.dec.wemb.weight, nwordt,
             cnfg.scale_down_emb, cnfg.freeze_tgtemb)

if use_cuda:
    mymodel.to(cuda_device)
    lossf.to(cuda_device)

optimizer = Optimizer(mymodel.parameters(),
                      lr=init_lr,
                      betas=adam_betas_default,
                      eps=ieps_adam_default,
                      weight_decay=cnfg.weight_decay,
                      amsgrad=use_ams)
optimizer.zero_grad(set_to_none=True)

use_amp = cnfg.use_amp and use_cuda
scaler = GradScaler() if use_amp else None

if multi_gpu:
    mymodel = DataParallelMT(mymodel,
                             device_ids=cuda_devices,
                             output_device=cuda_device.index,
                             host_replicate=True,
示例#2
0
        _emb = _emb.narrow(0, 0, nwordt).contiguous()
    if cnfg.scale_down_emb:
        _emb.div_(sqrt(cnfg.isize))
    mymodel.dec.wemb.weight.data = _emb
    if cnfg.freeze_tgtemb:
        mymodel.dec.wemb.weight.requires_grad_(False)
    else:
        mymodel.dec.wemb.weight.requires_grad_(True)
    _emb = None

if use_cuda:
    mymodel.to(cuda_device)
    lossf.to(cuda_device)

# lr will be over written by GoogleLR before used
optimizer = optim.Adam(mymodel.parameters(),
                       lr=1e-4,
                       betas=(0.9, 0.98),
                       eps=1e-9,
                       weight_decay=cnfg.weight_decay,
                       amsgrad=use_ams)

if multi_gpu:
    #mymodel = nn.DataParallel(mymodel, device_ids=cuda_devices, output_device=cuda_device.index)
    mymodel = DataParallelMT(mymodel,
                             device_ids=cuda_devices,
                             output_device=cuda_device.index,
                             host_replicate=True,
                             gather_output=False)
    lossf = DataParallelCriterion(lossf,
                                  device_ids=cuda_devices,
示例#3
0
	mymodel = load_model_cpu(fine_tune_m, mymodel)

lossf = LabelSmoothingLoss(nwordt, cnfg.label_smoothing, ignore_index=pad_id, reduction='sum', forbidden_index=cnfg.forbidden_indexes)

if cnfg.src_emb is not None:
	logger.info("Load source embedding from: " + cnfg.src_emb)
	load_emb(cnfg.src_emb, mymodel.enc.wemb.weight, nwordi, cnfg.scale_down_emb, cnfg.freeze_srcemb)
if cnfg.tgt_emb is not None:
	logger.info("Load target embedding from: " + cnfg.tgt_emb)
	load_emb(cnfg.tgt_emb, mymodel.dec.wemb.weight, nwordt, cnfg.scale_down_emb, cnfg.freeze_tgtemb)

if use_cuda:
	mymodel.to(cuda_device)
	lossf.to(cuda_device)

optimizer = optim.Adam(mymodel.parameters(), lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams)
optimizer.zero_grad()

use_amp = cnfg.use_amp and use_cuda
scaler = GradScaler() if use_amp else None

if multi_gpu:
	mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False)
	lossf = DataParallelCriterion(lossf, device_ids=cuda_devices, output_device=cuda_device.index, replicate_once=True)

fine_tune_state = cnfg.fine_tune_state
if fine_tune_state is not None:
	logger.info("Load optimizer state from: " + fine_tune_state)
	optimizer.load_state_dict(h5load(fine_tune_state))

lrsch = GoogleLR(optimizer, cnfg.isize, cnfg.warm_step, scale=cnfg.lr_scale)