def test_memory_efficient(self): model = copy.deepcopy(self.model) params = list(model.parameters()) optimizer = MemoryEfficientFP16Optimizer.build_optimizer( self.cfg_dls, params) self.run_iter(model, params, optimizer)
def test_load_state_dict(self): # define simple FP16 model model = torch.nn.Linear(5, 5).cuda().half() params = list(model.parameters()) # initialize memory efficient FP16 optimizer # with pseudo DictConfigs optimizer = FairseqAdam( cfg=OmegaConf.create( vars( argparse.Namespace( adam_betas="(0.9, 0.999)", adam_eps=1e-8, weight_decay=0.0, lr=[0.00001], ) ) ), params=params, ) me_optimizer = MemoryEfficientFP16Optimizer( cfg=OmegaConf.create( { "common": vars( argparse.Namespace( fp16_init_scale=1, fp16_scale_window=1, fp16_scale_tolerance=1, threshold_loss_scale=1, min_loss_scale=1e-4, ) ) } ), params=params, optimizer=optimizer, ) # optimizer state is created in the first step loss = model(torch.rand(5).cuda().half()).sum() me_optimizer.backward(loss) me_optimizer.step() # reload state state = me_optimizer.state_dict() me_optimizer.load_state_dict(state) for k, v in me_optimizer.optimizer.state.items(): self.assertTrue(k.dtype == torch.float16) for v_i in v.values(): if torch.is_tensor(v_i): self.assertTrue(v_i.dtype == torch.float32)