def test_deepspeed_with_meta_device(tmpdir): with init_meta_context(): model = BoringModel() assert model.layer.weight.device.type == "meta" trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3), gpus=2, fast_dev_run=True, precision=16 ) trainer.fit(model) assert model.layer.weight.device.type == "cpu"
def test_materialize_module_recursive_child(): """Test materialize_module doesn't set a child recursively to a model instantiated within init_meta_context.""" with init_meta_context(): model = BoringModel() materialize_module(model) with pytest.raises(AttributeError, match="'Linear' object has no attribute 'layer'"): model.layer.layer
def test_init_meta_context(): with init_meta_context(): m = nn.Linear(in_features=1, out_features=1) assert isinstance(m, nn.Linear) assert m.weight.device.type == "meta" assert is_on_meta_device(m) mlp = MLP(4) assert mlp.layer[0].weight.device.type == "meta" mlp = materialize_module(mlp) assert mlp.layer[0].weight.device.type == "cpu" assert not is_on_meta_device(mlp) assert not is_on_meta_device(nn.Module()) model = SimpleBoringModel(4) assert model.layer[0].weight.device.type == "meta" materialize_module(model) assert model.layer[0].weight.device.type == "cpu" mlp = MLP(4) assert mlp.layer[0].weight.device.type == "cpu" # no-op as already materialized. materialize_module(mlp) assert mlp.layer[0].weight.device.type == "cpu" m = nn.Linear(in_features=1, out_features=1) assert m.weight.device.type == "cpu" with init_meta_context(): m = nn.Linear(in_features=1, out_features=1) assert m.weight.device.type == "meta" m = nn.Linear(in_features=1, out_features=1) assert m.weight.device.type == "cpu"
args = parser.parse_args() if not os.path.exists("input.txt"): os.system( "wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt" ) text = open('input.txt', 'r').read() # don't worry we won't run out of file handles train_dataset = CharDataset( text, args.block_size) # one line of poem is roughly 50 characters train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) with init_meta_context(): model = GPT(vocab_size=train_dataset.vocab_size, block_size=train_dataset.block_size, n_layer=args.n_layer, n_head=args.n_head, n_embd=args.n_embd, learning_rate=args.learning_rate) lr_decay = LearningRateDecayCallback(learning_rate=6e-4, warmup_tokens=512 * 20, final_tokens=2 * len(train_dataset) * args.block_size) trainer = Trainer.from_argparse_args( args, max_epochs=10,