def get_pipeline_module(*args: Any, **kwargs: Any) -> PipelineModule: """Create pipeline module with correct topology type.""" with mock.patch.object(PipelineModule, 'to', mock.MagicMock()): m = PipelineModule(*args, **kwargs) m._topo = PipeModelDataParallelTopology( num_pp=m.num_stages, num_dp=dist.get_world_size(m.world_group) // m.num_stages, num_mp=1, ) return m
def train_pipe(args, part='parameters'): torch.manual_seed(args.seed) deepspeed.runtime.utils.set_random_seed(args.seed) # # Build the model # # VGG also works :-) #net = vgg19(num_classes=10) net = AlexNet(num_classes=10) net = PipelineModule(layers=join_layers(net), loss_fn=torch.nn.CrossEntropyLoss(), num_stages=args.pipeline_parallel_size, partition_method=part, activation_checkpoint_interval=0) trainset = cifar_trainset(args.local_rank) engine, _, _, _ = deepspeed.initialize( args=args, model=net, model_parameters=[p for p in net.parameters() if p.requires_grad], training_data=trainset) for step in range(args.steps): loss = engine.train_batch()
def _helper(): base_model = copy.deepcopy(sequential_model) base_input = batch_input.clone().detach() base_output = base_model(base_input) base_output = base_output base_params = sum(p.numel() for p in base_model.parameters()) pipe_model = copy.deepcopy(sequential_model) pipe_model = PipelineModule(layers=pipe_model, num_stages=4) # Ensure all parameters are accounted for. my_params = sum(p.numel() for p in pipe_model.parameters()) total_pipe_params = torch.LongTensor([my_params]).to('cuda') dist.all_reduce(total_pipe_params) total_pipe_params = total_pipe_params.item() assert total_pipe_params == base_params pipe_model, _, _, _ = deepspeed.initialize( args=simple_args, model=pipe_model, model_parameters=[p for p in pipe_model.parameters()]) if pipe_model.is_first_stage or pipe_model.is_last_stage: pipe_input = base_input.clone().detach().to('cuda') # label 0 is meaningless dataset = [(pipe_input, 0)] loader = RepeatingLoader(dataset) data_iter = iter(loader) else: data_iter = None pipe_output = pipe_model.eval_batch(data_iter=data_iter) base_output = base_output.to('cpu') pipe_output = pipe_output.to('cpu') assert torch.allclose(base_output, pipe_output)
def __init__(self, context: DeepSpeedTrialContext) -> None: self.context = context self.args = AttrDict(self.context.get_hparams()) model = AlexNet(10) model = PipelineModule( layers=join_layers(model), loss_fn=torch.nn.CrossEntropyLoss(), num_stages=self.args.pipe_parallel_size, partition_method=self.args.part, activation_checkpoint_interval=0, ) ds_config = overwrite_deepspeed_config( self.args.deepspeed_config, self.args.get("overwrite_deepspeed_args", {})) model_engine, optimizer, _, _ = deepspeed.initialize( args=self.args, model=model, model_parameters=[ p for p in model.parameters() if p.requires_grad ], config=ds_config, ) self.model_engine = self.context.wrap_model_engine(model_engine)
else: model = nn.Sequential( ORTModule(nn.Linear(d_in, d_hidden).to(device)), # Stage 1 nn.ReLU().to(device), # ORTModule(nn.ReLU().to(device)), Stage 1, TODO: ORTModule can wrap Relu once stateless model is supported. ORTModule(nn.Linear(d_hidden, d_hidden).to(device)), # Stage 1 nn.ReLU().to(device), # ORTModule(nn.ReLU().to(device)), Stage 1, TODO: ORTModule can wrap Relu once stateless model is supported. ORTModule(nn.Linear(d_hidden, d_hidden).to(device)), # Stage 2 nn.ReLU().to(device), # ORTModule(nn.ReLU().to(device)), Stage 2, TODO: ORTModule can wrap Relu once stateless model is supported. ORTModule(nn.Linear(d_hidden, d_out).to(device)) # Stage 2 ) model = PipelineModule(layers=model, loss_fn=torch.nn.CrossEntropyLoss(), num_stages=args.pipeline_parallel_size, partition_method='uniform', #'parameters', activation_checkpoint_interval=0 ) params = [p for p in model.parameters() if p.requires_grad] # Input. x = torch.rand((n, d_in)) if args.fp16: x = x.half() # Output. y = torch.randint(0, d_out, (n,)) ds = SampleData(x,y) print("Initialize deepspeed") model_engine, optimizer, _, _ = deepspeed.initialize(args=args,
transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) deepspeed.init_distributed() net = AlexNet(num_classes=10) net = PipelineModule(layers=join_layers(net), loss_fn=torch.nn.CrossEntropyLoss(), num_stages=2, partition_method="parameters", activation_checkpoint_interval=0) args = add_argument() engine, optimizer, trainloader, __ = deepspeed.initialize( args=args, model=net, model_parameters=[p for p in net.parameters() if p.requires_grad], training_data=trainset) for step in range(steps): loss = engine.train_batch() print(loss) # deepspeed --hostfile=./hostfile model_parallel/deepspeed/tutorial.py --deepspeed --deepspeed_config model_parallel/deepspeed/ds_config.json