def run_trainer(args, emb_rref_list): """ Trainer function to be run from each machine. This function: 1. Performs some basic initialization steps. 2. Prepares random data for training. 3. Sanity checks cmd-line args such as embedding sizes and MLP layers 4. Sets up the model, loss, and Distributed Optimizer 5. Runs the Training Loop """ ######## BASIC INITIALIZATION ######## set_rand_seed() set_print_options(args.print_precision) args.use_gpu = args.use_gpu and torch.cuda.is_available() init_gpu(args.use_gpu) #print(args) ######## PREPARE TRAINING DATA ######## ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-") # input and target at random ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-") m_den = ln_bot[0] train_data, train_loader = dp.make_random_data_and_loader( args, ln_emb, m_den) nbatches = args.num_batches if args.num_batches > 0 else len(train_loader) ######## PARSE CMD LINE ARGS ######## m_spa = args.arch_sparse_feature_size num_fea = ln_emb.size + 1 # num sparse + num dense features m_den_out = ln_bot[ln_bot.size - 1] if args.arch_interaction_op == "dot": # approach 1: all # num_int = num_fea * num_fea + m_den_out # approach 2: unique if args.arch_interaction_itself: num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out else: num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out elif args.arch_interaction_op == "cat": num_int = num_fea * m_den_out else: sys.exit("ERROR: --arch-interaction-op=" + args.arch_interaction_op + " is not supported") arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-") ######## SANITY CHECKS ######## # Ensure feature sizes and MLP dimensions match if m_den != ln_bot[0]: sys.exit("ERROR: arch-dense-feature-size " + str(m_den) + " does not match first dim of bottom mlp " + str(ln_bot[0])) if m_spa != m_den_out: sys.exit("ERROR: arch-sparse-feature-size " + str(m_spa) + " does not match last dim of bottom mlp " + str(m_den_out)) if num_int != ln_top[0]: sys.exit("ERROR: # of feature interactions " + str(num_int) + " does not match first dimension of top mlp " + str(ln_top[0])) # test prints (model arch) if args.debug_mode: print("model arch:") print("mlp top arch " + str(ln_top.size - 1) + " layers, with input to output dimensions:") print(ln_top) print("# of interactions") print(num_int) print("mlp bot arch " + str(ln_bot.size - 1) + " layers, with input to output dimensions:") print(ln_bot) print("# of features (sparse and dense)") print(num_fea) print("dense feature size") print(m_den) print("sparse feature size") print(m_spa) print("# of embeddings (= # of sparse features) " + str(ln_emb.size) + ", with dimensions " + str(m_spa) + "x:") print(ln_emb) print("data (inputs and targets):") for j, (X, offsets, indices, T) in enumerate(train_loader): # early exit if nbatches was set by the user and has been exceeded if nbatches > 0 and j >= nbatches: break print("mini-batch: %d" % j) print(X.detach().cpu().numpy()) # transform offsets to lengths when printing print([ np.diff(S_o.detach().cpu().tolist() + list(indices[i].shape)).tolist() for i, S_o in enumerate(offsets) ]) print([S_i.detach().cpu().tolist() for S_i in indices]) print(T.detach().cpu().numpy()) ######## TRAINING SETUP ######## # Initialize the model (note we are passing the list of RRefs that point to # the remote embeddings). dlrm = model.DLRM_RPC( emb_rref_list, args.distributed_rank, args.use_gpu, ln_emb, ln_bot, ln_top, arch_interaction_op=args.arch_interaction_op, arch_interaction_itself=args.arch_interaction_itself, sigmoid_bot=-1, sigmoid_top=ln_top.size - 2, ) # Specify the loss function loss_fn = torch.nn.MSELoss(reduction="mean") model_parameter_rrefs = [] # RRefs for embeddings from PS for ind, emb_rref in enumerate(emb_rref_list): ps_name = "ps{}".format(ind) model_parameter_rrefs.extend( rpc.rpc_sync(ps_name, _retrieve_embedding_parameters, args=(emb_rref, ))) # RRefs local to the model (MLP) for param in dlrm.parameters(): model_parameter_rrefs.append(RRef(param)) # Build DistributedOptimizer. opt = DistributedOptimizer( optim.SGD, model_parameter_rrefs, lr=args.learning_rate, ) def time_wrap(use_gpu): if use_gpu: torch.cuda.synchronize() return time.time() # TODO: uncomment for comp/comms DDP benchmark #if args.distributed_rank == 0: # state_dict_top = {} # state_dict_bot = {} # dlrm.top_mlp_ddp.register_comm_hook(state_dict_top, profile_hook) # dlrm.bot_mlp_ddp.register_comm_hook(state_dict_bot, profile_hook) # training or inference best_gA_test = 0 best_auc_test = 0 total_time = 0 total_loss = 0 total_accu = 0 total_iter = 0 total_samp = 0 # Lists to track forward and backwad times per iteration fwd_times = [] bwd_times = [] rpc_fwd_times = [] embedding_lookup_times = [] ######## RUN TRAINING LOOP ######## with torch.autograd.profiler.profile(enabled=args.enable_profiling, use_cuda=args.use_gpu) as prof: for epoch in range(args.nepochs): accum_time_begin = time_wrap(args.use_gpu) if args.mlperf_logging: previous_iteration_time = None for j, (X, offsets, indices, T) in enumerate(train_loader): if args.mlperf_logging: current_time = time_wrap(args.use_gpu) if previous_iteration_time: iteration_time = current_time - previous_iteration_time else: iteration_time = 0 previous_iteration_time = current_time else: t1 = time_wrap(args.use_gpu) # early exit if nbatches was set by the user and has been exceeded if nbatches > 0 and j >= nbatches: break # create distributed autograd context with dist_autograd.context() as context_id: # Run forward pass fwd_start = time_wrap(args.use_gpu) Z, rpc_delays, embed_lookup_delay, rpc_total = dlrm.forward( X, offsets, indices) fwd_end = time_wrap(args.use_gpu) # Compute Loss E = loss_fn(Z, T) # Run distributed backward pass bwd_start = time_wrap(args.use_gpu) dist_autograd.backward(context_id, [E]) bwd_end = time_wrap(args.use_gpu) # Run distributed optimizer opt.step(context_id) if epoch >= args.warmup_epochs: fwd_times.append(fwd_end - fwd_start) bwd_times.append(bwd_end - bwd_start) rpc_fwd_times.extend(rpc_delays) embedding_lookup_times.append(embed_lookup_delay) # compute loss and accuracy L = E.detach().cpu().numpy() # numpy array S = Z.detach().cpu().numpy() # numpy array T = T.detach().cpu().numpy() # numpy array mbs = T.shape[ 0] # = args.mini_batch_size except maybe for last A = np.sum((np.round(S, 0) == T).astype(np.uint8)) if args.mlperf_logging: total_time += iteration_time else: t2 = time_wrap(args.use_gpu) total_time += t2 - t1 total_accu += A total_loss += L * mbs total_iter += 1 total_samp += mbs should_print = ((j + 1) % args.print_freq == 0) or (j + 1 == nbatches) should_test = ((args.test_freq > 0) and (args.data_generation == "dataset") and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches))) # print time, loss and accuracy if should_print or should_test: gT = 1000.0 * total_time / total_iter if args.print_time else -1 total_time = 0 gA = total_accu / total_samp total_accu = 0 gL = total_loss / total_samp total_loss = 0 str_run_type = "inference" if args.inference_only else "training" print( "Finished {} it {}/{} of epoch {}, {:.2f} ms/it, ". format(str_run_type, j + 1, nbatches, epoch, gT) + "loss {:.6f}, accuracy {:3.3f} %".format(gL, gA * 100)) log_iter = nbatches * epoch + j + 1 # Uncomment the line below to print out the total time with overhead # print("Accumulated time so far: {}" \ # .format(time_wrap(args.use_gpu) - accum_time_begin)) total_iter = 0 total_samp = 0 # END TRAIN LOOP # TODO: uncomment for comp/comms DDP benchmark # TODO: for bottom also #torch.cuda.synchronize(args.distributed_rank) #if args.distributed_rank == 0: # for bucket_index in range(len(state_dict_top)): # e_bfr = state_dict[bucket_index]["e_bfr"] # e_aft = state_dict[bucket_index]["e_aft"] # print(f"bucket {bucket_index} comm time: {e_bfr.elapsed_time(e_aft)}") mean_fwd = 1000.0 * np.mean(fwd_times) mean_bwd = 1000.0 * np.mean(bwd_times) std_fwd = 1000.0 * np.std(fwd_times) std_bwd = 1000.0 * np.std(bwd_times) rpc_fwd_mean = 1000.0 * np.mean(rpc_fwd_times) rpc_fwd_std = 1000.0 * np.std(rpc_fwd_times) embedding_fwd_mean = 1000.0 * np.mean(embedding_lookup_times) embedding_fwd_std = 1000.0 * np.std(embedding_lookup_times) print("[Trainer {}] Average FWD Time (ms): {}".format( args.distributed_rank, mean_fwd)) print("[Trainer {}] STD DEV FWD Time (ms): {}".format( args.distributed_rank, std_fwd)) print("[Trainer {}] Average BWD Time (ms): {}".format( args.distributed_rank, mean_bwd)) print("[Trainer {}] STD DEV BWD Time (ms): {}".format( args.distributed_rank, std_bwd)) print("[Trainer {}] Average RPC FWD Time (ms): {}".format( args.distributed_rank, rpc_fwd_mean)) print("[Trainer {}] STD DEV RPC FWD Time (ms): {}".format( args.distributed_rank, rpc_fwd_std)) print("[Trainer {}] Average Embedding Lookup Time (ms): {}".format( args.distributed_rank, embedding_fwd_mean)) print("[Trainer {}] STD DEV Embedding Lookup Time (ms): {}".format( args.distributed_rank, embedding_fwd_std)) # profiling if args.enable_profiling: with open("dlrm_s_pytorch.prof", "w") as prof_f: prof_f.write(prof.key_averages().table(sort_by="cpu_time_total")) prof.export_chrome_trace("./dlrm_s_pytorch.json")
def test_dist_optim(self): # local version module1 = MyModule() module2 = MyModule() params = [module1.get_w(), module2.get_w()] local_optim = optim.SGD(params, lr=0.05) old_w1 = module1.w.clone().detach() old_w2 = module2.w.clone().detach() g_cpu = torch.Generator() g_cpu.manual_seed(0) t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu) t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu) output1 = module1.forward(t2) output2 = module2.forward(output1) loss = torch.add(output2, t1).sum() loss.backward() local_optim.step() # distributed version owner1 = "worker%d" % ((self.rank + 1) % self.world_size) owner2 = "worker%d" % ((self.rank + 2) % self.world_size) remote_module1 = rpc.remote(owner1, MyModule) remote_module2 = rpc.remote(owner2, MyModule) remote_param1 = remote_method(MyModule.get_w, remote_module1) remote_param2 = remote_method(MyModule.get_w, remote_module2) old_w1_remote = remote_param1.to_here() # sanity check: local and remote initial weights should match self.assertEqual(old_w1, remote_param1.to_here()) self.assertEqual(old_w2, remote_param2.to_here()) dist_optim = DistributedOptimizer(optim.SGD, [remote_param1, remote_param2], lr=0.05) with dist_autograd.context() as context_id: g_cpu.manual_seed(0) t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu) t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu) output1 = rpc_async_method(MyModule.forward, remote_module1, t2) output2 = rpc_async_method(MyModule.forward, remote_module2, output1.wait()) loss = torch.add(output2.wait(), t1) dist_autograd.backward(context_id, [loss.sum()]) dist_optim.step(context_id) new_w1 = rpc_async_method(MyModule.get_w, remote_module1).wait() new_w2 = rpc_async_method(MyModule.get_w, remote_module2).wait() # ensure optimizer changed weights self.assertNotEqual(old_w1, new_w1) self.assertNotEqual(old_w2, new_w2) # ensure local equals remote self.assertEqual(new_w1, module1.get_w()) self.assertEqual(new_w2, module2.get_w())
def test_ddp_dist_autograd_local_vs_remote_gpu(self): # Each trainer uses a different random seed. Otherwise, they are going # to have exactly the same initial model parameters, input, and # therefore grads. That means the grads will be the same before and # after DDP's all-reduce. torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) remote_layer1 = RemoteModule("worker0", device="cpu", module_cls=nn.Linear, args=(10, 7, False)) layer1 = nn.Linear(10, 7, False) # Start with the same parameters for remote and local layer1.weight = remote_layer1.module_rref.to_here().weight layer2 = nn.Linear(7, 5).cuda(self.rank) ddp_layer2 = DistributedDataParallel(layer2, device_ids=[self.rank]) remote_layer3 = RemoteModule("worker0", device="cpu", module_cls=nn.Linear, args=(5, 3, False)) layer3 = nn.Linear(5, 3, False) # Start with the same parameters for remote and local layer3.weight = remote_layer3.module_rref.to_here().weight layer4 = nn.Linear(3, 1).cuda(self.rank) ddp_layer4 = DistributedDataParallel(layer4, device_ids=[self.rank]) # Run local case. inputs = torch.rand((10, 10)) loss = ddp_layer4( layer3(ddp_layer2(layer1(inputs).cuda(self.rank)).cpu()).cuda( self.rank)).sum() loss.backward() # Run remote case. with dist_autograd.context() as context_id: loss = ddp_layer4( remote_layer3( ddp_layer2(remote_layer1(inputs).cuda( self.rank)).cpu()).cuda(self.rank)).sum() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) dist.barrier() self.assertEqual( layer1.weight.grad, rpc.rpc_sync( "worker0", DdpComparisonTest.get_remote_grads, args=(remote_layer1.module_rref, context_id), ), ) self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight]) self.assertEqual( layer3.weight.grad, rpc.rpc_sync( "worker0", DdpComparisonTest.get_remote_grads, args=(remote_layer3.module_rref, context_id), ), ) self.assertEqual(layer4.weight.grad, grads_dict[layer4.weight])
def _run_trainer(remote_emb_module, rank): r""" Each trainer runs a forward pass which involves an embedding lookup on the parameter server and running nn.Linear locally. During the backward pass, DDP is responsible for aggregating the gradients for the dense part (nn.Linear) and distributed autograd ensures gradients updates are propagated to the parameter server. """ # Setup the model. model = HybridModel(remote_emb_module, rank) # Retrieve all model parameters as rrefs for DistributedOptimizer. # Retrieve parameters for embedding table. model_parameter_rrefs = model.remote_emb_module.remote_parameters() # model.fc.parameters() only includes local parameters. # NOTE: Cannot call model.parameters() here, # because this will call remote_emb_module.parameters(), # which supports remote_parameters() but not parameters(). for param in model.fc.parameters(): model_parameter_rrefs.append(RRef(param)) # Setup distributed optimizer opt = DistributedOptimizer( optim.SGD, model_parameter_rrefs, lr=0.05, ) criterion = torch.nn.CrossEntropyLoss() def get_next_batch(rank): for _ in range(10): num_indices = random.randint(20, 50) indices = torch.LongTensor(num_indices).random_(0, NUM_EMBEDDINGS) # Generate offsets. offsets = [] start = 0 batch_size = 0 while start < num_indices: offsets.append(start) start += random.randint(1, 10) batch_size += 1 offsets_tensor = torch.LongTensor(offsets) target = torch.LongTensor(batch_size).random_(8).cuda(rank) yield indices, offsets_tensor, target # Train for 100 epochs for epoch in range(100): # create distributed autograd context for indices, offsets, target in get_next_batch(rank): with dist_autograd.context() as context_id: output = model(indices, offsets) loss = criterion(output, target) # Run distributed backward pass dist_autograd.backward(context_id, [loss]) # Tun distributed optimizer opt.step(context_id) # Not necessary to zero grads as each iteration creates a different # distributed autograd context which hosts different grads print("Training done for epoch {}".format(epoch))
def dist_backward_script(context_id: int, loss: torch.Tensor): dist_autograd.backward(context_id, [loss])
def study(): """ Async multiplication using two remote modules """ # Start with a local version module1 = MyModule() module2 = MyModule() params = [module1.get_w(), module2.get_w()] local_optim = optim.SGD(params, lr=0.05) # Keep a copy of the old weights to make sure they change old_w1 = module1.w.clone().detach() old_w2 = module2.w.clone().detach() torch.manual_seed(0) t1 = torch.rand((3, 3), requires_grad=True) t2 = torch.rand((3, 3), requires_grad=True) output1 = module1.forward(t2) output2 = module2.forward(output1) loss = torch.add(output2, t1).sum() loss.backward() local_optim.step() # distributed version owner1 = "worker%d" % ((Env.rank + 1) % Env.world_size) owner2 = "worker%d" % ((Env.rank + 2) % Env.world_size) remote_module1 = rpc.remote(owner1, MyModule) remote_module2 = rpc.remote(owner2, MyModule) remote_param1 = remote_method(MyModule.get_w, remote_module1) remote_param2 = remote_method(MyModule.get_w, remote_module2) old_w1_remote = remote_param1.to_here() dist_optim = DistributedOptimizer( optim.SGD, [remote_param1, remote_param2], lr=0.05 ) with dist_autograd.context(): torch.manual_seed(0) t1 = torch.rand((3, 3), requires_grad=True) t2 = torch.rand((3, 3), requires_grad=True) output1 = remote_async(MyModule.forward, remote_module1, t2) output2 = remote_async(MyModule.forward, remote_module2, output1.wait()) loss = torch.add(output2.wait(), t1) dist_autograd.backward([loss.sum()]) dist_optim.step() new_w1 = remote_async(MyModule.get_w, remote_module1).wait() new_w2 = remote_async(MyModule.get_w, remote_module2).wait() # Make sure the weights have been updated print(f'Old weight vs new weight: {old_w1 == new_w1}') print(f'Old weight vs new weight: {old_w2 == new_w2}') # Make sure the weights on the remote module and the local copy are the same w1_consistent = (new_w1 == module1.get_w()).all() w2_consistent = (new_w2 == module2.get_w()).all() print(f'w1 consist: {w1_consistent}') print(f'w2 consist: {w2_consistent}')