def wrapper(*args, **kwargs): frame = currentframe() v = getargvalues(frame) argspec = getfullargspec(func) formal_arg_names = argspec.args s = "{'op':'%s'," % v.locals["func"].__name__ for idx, val in enumerate(v.locals["args"]): name = "" + formal_arg_names[idx] if name == "self" and isinstance(val, torch.Tensor): s += ", shape = %s" % str(tuple(val.shape)) if isinstance(val, torch.Tensor): name += "_tensor" value = { 'shape': tuple(val.size()), 'type': str(val.dtype).split(".")[-1] } val = value # name += "'" s += "'%s':%s," % (name, str(val)) num_def = len(argspec.defaults) defaults = dict(zip(argspec.args[-num_def:], argspec.defaults)) overrides = {k: str(v) for k, v in v.locals["kwargs"].items()} defaults.update(overrides) s += "%s}" % str(defaults).strip("{}") nvtx.range_push(s) result = func(*args, **kwargs) nvtx.range_pop() return result
def push_nvtx_model_config(config): """ Helper function to dump the passed in dict config as an nvtx marker with "model_config" key """ nvtx_msg = json.dumps({"model_config": config}) nvtx.range_push(nvtx_msg)
def test(model, device, test_loader): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: nvtx.range_push("Copy to device") data, target = data.to(device), target.to(device) nvtx.range_pop() # Copy to device nvtx.range_push("Test forward pass") output = model(data) nvtx.range_pop() # Test forward pass test_loss += F.nll_loss( output, target, reduction='sum').item() # sum up batch loss pred = output.argmax( dim=1, keepdim=True) # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) print( '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
def wrapper(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1): input_dict = { 'shape': tuple(input.size()), 'type': str(input.dtype).split(".")[-1] } weight_dict = { 'shape': tuple(weight.size()), 'type': str(weight.dtype).split(".")[-1] } # Interpolate numbers as strings because some can be one-elem tuples as well nvtx_str = "{'op':'conv_transpose%sd', 'input_tensor':%s, 'weight_tensor':%s, 'stride':%s, 'padding':%s, 'output_padding':%s, 'groups':%s, 'dilation':%s}" % ( dim_count, str(input_dict), str(weight_dict), str(stride), str(padding), str(output_padding), str(groups), str(dilation)) nvtx.range_push(nvtx_str) op = fun(input, weight, bias, stride, padding, dilation, groups) nvtx.range_pop() return op
def __init__(self): nvtx.range_push("Toymodel_layer_stack") super(ToyModel, self).__init__() self.net1 = torch.nn.Linear(100, 100).to('cuda:0') self.relu = torch.nn.ReLU() self.net2 = torch.nn.Linear(100, 50).to('cpu') nvtx.range_pop()
def range_push(msg: str) -> None: r"""Annotates the start of a range for profiling. Requires HABITAT_PROFILING environment variable to be set, otherwise the function is a no-op. Pushes a range onto a stack of nested ranges. Every range_push should have a corresponding range_pop. Attached profilers can capture the time spent in ranges." """ if enable_profiling: nvtx.range_push(msg)
def define_graph(self): nvtx.range_push("Reading JPEG files into host memory") jpegs, labels = self.input() # read in jpeg files nvtx.range_pop() nvtx.range_push("Start mixed decoding process") # images = self.decode(jpegs) # Do decoding process decode = ops.ImageDecoder(device="mixed", output_type=types.RGB) images = decode(jpegs) nvtx.range_pop() return (images, labels)
def new_iter(self, *args, **kwargs): # Push trace marker nvtx.range_push(traceMarker("Dataloader")) # First pass is for creating the dataloader + returning the first data cadena = argMarker(mod, "DataLoader", args, kwargs) nvtx.range_push(cadena) for x in old_iter(self, *args, **kwargs): # Pop tracemarker nvtx.range_pop() # Dataloader stop, Model start nvtx.range_pop() yield x # Push trace marker nvtx.range_push(traceMarker("DataLoader")) # Model stop, dataloader start cadena = argMarker(mod, "DataLoader", args, kwargs) nvtx.range_push(cadena) # Pop the last iteration before returning nvtx.range_pop() nvtx.range_pop()
def trainStandardMethod(model): model.train(True) loss_fn = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.001) one_hot_indices = torch.LongTensor(batch_size) \ .random_(0, num_classes) \ .view(batch_size, 1) for i in range(num_batches): nvtx.range_push("Batch" + str(i)) # generate random inputs and labels inputs = torch.randn(batch_size, 3, image_w, image_h) labels = torch.zeros(batch_size, num_classes) \ .scatter_(1, one_hot_indices, 1) nvtx.range_push("Copy to device") inputs = inputs.to('cuda:0') # labels = labels.to('cuda:0') nvtx.range_pop() # run forward pass nvtx.range_push("Forward pass") optimizer.zero_grad() outputs = model(inputs) nvtx.range_pop() # run backward pass # labels = labels.to(outputs.device) nvtx.range_push("Backward pass") loss_fn(outputs, labels).backward() torch.cuda.synchronize('cuda:0') optimizer.step() nvtx.range_pop() nvtx.range_pop()
def range_push(msg: str) -> None: r"""Annotates the start of a range for profiling. Requires HABITAT_PROFILING environment variable to be set, otherwise the function is a no-op. Pushes a range onto a stack of nested ranges. Every range_push should have a corresponding range_pop. Attached profilers can capture the time spent in ranges." """ if not _enable_profiling: return nvtx.range_push(msg) _helper.range_depth += 1 max_depth = 64 # In practice, there is little need to go deeper than 5 or 10. By asserting # here, we'll catch improper range_push/range_pop usage. Specifically, # we'll (eventually) catch an unmatched range_push. assert _helper.range_depth < max_depth
def oneStepTrain(model): model = vgg19() loss_fn = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.001) one_hot_indices = torch.LongTensor(batch_size) \ .random_(0, num_classes) \ .view(batch_size, 1) optimizer.zero_grad() nvtx.range_push("Copy to device") inputs = torch.randn(batch_size, 3, image_w, image_h).to('cuda:0') nvtx.range_pop() outputs = model(inputs) labels = torch.zeros(batch_size, num_classes).scatter_(1, one_hot_indices, 1) labels = labels.to(outputs.device) nvtx.range_push("backward pass") loss_fn(outputs, labels).backward() nvtx.range_pop() optimizer.step()
def wrapper_func(*args, **kwargs): # Push trace marker nvtx.range_push(traceMarker(fn_name)) # Push module marker if s: m = modMarker(mod, fn_name, args) nvtx.range_push(m) # Create and push argument marker cadena = argMarker(mod, fn_name, args, kwargs) nvtx.range_push(cadena) # Call the original function result = func(*args, **kwargs) # Pop argumet marker nvtx.range_pop() # Pop module marker if s: nvtx.range_pop() # Pop trace marker nvtx.range_pop() return result
def forward(self, x): nvtx.range_push("net1") x1 = self.net1(x) nvtx.range_pop() nvtx.range_push("relu1") x2 = self.relu(x1) nvtx.range_pop() nvtx.range_push("Copy to cpu") x2 = x2.to('cpu') nvtx.range_pop() nvtx.range_push("net2") x3 = self.net2(x2) # x = self.relu(self.net1(x)) nvtx.range_pop() # return self.net2(x.to('cpu')) return x3
def wrapper_func(*args, **kwargs): global wrappers_enabled traceMarker_str = "" input_callid_list = [] if config.capture_input_ops: dlprof.capture_inputs(input_callid_list, *args) if wrappers_enabled: # Push trace marker traceMarker_str = traceMarker(fn_name) nvtx.range_push(traceMarker_str) # Push module marker if s: m = modMarker(mod, fn_name, args) nvtx.range_push(m) # Create and push argument marker # # Disable wrappers while getting the argMarker in case it # ends up executing another wrapped function wrappers_enabled = False if config.capture_input_ops: cadena = argMarker(mod, fn_name, args, kwargs, dlprof.call_id, input_callid_list) else: cadena = argMarker(mod, fn_name, args, kwargs) nvtx.range_push(cadena) wrappers_enabled = True # Call the original function result = func(*args, **kwargs) if wrappers_enabled: # Pop argumet marker nvtx.range_pop() # Pop module marker if s: nvtx.range_pop() # Pop trace marker nvtx.range_pop() if config.capture_input_ops: dlprof.capture_outputs(dlprof.call_id, result) # Store the callid -> op_name mapping if traceMarker_str is not "": traceMarker_str = traceMarker_str.replace("\'", "\"") traceMarker_dict = json.loads(traceMarker_str) dlprof.call_id_to_op_map[ dlprof.call_id] = traceMarker_dict['funcStack'] dlprof.call_id = dlprof.call_id + 1 return result
def run_step(self): """ Implement the standard training logic described above. """ assert self.model.training, "[SimpleTrainer] model was changed to eval mode!" start = time.perf_counter() """ If you want to do something with the data, you can wrap the dataloader. """ nvtx.range_push("Data loading") # data = next(self._data_loader_iter) dali_data = next(self._dali_data_loader_iter) d_data = [] for i in range(0, 12): img = dali_data[0]['image'][i] seg = dali_data[0]['sem_seg'][i][0].cpu().long() d_data.append({ 'file_name': "", 'height': 1024, 'width': 2048, 'image': img, 'sem_seg': seg }) nvtx.range_pop() data_time = time.perf_counter() - start """ If you want to do something with the losses, you can wrap the model. """ nvtx.range_push("Forward pass") loss_dict = self.model(d_data) losses = sum(loss_dict.values()) nvtx.range_pop() """ If you need to accumulate gradients or do something similar, you can wrap the optimizer with your custom `zero_grad()` method. """ nvtx.range_push("Backward pass") self.optimizer.zero_grad() losses.backward() nvtx.range_pop() # self._write_metrics(loss_dict, data_time) """ If you need gradient clipping/scaling or other processing, you can wrap the optimizer with your custom `step()` method. But it is suboptimal as explained in https://arxiv.org/abs/2006.15704 Sec 3.2.4 """ self.optimizer.step()
def train(self, start_iter: int, max_iter: int): """ Args: start_iter, max_iter (int): See docs above """ logger = logging.getLogger(__name__) logger.info("Starting training from iteration {}".format(start_iter)) self.iter = self.start_iter = start_iter self.max_iter = max_iter with EventStorage(start_iter) as self.storage: try: self.before_train() for self.iter in range(start_iter, max_iter): nvtx.range_push("Batch " + str(self.iter)) nvtx.range_push("Before step") self.before_step() nvtx.range_pop() nvtx.range_push("Run step") self.run_step() nvtx.range_pop() nvtx.range_push("After step") self.after_step() nvtx.range_pop() nvtx.range_pop() # self.iter == max_iter can be used by `after_train` to # tell whether the training successfully finished or failed # due to exceptions. self.iter += 1 except Exception: logger.exception("Exception during training:") raise finally: self.after_train()
def forward(self, numerical_input, categorical_inputs): """ Args: numerical_input (Tensor): with shape [batch_size, num_numerical_features] categorical_inputs (Tensor): with shape [batch_size, num_categorical_features] """ batch_size = numerical_input.size()[0] # Put indices on the same device as corresponding embedding device_indices = [] for embedding_id, _ in enumerate(self.embeddings): device_indices.append(categorical_inputs[:, embedding_id].to(self._embedding_device_map[embedding_id])) nvtx.range_push("layer:Bottom_MLP") bottom_mlp_output = self.bottom_mlp(numerical_input) nvtx.range_pop() # embedding_outputs will be a list of (26 in the case of Criteo) fetched embeddings with shape # [batch_size, embedding_size] embedding_outputs = [] for embedding_id, embedding in enumerate(self.embeddings): if self._hash_indices: device_indices[embedding_id] = device_indices[embedding_id] % embedding.num_embeddings nvtx.range_push("layer:Embedding_{}".format(embedding_id)) embedding_outputs.append(embedding(device_indices[embedding_id]).to(self._base_device)) nvtx.range_pop() nvtx.range_push("layer:Interaction") interaction_output = self._interaction(bottom_mlp_output, embedding_outputs, batch_size) nvtx.range_pop() nvtx.range_push("layer:Top_MLP") top_mlp_output = self.top_mlp(interaction_output) nvtx.range_pop() return top_mlp_output
def worker(gpu, ngpus_per_node, args): env_device, train_device = args_initialize(gpu, ngpus_per_node, args) train_csv_file, train_csv_writer, eval_csv_file, eval_csv_writer, summary_writer = log_initialize(args, train_device) train_env, test_env, observation = env_initialize(args, env_device) model = ActorCritic(args.num_stack, train_env.action_space, normalize=args.normalize, name=args.env_name) model, optimizer = model_initialize(args, model, train_device) shape = (args.num_steps + 1, args.num_ales, args.num_stack, *train_env.observation_space.shape[-2:]) states = torch.zeros(shape, device=train_device, dtype=torch.float32) states[0, :, -1] = observation.to(device=train_device, dtype=torch.float32) shape = (args.num_steps + 1, args.num_ales) values = torch.zeros(shape, device=train_device, dtype=torch.float32) logits = torch.zeros((args.num_steps + 1, args.num_ales, train_env.action_space.n), device=train_device, dtype=torch.float32) returns = torch.zeros(shape, device=train_device, dtype=torch.float32) shape = (args.num_steps, args.num_ales) rewards = torch.zeros(shape, device=train_device, dtype=torch.float32) masks = torch.zeros(shape, device=train_device, dtype=torch.float32) actions = torch.zeros(shape, device=train_device, dtype=torch.long) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) if args.use_gae: gae = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) maybe_npy = lambda a: a.numpy() if args.use_openai else a num_frames_per_iter = args.num_ales * args.num_steps args.num_minibatches = num_frames_per_iter / args.batch_size total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter)) decay = 1.0 / total_steps scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.ppo_epoch, gamma=1.0 - decay) iterator = range(total_steps) if args.rank == 0: iterator = tqdm(iterator) total_time = 0 evaluation_offset = 0 train_stream = torch.cuda.Stream() torch.cuda.synchronize() for update in iterator: T = args.world_size * update * num_frames_per_iter if (args.rank == 0) and (T >= evaluation_offset): evaluation_offset += args.evaluation_interval eval_lengths, eval_rewards = test(args, model, test_env) lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths) rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards) length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format(lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian) reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format(rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian) print('[training time: {}] {}'.format(format_time(total_time), ' --- '.join([length_data, reward_data]))) if eval_csv_writer and eval_csv_file: eval_csv_writer.writerow([T, total_time, rmean, rmedian, rmin, rmax, rstd, lmean, lmedian, lmin, lmax, lstd]) eval_csv_file.flush() if args.plot: summary_writer.add_scalar('eval/rewards_mean', rmean, T, walltime=total_time) summary_writer.add_scalar('eval/lengths_mean', lmean, T, walltime=total_time) start_time = time.time() with torch.no_grad(): for step in range(args.num_steps): nvtx.range_push('train:step') value, logit = model(states[step]) # store values and logits values[step], logits[step] = value.squeeze(-1), logit.squeeze(-1) # convert actions to numpy and perform next step probs = torch.clamp(F.softmax(logit, dim=1), min = 0.00001, max = 0.99999) probs_action = probs.multinomial(1).to(env_device) observation, reward, done, info = train_env.step(maybe_npy(probs_action)) if args.use_openai: # convert back to pytorch tensors observation = torch.from_numpy(observation) reward = torch.from_numpy(reward) done = torch.from_numpy(done.astype(np.uint8)) else: observation = observation.squeeze(-1).unsqueeze(1) # move back to training memory observation = observation.to(device=train_device) reward = reward.to(device=train_device, dtype=torch.float32) done = done.to(device=train_device, dtype=torch.bool) probs_action = probs_action.to(device=train_device, dtype=torch.long) not_done = 1.0 - done.float() # update rewards and actions actions[step].copy_(probs_action.view(-1)) masks[step].copy_(not_done) rewards[step].copy_(reward.sign()) # update next observations states[step + 1, :, :-1].copy_(states[step, :, 1:]) states[step + 1] *= not_done.view(-1, *[1] * (observation.dim() - 1)) states[step + 1, :, -1].copy_(observation.view(-1, *states.size()[-2:])) # update episodic reward counters episode_rewards += reward final_rewards[done] = episode_rewards[done] episode_rewards *= not_done episode_lengths += not_done final_lengths[done] = episode_lengths[done] episode_lengths *= not_done nvtx.range_pop() returns[-1] = values[-1] = model(states[-1])[0].data.squeeze(-1) if args.use_gae: gae.zero_() for step in reversed(range(args.num_steps)): delta = rewards[step] + (args.gamma * values[step + 1] * masks[step]) - values[step] gae = delta + (args.gamma * args.tau * masks[step] * gae) returns[step] = gae + values[step] else: for step in reversed(range(args.num_steps)): returns[step] = rewards[step] + (args.gamma * returns[step + 1] * masks[step]) log_probs = F.log_softmax(logits[:-1].view(-1, train_env.action_space.n), dim=1) action_log_probs = log_probs.gather(1, actions.view(-1).unsqueeze(-1)) advantages = returns[:-1].view(-1).unsqueeze(-1) - values[:-1].view(-1).unsqueeze(-1) advantages = (advantages - advantages.mean()) / (advantages.std() + float(np.finfo(np.float32).eps)) total_value_loss = 0.0 total_policy_loss = 0.0 total_dist_entropy = 0.0 nvtx.range_push('train:loader') states_view = states[:-1].view(-1, *states.size()[-3:]) actions_view = actions.view(-1) returns_view = returns[:-1].view(-1) train_dataset = torch.utils.data.TensorDataset(states_view, actions_view, action_log_probs, returns_view, advantages) train_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=0, pin_memory=False, sampler=train_sampler) nvtx.range_pop() with torch.cuda.stream(train_stream): for epoch in range(args.ppo_epoch): nvtx.range_push('train:epoch_step') if args.distributed: train_sampler.set_epoch(epoch) prefetcher = data_prefetcher(train_loader) local_states, local_actions, local_action_log_probs, local_returns, local_advantages = prefetcher.next() while local_states is not None: batch_values, batch_logits = model(local_states) batch_log_probs = F.log_softmax(batch_logits, dim=1) batch_action_log_probs = batch_log_probs.gather(1, local_actions.unsqueeze(-1)) batch_probs = F.softmax(batch_logits, dim=1) batch_dist_entropy = -(batch_log_probs * batch_probs).sum(-1).mean() ratio = torch.exp(batch_action_log_probs - local_action_log_probs) surrogate1 = ratio * local_advantages surrogate2 = torch.clamp(ratio, 1.0 - args.clip_epsilon, 1.0 + args.clip_epsilon) * local_advantages batch_policy_loss = -torch.min(surrogate1, surrogate2).mean() batch_value_loss = F.mse_loss(local_returns.unsqueeze(-1), batch_values) / 2.0 loss = batch_value_loss * args.value_loss_coef + batch_policy_loss - batch_dist_entropy * args.entropy_coef optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() total_value_loss += batch_value_loss.item() total_policy_loss += batch_policy_loss.item() total_dist_entropy += batch_dist_entropy.item() local_states, local_actions, local_action_log_probs, local_returns, local_advantages = prefetcher.next() scheduler.step() nvtx.range_pop() torch.cuda.synchronize() states[0].copy_(states[-1]) if args.rank == 0: iter_time = time.time() - start_time total_time += iter_time value_loss = total_value_loss / (args.ppo_epoch * args.num_minibatches) policy_loss = total_policy_loss / (args.ppo_epoch * args.num_minibatches) dist_entropy = total_dist_entropy / (args.ppo_epoch * args.num_minibatches) if args.plot: writer.add_scalar('train/rewards_mean', final_rewards.mean().item(), T, walltime=total_time) writer.add_scalar('train/lengths_mean', final_lengths.mean().item(), T, walltime=total_time) writer.add_scalar('train/learning_rate', scheduler.get_lr()[0], T, walltime=total_time) writer.add_scalar('train/value_loss', value_loss, T, walltime=total_time) writer.add_scalar('train/policy_loss', policy_loss, T, walltime=total_time) writer.add_scalar('train/entropy', dist_entropy, T, walltime=total_time) progress_data = callback(args, model, T, iter_time, final_rewards, final_lengths, value_loss, policy_loss, dist_entropy, train_csv_writer, train_csv_file) iterator.set_postfix_str(progress_data) if args.plot and (args.rank == 0): writer.close() if args.use_openai: train_env.close() if args.use_openai_test_env: test_env.close()
def train(args, model, device, train_loader, optimizer, epoch): model.train() nvtx.range_push("Data loading"); for batch_idx, (data, target) in enumerate(train_loader): nvtx.range_pop();# Data loading nvtx.range_push("Batch " + str(batch_idx)) nvtx.range_push("Copy to device") data, target = data.to(device), target.to(device) nvtx.range_pop() # Copy to device nvtx.range_push("Forward pass") optimizer.zero_grad() # Enables autocasting for the forward pass with torch.cuda.amp.autocast(enabled=True): output = model(data) loss = F.nll_loss(output, target) nvtx.range_pop() # Forward pass nvtx.range_push("Backward pass") loss.backward() optimizer.step() nvtx.range_pop() # Backward pass nvtx.range_pop() # Batch if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) if args.dry_run: break nvtx.range_push("Data loading"); nvtx.range_pop(); # Data loading
def main(): # c10d_frontend = torch.classes.dist_c10d.frontend() dist.init_process_group(backend='nccl') d_pg = _get_default_group() # print(c10d_frontend.get_name_of_process_group(d_pg)) pg2 = dist.new_group([0,1], backend='nccl') # print(c10d_frontend.get_name_of_process_group(pg2)) if dist.get_rank() == 0: print(dir(d_pg)) print(type(d_pg)) print(type(pg2)) print(dir(dist)) print(_pg_names) local_size = torch.cuda.device_count() rank = dist.get_rank() torch.cuda.set_device(rank % local_size) torch.cuda.synchronize() comm_stream = torch.cuda.Stream(rank % local_size) device_id = rank % local_size # print(f'rank {rank}') warm_up = 5 repeat = 10 partition_sizes = [ 2457600, 960, 819200, 320, 320, 320, 3276800, 1280, 3276800, 320, 320, 320 ] local_params = [] for psize in partition_sizes: r = torch.rand(psize, dtype=torch.half, device=f'cuda:{device_id}').view(-1) local_params.append(r) print(f'rank {rank}, psize {psize}, sum {torch.sum(r).item()}') start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) ts = [] for i in range(repeat + warm_up): with torch.cuda.stream(comm_stream): nvtx.range_push(f'exp-{i}') t1 = time.time() # start_event.record(stream=comm_stream) benchmark_all_gather(partition_sizes, local_params, comm_stream) # end_event.record(stream=comm_stream) # end_event.synchronize() t2 = time.time() nvtx.range_pop() if i >= warm_up: # ts.append(start_event.elapsed_time(end_event)) ts.append((t2 - t1) * 1e3) if dist.get_rank() == 0: avg_t = np.mean(ts) bw = (dist.get_world_size() - 1) * np.sum(partition_sizes) * 2 / 1e9 / (avg_t / 1e3) print(f'avg time {avg_t} ms, bw {bw} GB/s')
def forward(self, x): identity = x nvtx.range_push("layer:{}".format(chr(self.id + 97))) # to print a,b,c,.. nvtx.range_push("layer:Conv1") out = self.conv1(x) nvtx.range_pop() nvtx.range_push("layer:BN1") out = self.bn1(out) nvtx.range_pop() nvtx.range_push("layer:ReLU1") out = self.relu(out) nvtx.range_pop() nvtx.range_push("layer:Conv2") out = self.conv2(out) nvtx.range_pop() nvtx.range_push("layer:BN2") out = self.bn2(out) nvtx.range_pop() nvtx.range_push("layer:ReLU2") out = self.relu(out) nvtx.range_pop() nvtx.range_push("layer:Conv3") out = self.conv3(out) nvtx.range_pop() nvtx.range_push("layer:BN3") out = self.bn3(out) nvtx.range_pop() nvtx.range_push("layer:Residual") if self.downsample is not None: nvtx.range_push("layer:Projection") identity = self.downsample(x) nvtx.range_pop() out += identity nvtx.range_pop() nvtx.range_push("layer:ReLU3") out = self.relu(out) nvtx.range_pop() nvtx.range_pop() return out
if args.pipeline: train_set = SoftwarePipeline(train_set) gpu_id = dist.get_rank() % torch.cuda.device_count() torch.cuda.set_device(gpu_id) model = models.__dict__["resnet50"]() model.cuda(torch.cuda.current_device()) model = DDP(model, [gpu_id]) criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.SGD(model.parameters(), lr=init_learning_rate, momentum=0.875, weight_decay=3.0517578125e-05) for epoch in range(10): nvtx.range_push('epoch') nvtx.range_push('set_train') model.train() nvtx.range_pop() # set train nvtx.range_push('set_epoch') train_sampler.set_epoch(epoch) nvtx.range_pop() # set epoch nvtx.range_push('adjust_lr') adjust_learning_rate(optimizer, epoch, init_learning_rate) nvtx.range_pop() # adjust lr time0 = pc()
def benchmark_all_gather(partition_sizes, local_params, comm_stream): dtype = torch.half world_size = dist.get_world_size() rank = dist.get_rank() device_id = rank % torch.cuda.device_count() t1 = time.time() with torch.cuda.stream(comm_stream): nvtx.range_push('allocate final params') # allocate memories allgather_params = [] for psize in partition_sizes: tensor_size = psize * world_size tensor = torch.empty(tensor_size, dtype=dtype, device=f'cuda:{device_id}').view(-1) allgather_params.append(tensor) nvtx.range_pop() comm_stream.synchronize() t2 = time.time() # print_at_rank0(f'allocate cost {t2 - t1} s') with torch.cuda.stream(comm_stream): nvtx.range_push('construct all output list') # create allgather parameters all_gather_list_list = [] for pidx, psize in enumerate(partition_sizes): flat_tensor = allgather_params[pidx] partitions = [] for i in range(world_size): partitions.append(flat_tensor.narrow(0, psize * i, psize)) all_gather_list_list.append(partitions) nvtx.range_pop() comm_stream.synchronize() print_at_rank0(f'construct params cost {time.time() - t2} s') with torch.cuda.stream(comm_stream): backend = get_backend() nvtx.range_push('launch dist all-gather') with _batch_p2p_manager(backend): handles = [] for pidx, psize in enumerate(partition_sizes): h = all_gather(all_gather_list_list[pidx], all_gather_list_list[pidx][rank], async_op=True) # h = dist.all_gather(all_gather_list_list[pidx], # all_gather_list_list[pidx][rank], # async_op=True) handles.append(h) # handles=[] # for pidx, psize in enumerate(partition_sizes): # # h = all_gather(all_gather_list_list[pidx], # # all_gather_list_list[pidx][rank], # # async_op=True) # h = dist.all_gather(all_gather_list_list[pidx], # local_params[pidx], # async_op=True) # handles.append(h) # # torch.cuda.synchronize() # handles[-1].wait() # event enqueued, but not guaranteed complete nvtx.range_pop() torch.cuda.synchronize() end_event = torch.cuda.Event() comm_stream.wait_event(end_event) return None
def learn(self, states, actions, returns, next_states, nonterminals, weights): tactions = actions.unsqueeze(-1).unsqueeze(-1) if self.categorical: tactions = tactions.expand(-1, -1, self.atoms) # Calculate current state probabilities (online network noise already sampled) nvtx.range_push('agent:online (state) probs') ps = self.online_net( states, log=True) # Log probabilities log p(s_t, ·; θonline) ps_a = ps.gather(1, tactions) # log p(s_t, a_t; θonline) nvtx.range_pop() with torch.no_grad(): if isinstance(self.target_net, DQN): self.target_net.reset_noise() else: self.target_net.module.reset_noise( ) # Sample new target net noise nvtx.range_push('agent:target (next state) probs') tns = self.target_net( next_states) # Probabilities p(s_t+n, ·; θtarget) nvtx.range_pop() if self.double_q: # Calculate nth next state probabilities nvtx.range_push('agent:online (next state) probs') pns = self.online_net( next_states) # Probabilities p(s_t+n, ·; θonline) nvtx.range_pop() else: pns = tns if self.categorical: pns = self.support.expand_as( pns ) * pns # Distribution d_t+n = (z, p(s_t+n, ·; θonline)) # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))] argmax_indices_ns = pns.sum(-1).argmax(-1).unsqueeze(-1).unsqueeze( -1) if self.categorical: argmax_indices_ns = argmax_indices_ns.expand( -1, -1, self.atoms) pns_a = tns.gather( 1, argmax_indices_ns ) # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget) if self.categorical: # Compute Tz (Bellman operator T applied to z) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = returns.unsqueeze(-1) + nonterminals.float().unsqueeze( -1) * (self.discount**self.n) * self.support.unsqueeze(0) Tz = Tz.clamp(min=self.v_min, max=self.v_max) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.v_min) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64) # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (self.atoms - 1)) * (l == u)] += 1 # Distribute probability of Tz batch_size = states.size(0) m = states.new_zeros(batch_size, self.atoms) offset = torch.linspace(0, ((batch_size - 1) * self.atoms), batch_size).unsqueeze(1).expand( batch_size, self.atoms).to(actions) m.view(-1).index_add_( 0, (l + offset).view(-1), (pns_a.squeeze(1) * (u.float() - b) ).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (pns_a.squeeze(1) * (b - l.float()) ).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) else: Tz = returns + nonterminals.float() * ( self.discount**self.n) * pns_a.squeeze(-1).squeeze(-1) if self.categorical: loss = -torch.sum( m * ps_a.squeeze(1), 1) # Cross-entropy loss (minimises DKL(m||p(s_t, a_t))) weights = weights.unsqueeze(-1) else: loss = F.mse_loss(ps_a.squeeze(-1).squeeze(-1), Tz, reduction='none') nvtx.range_push('agent:loss + step') self.optimizer.zero_grad() weighted_loss = (weights * loss).mean() with amp.scale_loss(weighted_loss, self.optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.max_grad_norm) self.optimizer.step() nvtx.range_pop() return loss.detach()
def wrapper_func(*args, **kwargs): global wrappers_enabled traceMarker_str = "" input_callid_list = [] if wrappers_enabled: if config.capture_input_ops: ## Stack for callids to work with nested monkey patch function calls dlprof.patch_list.append(dlprof.call_id) dlprof.capture_inputs(dlprof.call_id, input_callid_list, *args) # Push trace marker traceMarker_str = traceMarker(fn_name) nvtx.range_push(traceMarker_str) # Push module marker if s: m = modMarker(mod, fn_name, args) nvtx.range_push(m) # Create and push argument marker # # Disable wrappers while getting the argMarker in case it # ends up executing another wrapped function wrappers_enabled = False if config.capture_input_ops: saved_call_id = dlprof.call_id # Keeps call_id correct when there are nested # monkey patch functions if dlprof.call_id != dlprof.patch_list[0]: saved_call_id = dlprof.patch_list[0] cadena = argMarker(mod, fn_name, args, kwargs, saved_call_id, input_callid_list) else: cadena = argMarker(mod, fn_name, args, kwargs) nvtx.range_push(cadena) wrappers_enabled = True # Call the original function result = func(*args, **kwargs) if wrappers_enabled: # Pop argumet marker nvtx.range_pop() # Pop module marker if s: nvtx.range_pop() # Pop trace marker nvtx.range_pop() if config.capture_input_ops: # Keeps call_id correct when there are nested # monkey patch functions saved_call_id = dlprof.call_id if dlprof.call_id != dlprof.patch_list[0]: saved_call_id = dlprof.patch_list[0] dlprof.capture_outputs(saved_call_id, result) # Store the callid -> op_name mapping if traceMarker_str != "": traceMarker_str = traceMarker_str.replace("\'", "\"") traceMarker_dict = json.loads(traceMarker_str) dlprof.call_id_to_op_map[saved_call_id] = traceMarker_dict[ 'funcStack'] starting_call_id = dlprof.patch_list[0] last_call_id = dlprof.patch_list.pop() dlprof.call_id = dlprof.call_id + 1 return result
def worker(gpu, ngpus_per_node, callback, args): args.gpu = gpu if args.distributed: args.seed += args.gpu torch.cuda.set_device(args.gpu) args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0 if args.multiprocessing_distributed: args.rank = args.rank * ngpus_per_node + args.gpu torch.distributed.init_process_group( backend='nccl', init_method='tcp://127.0.0.1:8632', world_size=args.world_size, rank=args.rank) else: args.rank = 0 if (args.num_ales % args.num_minibatches) != 0: raise ValueError( 'Number of ales({}) size is not even divisible by the minibatch size({})' .format(args.num_ales, args.num_minibatches)) if args.num_steps_per_update == -1: args.num_steps_per_update = args.num_steps minibatch_size = int(args.num_ales / args.num_minibatches) step0 = args.num_steps - args.num_steps_per_update n_minibatch = -1 args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available() args.no_cuda_train = (not args.no_cuda_train) and torch.cuda.is_available() args.verbose = args.verbose and (args.rank == 0) env_device = torch.device( 'cuda', args.gpu) if args.use_cuda_env else torch.device('cpu') train_device = torch.device('cuda', args.gpu) if ( args.no_cuda_train == False) else torch.device('cpu') np.random.seed(args.seed) torch.manual_seed(np.random.randint(1, 10000)) if args.use_cuda_env or (args.no_cuda_train == False): torch.cuda.manual_seed(np.random.randint(1, 10000)) if args.rank == 0: if args.output_filename: train_csv_file = open(args.output_filename, 'w', newline='') train_csv_file.write(json.dumps(vars(args))) train_csv_file.write('\n') train_csv_writer = csv.writer(train_csv_file, delimiter=',') train_csv_writer.writerow([ 'frames', 'fps', 'total_time', 'rmean', 'rmedian', 'rmin', 'rmax', 'lmean', 'lmedian', 'lmin', 'lmax', 'entropy', 'value_loss', 'policy_loss' ]) eval_output_filename = '.'.join([ ''.join(args.output_filename.split('.')[:-1] + ['_test']), 'csv' ]) eval_csv_file = open(eval_output_filename, 'w', newline='') eval_csv_file.write(json.dumps(vars(args))) eval_csv_file.write('\n') eval_csv_writer = csv.writer(eval_csv_file, delimiter=',') eval_csv_writer.writerow([ 'frames', 'total_time', 'rmean', 'rmedian', 'rmin', 'rmax', 'rstd', 'lmean', 'lmedian', 'lmin', 'lmax', 'lstd' ]) else: train_csv_file, train_csv_writer = None, None eval_csv_file, eval_csv_writer = None, None if args.plot: from tensorboardX import SummaryWriter current_time = datetime.now().strftime('%b%d_%H-%M-%S') log_dir = os.path.join(args.log_dir, current_time + '_' + socket.gethostname()) writer = SummaryWriter(log_dir=log_dir) for k, v in vars(args).items(): writer.add_text(k, str(v)) print() print('PyTorch : {}'.format(torch.__version__)) print('CUDA : {}'.format(torch.backends.cudnn.m.cuda)) print('CUDNN : {}'.format(torch.backends.cudnn.version())) print('APEX : {}'.format('.'.join( [str(i) for i in apex.amp.__version__.VERSION]))) print() if train_device.type == 'cuda': print(cuda_device_str(train_device.index), flush=True) if args.use_openai: train_env = create_vectorize_atari_env( args.env_name, args.seed, args.num_ales, episode_life=args.episodic_life, clip_rewards=False, max_frames=args.max_episode_length) observation = torch.from_numpy(train_env.reset()).squeeze(1) else: train_env = AtariEnv(args.env_name, args.num_ales, color_mode='gray', repeat_prob=0.0, device=env_device, rescale=True, episodic_life=args.episodic_life, clip_rewards=False, frameskip=4) train_env.train() observation = train_env.reset(initial_steps=args.ale_start_steps, verbose=args.verbose).squeeze(-1) if args.use_openai_test_env: test_env = create_vectorize_atari_env(args.env_name, args.seed, args.evaluation_episodes, episode_life=False, clip_rewards=False) test_env.reset() else: test_env = AtariEnv(args.env_name, args.evaluation_episodes, color_mode='gray', repeat_prob=0.0, device='cpu', rescale=True, episodic_life=False, clip_rewards=False, frameskip=4) model = ActorCritic(args.num_stack, train_env.action_space, normalize=args.normalize, name=args.env_name) model = model.to(train_device).train() if args.rank == 0: print(model) args.model_name = model.name if args.use_adam: optimizer = optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) else: optimizer = optim.RMSprop(model.parameters(), lr=args.lr, eps=args.eps, alpha=args.alpha) # This is the number of frames GENERATED between two updates num_frames_per_iter = args.num_ales * args.num_steps_per_update total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter)) model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale) if args.distributed: model = DDP(model, delay_allreduce=True) shape = (args.num_steps + 1, args.num_ales, args.num_stack, *train_env.observation_space.shape[-2:]) states = torch.zeros(shape, device=train_device, dtype=torch.float32) states[step0, :, -1] = observation.to(device=train_device, dtype=torch.float32) shape = (args.num_steps + 1, args.num_ales) values = torch.zeros(shape, device=train_device, dtype=torch.float32) logits = torch.zeros( (args.num_steps + 1, args.num_ales, train_env.action_space.n), device=train_device, dtype=torch.float32) returns = torch.zeros(shape, device=train_device, dtype=torch.float32) shape = (args.num_steps, args.num_ales) rewards = torch.zeros(shape, device=train_device, dtype=torch.float32) masks = torch.zeros(shape, device=train_device, dtype=torch.float32) actions = torch.zeros(shape, device=train_device, dtype=torch.long) mus = torch.ones(shape, device=train_device, dtype=torch.float32) # pis = torch.zeros(shape, device=train_device, dtype=torch.float32) rhos = torch.zeros((args.num_steps, minibatch_size), device=train_device, dtype=torch.float32) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) if args.use_gae: raise ValueError('GAE is not compatible with VTRACE') maybe_npy = lambda a: a.numpy() if args.use_openai else a torch.cuda.synchronize() iterator = range(total_steps) if args.rank == 0: iterator = tqdm(iterator) total_time = 0 evaluation_offset = 0 for update in iterator: T = args.world_size * update * num_frames_per_iter if (args.rank == 0) and (T >= evaluation_offset): evaluation_offset += args.evaluation_interval eval_lengths, eval_rewards = evaluate(args, T, total_time, model, test_env, eval_csv_writer, eval_csv_file) if args.plot: writer.add_scalar('eval/rewards_mean', eval_rewards.mean().item(), T, walltime=total_time) writer.add_scalar('eval/lengths_mean', eval_lengths.mean().item(), T, walltime=total_time) start_time = time.time() with torch.no_grad(): for step in range(args.num_steps_per_update): nvtx.range_push('train:step') value, logit = model(states[step0 + step]) # store values and logits values[step0 + step] = value.squeeze(-1) # convert actions to numpy and perform next step probs = torch.clamp(F.softmax(logit, dim=1), min=0.00001, max=0.99999) probs_action = probs.multinomial(1).to(env_device) # Check if the multinomial threw an exception # https://github.com/pytorch/pytorch/issues/7014 torch.cuda.current_stream().synchronize() observation, reward, done, info = train_env.step( maybe_npy(probs_action)) if args.use_openai: # convert back to pytorch tensors observation = torch.from_numpy(observation) reward = torch.from_numpy(reward) done = torch.from_numpy(done.astype(np.uint8)) else: observation = observation.squeeze(-1).unsqueeze(1) # move back to training memory observation = observation.to(device=train_device) reward = reward.to(device=train_device, dtype=torch.float32) done = done.to(device=train_device) probs_action = probs_action.to(device=train_device, dtype=torch.long) not_done = 1.0 - done.float() # update rewards and actions actions[step0 + step].copy_(probs_action.view(-1)) masks[step0 + step].copy_(not_done) rewards[step0 + step].copy_(reward.sign()) #mus[step0 + step] = F.softmax(logit, dim=1).gather(1, actions[step0 + step].view(-1).unsqueeze(-1)).view(-1) mus[step0 + step] = torch.clamp(F.softmax(logit, dim=1).gather( 1, actions[step0 + step].view(-1).unsqueeze(-1)).view(-1), min=0.00001, max=0.99999) # update next observations states[step0 + step + 1, :, :-1].copy_(states[step0 + step, :, 1:]) states[step0 + step + 1] *= not_done.view( -1, *[1] * (observation.dim() - 1)) states[step0 + step + 1, :, -1].copy_(observation.view(-1, *states.size()[-2:])) # update episodic reward counters episode_rewards += reward final_rewards[done] = episode_rewards[done] episode_rewards *= not_done episode_lengths += not_done final_lengths[done] = episode_lengths[done] episode_lengths *= not_done nvtx.range_pop() n_minibatch = (n_minibatch + 1) % args.num_minibatches min_ale_index = int(n_minibatch * minibatch_size) max_ale_index = min_ale_index + minibatch_size # compute v-trace using the recursive method (remark 1 in IMPALA paper) # value_next_step, logit = model(states[-1:, min_ale_index:max_ale_index, :, : ,:].contiguous().view(-1, *states.size()[-3:])) # returns[-1, min_ale_index:max_ale_index] = value_next_step.squeeze() # for step in reversed(range(args.num_steps)): # value, logit = model(states[step, min_ale_index:max_ale_index, :, : ,:].contiguous().view(-1, *states.size()[-3:])) # pis = F.softmax(logit, dim=1).gather(1, actions[step, min_ale_index:max_ale_index].view(-1).unsqueeze(-1)).view(-1) # c = torch.clamp(pis / mus[step, min_ale_index:max_ale_index], max=c_) # rhos[step, :] = torch.clamp(pis / mus[step, min_ale_index:max_ale_index], max=rho_) # delta_value = rhos[step, :] * (rewards[step, min_ale_index:max_ale_index] + (args.gamma * value_next_step - value).squeeze()) # returns[step, min_ale_index:max_ale_index] = value.squeeze() + delta_value + args.gamma * c * \ # (returns[step + 1, min_ale_index:max_ale_index] - value_next_step.squeeze()) # value_next_step = value nvtx.range_push('train:compute_values') value, logit = model( states[:, min_ale_index:max_ale_index, :, :, :].contiguous().view( -1, *states.size()[-3:])) batch_value = value.detach().view((args.num_steps + 1, minibatch_size)) batch_probs = F.softmax(logit.detach()[:(args.num_steps * minibatch_size), :], dim=1) batch_pis = batch_probs.gather( 1, actions[:, min_ale_index:max_ale_index].contiguous().view( -1).unsqueeze(-1)).view((args.num_steps, minibatch_size)) returns[-1, min_ale_index:max_ale_index] = batch_value[-1] with torch.no_grad(): for step in reversed(range(args.num_steps)): c = torch.clamp(batch_pis[step, :] / mus[step, min_ale_index:max_ale_index], max=args.c_hat) rhos[step, :] = torch.clamp( batch_pis[step, :] / mus[step, min_ale_index:max_ale_index], max=args.rho_hat) delta_value = rhos[step, :] * ( rewards[step, min_ale_index:max_ale_index] + (args.gamma * batch_value[step + 1] - batch_value[step]).squeeze()) returns[step, min_ale_index:max_ale_index] = \ batch_value[step, :].squeeze() + delta_value + args.gamma * c * \ (returns[step + 1, min_ale_index:max_ale_index] - batch_value[step + 1, :].squeeze()) value = value[:args.num_steps * minibatch_size, :] logit = logit[:args.num_steps * minibatch_size, :] log_probs = F.log_softmax(logit, dim=1) probs = F.softmax(logit, dim=1) action_log_probs = log_probs.gather( 1, actions[:, min_ale_index:max_ale_index].contiguous().view( -1).unsqueeze(-1)) dist_entropy = -(log_probs * probs).sum(-1).mean() advantages = returns[:-1, min_ale_index:max_ale_index].contiguous( ).view(-1).unsqueeze(-1) - value value_loss = advantages.pow(2).mean() policy_loss = -(action_log_probs * rhos.view(-1, 1).detach() * \ (rewards[:, min_ale_index:max_ale_index].contiguous().view(-1, 1) + args.gamma * \ returns[1:, min_ale_index:max_ale_index].contiguous().view(-1, 1) - value).detach()).mean() nvtx.range_pop() nvtx.range_push('train:backprop') loss = value_loss * args.value_loss_coef + policy_loss - dist_entropy * args.entropy_coef optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) optimizer.step() nvtx.range_pop() nvtx.range_push('train:next_states') for step in range(0, args.num_steps_per_update): states[:-1, :, :, :, :] = states[1:, :, :, :, :] rewards[:-1, :] = rewards[1:, :] actions[:-1, :] = actions[1:, :] masks[:-1, :] = masks[1:, :] mus[:-1, :] = mus[1:, :] nvtx.range_pop() torch.cuda.synchronize() if args.rank == 0: iter_time = time.time() - start_time total_time += iter_time if args.plot: writer.add_scalar('train/rewards_mean', final_rewards.mean().item(), T, walltime=total_time) writer.add_scalar('train/lengths_mean', final_lengths.mean().item(), T, walltime=total_time) writer.add_scalar('train/value_loss', value_loss, T, walltime=total_time) writer.add_scalar('train/policy_loss', policy_loss, T, walltime=total_time) writer.add_scalar('train/entropy', dist_entropy, T, walltime=total_time) progress_data = callback(args, model, T, iter_time, final_rewards, final_lengths, value_loss, policy_loss, dist_entropy, train_csv_writer, train_csv_file) iterator.set_postfix_str(progress_data) if args.plot and (args.rank == 0): writer.close() if args.use_openai: train_env.close() if args.use_openai_test_env: test_env.close()
# self.input = ops.FileReader(file_root = image_dir, file_list=image_dir+"/file_list.txt") # self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB) def define_graph(self): nvtx.range_push("Reading JPEG files into host memory") jpegs, labels = self.input() # read in jpeg files nvtx.range_pop() nvtx.range_push("Start mixed decoding process") # images = self.decode(jpegs) # Do decoding process decode = ops.ImageDecoder(device="mixed", output_type=types.RGB) images = decode(jpegs) nvtx.range_pop() return (images, labels) if __name__ == "__main__": pipe = SimplePipeline(batch_size, 1, 0) nvtx.range_push("Building pipeline") pipe.build() nvtx.range_pop() nvtx.range_push("Running pipeline") ticks = time.time() pipe_out = pipe.run() images, labels = pipe_out nvtx.range_pop() # images_cpu = images.as_cpu() elapsed = time.time() - ticks print("Time elapsed for getting decoded images: ", elapsed) # showImages(images) # printDirHierarchy(image_dir)
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--dry-run', action='store_true', default=False, help='quickly check a single pass') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=100, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") train_kwargs = {'batch_size': args.batch_size} test_kwargs = {'batch_size': args.test_batch_size} if use_cuda: cuda_kwargs = {'num_workers': multiprocessing.cpu_count(), 'pin_memory': True, 'shuffle': True} train_kwargs.update(cuda_kwargs) test_kwargs.update(cuda_kwargs) transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) scriptPath = os.path.dirname(os.path.realpath(__file__)) dataDir = os.path.join(scriptPath, 'data') dataset1 = datasets.MNIST(dataDir, train=True, download=True, transform=transform) dataset2 = datasets.MNIST(dataDir, train=False, transform=transform) train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): # Start profiling from 2nd epoch if epoch == 2: torch.cuda.cudart().cudaProfilerStart() nvtx.range_push("Epoch " + str(epoch)) nvtx.range_push("Train") train(args, model, device, train_loader, optimizer, epoch) nvtx.range_pop() # Train nvtx.range_push("Test") test(model, device, test_loader) nvtx.range_pop() # Test scheduler.step() nvtx.range_pop() # Epoch # Stop profiling at the end of 2nd epoch if epoch == 2: torch.cuda.cudart().cudaProfilerStop() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
def worker(gpu, ngpus_per_node, callback, args): args.gpu = gpu if (args.num_ales % args.world_size) != 0: raise ValueError( 'The num_ales({}) should be evenly divisible by the world_size({})' .format(args.num_ales, args.world_size)) args.num_ales = int(args.num_ales / args.world_size) if (args.batch_size % args.world_size) != 0: raise ValueError( 'The batch_size({}) should be evenly divisible by the world_size({})' .format(args.batch_size, args.world_size)) args.batch_size = int(args.num_ales / args.world_size) num_frames_per_iter = args.num_ales * args.num_steps args.num_minibatches = num_frames_per_iter / args.batch_size total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter)) if args.distributed: args.seed += args.gpu torch.cuda.set_device(args.gpu) args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0 if args.multiprocessing_distributed: args.rank = args.rank * ngpus_per_node + args.gpu torch.distributed.init_process_group( backend='nccl', init_method='tcp://127.0.0.1:8632', world_size=args.world_size, rank=args.rank) else: args.rank = 0 if args.lr_scale: scaled_lr = args.lr * math.sqrt((args.num_ales * args.world_size) / 16) if args.rank == 0: print('Scaled learning rate from {:4.4f} to {:4.4f}'.format( args.lr, scaled_lr)) args.lr = scaled_lr args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available() args.no_cuda_train = (not args.no_cuda_train) and torch.cuda.is_available() args.verbose = args.verbose and (args.rank == 0) env_device = torch.device( 'cuda', args.gpu) if args.use_cuda_env else torch.device('cpu') train_device = torch.device('cuda', args.gpu) if ( args.no_cuda_train == False) else torch.device('cpu') np.random.seed(args.seed) torch.manual_seed(np.random.randint(1, 10000)) if args.use_cuda_env or (args.no_cuda_train == False): torch.cuda.manual_seed(np.random.randint(1, 10000)) if args.rank == 0: if args.output_filename: train_csv_file = open(args.output_filename, 'w', newline='') train_csv_file.write(json.dumps(vars(args))) train_csv_file.write('\n') train_csv_writer = csv.writer(train_csv_file, delimiter=',') train_csv_writer.writerow([ 'frames', 'fps', 'total_time', 'rmean', 'rmedian', 'rmin', 'rmax', 'lmean', 'lmedian', 'lmin', 'lmax', 'entropy', 'value_loss', 'policy_loss' ]) eval_output_filename = '.'.join([ ''.join(args.output_filename.split('.')[:-1] + ['_test']), 'csv' ]) eval_csv_file = open(eval_output_filename, 'w', newline='') eval_csv_file.write(json.dumps(vars(args))) eval_csv_file.write('\n') eval_csv_writer = csv.writer(eval_csv_file, delimiter=',') eval_csv_writer.writerow([ 'frames', 'total_time', 'rmean', 'rmedian', 'rmin', 'rmax', 'rstd', 'lmean', 'lmedian', 'lmin', 'lmax', 'lstd' ]) else: train_csv_file, train_csv_writer = None, None eval_csv_file, eval_csv_writer = None, None if args.plot: from tensorboardX import SummaryWriter current_time = datetime.now().strftime('%b%d_%H-%M-%S') log_dir = os.path.join(args.log_dir, current_time + '_' + socket.gethostname()) writer = SummaryWriter(log_dir=log_dir) for k, v in vars(args).items(): writer.add_text(k, str(v)) print() print('PyTorch : {}'.format(torch.__version__)) print('CUDA : {}'.format(torch.backends.cudnn.m.cuda)) print('CUDNN : {}'.format(torch.backends.cudnn.version())) print('APEX : {}'.format('.'.join( [str(i) for i in apex.amp.__version__.VERSION]))) print() if train_device.type == 'cuda': print(cuda_device_str(train_device.index), flush=True) if args.use_openai: train_env = create_vectorize_atari_env( args.env_name, args.seed, args.num_ales, episode_life=args.episodic_life, clip_rewards=False, max_frames=args.max_episode_length) observation = torch.from_numpy(train_env.reset()).squeeze(1) test_env = create_vectorize_atari_env(args.env_name, args.seed, args.evaluation_episodes, episode_life=False, clip_rewards=False) test_env.reset() else: train_env = AtariEnv(args.env_name, args.num_ales, color_mode='gray', repeat_prob=0.0, device=env_device, rescale=True, episodic_life=args.episodic_life, clip_rewards=False) train_env.train() observation = train_env.reset(initial_steps=args.ale_start_steps, verbose=args.verbose).squeeze(-1) test_env = AtariEnv(args.env_name, args.evaluation_episodes, color_mode='gray', repeat_prob=0.0, device='cpu', rescale=True, episodic_life=False, clip_rewards=False, frameskip=4) model = ActorCritic(args.num_stack, train_env.action_space, normalize=args.normalize, name=args.env_name) model = model.to(train_device).train() if args.rank == 0: print(model) args.model_name = model.name if args.use_adam: optimizer = optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) else: optimizer = optim.RMSprop(model.parameters(), lr=args.lr, eps=args.eps, alpha=args.alpha) decay = 1.0 / total_steps scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.ppo_epoch, gamma=1.0 - decay) model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale) if args.distributed: model = DDP(model, delay_allreduce=True) shape = (args.num_steps + 1, args.num_ales, args.num_stack, *train_env.observation_space.shape[-2:]) states = torch.zeros(shape, device=train_device, dtype=torch.float32) states[0, :, -1] = observation.to(device=train_device, dtype=torch.float32) shape = (args.num_steps + 1, args.num_ales) values = torch.zeros(shape, device=train_device, dtype=torch.float32) logits = torch.zeros( (args.num_steps + 1, args.num_ales, train_env.action_space.n), device=train_device, dtype=torch.float32) returns = torch.zeros(shape, device=train_device, dtype=torch.float32) shape = (args.num_steps, args.num_ales) rewards = torch.zeros(shape, device=train_device, dtype=torch.float32) masks = torch.zeros(shape, device=train_device, dtype=torch.float32) actions = torch.zeros(shape, device=train_device, dtype=torch.long) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) if args.use_gae: gae = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) maybe_npy = lambda a: a.numpy() if args.use_openai else a torch.cuda.synchronize() iterator = range(total_steps) if args.rank == 0: iterator = tqdm(iterator) total_time = 0 evaluation_offset = 0 train_stream = torch.cuda.Stream() for update in iterator: T = args.world_size * update * num_frames_per_iter if (args.rank == 0) and (T >= evaluation_offset): evaluation_offset += args.evaluation_interval eval_lengths, eval_rewards = evaluate(args, T, total_time, model, test_env, eval_csv_writer, eval_csv_file) if args.plot: writer.add_scalar('eval/rewards_mean', eval_rewards.mean().item(), T, walltime=total_time) writer.add_scalar('eval/lengths_mean', eval_lengths.mean().item(), T, walltime=total_time) start_time = time.time() with torch.no_grad(): for step in range(args.num_steps): nvtx.range_push('train:step') value, logit = model(states[step]) # store values and logits values[step], logits[step] = value.squeeze(-1), logit.squeeze( -1) # convert actions to numpy and perform next step probs = torch.clamp(F.softmax(logit, dim=1), min=0.00001, max=0.99999) probs_action = probs.multinomial(1).to(env_device) observation, reward, done, info = train_env.step( maybe_npy(probs_action)) if args.use_openai: # convert back to pytorch tensors observation = torch.from_numpy(observation) reward = torch.from_numpy(reward) done = torch.from_numpy(done.astype(np.uint8)) else: observation = observation.squeeze(-1).unsqueeze(1) # move back to training memory observation = observation.to(device=train_device) reward = reward.to(device=train_device, dtype=torch.float32) done = done.to(device=train_device) probs_action = probs_action.to(device=train_device, dtype=torch.long) not_done = 1.0 - done.float() # update rewards and actions actions[step].copy_(probs_action.view(-1)) masks[step].copy_(not_done) rewards[step].copy_(reward.sign()) # update next observations states[step + 1, :, :-1].copy_(states[step, :, 1:]) states[step + 1] *= not_done.view( -1, *[1] * (observation.dim() - 1)) states[step + 1, :, -1].copy_(observation.view(-1, *states.size()[-2:])) # update episodic reward counters episode_rewards += reward final_rewards[done] = episode_rewards[done] episode_rewards *= not_done episode_lengths += not_done final_lengths[done] = episode_lengths[done] episode_lengths *= not_done nvtx.range_pop() returns[-1] = values[-1] = model(states[-1])[0].data.squeeze(-1) if args.use_gae: gae.zero_() for step in reversed(range(args.num_steps)): delta = rewards[step] + (args.gamma * values[step + 1] * masks[step]) - values[step] gae = delta + (args.gamma * args.tau * masks[step] * gae) returns[step] = gae + values[step] else: for step in reversed(range(args.num_steps)): returns[step] = rewards[step] + ( args.gamma * returns[step + 1] * masks[step]) log_probs = F.log_softmax(logits[:-1].view( -1, train_env.action_space.n), dim=1) action_log_probs = log_probs.gather(1, actions.view(-1).unsqueeze(-1)) advantages = returns[:-1].view(-1).unsqueeze( -1) - values[:-1].view(-1).unsqueeze(-1) advantages = (advantages - advantages.mean()) / ( advantages.std() + float(np.finfo(np.float32).eps)) total_value_loss = 0.0 total_policy_loss = 0.0 total_dist_entropy = 0.0 nvtx.range_push('train:loader') states_view = states[:-1].view(-1, *states.size()[-3:]) actions_view = actions.view(-1) returns_view = returns[:-1].view(-1) train_dataset = torch.utils.data.TensorDataset(states_view, actions_view, action_log_probs, returns_view, advantages) train_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=0, pin_memory=False, sampler=train_sampler) nvtx.range_pop() with torch.cuda.stream(train_stream): for epoch in range(args.ppo_epoch): nvtx.range_push('train:epoch_step') if args.distributed: train_sampler.set_epoch(epoch) prefetcher = data_prefetcher(train_loader) local_states, local_actions, local_action_log_probs, local_returns, local_advantages = prefetcher.next( ) while local_states is not None: batch_values, batch_logits = model(local_states) batch_log_probs = F.log_softmax(batch_logits, dim=1) batch_action_log_probs = batch_log_probs.gather( 1, local_actions.unsqueeze(-1)) batch_probs = F.softmax(batch_logits, dim=1) batch_dist_entropy = -(batch_log_probs * batch_probs).sum(-1).mean() ratio = torch.exp(batch_action_log_probs - local_action_log_probs) surrogate1 = ratio * local_advantages surrogate2 = torch.clamp( ratio, 1.0 - args.clip_epsilon, 1.0 + args.clip_epsilon) * local_advantages batch_policy_loss = -torch.min(surrogate1, surrogate2).mean() batch_value_loss = F.mse_loss(local_returns.unsqueeze(-1), batch_values) / 2.0 loss = batch_value_loss * args.value_loss_coef + batch_policy_loss - batch_dist_entropy * args.entropy_coef optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) optimizer.step() total_value_loss += batch_value_loss.item() total_policy_loss += batch_policy_loss.item() total_dist_entropy += batch_dist_entropy.item() local_states, local_actions, local_action_log_probs, local_returns, local_advantages = prefetcher.next( ) scheduler.step() nvtx.range_pop() torch.cuda.synchronize() states[0].copy_(states[-1]) if args.rank == 0: iter_time = time.time() - start_time total_time += iter_time value_loss = total_value_loss / (args.ppo_epoch * args.num_minibatches) policy_loss = total_policy_loss / (args.ppo_epoch * args.num_minibatches) dist_entropy = total_dist_entropy / (args.ppo_epoch * args.num_minibatches) if args.plot: writer.add_scalar('train/rewards_mean', final_rewards.mean().item(), T, walltime=total_time) writer.add_scalar('train/lengths_mean', final_lengths.mean().item(), T, walltime=total_time) writer.add_scalar('train/learning_rate', scheduler.get_lr()[0], T, walltime=total_time) writer.add_scalar('train/value_loss', value_loss, T, walltime=total_time) writer.add_scalar('train/policy_loss', policy_loss, T, walltime=total_time) writer.add_scalar('train/entropy', dist_entropy, T, walltime=total_time) progress_data = callback(args, model, T, iter_time, final_rewards, final_lengths, value_loss, policy_loss, dist_entropy, train_csv_writer, train_csv_file) iterator.set_postfix_str(progress_data) if args.plot: writer.close() if args.use_openai: train_env.close() test_env.close()
def forward(self, x): nvtx.range_push("layer:block_1") x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) nvtx.range_pop() nvtx.range_push("layer:block_2") x = self.layer1(x) nvtx.range_pop() nvtx.range_push("layer:block_3") x = self.layer2(x) nvtx.range_pop() nvtx.range_push("layer:block_4") x = self.layer3(x) nvtx.range_pop() nvtx.range_push("layer:block_5") x = self.layer4(x) nvtx.range_pop() x = self.avgpool(x) x = torch.flatten(x, 1) nvtx.range_push("layer:FC") x = self.fc(x) nvtx.range_pop() return x