def worker(gpu, ngpus_per_node, args): env_device, train_device = args_initialize(gpu, ngpus_per_node, args) train_csv_file, train_csv_writer, eval_csv_file, eval_csv_writer, summary_writer = log_initialize(args, train_device) train_env, test_env, observation = env_initialize(args, env_device) model = ActorCritic(args.num_stack, train_env.action_space, normalize=args.normalize, name=args.env_name) model, optimizer = model_initialize(args, model, train_device) shape = (args.num_steps + 1, args.num_ales, args.num_stack, *train_env.observation_space.shape[-2:]) states = torch.zeros(shape, device=train_device, dtype=torch.float32) states[0, :, -1] = observation.to(device=train_device, dtype=torch.float32) shape = (args.num_steps + 1, args.num_ales) values = torch.zeros(shape, device=train_device, dtype=torch.float32) logits = torch.zeros((args.num_steps + 1, args.num_ales, train_env.action_space.n), device=train_device, dtype=torch.float32) returns = torch.zeros(shape, device=train_device, dtype=torch.float32) shape = (args.num_steps, args.num_ales) rewards = torch.zeros(shape, device=train_device, dtype=torch.float32) masks = torch.zeros(shape, device=train_device, dtype=torch.float32) actions = torch.zeros(shape, device=train_device, dtype=torch.long) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) if args.use_gae: gae = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) maybe_npy = lambda a: a.numpy() if args.use_openai else a num_frames_per_iter = args.num_ales * args.num_steps args.num_minibatches = num_frames_per_iter / args.batch_size total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter)) decay = 1.0 / total_steps scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.ppo_epoch, gamma=1.0 - decay) iterator = range(total_steps) if args.rank == 0: iterator = tqdm(iterator) total_time = 0 evaluation_offset = 0 train_stream = torch.cuda.Stream() torch.cuda.synchronize() for update in iterator: T = args.world_size * update * num_frames_per_iter if (args.rank == 0) and (T >= evaluation_offset): evaluation_offset += args.evaluation_interval eval_lengths, eval_rewards = test(args, model, test_env) lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths) rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards) length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format(lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian) reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format(rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian) print('[training time: {}] {}'.format(format_time(total_time), ' --- '.join([length_data, reward_data]))) if eval_csv_writer and eval_csv_file: eval_csv_writer.writerow([T, total_time, rmean, rmedian, rmin, rmax, rstd, lmean, lmedian, lmin, lmax, lstd]) eval_csv_file.flush() if args.plot: summary_writer.add_scalar('eval/rewards_mean', rmean, T, walltime=total_time) summary_writer.add_scalar('eval/lengths_mean', lmean, T, walltime=total_time) start_time = time.time() with torch.no_grad(): for step in range(args.num_steps): nvtx.range_push('train:step') value, logit = model(states[step]) # store values and logits values[step], logits[step] = value.squeeze(-1), logit.squeeze(-1) # convert actions to numpy and perform next step probs = torch.clamp(F.softmax(logit, dim=1), min = 0.00001, max = 0.99999) probs_action = probs.multinomial(1).to(env_device) observation, reward, done, info = train_env.step(maybe_npy(probs_action)) if args.use_openai: # convert back to pytorch tensors observation = torch.from_numpy(observation) reward = torch.from_numpy(reward) done = torch.from_numpy(done.astype(np.uint8)) else: observation = observation.squeeze(-1).unsqueeze(1) # move back to training memory observation = observation.to(device=train_device) reward = reward.to(device=train_device, dtype=torch.float32) done = done.to(device=train_device, dtype=torch.bool) probs_action = probs_action.to(device=train_device, dtype=torch.long) not_done = 1.0 - done.float() # update rewards and actions actions[step].copy_(probs_action.view(-1)) masks[step].copy_(not_done) rewards[step].copy_(reward.sign()) # update next observations states[step + 1, :, :-1].copy_(states[step, :, 1:]) states[step + 1] *= not_done.view(-1, *[1] * (observation.dim() - 1)) states[step + 1, :, -1].copy_(observation.view(-1, *states.size()[-2:])) # update episodic reward counters episode_rewards += reward final_rewards[done] = episode_rewards[done] episode_rewards *= not_done episode_lengths += not_done final_lengths[done] = episode_lengths[done] episode_lengths *= not_done nvtx.range_pop() returns[-1] = values[-1] = model(states[-1])[0].data.squeeze(-1) if args.use_gae: gae.zero_() for step in reversed(range(args.num_steps)): delta = rewards[step] + (args.gamma * values[step + 1] * masks[step]) - values[step] gae = delta + (args.gamma * args.tau * masks[step] * gae) returns[step] = gae + values[step] else: for step in reversed(range(args.num_steps)): returns[step] = rewards[step] + (args.gamma * returns[step + 1] * masks[step]) log_probs = F.log_softmax(logits[:-1].view(-1, train_env.action_space.n), dim=1) action_log_probs = log_probs.gather(1, actions.view(-1).unsqueeze(-1)) advantages = returns[:-1].view(-1).unsqueeze(-1) - values[:-1].view(-1).unsqueeze(-1) advantages = (advantages - advantages.mean()) / (advantages.std() + float(np.finfo(np.float32).eps)) total_value_loss = 0.0 total_policy_loss = 0.0 total_dist_entropy = 0.0 nvtx.range_push('train:loader') states_view = states[:-1].view(-1, *states.size()[-3:]) actions_view = actions.view(-1) returns_view = returns[:-1].view(-1) train_dataset = torch.utils.data.TensorDataset(states_view, actions_view, action_log_probs, returns_view, advantages) train_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=0, pin_memory=False, sampler=train_sampler) nvtx.range_pop() with torch.cuda.stream(train_stream): for epoch in range(args.ppo_epoch): nvtx.range_push('train:epoch_step') if args.distributed: train_sampler.set_epoch(epoch) prefetcher = data_prefetcher(train_loader) local_states, local_actions, local_action_log_probs, local_returns, local_advantages = prefetcher.next() while local_states is not None: batch_values, batch_logits = model(local_states) batch_log_probs = F.log_softmax(batch_logits, dim=1) batch_action_log_probs = batch_log_probs.gather(1, local_actions.unsqueeze(-1)) batch_probs = F.softmax(batch_logits, dim=1) batch_dist_entropy = -(batch_log_probs * batch_probs).sum(-1).mean() ratio = torch.exp(batch_action_log_probs - local_action_log_probs) surrogate1 = ratio * local_advantages surrogate2 = torch.clamp(ratio, 1.0 - args.clip_epsilon, 1.0 + args.clip_epsilon) * local_advantages batch_policy_loss = -torch.min(surrogate1, surrogate2).mean() batch_value_loss = F.mse_loss(local_returns.unsqueeze(-1), batch_values) / 2.0 loss = batch_value_loss * args.value_loss_coef + batch_policy_loss - batch_dist_entropy * args.entropy_coef optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() total_value_loss += batch_value_loss.item() total_policy_loss += batch_policy_loss.item() total_dist_entropy += batch_dist_entropy.item() local_states, local_actions, local_action_log_probs, local_returns, local_advantages = prefetcher.next() scheduler.step() nvtx.range_pop() torch.cuda.synchronize() states[0].copy_(states[-1]) if args.rank == 0: iter_time = time.time() - start_time total_time += iter_time value_loss = total_value_loss / (args.ppo_epoch * args.num_minibatches) policy_loss = total_policy_loss / (args.ppo_epoch * args.num_minibatches) dist_entropy = total_dist_entropy / (args.ppo_epoch * args.num_minibatches) if args.plot: writer.add_scalar('train/rewards_mean', final_rewards.mean().item(), T, walltime=total_time) writer.add_scalar('train/lengths_mean', final_lengths.mean().item(), T, walltime=total_time) writer.add_scalar('train/learning_rate', scheduler.get_lr()[0], T, walltime=total_time) writer.add_scalar('train/value_loss', value_loss, T, walltime=total_time) writer.add_scalar('train/policy_loss', policy_loss, T, walltime=total_time) writer.add_scalar('train/entropy', dist_entropy, T, walltime=total_time) progress_data = callback(args, model, T, iter_time, final_rewards, final_lengths, value_loss, policy_loss, dist_entropy, train_csv_writer, train_csv_file) iterator.set_postfix_str(progress_data) if args.plot and (args.rank == 0): writer.close() if args.use_openai: train_env.close() if args.use_openai_test_env: test_env.close()
def worker(gpu, ngpus_per_node, args): env_device, train_device = args_initialize(gpu, ngpus_per_node, args) train_env, test_env, observation = env_initialize(args, env_device) train_csv_file, train_csv_writer, eval_csv_file, eval_csv_writer, summary_writer = log_initialize( args, train_device) model = ActorCritic(args.num_stack, train_env.action_space, normalize=args.normalize, name=args.env_name) model, optimizer = model_initialize(args, model, train_device) num_frames_per_iter = args.num_ales * args.num_steps total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter)) shape = (args.num_steps + 1, args.num_ales, args.num_stack, *train_env.observation_space.shape[-2:]) states = torch.zeros(shape, device=train_device, dtype=torch.float32) states[0, :, -1] = observation.to(device=train_device, dtype=torch.float32) shape = (args.num_steps + 1, args.num_ales) values = torch.zeros(shape, device=train_device, dtype=torch.float32) returns = torch.zeros(shape, device=train_device, dtype=torch.float32) shape = (args.num_steps, args.num_ales) rewards = torch.zeros(shape, device=train_device, dtype=torch.float32) masks = torch.zeros(shape, device=train_device, dtype=torch.float32) actions = torch.zeros(shape, device=train_device, dtype=torch.long) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) if args.use_gae: gae = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) maybe_npy = lambda a: a.numpy() if args.use_openai else a torch.cuda.synchronize() iterator = range(total_steps) if args.rank == 0: iterator = tqdm(iterator) total_time = 0 evaluation_offset = 0 for update in iterator: T = args.world_size * update * num_frames_per_iter if (args.rank == 0) and (T >= evaluation_offset): evaluation_offset += args.evaluation_interval eval_lengths, eval_rewards = test(args, model, test_env) lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths) rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards) length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format( lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian) reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format( rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian) print('[training time: {}] {}'.format( format_time(total_time), ' --- '.join([length_data, reward_data]))) if eval_csv_writer and eval_csv_file: eval_csv_writer.writerow([ T, total_time, rmean, rmedian, rmin, rmax, rstd, lmean, lmedian, lmin, lmax, lstd ]) eval_csv_file.flush() if args.plot: summary_writer.add_scalar('eval/rewards_mean', rmean, T, walltime=total_time) summary_writer.add_scalar('eval/lengths_mean', lmean, T, walltime=total_time) start_time = time.time() with torch.no_grad(): for step in range(args.num_steps): value, logit = model(states[step]) # store values values[step] = value.squeeze(-1) # convert actions to numpy and perform next step probs_action = F.softmax(logit, dim=1).multinomial(1).to(env_device) observation, reward, done, info = train_env.step( maybe_npy(probs_action)) if args.use_openai: # convert back to pytorch tensors observation = torch.from_numpy(observation) reward = torch.from_numpy(reward) done = torch.from_numpy(done.astype(np.uint8)) else: observation = observation.squeeze(-1).unsqueeze(1) # move back to training memory observation = observation.to(device=train_device) reward = reward.to(device=train_device, dtype=torch.float32) done = done.to(device=train_device, dtype=torch.bool) probs_action = probs_action.to(device=train_device, dtype=torch.long) not_done = 1.0 - done.float() # update rewards and actions actions[step].copy_(probs_action.view(-1)) masks[step].copy_(not_done) rewards[step].copy_(reward.sign()) # update next observations states[step + 1, :, :-1].copy_(states[step, :, 1:].clone()) states[step + 1] *= not_done.view( -1, *[1] * (observation.dim() - 1)) states[step + 1, :, -1].copy_(observation.view(-1, *states.size()[-2:])) # update episodic reward counters episode_rewards += reward final_rewards[done] = episode_rewards[done] episode_rewards *= not_done episode_lengths += not_done final_lengths[done] = episode_lengths[done] episode_lengths *= not_done returns[-1] = values[-1] = model(states[-1])[0].data.squeeze(-1) if args.use_gae: gae.zero_() for step in reversed(range(args.num_steps)): delta = rewards[step] + (args.gamma * values[step + 1] * masks[step]) - values[step] gae = delta + (args.gamma * args.tau * masks[step] * gae) returns[step] = gae + values[step] else: for step in reversed(range(args.num_steps)): returns[step] = rewards[step] + ( args.gamma * returns[step + 1] * masks[step]) value, logit = model(states[:-1].view(-1, *states.size()[-3:])) log_probs = F.log_softmax(logit, dim=1) probs = F.softmax(logit, dim=1) action_log_probs = log_probs.gather(1, actions.view(-1).unsqueeze(-1)) dist_entropy = -(log_probs * probs).sum(-1).mean() advantages = returns[:-1].view(-1).unsqueeze(-1) - value value_loss = advantages.pow(2).mean() policy_loss = -(advantages.clone().detach() * action_log_probs).mean() loss = value_loss * args.value_loss_coef + policy_loss - dist_entropy * args.entropy_coef optimizer.zero_grad() if args.cpu_train: loss.backward() master_params = model.parameters() else: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() master_params = amp.master_params(optimizer) torch.nn.utils.clip_grad_norm_(master_params, args.max_grad_norm) optimizer.step() states[0].copy_(states[-1]) torch.cuda.synchronize() if args.rank == 0: iter_time = time.time() - start_time total_time += iter_time if args.plot: summary_writer.add_scalar('train/rewards_mean', final_rewards.mean().item(), T, walltime=total_time) summary_writer.add_scalar('train/lengths_mean', final_lengths.mean().item(), T, walltime=total_time) summary_writer.add_scalar('train/value_loss', value_loss, T, walltime=total_time) summary_writer.add_scalar('train/policy_loss', policy_loss, T, walltime=total_time) summary_writer.add_scalar('train/entropy', dist_entropy, T, walltime=total_time) progress_data = callback(args, model, T, iter_time, final_rewards, final_lengths, value_loss.item(), policy_loss.item(), dist_entropy.item(), train_csv_writer, train_csv_file) iterator.set_postfix_str(progress_data) if args.plot and (args.rank == 0): writer.close() if args.use_openai: train_env.close() if args.use_openai_test_env: test_env.close()
def worker(gpu, ngpus_per_node, args): env_device, train_device = args_initialize(gpu, ngpus_per_node, args) double_testing = False # openai and cule testing if double_testing == False: train_env, test_env, observation = env_initialize(args, env_device) else: use_openai_test_env = args.use_openai_test_env args.use_openai_test_env = False train_env, test_env, observation = env_initialize(args, env_device) args.use_openai_test_env = True _, test_env_oai, _ = env_initialize(args, env_device) args.use_openai_test_env = use_openai_test_env train_csv_file, train_csv_writer, eval_csv_file, eval_csv_writer, summary_writer = log_initialize( args, train_device) model = ActorCritic(args.num_stack, train_env.action_space, normalize=args.normalize, name=args.env_name) model, optimizer = model_initialize(args, model, train_device) if (args.num_ales % args.num_minibatches) != 0: raise ValueError( 'Number of ales({}) size is not even divisible by the minibatch size({})' .format(args.num_ales, args.num_minibatches)) if args.num_steps_per_update == -1: args.num_steps_per_update = args.num_steps minibatch_size = int(args.num_ales / args.num_minibatches) step0 = args.num_steps - args.num_steps_per_update n_minibatch = -1 # This is the number of frames GENERATED between two updates num_frames_per_iter = args.num_ales * args.num_steps_per_update total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter)) shape = (args.num_steps + 1, args.num_ales, args.num_stack, *train_env.observation_space.shape[-2:]) states = torch.zeros(shape, device=train_device, dtype=torch.float32) states[step0, :, -1] = observation.to(device=train_device, dtype=torch.float32) shape = (args.num_steps + 1, args.num_ales) values = torch.zeros(shape, device=train_device, dtype=torch.float32) logits = torch.zeros( (args.num_steps + 1, args.num_ales, train_env.action_space.n), device=train_device, dtype=torch.float32) returns = torch.zeros(shape, device=train_device, dtype=torch.float32) shape = (args.num_steps, args.num_ales) rewards = torch.zeros(shape, device=train_device, dtype=torch.float32) masks = torch.zeros(shape, device=train_device, dtype=torch.float32) actions = torch.zeros(shape, device=train_device, dtype=torch.long) mus = torch.ones(shape, device=train_device, dtype=torch.float32) # pis = torch.zeros(shape, device=train_device, dtype=torch.float32) rhos = torch.zeros((args.num_steps, minibatch_size), device=train_device, dtype=torch.float32) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) if args.use_gae: raise ValueError('GAE is not compatible with VTRACE') maybe_npy = lambda a: a.numpy() if args.use_openai else a torch.cuda.synchronize() iterator = range(total_steps) if args.rank == 0: iterator = tqdm(iterator) total_time = 0 evaluation_offset = 0 for update in iterator: T = args.world_size * update * num_frames_per_iter if (args.rank == 0) and (T >= evaluation_offset): evaluation_offset += args.evaluation_interval if double_testing == False: eval_lengths, eval_rewards = test(args, model, test_env) lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths) rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards) length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format( lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian) reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format( rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian) print('[training time: {}] {}'.format( format_time(total_time), ' --- '.join([length_data, reward_data]))) else: args.use_openai_test_env = False eval_lengths, eval_rewards = test(args, model, test_env) lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths) rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards) length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format( lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian) reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format( rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian) print('[CuLE CPU] [training time: {}] {}'.format( format_time(total_time), ' --- '.join([length_data, reward_data]))) args.use_openai_test_env = True eval_lengths, eval_rewards = test(args, model, test_env_oai) lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths) rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards) length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format( lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian) reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format( rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian) print('[OpAI CPU] [training time: {}] {}'.format( format_time(total_time), ' --- '.join([length_data, reward_data]))) args.use_openai_test_env = use_openai_test_env if eval_csv_writer and eval_csv_file: eval_csv_writer.writerow([ T, total_time, rmean, rmedian, rmin, rmax, rstd, lmean, lmedian, lmin, lmax, lstd ]) eval_csv_file.flush() if args.plot: summary_writer.add_scalar('eval/rewards_mean', rmean, T, walltime=total_time) summary_writer.add_scalar('eval/lengths_mean', lmean, T, walltime=total_time) start_time = time.time() with torch.no_grad(): for step in range(args.num_steps_per_update): nvtx.range_push('train:step') value, logit = model(states[step0 + step]) # store values and logits values[step0 + step] = value.squeeze(-1) # convert actions to numpy and perform next step probs = torch.clamp(F.softmax(logit, dim=1), min=0.00001, max=0.99999) probs_action = probs.multinomial(1).to(env_device) # Check if the multinomial threw an exception # https://github.com/pytorch/pytorch/issues/7014 torch.cuda.current_stream().synchronize() observation, reward, done, info = train_env.step( maybe_npy(probs_action)) if args.use_openai: # convert back to pytorch tensors observation = torch.from_numpy(observation) reward = torch.from_numpy(reward) done = torch.from_numpy(done.astype(np.uint8)) else: observation = observation.squeeze(-1).unsqueeze(1) # move back to training memory observation = observation.to(device=train_device) reward = reward.to(device=train_device, dtype=torch.float32) done = done.to(device=train_device, dtype=torch.bool) probs_action = probs_action.to(device=train_device, dtype=torch.long) not_done = 1.0 - done.float() # update rewards and actions actions[step0 + step].copy_(probs_action.view(-1)) masks[step0 + step].copy_(not_done) rewards[step0 + step].copy_(reward.sign()) #mus[step0 + step] = F.softmax(logit, dim=1).gather(1, actions[step0 + step].view(-1).unsqueeze(-1)).view(-1) mus[step0 + step] = torch.clamp(F.softmax(logit, dim=1).gather( 1, actions[step0 + step].view(-1).unsqueeze(-1)).view(-1), min=0.00001, max=0.99999) # update next observations states[step0 + step + 1, :, :-1].copy_(states[step0 + step, :, 1:]) states[step0 + step + 1] *= not_done.view( -1, *[1] * (observation.dim() - 1)) states[step0 + step + 1, :, -1].copy_(observation.view(-1, *states.size()[-2:])) # update episodic reward counters episode_rewards += reward final_rewards[done] = episode_rewards[done] episode_rewards *= not_done episode_lengths += not_done final_lengths[done] = episode_lengths[done] episode_lengths *= not_done nvtx.range_pop() n_minibatch = (n_minibatch + 1) % args.num_minibatches min_ale_index = int(n_minibatch * minibatch_size) max_ale_index = min_ale_index + minibatch_size # compute v-trace using the recursive method (remark 1 in IMPALA paper) # value_next_step, logit = model(states[-1:, min_ale_index:max_ale_index, :, : ,:].contiguous().view(-1, *states.size()[-3:])) # returns[-1, min_ale_index:max_ale_index] = value_next_step.squeeze() # for step in reversed(range(args.num_steps)): # value, logit = model(states[step, min_ale_index:max_ale_index, :, : ,:].contiguous().view(-1, *states.size()[-3:])) # pis = F.softmax(logit, dim=1).gather(1, actions[step, min_ale_index:max_ale_index].view(-1).unsqueeze(-1)).view(-1) # c = torch.clamp(pis / mus[step, min_ale_index:max_ale_index], max=c_) # rhos[step, :] = torch.clamp(pis / mus[step, min_ale_index:max_ale_index], max=rho_) # delta_value = rhos[step, :] * (rewards[step, min_ale_index:max_ale_index] + (args.gamma * value_next_step - value).squeeze()) # returns[step, min_ale_index:max_ale_index] = value.squeeze() + delta_value + args.gamma * c * \ # (returns[step + 1, min_ale_index:max_ale_index] - value_next_step.squeeze()) # value_next_step = value nvtx.range_push('train:compute_values') value, logit = model( states[:, min_ale_index:max_ale_index, :, :, :].contiguous().view( -1, *states.size()[-3:])) batch_value = value.detach().view((args.num_steps + 1, minibatch_size)) batch_probs = F.softmax(logit.detach()[:(args.num_steps * minibatch_size), :], dim=1) batch_pis = batch_probs.gather( 1, actions[:, min_ale_index:max_ale_index].contiguous().view( -1).unsqueeze(-1)).view((args.num_steps, minibatch_size)) returns[-1, min_ale_index:max_ale_index] = batch_value[-1] with torch.no_grad(): for step in reversed(range(args.num_steps)): c = torch.clamp(batch_pis[step, :] / mus[step, min_ale_index:max_ale_index], max=args.c_hat) rhos[step, :] = torch.clamp( batch_pis[step, :] / mus[step, min_ale_index:max_ale_index], max=args.rho_hat) delta_value = rhos[step, :] * ( rewards[step, min_ale_index:max_ale_index] + (args.gamma * batch_value[step + 1] - batch_value[step]).squeeze()) returns[step, min_ale_index:max_ale_index] = \ batch_value[step, :].squeeze() + delta_value + args.gamma * c * \ (returns[step + 1, min_ale_index:max_ale_index] - batch_value[step + 1, :].squeeze()) value = value[:args.num_steps * minibatch_size, :] logit = logit[:args.num_steps * minibatch_size, :] log_probs = F.log_softmax(logit, dim=1) probs = F.softmax(logit, dim=1) action_log_probs = log_probs.gather( 1, actions[:, min_ale_index:max_ale_index].contiguous().view( -1).unsqueeze(-1)) dist_entropy = -(log_probs * probs).sum(-1).mean() advantages = returns[:-1, min_ale_index:max_ale_index].contiguous( ).view(-1).unsqueeze(-1) - value value_loss = advantages.pow(2).mean() policy_loss = -(action_log_probs * rhos.view(-1, 1).detach() * \ (rewards[:, min_ale_index:max_ale_index].contiguous().view(-1, 1) + args.gamma * \ returns[1:, min_ale_index:max_ale_index].contiguous().view(-1, 1) - value).detach()).mean() nvtx.range_pop() nvtx.range_push('train:backprop') loss = value_loss * args.value_loss_coef + policy_loss - dist_entropy * args.entropy_coef optimizer.zero_grad() if args.cpu_train: loss.backward() master_params = model.parameters() else: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() master_params = amp.master_params(optimizer) torch.nn.utils.clip_grad_norm_(master_params, args.max_grad_norm) optimizer.step() nvtx.range_pop() nvtx.range_push('train:next_states') for step in range(0, args.num_steps_per_update): states[:-1, :, :, :, :] = states[1:, :, :, :, :] rewards[:-1, :] = rewards[1:, :] actions[:-1, :] = actions[1:, :] masks[:-1, :] = masks[1:, :] mus[:-1, :] = mus[1:, :] nvtx.range_pop() torch.cuda.synchronize() if args.rank == 0: iter_time = time.time() - start_time total_time += iter_time if args.plot: summary_writer.add_scalar('train/rewards_mean', final_rewards.mean().item(), T, walltime=total_time) summary_writer.add_scalar('train/lengths_mean', final_lengths.mean().item(), T, walltime=total_time) summary_writer.add_scalar('train/value_loss', value_loss, T, walltime=total_time) summary_writer.add_scalar('train/policy_loss', policy_loss, T, walltime=total_time) summary_writer.add_scalar('train/entropy', dist_entropy, T, walltime=total_time) progress_data = callback(args, model, T, iter_time, final_rewards, final_lengths, value_loss, policy_loss, dist_entropy, train_csv_writer, train_csv_file) iterator.set_postfix_str(progress_data) if args.plot and (args.rank == 0): writer.close() if args.use_openai: train_env.close() if args.use_openai_test_env: test_env.close()
def worker(gpu, ngpus_per_node, args): env_device, train_device = args_initialize(gpu, ngpus_per_node, args) train_env, test_env, observation = env_initialize(args, env_device) train_csv_file, train_csv_writer, eval_csv_file, eval_csv_writer, summary_writer = log_initialize( args, train_device) model = ActorCritic(args.num_stack, train_env.action_space, BasicBlock, normalize=args.normalize, name=args.env_name) model, optimizer = model_initialize(args, model, train_device) if (args.num_ales % args.num_minibatches) != 0: raise ValueError( 'Number of ales({}) size is not even divisible by the minibatch size({})' .format(args.num_ales, args.num_minibatches)) if args.num_steps_per_update == -1: args.num_steps_per_update = args.num_steps minibatch_size = int(args.num_ales / args.num_minibatches) print("minibatch_size", minibatch_size) step0 = args.num_steps - args.num_steps_per_update n_minibatch = -1 # This is the number of frames GENERATED between two updates num_frames_per_iter = args.num_ales * args.num_steps_per_update total_steps = math.ceil( args.t_max / (args.world_size * num_frames_per_iter)) #number of total frame shape = (args.num_steps + 1, args.num_ales, args.num_stack, *train_env.observation_space.shape[-2:]) states = torch.zeros(shape, device=train_device, dtype=torch.float32) states[step0, :, -1] = observation.to(device=train_device, dtype=torch.float32) shape = (args.num_steps + 1, args.num_ales) values = torch.zeros(shape, device=train_device, dtype=torch.float32) logits = torch.zeros( (args.num_steps + 1, args.num_ales, train_env.action_space.n), device=train_device, dtype=torch.float32) returns = torch.zeros(shape, device=train_device, dtype=torch.float32) shape = (args.num_steps, args.num_ales) rewards = torch.zeros(shape, device=train_device, dtype=torch.float32) masks = torch.zeros(shape, device=train_device, dtype=torch.float32) actions = torch.zeros(shape, device=train_device, dtype=torch.long) actions_one_hot = torch.zeros((args.num_steps, args.num_ales, 18), device=train_device, dtype=torch.long) actions_space = torch.zeros(18, device=train_device, dtype=torch.long) #for LSTM lstm_hidden_state = torch.zeros((args.num_steps + 1, args.num_ales, 256), device=train_device, dtype=torch.float32) mus = torch.ones(shape, device=train_device, dtype=torch.float32) # pis = torch.zeros(shape, device=train_device, dtype=torch.float32) rhos = torch.zeros((args.num_steps, minibatch_size), device=train_device, dtype=torch.float32) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) #init replay memory #mem = ReplayMemory(observation.to(device=train_device, dtype=torch.float32),args,train_device) torch.cuda.synchronize() iterator = range(total_steps) if args.rank == 0: iterator = tqdm(iterator) total_time = 0 evaluation_offset = 0 opt_step = 0 aux_task = False for update in iterator: T = args.world_size * update * num_frames_per_iter if (args.rank == 0) and (T >= evaluation_offset): print("===========evaluating=========") evaluation_offset += args.evaluation_interval torch.save(model.state_dict(), "./model_save") eval_lengths, eval_rewards = test(args, model, test_env) lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths) rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards) length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format( lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian) reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format( rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian) print('[training time: {}] {}'.format( format_time(total_time), ' --- '.join([length_data, reward_data]))) if eval_csv_writer and eval_csv_file: eval_csv_writer.writerow([ T, total_time, rmean, rmedian, rmin, rmax, rstd, lmean, lmedian, lmin, lmax, lstd ]) eval_csv_file.flush() if args.plot: summary_writer.add_scalar('eval/rewards_mean', rmean, T, walltime=total_time) summary_writer.add_scalar('eval/lengths_mean', lmean, T, walltime=total_time) start_time = time.time() with torch.no_grad(): for step in range(args.num_steps_per_update): #nvtx.range_push('train:step') #value, logit = model(states[step0 + step])#,lstm_hidden_state[step0+step]) value, logit, lstm_hidden_state[step0 + step] = model( states[step0 + step], args, lstm_hidden_state[step0], actions_one_hot[step], rewards[step]) # store values and logits values[step0 + step] = value.squeeze(-1) # convert actions to numpy and perform next step probs = torch.clamp(F.softmax(logit, dim=1), min=0.00001, max=0.99999) probs_action = probs.multinomial(1).to(env_device) actions_space[probs_action] = 1 torch.cuda.current_stream().synchronize() observation, reward, done, info = train_env.step(probs_action) observation = observation.squeeze(-1).unsqueeze(1) # move back to training memory observation = observation.to(device=train_device) reward = reward.to(device=train_device, dtype=torch.float32) done = done.to(device=train_device, dtype=torch.bool) probs_action = probs_action.to(device=train_device, dtype=torch.long) not_done = 1.0 - done.float() lstm_hidden_state[step0 + step] *= not_done[:, None] # update rewards and actions actions[step0 + step].copy_(probs_action.view(-1)) actions_one_hot[step0 + step].copy_(actions_space) masks[step0 + step].copy_(not_done) rewards[step0 + step].copy_(reward.sign()) #mus[step0 + step] = F.softmax(logit, dim=1).gather(1, actions[step0 + step].view(-1).unsqueeze(-1)).view(-1) mus[step0 + step] = torch.clamp(F.softmax(logit, dim=1).gather( 1, actions[step0 + step].view(-1).unsqueeze(-1)).view(-1), min=0.00001, max=0.99999) # update next observations states[step0 + step + 1, :, :-1].copy_(states[step0 + step, :, 1:]) states[step0 + step + 1] *= not_done.view( -1, *[1] * (observation.dim() - 1)) states[step0 + step + 1, :, -1].copy_(observation.view(-1, *states.size()[-2:])) # update episodic reward counters episode_rewards += reward final_rewards[done] = episode_rewards[done] episode_rewards *= not_done episode_lengths += not_done final_lengths[done] = episode_lengths[done] episode_lengths *= not_done nvtx.range_pop() #APPENDING observation #mem.append(Experience(obs=observation.to(device=train_device, dtype=torch.float32),action=probs_action.view(-1).unsqueeze(1),reward=reward.unsqueeze(1)))#,done=done.unsqueeze(1))) # mem.append(Experience(obs=observation.to(device=train_device, dtype=torch.float32),reward=reward.unsqueeze(1))) # if (opt_step >100 and opt_step %50 ==0): # mem.clearMemory(); #clear half of memory every 50 steps n_minibatch = (n_minibatch + 1) % args.num_minibatches min_ale_index = int(n_minibatch * minibatch_size) max_ale_index = min_ale_index + minibatch_size #to cat with output from FC and last reward #actions_one_hot= torch.cat([torch.zeros(args.num_steps,minibatch_size,18).to(device=train_device,dtype=torch.long),actions_one_hot[:,min_ale_index:max_ale_index,:]]) nvtx.range_push('train:compute_values') # not sure about the LSTM input ouput value, logit ,lstm_hidden_state[:,min_ale_index:max_ale_index] = model(states[:, min_ale_index:max_ale_index, :, :, :].contiguous().view(-1, *states.size()[-3:]),\ args,lstm_hidden_state[:,min_ale_index:max_ale_index].contiguous(), actions_one_hot[:,min_ale_index:max_ale_index,:].contiguous().view(-1,18),\ rewards[:,min_ale_index:max_ale_index].contiguous().view(-1,1)) batch_value = value.detach().view((args.num_steps + 1, minibatch_size)) batch_probs = F.softmax(logit.detach()[:(args.num_steps * minibatch_size), :], dim=1) batch_pis = batch_probs.gather( 1, actions[:, min_ale_index:max_ale_index].contiguous().view( -1).unsqueeze(-1)).view((args.num_steps, minibatch_size)) returns[-1, min_ale_index:max_ale_index] = batch_value[-1] with torch.no_grad(): for step in reversed(range(args.num_steps)): c = torch.clamp(batch_pis[step, :] / mus[step, min_ale_index:max_ale_index], max=args.c_hat) rhos[step, :] = torch.clamp( batch_pis[step, :] / mus[step, min_ale_index:max_ale_index], max=args.rho_hat) delta_value = rhos[step, :] * ( rewards[step, min_ale_index:max_ale_index] + (args.gamma * batch_value[step + 1] - batch_value[step]).squeeze()) returns[step, min_ale_index:max_ale_index] = \ batch_value[step, :].squeeze() + delta_value + args.gamma * c * \ (returns[step + 1, min_ale_index:max_ale_index] - batch_value[step + 1, :].squeeze()) value = value[:args.num_steps * minibatch_size, :] logit = logit[:args.num_steps * minibatch_size, :] log_probs = F.log_softmax(logit, dim=1) probs = F.softmax(logit, dim=1) action_log_probs = log_probs.gather( 1, actions[:, min_ale_index:max_ale_index].contiguous().view( -1).unsqueeze(-1)) dist_entropy = -(log_probs * probs).sum(-1).mean() advantages = returns[:-1, min_ale_index:max_ale_index].contiguous( ).view(-1).unsqueeze(-1) - value value_loss = advantages.pow(2).mean() policy_loss = -(action_log_probs * rhos.view(-1, 1).detach() * \ (rewards[:, min_ale_index:max_ale_index].contiguous().view(-1, 1) + args.gamma * \ returns[1:, min_ale_index:max_ale_index].contiguous().view(-1, 1) - value).detach()).mean() nvtx.range_pop() nvtx.range_push('train:backprop') # auxliary task from UNREAL https://arxiv.org/pdf/1611.05397 #REWARD PREDICTION # if (opt_step>100 and opt_step%20 == 0): # aux_task = True # obs = [] # for i in range(20): # obs.append(mem.rp()) # states_,batch_rp_c= process_rp(obs) # rp_c = model(states_,args,aux_task='rp')+1e-7 # print(rp_c) # # rp_loss = -torch.sum(batch_rp_c.to(device=train_device) * torch.log(rp_c))/20/3 # print("---------------rp_loss---",rp_loss) # # ##################################################### ### pixel change loss # # obs_=[] # # #for i in range(32):TODO BATCH LATER # # obs_.append(mem.pc()) # # # # states_pc,batch_pc_a,batch_pc_R = process_pc(obs_,model,train_device) # # print(len(states_pc)) # # print(states_pc[0].shape) # # print(batch_pc_a[0].shape) # # print(batch_pc_R[0].shape) # # print(torch.cat(states_pc).shape) # # print(stop) # #torch.Size([5, 4, 84, 84]) # #torch.Size([18]) # #torch.Size([5, 20, 20]) # #print(torch.cat(states_pc).shape) # #print(stop) # # states_pc = torch.cat(states_pc).view(-1,4,84,84) # # pc_q, pc_q_max = model(states_pc,aux_task='pc') # # print(pc_q_max.shape) # # pc_a_reshape = batch_pc_a[0].view(-1,train_env.action_space.n,1,1) # # pc_qa_ = torch.mul(pc_q,pc_a_reshape) # # pc_qa = torch.sum (pc_qa_, dim=1,keepdim =False) # # # # print(pc_qa.shape) # # print(batch_pc_R[0].shape) # # print(pc_qa.shape) # # pc_loss = torch.sum( (( batch_pc_R[0]-pc_qa)**2/2.) ) # # print(pc_loss) # # print(stop) # loss = value_loss * args.value_loss_coef + policy_loss - dist_entropy * args.entropy_coef # if aux_task ==True: # loss += rp_loss # aux_task = False optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward(retain_graph=True) master_params = amp.master_params(optimizer) torch.nn.utils.clip_grad_norm_(master_params, args.max_grad_norm) optimizer.step() opt_step += 1 #nvtx.range_pop() #nvtx.range_push('train:next_states') for step in range(0, args.num_steps_per_update): states[:-1, :, :, :, :] = states[1:, :, :, :, :] rewards[:-1, :] = rewards[1:, :] actions[:-1, :] = actions[1:, :] # actions_one_hot[:-1,:] = actions_one_hot[1:,:] # lstm_hidden_state[:-1,:] = lstm_hidden_state [1:,:] masks[:-1, :] = masks[1:, :] mus[:-1, :] = mus[1:, :] #nvtx.range_pop() torch.cuda.synchronize() if args.rank == 0: iter_time = time.time() - start_time total_time += iter_time if args.plot: summary_writer.add_scalar('train/rewards_mean', final_rewards.mean().item(), T, walltime=total_time) summary_writer.add_scalar('train/lengths_mean', final_lengths.mean().item(), T, walltime=total_time) summary_writer.add_scalar('train/value_loss', value_loss, T, walltime=total_time) summary_writer.add_scalar('train/policy_loss', policy_loss, T, walltime=total_time) summary_writer.add_scalar('train/entropy', dist_entropy, T, walltime=total_time) progress_data = callback(args, model, T, iter_time, final_rewards, final_lengths, value_loss, policy_loss, dist_entropy, train_csv_writer, train_csv_file) iterator.set_postfix_str(progress_data) if args.plot and (args.rank == 0): # name = '{}.pth'.format("BO_PC") # torch.save(model.module.state_dict(), "Pong_1gpu_1200") summary_writer.close()
def worker(gpu, ngpus_per_node, args): env_device, train_device = args_initialize(gpu, ngpus_per_node, args) # benchmark? if args.benchmark: device_name = torch.cuda.get_device_name(args.gpu).lower().replace(' ', '_') backend_name = 'cule_cpu' if args.use_openai: backend_name = 'openai' if args.use_cuda_env: backend_name = 'cule_gpu' if args.use_cuda_env and args.multiprocessing_distributed: backend_name = 'cule_multiples_gpus' filename = 'rom_perf_' + device_name + '_' + backend_name + '_' + args.env_name + '_' + str(args.num_ales) + '.csv' csv_file = open(filename, 'w', newline='') csv_writer = csv.writer(csv_file, delimiter=',') csv_writer.writerow(['env_name', 'num_ales', 'step_time', 'step_rate', 'device', 'mode']) args.evaluation_interval = args.t_max # no eval while benchmarking! benchmark_steps = 100 double_testing = True # openai and cule testing if double_testing == False: train_csv_file, train_csv_writer, eval_csv_file, eval_csv_writer, summary_writer = log_initialize(args, train_device) train_env, test_env, observation = env_initialize(args, env_device) else: use_openai_test_env = args.use_openai_test_env output_filename = args.output_filename if args.output_filename is None: args.output_filename = 'test.csv' args.use_openai_test_env = False args.output_filename = args.output_filename[:-4] + '_cule.csv' train_csv_file, train_csv_writer, eval_csv_file, eval_csv_writer, summary_writer = log_initialize(args, train_device) train_env, test_env, observation = env_initialize(args, env_device) args.use_openai_test_env = True args.output_filename = args.output_filename[:-4] + '_openai.csv' _, test_env_oai, _ = env_initialize(args, env_device) train_csv_file_oai, train_csv_writer_oai, eval_csv_file_oai, eval_csv_writer_oai, summary_writer_oai = log_initialize(args, train_device) args.use_openai_test_env = use_openai_test_env args.output_filename = output_filename model = ActorCritic(args.num_stack, train_env.action_space, normalize=args.normalize, name=args.env_name) model, optimizer = model_initialize(args, model, train_device) if (args.num_ales % args.num_minibatches) != 0: raise ValueError('Number of ales({}) size is not even divisible by the minibatch size({})'.format( args.num_ales, args.num_minibatches)) if args.num_steps_per_update == -1: args.num_steps_per_update = args.num_steps minibatch_size = int(args.num_ales / args.num_minibatches) step0 = args.num_steps - args.num_steps_per_update n_minibatch = -1 # This is the number of frames GENERATED between two updates num_frames_per_iter = args.num_ales * args.num_steps_per_update total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter)) shape = (args.num_steps + 1, args.num_ales, args.num_stack, *train_env.observation_space.shape[-2:]) states = torch.zeros(shape, device=train_device, dtype=torch.float32) states[step0, :, -1] = observation.to(device=train_device, dtype=torch.float32) shape = (args.num_steps + 1, args.num_ales) values = torch.zeros(shape, device=train_device, dtype=torch.float32) logits = torch.zeros((args.num_steps + 1, args.num_ales, train_env.action_space.n), device=train_device, dtype=torch.float32) returns = torch.zeros(shape, device=train_device, dtype=torch.float32) shape = (args.num_steps, args.num_ales) rewards = torch.zeros(shape, device=train_device, dtype=torch.float32) masks = torch.zeros(shape, device=train_device, dtype=torch.float32) actions = torch.zeros(shape, device=train_device, dtype=torch.long) mus = torch.ones(shape, device=train_device, dtype=torch.float32) # pis = torch.zeros(shape, device=train_device, dtype=torch.float32) rhos = torch.zeros((args.num_steps, minibatch_size), device=train_device, dtype=torch.float32) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) if args.use_gae: raise ValueError('GAE is not compatible with VTRACE') maybe_npy = lambda a: a.numpy() if args.use_openai else a torch.cuda.synchronize() iterator = range(total_steps) if args.rank == 0: iterator = tqdm(iterator) total_time = 0 evaluation_offset = 0 # benchmark - random if args.benchmark: # warmup (measure anyway for debug!) torch.cuda.current_stream().synchronize() benchmark_start_time = time.time() for step in range(0, 10): if args.use_openai: random_actions = np.random.randint(train_env.action_space.n, size=args.num_ales) observation, reward, done, info = train_env.step(random_actions) else: random_actions = train_env.sample_random_actions() observation, reward, done, info = train_env.step(maybe_npy(random_actions)) torch.cuda.current_stream().synchronize() elapsed_time = time.time() - benchmark_start_time fps = 10 * args.num_ales / elapsed_time print('Warmup - random: ' + str(round(fps)) + 'FPS') # benchmark benchmark_start_time = time.time() for step in range(0, benchmark_steps): if args.use_openai: random_actions = np.random.randint(train_env.action_space.n, size=args.num_ales) observation, reward, done, info = train_env.step(random_actions) else: random_actions = train_env.sample_random_actions() observation, reward, done, info = train_env.step(maybe_npy(random_actions)) torch.cuda.current_stream().synchronize() elapsed_time = time.time() - benchmark_start_time fps = benchmark_steps * args.num_ales / elapsed_time csv_writer.writerow([args.env_name, args.num_ales, elapsed_time / benchmark_steps, fps, backend_name, 'random']) print('Benchmark - random: ' + str(round(fps)) + ' PFS') benchmark_start_time = time.time() for update in iterator: if not args.benchmark: T = args.world_size * update * num_frames_per_iter if (args.rank == 0) and (T >= evaluation_offset): evaluation_offset += args.evaluation_interval if double_testing == False: eval_lengths, eval_rewards = test(args, model, test_env) lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths) rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards) length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format(lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian) reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format(rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian) print('[training time: {}] {}'.format(format_time(total_time), ' --- '.join([length_data, reward_data]))) if eval_csv_writer and eval_csv_file: eval_csv_writer.writerow([T, total_time, rmean, rmedian, rmin, rmax, rstd, lmean, lmedian, lmin, lmax, lstd]) eval_csv_file.flush() if args.plot: summary_writer.add_scalar('eval/rewards_mean', rmean, T, walltime=total_time) summary_writer.add_scalar('eval/lengths_mean', lmean, T, walltime=total_time) else: args.use_openai_test_env = False eval_lengths, eval_rewards = test(args, model, test_env) lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths) rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards) length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format(lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian) reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format(rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian) print('[CuLE CPU] [training time: {}] {}'.format(format_time(total_time), ' --- '.join([length_data, reward_data]))) if eval_csv_writer and eval_csv_file: eval_csv_writer.writerow([T, total_time, rmean, rmedian, rmin, rmax, rstd, lmean, lmedian, lmin, lmax, lstd]) eval_csv_file.flush() if args.plot: summary_writer.add_scalar('eval/rewards_mean', rmean, T, walltime=total_time) summary_writer.add_scalar('eval/lengths_mean', lmean, T, walltime=total_time) args.use_openai_test_env = True eval_lengths, eval_rewards = test(args, model, test_env_oai) lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths) rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards) length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format(lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian) reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format(rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian) print('[OpAI CPU] [training time: {}] {}'.format(format_time(total_time), ' --- '.join([length_data, reward_data]))) if eval_csv_writer_oai and eval_csv_file_oai: eval_csv_writer_oai.writerow([T, total_time, rmean, rmedian, rmin, rmax, rstd, lmean, lmedian, lmin, lmax, lstd]) eval_csv_file_oai.flush() if args.plot: summary_writer_oai.add_scalar('eval/rewards_mean', rmean, T, walltime=total_time) summary_writer_oai.add_scalar('eval/lengths_mean', lmean, T, walltime=total_time) args.use_openai_test_env = use_openai_test_env start_time = time.time() with torch.no_grad(): for step in range(args.num_steps_per_update): nvtx.range_push('train:step') # step value, logit = model(states[step0 + step]) # store values and logits values[step0 + step] = value.squeeze(-1) # convert actions to numpy and perform next step probs = torch.clamp(F.softmax(logit, dim=1), min = 0.00001, max = 0.99999) probs_action = probs.multinomial(1).to(env_device) # Check if the multinomial threw an exception # https://github.com/pytorch/pytorch/issues/7014 torch.cuda.current_stream().synchronize() observation, reward, done, info = train_env.step(maybe_npy(probs_action)) if args.use_openai: # convert back to pytorch tensors observation = torch.from_numpy(observation) reward = torch.from_numpy(reward) done = torch.from_numpy(done.astype(np.uint8)) else: observation = observation.squeeze(-1).unsqueeze(1) # move back to training memory observation = observation.to(device=train_device) reward = reward.to(device=train_device, dtype=torch.float32) done = done.to(device=train_device, dtype=torch.bool) probs_action = probs_action.to(device=train_device, dtype=torch.long) not_done = 1.0 - done.float() # update rewards and actions actions[step0 + step].copy_(probs_action.view(-1)) masks[step0 + step].copy_(not_done) rewards[step0 + step].copy_(reward.sign()) #mus[step0 + step] = F.softmax(logit, dim=1).gather(1, actions[step0 + step].view(-1).unsqueeze(-1)).view(-1) mus[step0 + step] = torch.clamp(F.softmax(logit, dim=1).gather(1, actions[step0 + step].view(-1).unsqueeze(-1)).view(-1), min = 0.00001, max=0.99999) # update next observations states[step0 + step + 1, :, :-1].copy_(states[step0 + step, :, 1:]) states[step0 + step + 1] *= not_done.view(-1, *[1] * (observation.dim() - 1)) states[step0 + step + 1, :, -1].copy_(observation.view(-1, *states.size()[-2:])) # update episodic reward counters episode_rewards += reward final_rewards[done] = episode_rewards[done] episode_rewards *= not_done episode_lengths += not_done final_lengths[done] = episode_lengths[done] episode_lengths *= not_done nvtx.range_pop() # benchmark - inference if args.benchmark: if update < (benchmark_steps - 1): for step in range(0, args.num_steps_per_update): states[:-1, :, :, :, :] = states[1:, :, :, : ,:] rewards[:-1, :] = rewards[1:, :] actions[:-1, :] = actions[1:, :] masks[:-1, :] = masks[1:, :] mus[:-1, :] = mus[1:, :] continue if update == (benchmark_steps - 1): torch.cuda.current_stream().synchronize() elapsed_time = time.time() - benchmark_start_time fps = benchmark_steps * args.num_ales * args.num_steps_per_update / elapsed_time csv_writer.writerow([args.env_name, args.num_ales, elapsed_time / benchmark_steps, fps, backend_name, 'inference']) print('Benchmark - inference: ' + str(round(fps)) + ' PFS') n_minibatch = (n_minibatch + 1) % args.num_minibatches min_ale_index = int(n_minibatch * minibatch_size) max_ale_index = min_ale_index + minibatch_size nvtx.range_push('train:compute_values') value, logit = model(states[:, min_ale_index:max_ale_index, :, :, :].contiguous().view(-1, *states.size()[-3:])) batch_value = value.detach().view((args.num_steps + 1, minibatch_size)) batch_probs = F.softmax(logit.detach()[:(args.num_steps * minibatch_size), :], dim=1) batch_pis = batch_probs.gather(1, actions[:, min_ale_index:max_ale_index].contiguous().view(-1).unsqueeze(-1)).view((args.num_steps, minibatch_size)) returns[-1, min_ale_index:max_ale_index] = batch_value[-1] with torch.no_grad(): for step in reversed(range(args.num_steps)): c = torch.clamp(batch_pis[step, :] / mus[step, min_ale_index:max_ale_index], max=args.c_hat) rhos[step, :] = torch.clamp(batch_pis[step, :] / mus[step, min_ale_index:max_ale_index], max=args.rho_hat) delta_value = rhos[step, :] * (rewards[step, min_ale_index:max_ale_index] + (args.gamma * batch_value[step + 1] - batch_value[step]).squeeze()) returns[step, min_ale_index:max_ale_index] = \ batch_value[step, :].squeeze() + delta_value + args.gamma * c * \ (returns[step + 1, min_ale_index:max_ale_index] - batch_value[step + 1, :].squeeze()) value = value[:args.num_steps * minibatch_size, :] logit = logit[:args.num_steps * minibatch_size, :] log_probs = F.log_softmax(logit, dim=1) probs = F.softmax(logit, dim=1) action_log_probs = log_probs.gather(1, actions[:, min_ale_index:max_ale_index].contiguous().view(-1).unsqueeze(-1)) dist_entropy = -(log_probs * probs).sum(-1).mean() advantages = returns[:-1, min_ale_index:max_ale_index].contiguous().view(-1).unsqueeze(-1) - value value_loss = advantages.pow(2).mean() policy_loss = -(action_log_probs * rhos.view(-1, 1).detach() * \ (rewards[:, min_ale_index:max_ale_index].contiguous().view(-1, 1) + args.gamma * \ returns[1:, min_ale_index:max_ale_index].contiguous().view(-1, 1) - value).detach()).mean() nvtx.range_pop() nvtx.range_push('train:backprop') loss = value_loss * args.value_loss_coef + policy_loss - dist_entropy * args.entropy_coef optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() nvtx.range_pop() nvtx.range_push('train:next_states') for step in range(0, args.num_steps_per_update): states[:-1] = states[1:].clone() rewards[:-1] = rewards[1:] actions[:-1] = actions[1:] masks[:-1] = masks[1:] mus[:-1] = mus[1:] nvtx.range_pop() torch.cuda.synchronize() if not args.benchmark: if args.rank == 0: iter_time = time.time() - start_time total_time += iter_time if args.plot: summary_writer.add_scalar('train/rewards_mean', final_rewards.mean().item(), T, walltime=total_time) summary_writer.add_scalar('train/lengths_mean', final_lengths.mean().item(), T, walltime=total_time) summary_writer.add_scalar('train/value_loss', value_loss, T, walltime=total_time) summary_writer.add_scalar('train/policy_loss', policy_loss, T, walltime=total_time) summary_writer.add_scalar('train/entropy', dist_entropy, T, walltime=total_time) progress_data = callback(args, model, T, iter_time, final_rewards, final_lengths, value_loss, policy_loss, dist_entropy, train_csv_writer, train_csv_file) iterator.set_postfix_str(progress_data) # benchmark - training if args.benchmark: if update == benchmark_steps: benchmark_start_time = time.time() if update == 2 * benchmark_steps: elapsed_time = time.time() - benchmark_start_time fps = benchmark_steps * args.num_ales * args.num_steps_per_update / elapsed_time csv_writer.writerow([args.env_name, args.num_ales, elapsed_time / benchmark_steps, fps, backend_name, 'training']) print('Benchmark - training: ' + str(round(fps)) + ' PFS') csv_file.close() break if args.plot and (args.rank == 0): writer.close() if args.use_openai: train_env.close() if args.use_openai_test_env: test_env.close()