def log_progress(self): episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) if self.t % self.log_every_n_steps == 0 and self.model_initialized: logz.log_tabular("TimeStep", self.t) logz.log_tabular("MeanReturn", self.mean_episode_reward) logz.log_tabular("BestMeanReturn", max(self.best_mean_episode_reward, self.mean_episode_reward)) logz.log_tabular("Episodes", len(episode_rewards)) logz.log_tabular("Exploration", self.exploration.value(self.t)) logz.log_tabular("LearningRate", self.optimizer_spec.lr_lambda(self.t)) logz.log_tabular("Time", (time.time() - self.start_time) / 60.) logz.dump_tabular() logz.save_pytorch_model(self.q_net)
def train_PG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, reward_to_go, animate, logdir, normalize_advantages, nn_baseline, seed, n_layers, size): start = time.time() #================== #SETUP LOGGER #=================== locals_ = locals() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] hyperparams = {k: locals_[k] if k in locals_ else None for k in args} logz.save_hyperparams(hyperparams) #================== #SETUP ENV #=================== #Make gym env env = gym.make(env_name) #Set random seeds (TORCH, NUMPY and ENVIRONMENT) torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) #Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #Find out if env is continous or discrete discrete = isinstance(env.action_space, gym.spaces.Discrete) #Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #================== #INITIALIZE AGENT #=================== neural_network_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'reward_to_go': reward_to_go, 'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages, } agent = Agent(neural_network_args, sample_trajectory_args, estimate_return_args) #================== #TRAINING LOOP #=================== total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) with torch.no_grad(): #Step 1: Sample Trajectories from current policy (neural network) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch #Step 2: Calculate the RETURNS (Q_val, Adv) for this batch (batch_size = sum of all timesteps across all paths) ob_no = np.concatenate([path["observation"] for path in paths]) #(batch_size * obs_dim) ac_na = np.concatenate([path["action"] for path in paths]) #(batch_size * action_dim) re_n = [ path["reward"] for path in paths ] #(num_paths) each index is a numpy array containing the rewards for that path with torch.no_grad(): q_n, adv_n = agent.estimate_return(ob_no, re_n) #Step 3: Update parameters using Policy Gradient agent.update_parameters(ob_no, ac_na, q_n, adv_n) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [len(path["reward"]) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.save_pytorch_model(agent)
def train(episodes, learning_rate, batch_size, gamma, eps_start, eps_end, eps_decay, target_update, max_steps, buffer_size, random_link, random_target, repeat_actions, logdir): setup_logger(logdir, locals()) env = environment() eval_policy = evaluation(env, logdir) env.reset_target_position(random_=True) env.reset_robot_position(random_=False) resize = T.Compose([ T.ToPILImage(), T.Grayscale(num_output_channels=1), T.Resize(64, interpolation=Image.BILINEAR), T.ToTensor() ]) img = env.render() img = torch.from_numpy(img.copy()) img_height, img_width, _ = img.shape policy_net = DQN(img_height, img_width).to(device) target_net = DQN(img_height, img_width).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate) memory = Replay_Buffer(buffer_size) obs = env.render() obs = resize(np.uint8(obs)).unsqueeze(0).to(device) steps_ep = 0 rewards_ep = 0 successes = 0 target_upd = 0 grad_upd = 0 steps_train = 0 for ep in range(1, episodes + 1): env.reset_target_position(random_=random_target) env.reset_robot_position(random_=random_link) rewards_ep = 0 steps_ep = 0 steps_all = [] rewards_all = [] sampling_time = 0 start_time = time.time() while True: action, eps_threshold = select_actions(obs, eps_start, eps_end, eps_decay, steps_train, policy_net) action = action.to(device) reward, done = env.step_(action) reward = torch.tensor(reward, dtype=torch.float).view(-1, 1) obs_next = env.render() obs_next = resize(np.uint8(obs_next)).unsqueeze(0).to(device) transition = {'s': obs, 'a': action, 'r': reward, "s'": obs_next} steps_ep += 1 steps_train += 1 rewards_ep += reward memory_state = memory.push(transition) obs = env.render() obs = resize(np.uint8(obs)).unsqueeze(0).to(device) if done == True: rewards_all.append(rewards_ep / steps_ep) steps_all.append(steps_ep) successes += 1 env.reset_target_position(random_=random_target) env.reset_robot_position(random_=random_link) break elif steps_ep == max_steps: rewards_all.append(rewards_ep) steps_all.append(steps_ep) env.reset_target_position(random_=random_target) env.reset_robot_position(random_=random_link) break status = optimize_model(policy_net, target_net, optimizer, memory, gamma, batch_size) if status != False: grad_upd += 1 if grad_upd % target_update == 0: # update target network parameters target_net.load_state_dict(policy_net.state_dict()) target_net.eval() target_upd += 1 end_time = time.time() sampling_time += end_time - start_time sampling_time /= ep if ep % 25 == 0: return_val, steps_val = eval_policy.sample_episode( policy_net, save_video=True if ep % 500 == 0 else False, n_episodes=5) qvalue_eval = eval_policy.get_qvalue(policy_net) logz.log_tabular('Averaged Steps Traning', np.around(np.average(steps_all), decimals=0)) # last 10 episodes logz.log_tabular('Averaged Return Training', np.around(np.average(rewards_all), decimals=2)) logz.log_tabular('Averaged Steps Validation', np.around(np.average(steps_val), decimals=0)) logz.log_tabular('Averaged Return Validation', np.around(np.average(return_val), decimals=2)) logz.log_tabular('Cumulative Successes', successes) logz.log_tabular('Number of episodes', ep) logz.log_tabular('Sampling time (s)', sampling_time) logz.log_tabular('Epsilon threshold', eps_threshold) logz.log_tabular('Gradient update', grad_upd) logz.log_tabular('Updates target network', target_upd) logz.log_tabular('Average q-value evaluation', qvalue_eval) logz.dump_tabular() steps_all = [] rewards_all = [] logz.save_pytorch_model(policy_net.state_dict()) env.terminate()
def train(BATCH_SIZE, DISCOUNT, ENTROPY_WEIGHT, HIDDEN_SIZE, LEARNING_RATE, MAX_STEPS, POLYAK_FACTOR, REPLAY_SIZE, TEST_INTERVAL, UPDATE_INTERVAL, UPDATE_START, ENV, OBSERVATION_LOW, VALUE_FNC, FLOW_TYPE, FLOWS, DEMONSTRATIONS, PRIORITIZE_REPLAY, BEHAVIOR_CLONING, ARM, BASE, RPA, REWARD_DENSE, logdir): ALPHA = 0.3 BETA = 1 epsilon = 0.0001 #0.1 epsilon_d = 0.1 #0.3 weights = 1 #1 lambda_ac = 0.85 #0.7 lambda_bc = 0.3 #0.4 setup_logger(logdir, locals()) ENV = __import__(ENV) if ARM and BASE: env = ENV.youBotAll('youbot_navig2.ttt', obs_lowdim=OBSERVATION_LOW, rpa=RPA, reward_dense=REWARD_DENSE, boundary=1) elif ARM: env = ENV.youBotArm('youbot_navig.ttt', obs_lowdim=OBSERVATION_LOW, rpa=RPA, reward_dense=REWARD_DENSE) elif BASE: env = ENV.youBotBase('youbot_navig.ttt', obs_lowdim=OBSERVATION_LOW, rpa=RPA, reward_dense=REWARD_DENSE, boundary=1) action_space = env.action_space obs_space = env.observation_space() step_limit = env.step_limit() if OBSERVATION_LOW: actor = SoftActorGated(HIDDEN_SIZE, action_space, obs_space, flow_type=FLOW_TYPE, flows=FLOWS).float().to(device) critic_1 = Critic(HIDDEN_SIZE, 1, obs_space, action_space, state_action=True).float().to(device) critic_2 = Critic(HIDDEN_SIZE, 1, obs_space, action_space, state_action=True).float().to(device) else: actor = ActorImageNet(HIDDEN_SIZE, action_space, obs_space, flow_type=FLOW_TYPE, flows=FLOWS).float().to(device) critic_1 = Critic(HIDDEN_SIZE, 1, obs_space, action_space, state_action=True).float().to(device) critic_2 = Critic(HIDDEN_SIZE, 1, obs_space, action_space, state_action=True).float().to(device) critic_1.load_state_dict( torch.load( 'data/youbot_all_final_21-08-2019_22-32-00/models/critic1_model_473000.pkl' )) critic_2.load_state_dict( torch.load( 'data/youbot_all_final_21-08-2019_22-32-00/models/critic1_model_473000.pkl' )) actor.apply(weights_init) # critic_1.apply(weights_init) # critic_2.apply(weights_init) if VALUE_FNC: value_critic = Critic(HIDDEN_SIZE, 1, obs_space, action_space).float().to(device) target_value_critic = create_target_network(value_critic).float().to( device) value_critic_optimiser = optim.Adam(value_critic.parameters(), lr=LEARNING_RATE) else: target_critic_1 = create_target_network(critic_1) target_critic_2 = create_target_network(critic_2) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE) critics_optimiser = optim.Adam(list(critic_1.parameters()) + list(critic_2.parameters()), lr=LEARNING_RATE) # Replay buffer if PRIORITIZE_REPLAY: # D = PrioritizedReplayBuffer(REPLAY_SIZE, ALPHA) D = ReplayMemory(device, 3, DISCOUNT, 1, BETA, ALPHA, REPLAY_SIZE) else: D = deque(maxlen=REPLAY_SIZE) eval_ = evaluation_sac(env, logdir, device) #Automatic entropy tuning init target_entropy = -np.prod(action_space).item() log_alpha = torch.zeros(1, requires_grad=True, device=device) alpha_optimizer = optim.Adam([log_alpha], lr=LEARNING_RATE) home = os.path.expanduser('~') if DEMONSTRATIONS: dir_dem = os.path.join(home, 'robotics_drl/data/demonstrations/', DEMONSTRATIONS) D, n_demonstrations = load_buffer_demonstrations( D, dir_dem, PRIORITIZE_REPLAY, OBSERVATION_LOW) else: n_demonstrations = 0 if not BEHAVIOR_CLONING: behavior_loss = 0 os.mkdir(os.path.join(home, 'robotics_drl', logdir, 'models')) dir_models = os.path.join(home, 'robotics_drl', logdir, 'models') state, done = env.reset(), False if OBSERVATION_LOW: state = state.float().to(device) else: state['low'] = state['low'].float() state['high'] = state['high'].float() pbar = tqdm(range(1, MAX_STEPS + 1), unit_scale=1, smoothing=0) steps = 0 success = 0 for step in pbar: with torch.no_grad(): if step < UPDATE_START and not DEMONSTRATIONS: # To improve exploration take actions sampled from a uniform random distribution over actions at the start of training action = torch.tensor(env.sample_action(), dtype=torch.float32, device=device).unsqueeze(dim=0) else: # Observe state s and select action a ~ μ(a|s) if not OBSERVATION_LOW: state['low'] = state['low'].float().to(device) state['high'] = state['high'].float().to(device) action, _ = actor(state, log_prob=False, deterministic=False) if not OBSERVATION_LOW: state['low'] = state['low'].float().cpu() state['high'] = state['high'].float().cpu() #if (policy.mean).mean() > 0.4: # print("GOOD VELOCITY") # Execute a in the environment and observe next state s', reward r, and done signal d to indicate whether s' is terminal next_state, reward, done = env.step( action.squeeze(dim=0).cpu().tolist()) if OBSERVATION_LOW: next_state = next_state.float().to(device) else: next_state['low'] = next_state['low'].float() next_state['high'] = next_state['high'].float() # Store (s, a, r, s', d) in replay buffer D if PRIORITIZE_REPLAY: if OBSERVATION_LOW: D.add(state.cpu().tolist(), action.cpu().squeeze().tolist(), reward, next_state.cpu().tolist(), done) else: D.append(state['high'], state['low'], action.cpu().squeeze().tolist(), reward, done) else: D.append({ 'state': state.unsqueeze(dim=0) if OBSERVATION_LOW else state, 'action': action, 'reward': torch.tensor([reward], dtype=torch.float32, device=device), 'next_state': next_state.unsqueeze( dim=0) if OBSERVATION_LOW else next_state, 'done': torch.tensor([True if reward == 1 else False], dtype=torch.float32, device=device) }) state = next_state # If s' is terminal, reset environment state steps += 1 if done or steps > step_limit: #TODO: incorporate step limit in the environment eval_c2 = True #TODO: multiprocess pyrep with a session for each testing and training steps = 0 if OBSERVATION_LOW: state = env.reset().float().to(device) else: state = env.reset() state['low'] = state['low'].float() state['high'] = state['high'].float() if reward == 1: success += 1 if step > UPDATE_START and step % UPDATE_INTERVAL == 0: for _ in range(1): # Randomly sample a batch of transitions B = {(s, a, r, s', d)} from D if PRIORITIZE_REPLAY: if OBSERVATION_LOW: state_batch, action_batch, reward_batch, state_next_batch, done_batch, weights_pr, idxes = D.sample( BATCH_SIZE, BETA) state_batch = torch.from_numpy(state_batch).float().to( device) next_state_batch = torch.from_numpy( state_next_batch).float().to(device) action_batch = torch.from_numpy( action_batch).float().to(device) reward_batch = torch.from_numpy( reward_batch).float().to(device) done_batch = torch.from_numpy(done_batch).float().to( device) weights_pr = torch.from_numpy(weights_pr).float().to( device) else: idxes, high_state_batch, low_state_batch, action_batch, reward_batch, high_state_next_batch, low_state_next_batch, done_batch, weights_pr = D.sample( BATCH_SIZE) state_batch = { 'low': low_state_batch.float().to(device).view(-1, 32), 'high': high_state_batch.float().to(device).view( -1, 12, 128, 128) } next_state_batch = { 'low': low_state_next_batch.float().to(device).view( -1, 32), 'high': high_state_next_batch.float().to(device).view( -1, 12, 128, 128) } action_batch = action_batch.float().to(device) reward_batch = reward_batch.float().to(device) done_batch = done_batch.float().to(device) weights_pr = weights_pr.float().to(device) # for j in range(BATCH_SIZE): # new_state_batch['high'] = torch.cat((new_state_batch['high'], state_batch[j].tolist()['high'].view(-1,(3+1)*env.frames,128,128)), dim=0) # new_state_batch['low'] = torch.cat((new_state_batch['low'], state_batch[j].tolist()['low'].view(-1,32)), dim=0) # new_next_state_batch['high'] = torch.cat((new_next_state_batch['high'], state_next_batch[j].tolist()['high'].view(-1,(3+1)*env.frames,128,128)), dim=0) # new_next_state_batch['low'] = torch.cat((new_next_state_batch['low'], state_next_batch[j].tolist()['low'].view(-1,32)), dim=0) # new_state_batch['high'] = new_state_batch['high'].to(device) # new_state_batch['low'] = new_state_batch['low'].to(device) # new_next_state_batch['high'] = new_next_state_batch['high'].to(device) # new_next_state_batch['low'] = new_next_state_batch['low'].to(device) batch = { 'state': state_batch, 'action': action_batch, 'reward': reward_batch, 'next_state': next_state_batch, 'done': done_batch } state_batch = [] state_next_batch = [] else: batch = random.sample(D, BATCH_SIZE) state_batch = [] action_batch = [] reward_batch = [] state_next_batch = [] done_batch = [] for d in batch: state_batch.append(d['state']) action_batch.append(d['action']) reward_batch.append(d['reward']) state_next_batch.append(d['next_state']) done_batch.append(d['done']) batch = { 'state': torch.cat(state_batch, dim=0), 'action': torch.cat(action_batch, dim=0), 'reward': torch.cat(reward_batch, dim=0), 'next_state': torch.cat(state_next_batch, dim=0), 'done': torch.cat(done_batch, dim=0) } action, log_prob = actor(batch['state'], log_prob=True, deterministic=False) #Automatic entropy tuning alpha_loss = -( log_alpha.float() * (log_prob + target_entropy).float().detach()).mean() alpha_optimizer.zero_grad() alpha_loss.backward() alpha_optimizer.step() alpha = log_alpha.exp() weighted_sample_entropy = (alpha.float() * log_prob).view( -1, 1) # Compute targets for Q and V functions if VALUE_FNC: y_q = batch['reward'] + DISCOUNT * ( 1 - batch['done']) * target_value_critic( batch['next_state']) y_v = torch.min( critic_1(batch['state']['low'], action.detach()), critic_2(batch['state']['low'], action.detach()) ) - weighted_sample_entropy.detach() else: # No value function network with torch.no_grad(): next_actions, next_log_prob = actor( batch['next_state'], log_prob=True, deterministic=False) target_qs = torch.min( target_critic_1( batch['next_state']['low'] if not OBSERVATION_LOW else batch['next_state'], next_actions), target_critic_2( batch['next_state']['low'] if not OBSERVATION_LOW else batch['next_state'], next_actions)) - alpha * next_log_prob y_q = batch['reward'] + DISCOUNT * ( 1 - batch['done']) * target_qs.detach() td_error_critic1 = critic_1( batch['state']['low'] if not OBSERVATION_LOW else batch['state'], batch['action']) - y_q td_error_critic2 = critic_2( batch['state']['low'] if not OBSERVATION_LOW else batch['state'], batch['action']) - y_q q_loss = (td_error_critic1).pow(2).mean() + ( td_error_critic2).pow(2).mean() # q_loss = (F.mse_loss(critic_1(batch['state'], batch['action']), y_q) + F.mse_loss(critic_2(batch['state'], batch['action']), y_q)).mean() critics_optimiser.zero_grad() q_loss.backward() critics_optimiser.step() # Compute priorities, taking demonstrations into account if PRIORITIZE_REPLAY: td_error = weights_pr * (td_error_critic1.detach() + td_error_critic2.detach()).mean() action_dem = torch.tensor([]).to(device) if OBSERVATION_LOW: state_dem = torch.tensor([]).to(device) else: state_dem = { 'low': torch.tensor([]).float().to(device), 'high': torch.tensor([]).float().to(device) } priorities = torch.abs(td_error).tolist() i = 0 count_dem = 0 for idx in idxes: priorities[i] += epsilon if idx < n_demonstrations: priorities[i] += epsilon_d count_dem += 1 if BEHAVIOR_CLONING: action_dem = torch.cat( (action_dem, batch['action'][i].view( 1, -1)), dim=0) if OBSERVATION_LOW: state_dem = torch.cat( (state_dem, batch['state'][i].view( 1, -1)), dim=0) else: state_dem['high'] = torch.cat( (state_dem['high'], batch['state']['high'][i, ].view( -1, (3 + 1) * env.frames, 128, 128)), dim=0) state_dem['low'] = torch.cat( (state_dem['low'], batch['state']['low'][i, ].view( -1, 32)), dim=0) i += 1 if not action_dem.nelement() == 0: actual_action_dem, _ = actor(state_dem, log_prob=False, deterministic=True) # q_value_actor = (critic_1(batch['state'][i], batch['action'][i]) + critic_2(batch['state'][i], batch['action'][i]))/2 # q_value_actual = (critic_1(batch['state'][i], actual_action_dem) + critic_2(batch['state'][i], actual_action_dem))/2 # if q_value_actor > q_value_actual: # Q Filter behavior_loss = F.mse_loss( action_dem, actual_action_dem).unsqueeze(dim=0) else: behavior_loss = 0 D.update_priorities(idxes, priorities) lambda_bc = (count_dem / BATCH_SIZE) / 5 # Update V-function by one step of gradient descent if VALUE_FNC: v_loss = (value_critic(batch['state']) - y_v).pow(2).mean().to(device) value_critic_optimiser.zero_grad() v_loss.backward() value_critic_optimiser.step() # Update policy by one step of gradient ascent with torch.no_grad(): new_qs = torch.min( critic_1( batch["state"]['low'] if not OBSERVATION_LOW else batch['state'], action), critic_2( batch["state"]['low'] if not OBSERVATION_LOW else batch['state'], action)) policy_loss = lambda_ac * (weighted_sample_entropy.view( -1) - new_qs).mean().to(device) + lambda_bc * behavior_loss actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Update target value network if VALUE_FNC: update_target_network(value_critic, target_value_critic, POLYAK_FACTOR) else: update_target_network(critic_1, target_critic_1, POLYAK_FACTOR) update_target_network(critic_2, target_critic_2, POLYAK_FACTOR) state_dem = [] # Continues to sample transitions till episode is done and evaluation is on if step > UPDATE_START and step % TEST_INTERVAL == 0: eval_c = True else: eval_c = False if eval_c == True and eval_c2 == True: eval_c = False eval_c2 = False actor.eval() critic_1.eval() critic_2.eval() q_value_eval = eval_.get_qvalue(critic_1, critic_2) return_ep, steps_ep = eval_.sample_episode(actor) logz.log_tabular('Training steps', step) logz.log_tabular('Cumulative Success', success) logz.log_tabular('Validation return', return_ep.mean()) logz.log_tabular('Validation steps', steps_ep.mean()) logz.log_tabular('Validation return std', return_ep.std()) logz.log_tabular('Validation steps std', steps_ep.std()) logz.log_tabular('Q-value evaluation', q_value_eval) logz.log_tabular('Q-network loss', q_loss.detach().cpu().numpy()) if VALUE_FNC: logz.log_tabular('Value-network loss', v_loss.detach().cpu().numpy()) logz.log_tabular('Policy-network loss', policy_loss.detach().cpu().squeeze().numpy()) logz.log_tabular('Alpha loss', alpha_loss.detach().cpu().numpy()) logz.log_tabular('Alpha', alpha.detach().cpu().squeeze().numpy()) logz.log_tabular('Demonstrations current batch', count_dem) logz.dump_tabular() logz.save_pytorch_model(actor.state_dict()) torch.save(actor.state_dict(), os.path.join(dir_models, 'actor_model_%s.pkl' % (step))) torch.save( critic_1.state_dict(), os.path.join(dir_models, 'critic1_model_%s.pkl' % (step))) torch.save( critic_2.state_dict(), os.path.join(dir_models, 'critic1_model_%s.pkl' % (step))) #pbar.set_description('Step: %i | Reward: %f' % (step, return_ep.mean())) actor.train() critic_1.train() critic_2.train() env.terminate()
def train_PG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, reward_to_go, animate, logdir, normalize_advantages, nn_baseline, seed, n_layers, size): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment env = gym.make(env_name) # Set random seeds torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # Initialize Agent #========================================================================================# neural_network_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'reward_to_go': reward_to_go, 'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages, } agent = Agent(neural_network_args, sample_trajectory_args, estimate_return_args) #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) with torch.no_grad( ): # use torch.no_grad to disable the gradient calculation paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = [path["reward"] for path in paths] with torch.no_grad(): q_n, adv_n = agent.estimate_return(ob_no, re_n) agent.update_parameters(ob_no, ac_na, q_n, adv_n) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.save_pytorch_model(agent)
def train_AC( exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, actor_learning_rate, critic_learning_rate, num_target_updates, num_grad_steps_per_target_update, animate, logdir, normalize_advantages, seed, actor_n_layers, critic_n_layers, size): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment env = gym.make(env_name) # Set random seeds torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # Initialize Agent #========================================================================================# neural_network_args = { 'actor_n_layers': actor_n_layers, 'critic_n_layers': critic_n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'actor_learning_rate': actor_learning_rate, 'critic_learning_rate': critic_learning_rate, 'num_target_updates': num_target_updates, 'num_grad_steps_per_target_update': num_grad_steps_per_target_update, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_advantage_args = { 'gamma': gamma, 'normalize_advantages': normalize_advantages, } agent = Agent(neural_network_args, sample_trajectory_args, estimate_advantage_args) #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) with torch.no_grad(): # use torch.no_grad to disable the gradient calculation paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = np.concatenate([path["reward"] for path in paths]) next_ob_no = np.concatenate([path["next_observation"] for path in paths]) terminal_n = np.concatenate([path["terminal"] for path in paths]) # Call tensorflow operations to: # (1) update the critic, by calling agent.update_critic # (2) use the updated critic to compute the advantage by, calling agent.estimate_advantage # (3) use the estimated advantage values to update the actor, by calling agent.update_actor # YOUR CODE HERE raise NotImplementedError # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.save_pytorch_model(agent)