def on_policy(self): """Perform n_steps on-policy, and return the data necessary for on-policy update, and updates shared_counter. Returns: training_data (list): A list of TraceTrainingData objects, one for each episode run. Only the last object may contain a last_state attribute corresponding to the state at which the last episode was cut. """ t = 0 training_data = [] if self.done else [TraceTrainingData()] while t < self.n_steps: if self.done: # Re-initialize objects for new episode self.cur_state = utils.state_to_tensor(self.env.reset()) self.done = False training_data.append(TraceTrainingData()) if len(self.episode_rewards) > 0: self.rewards.append(sum(self.episode_rewards)) self.episode_lengths.append(len(self.episode_rewards)) self.episode_rewards = [] # Compute policy and q_values. Note that we do not detach elements used in training, # as this saves us computations in _train() policy, q_values = self.model(self.cur_state) value = (policy * q_values).sum(dim=1, keepdim=True) with torch.no_grad(): avg_policy, _ = self.shared_avg_model(self.cur_state) action = torch.multinomial(policy, num_samples=1)[0, 0] next_state, reward, done, _ = self.env.step(action.item()) next_state = utils.state_to_tensor(next_state) # Save transition in replay buffer self.replay_buffer.append_transition( (self.cur_state, torch.LongTensor([[action.item()]]), policy.detach(), torch.LongTensor([[reward]]), done)) # Save data for training (all tensors have first dimension 1) training_data[-1].append(action=torch.LongTensor([[action]]), policy=policy, q_values=q_values, value=value, reward=torch.Tensor([[reward]]), average_policy=avg_policy) # Update loop data t += 1 self.done = done self.cur_state = next_state self.episode_rewards.append(reward) if not self.done: training_data[-1].last_state = self.cur_state self.replay_buffer.cutoff( self.cur_state) # Notify termination to the replay buffer self.shared_counter.increment(t) return training_data
def train(rank, args, T, shared_model, shared_average_model, optimiser): torch.manual_seed(args.seed + rank) # CUDA if args.use_cuda: torch.cuda.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) gpu_id = 0 if args.use_cuda else -1 # todo 0 代表第一个显卡 if gpu_id >= 0: model = model.cuda() model.train() if not args.on_policy: # Normalise memory capacity by number of training processes memory = EpisodicReplayMemory( args.memory_capacity // args.num_processes, args.max_episode_length) t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # On-policy episode loop while True: # Sync with shared model at least every t_max steps if gpu_id >= 0: with torch.cuda.device(gpu_id): model.load_state_dict(shared_model.state_dict()) else: model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: avg_hx = torch.zeros(1, args.hidden_size) avg_cx = torch.zeros(1, args.hidden_size) if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = torch.zeros(1, args.hidden_size).cuda() cx = torch.zeros(1, args.hidden_size).cuda() else: hx = torch.zeros(1, args.hidden_size) cx = torch.zeros(1, args.hidden_size) # Reset environment and done flag state = state_to_tensor(env.reset()) if gpu_id >= 0: state = state.cuda() done, episode_length = False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx)) # shared 模型在 CPU上, 需要转换 if gpu_id >= 0: to_avg_state = state.cpu() else: to_avg_state = state average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( to_avg_state, (avg_hx, avg_cx)) # if gpu_id >= 0: # average_policies = average_policies.cuda() # Sample action action = torch.multinomial(policy, 1)[0, 0] # Step next_state, reward, done, _ = env.step(action.item()) next_state = state_to_tensor(next_state) if gpu_id >= 0: next_state = next_state.cuda() reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter if not args.on_policy: # Save (beginning part of) transition for offline training memory.append(state, action, reward, policy.detach()) # Save just tensors # Save outputs for online training [ arr.append(el) for arr, el in zip(( policies, Qs, Vs, actions, rewards, average_policies), (policy, Q, V, torch.LongTensor([[action]]), torch.Tensor([[reward]]), average_policy)) ] # Increment counters t += 1 T.increment() # Update state state = next_state # Break graph for last values calculated (used for targets, not directly as model outputs) if done: # Qret = 0 for terminal s Qret = torch.zeros(1, 1) if not args.on_policy: # Save terminal state for offline training memory.append(state, None, None, None) else: # Qret = V(s_i; θ) for non-terminal s _, _, Qret, _ = model(state, (hx, cx)) Qret = Qret.detach().cpu() # Train the network on-policy if gpu_id >= 0: Qs = list(map(lambda x: x.cpu(), Qs)) Vs = list(map(lambda x: x.cpu(), Vs)) policies = list(map(lambda x: x.cpu(), policies)) _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies) # Finish on-policy episode if done: break # Train the network off-policy when enough experience has been collected if not args.on_policy and len(memory) >= args.replay_start: # Sample a number of off-policy episodes based on the replay ratio for _ in range(_poisson(args.replay_ratio)): # Act and train off-policy for a batch of (truncated) episode trajectories = memory.sample_batch(args.batch_size, maxlen=args.t_max) # Reset hidden state avg_hx = torch.zeros(args.batch_size, args.hidden_size) avg_cx = torch.zeros(args.batch_size, args.hidden_size) if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = torch.zeros(args.batch_size, args.hidden_size).cuda() cx = torch.zeros(args.batch_size, args.hidden_size).cuda() else: hx = torch.zeros(args.batch_size, args.hidden_size) cx = torch.zeros(args.batch_size, args.hidden_size) # Lists of outputs for training policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], [] # Loop over trajectories (bar last timestep) for i in range(len(trajectories) - 1): # Unpack first half of transition state = torch.cat( tuple(trajectory.state for trajectory in trajectories[i]), 0) action = torch.LongTensor([ trajectory.action for trajectory in trajectories[i] ]).unsqueeze(1) reward = torch.Tensor([ trajectory.reward for trajectory in trajectories[i] ]).unsqueeze(1) old_policy = torch.cat( tuple(trajectory.policy for trajectory in trajectories[i]), 0) # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( state, (avg_hx, avg_cx)) # Save outputs for offline training [ arr.append(el) for arr, el in zip((policies, Qs, Vs, actions, rewards, average_policies, old_policies), ( policy, Q, V, action, reward, average_policy, old_policy)) ] # Unpack second half of transition next_state = torch.cat( tuple(trajectory.state for trajectory in trajectories[i + 1]), 0) done = torch.Tensor([ trajectory.action is None for trajectory in trajectories[i + 1] ]).unsqueeze(1) # Do forward pass for all transitions _, _, Qret, _ = model(next_state, (hx, cx)) # Qret = 0 for terminal s, V(s_i; θ) otherwise Qret = ((1 - done) * Qret).detach().cpu() # Train the network off-policy if gpu_id >= 0: Qs = list(map(lambda x: x.cpu(), Qs)) Vs = list(map(lambda x: x.cpu(), Vs)) policies = list(map(lambda x: x.cpu(), policies)) _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies, old_policies=old_policies) done = True env.close()
def test(rank, args, T, shared_model): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.eval() can_test = True # Test flag t_start = 1 # Test step counter to check against global counter rewards, steps = [], [] # Rewards and steps for plotting l = str(len(str(args.T_max))) # Max num. of digits for logging steps done = True # Start new episode while T.value() <= args.T_max: if can_test: t_start = T.value() # Reset counter # Evaluate over several episodes and average results avg_rewards, avg_episode_lengths = [], [] for _ in range(args.evaluation_episodes): while True: # Reset or pass on hidden state if done: # Sync with shared model every episode model.load_state_dict(shared_model.state_dict()) hx = Variable(torch.zeros(1, args.hidden_size), volatile=True) cx = Variable(torch.zeros(1, args.hidden_size), volatile=True) # Reset environment and done flag state = state_to_tensor(env.reset()) done, episode_length = False, 0 reward_sum = 0 # Optionally render validation states if args.render: env.render() # Calculate policy policy, _, _, (hx, cx) = model(Variable(state, volatile=True), (hx.detach(), cx.detach())) # Break graph for memory efficiency # Choose action greedily action = policy.max(1)[1].data[0, 0] # Step state, reward, done, _ = env.step(action) state = state_to_tensor(state) reward_sum += reward done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Log and reset statistics at the end of every episode if done: avg_rewards.append(reward_sum) avg_episode_lengths.append(episode_length) break print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format( datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3], t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes)) if args.evaluate: return rewards.append(avg_rewards) # Keep all evaluations steps.append(t_start) plot_line(steps, rewards) # Plot rewards torch.save(model.state_dict(), 'model.pth') # Save model params can_test = False # Finish testing else: if T.value() - t_start >= args.evaluation_interval: can_test = True time.sleep(0.001) # Check if available to test every millisecond env.close()
def test(rank, args, T, shared_model): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.eval() save_dir = os.path.join('results', args.name) can_test = True # Test flag t_start = 1 # Test step counter to check against global counter rewards, steps = [], [] # Rewards and steps for plotting l = str(len(str(args.T_max))) # Max num. of digits for logging steps done = True # Start new episode # stores step, reward, avg_steps and time results_dict = {'t': [], 'reward': [], 'avg_steps': [], 'time': []} while T.value() <= args.T_max: if can_test: t_start = T.value() # Reset counter # Evaluate over several episodes and average results avg_rewards, avg_episode_lengths = [], [] for _ in range(args.evaluation_episodes): while True: # Reset or pass on hidden state if done: # Sync with shared model every episode model.load_state_dict(shared_model.state_dict()) hx = torch.zeros(1, args.hidden_size) cx = torch.zeros(1, args.hidden_size) # Reset environment and done flag state = state_to_tensor(env.reset()) done, episode_length = False, 0 reward_sum = 0 # Optionally render validation states if args.render: env.render() # Calculate policy with torch.no_grad(): policy, _, _, (hx, cx), _ = model(state, (hx, cx)) # Choose action greedily action = policy.max(1)[1][0] # Step state, reward, done, _ = env.step(action.item()) state = state_to_tensor(state) reward_sum += reward done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Log and reset statistics at the end of every episode if done: avg_rewards.append(reward_sum) avg_episode_lengths.append(episode_length) break print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format( datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3], t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes)) fields = [ t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes, str(datetime.now()) ] # storing data in the dictionary. results_dict['t'].append(t_start) results_dict['reward'].append( sum(avg_rewards) / args.evaluation_episodes) results_dict['avg_steps'].append( sum(avg_episode_lengths) / args.evaluation_episodes) results_dict['time'].append(str(datetime.now())) # Dumping the results in pickle format with open(os.path.join(save_dir, 'results.pck'), 'wb') as f: pickle.dump(results_dict, f) # Saving the data in csv format with open(os.path.join(save_dir, 'results.csv'), 'a') as f: writer = csv.writer(f) writer.writerow(fields) if args.evaluate: return rewards.append(avg_rewards) # Keep all evaluations steps.append(t_start) plot_line(steps, rewards, save_dir) # Plot rewards torch.save(model.state_dict(), os.path.join(save_dir, 'model.pth')) # Save model params # torch.save(model.state_dict(), os.path.join(save_dir, 'model_{}.pth'.format(t_start))) # Save model params can_test = False # Finish testing else: if T.value() - t_start >= args.evaluation_interval: can_test = True time.sleep(0.001) # Check if available to test every millisecond # Dumping the results in pickle format with open(os.path.join(save_dir, 'results.pck'), 'wb') as f: pickle.dump(results_dict, f) env.close()
def train(rank, args, T, shared_model, optimiser): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.train() t = 1 # Thread step counter epr, eploss, done = 0, 0, True # Start new episode while T.value() <= args.T_max: while True: model.load_state_dict(shared_model.state_dict()) # sync with shared model # Get starting timestep t_start = t policies, Vs, actions, rewards = [], [], [], [] # save values for computing gradientss # Reset or pass on hidden state if done: hx, avg_hx = Variable(torch.zeros(1, args.hidden_size)), Variable(torch.zeros(1, args.hidden_size)) cx, avg_cx = Variable(torch.zeros(1, args.hidden_size)), Variable(torch.zeros(1, args.hidden_size)) # Reset environment and done flag state = state_to_tensor(env.reset()) done, episode_length = False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() while not done and t - t_start < args.t_max: # Calculate policy and values policy, V, (hx, cx) = model(Variable(state), (hx, cx)) # Sample action action = policy.multinomial().data[0, 0] # Step next_state, reward, done, _ = env.step(action) next_state = state_to_tensor(next_state) reward = args.reward_clip and min(max(reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Save outputs for online training [arr.append(el) for arr, el in zip((policies, Vs, actions, rewards), (policy, V, Variable(torch.LongTensor([[action]])), Variable(torch.Tensor([[reward]]))))] # Increment counters t += 1 T.increment() # Update state state = next_state if done: R = Variable(torch.zeros(1, 1)) else: # R = V(s_i; θ) for non-terminal s _, R, _ = model(Variable(state), (hx, cx)) R = R.detach() # Train the network on-policy p_loss, v_loss = _train(args, T, model, shared_model, optimiser, policies, Vs, actions, rewards, R) # Finish episode if done: break
def train(rank, args, T, shared_model, shared_average_model, optimiser): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) action_size = env.action_space.n model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.train() if not args.on_policy: memory = EpisodicReplayMemory(args.memory_capacity, args.max_episode_length) t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # On-policy episode loop while True: # Sync with shared model at least every t_max steps model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: hx, avg_hx = Variable(torch.zeros(1, args.hidden_size)), Variable( torch.zeros( 1, args.hidden_size)) cx, avg_cx = Variable(torch.zeros(1, args.hidden_size)), Variable( torch.zeros( 1, args.hidden_size)) # Reset environment and done flag state = state_to_tensor(env.reset()) action, reward, done, episode_length = 0, 0, False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and values input = extend_input(state, action_to_one_hot(action, action_size), reward) policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( Variable(input), (avg_hx, avg_cx)) # Sample action action = policy.multinomial().data[ 0, 0] # Graph broken as loss for stochastic action calculated manually # Step next_state, reward, done, _ = env.step(action) next_state = state_to_tensor(next_state) reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter if not args.on_policy: # Save (beginning part of) transition for offline training memory.append(input, action, reward, policy.data) # Save just tensors # Save outputs for online training [ arr.append(el) for arr, el in zip(( policies, Qs, Vs, actions, rewards, average_policies ), (policy, Q, V, Variable(torch.LongTensor([[action]])), Variable(torch.Tensor([[reward]])), average_policy)) ] # Increment counters t += 1 T.increment() # Update state state = next_state # Break graph for last values calculated (used for targets, not directly as model outputs) if done: # Qret = 0 for terminal s Qret = Variable(torch.zeros(1, 1)) if not args.on_policy: # Save terminal state for offline training memory.append( extend_input(state, action_to_one_hot(action, action_size), reward), None, None, None) else: # Qret = V(s_i; θ) for non-terminal s _, _, Qret, _ = model(Variable(input), (hx, cx)) Qret = Qret.detach() # Train the network on-policy _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies) # Finish on-policy episode if done: break # Train the network off-policy when enough experience has been collected if not args.on_policy and len(memory) >= args.replay_start: # Sample a number of off-policy episodes based on the replay ratio for _ in range(_poisson(args.replay_ratio)): # Act and train off-policy for a batch of (truncated) episode trajectories = memory.sample_batch(args.batch_size, maxlen=args.t_max) # Reset hidden state hx, avg_hx = Variable( torch.zeros(args.batch_size, args.hidden_size)), Variable( torch.zeros(args.batch_size, args.hidden_size)) cx, avg_cx = Variable( torch.zeros(args.batch_size, args.hidden_size)), Variable( torch.zeros(args.batch_size, args.hidden_size)) # Lists of outputs for training policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], [] # Loop over trajectories (bar last timestep) for i in range(len(trajectories) - 1): # Unpack first half of transition input = torch.cat((trajectory.state for trajectory in trajectories[i]), 0) action = Variable( torch.LongTensor([ trajectory.action for trajectory in trajectories[i] ])).unsqueeze(1) reward = Variable( torch.Tensor([ trajectory.reward for trajectory in trajectories[i] ])).unsqueeze(1) old_policy = Variable( torch.cat((trajectory.policy for trajectory in trajectories[i]), 0)) # Calculate policy and values policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( Variable(input), (avg_hx, avg_cx)) # Save outputs for offline training [ arr.append(el) for arr, el in zip((policies, Qs, Vs, actions, rewards, average_policies, old_policies), ( policy, Q, V, action, reward, average_policy, old_policy)) ] # Unpack second half of transition next_input = torch.cat( (trajectory.state for trajectory in trajectories[i + 1]), 0) done = Variable( torch.Tensor([ trajectory.action is None for trajectory in trajectories[i + 1] ]).unsqueeze(1)) # Do forward pass for all transitions _, _, Qret, _ = model(Variable(next_input), (hx, cx)) # Qret = 0 for terminal s, V(s_i; θ) otherwise Qret = ((1 - done) * Qret).detach() # Train the network off-policy _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies, old_policies=old_policies) done = True env.close()
def test(rank, args, T, shared_model): torch.manual_seed(args.seed + rank) env = JacoEnv(args.width, args.height, args.frame_skip, args.rewarding_distance, args.control_magnitude, args.reward_continuous) env.seed(args.seed + rank) if args.render: (_, _, obs_rgb_view2) = env.reset() plt.ion() f, ax = plt.subplots() im = ax.imshow(obs_rgb_view2) model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size) model.eval() can_test = True # Test flag t_start = 1 # Test step counter to check against global counter rewards, steps = [], [] # Rewards and steps for plotting n_digits = str( len(str(args.T_max))) # Max num. of digits for logging steps done = True # Start new episode while T.value() <= args.T_max: if can_test: t_start = T.value() # Reset counter # Evaluate over several episodes and average results avg_rewards, avg_episode_lengths = [], [] for _ in range(args.evaluation_episodes): while True: # Reset or pass on hidden state if done: # Sync with shared model every episode model.load_state_dict(shared_model.state_dict()) hx = Variable( torch.zeros(1, args.hidden_size), volatile=True) cx = Variable( torch.zeros(1, args.hidden_size), volatile=True) # Reset environment and done flag state = state_to_tensor(env.reset()) action, reward, done, episode_length = (0, 0, 0, 0, 0, 0), 0, False, 0 reward_sum = 0 # Calculate policy policy, _, (hx, cx) = model( Variable( state[0], volatile=True), Variable( state[1], volatile=True), (hx.detach(), cx.detach())) # Break graph for memory efficiency # Choose action greedily action = [p.max(1)[1].data[0, 0] for p in policy] # Step state, reward, done = env.step(action) obs_rgb_view1 = state[1] obs_rgb_view2 = state[2] state = state_to_tensor(state) reward_sum += reward done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Optionally render validation states if args.render: # rendering the first camera view im.set_data(obs_rgb_view1) plt.draw() plt.pause(0.05) # rendering mujoco simulation # viewer = mujoco_py.MjViewer(env.sim) # viewer.render() # Log and reset statistics at the end of every episode if done: avg_rewards.append(reward_sum) avg_episode_lengths.append(episode_length) break print(('[{}] Step: {:<' + n_digits + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format( datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S,%f')[:-3], t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes)) rewards.append(avg_rewards) # Keep all evaluations steps.append(t_start) plot_line(steps, rewards) # Plot rewards torch.save(model.state_dict(), os.path.join('results', str(t_start) + '_model.pth')) # Checkpoint model params can_test = False # Finish testing if args.evaluate: return else: if T.value() - t_start >= args.evaluation_interval: can_test = True time.sleep(0.001) # Check if available to test every millisecond
def train(rank, args, T, shared_model, optimiser): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) action_size = env.action_space.n model = ActorCritic(env.observation_space, env.action_space, args.hidden_size, args.no_noise, args.noise_entropy) model.train() t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # Sync with shared model at least every t_max steps model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: hx = Variable(torch.zeros(1, args.hidden_size)) cx = Variable(torch.zeros(1, args.hidden_size)) # Reset environment and done flag state = state_to_tensor(env.reset()) action, reward, done, episode_length = 0, 0, False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() model.sample_noise( ) # Pick a new noise vector (until next optimisation step) # Lists of outputs for training values, log_probs, rewards, entropies = [], [], [], [] while not done and t - t_start < args.t_max: input = extend_input(state, action_to_one_hot(action, action_size), reward, episode_length) # Calculate policy and value policy, value, (hx, cx) = model(Variable(input), (hx, cx)) log_policy = policy.log() entropy = -(log_policy * policy).sum(1) # Sample action action = policy.multinomial() log_prob = log_policy.gather( 1, action.detach() ) # Graph broken as loss for stochastic action calculated manually action = action.data[0, 0] # Step state, reward, done, _ = env.step(action) state = state_to_tensor(state) reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length episode_length += 1 # Increase episode counter # Save outputs for training [ arr.append(el) for arr, el in zip((values, log_probs, rewards, entropies), (value, log_prob, reward, entropy)) ] # Increment counters t += 1 T.increment() # Return R = 0 for terminal s or V(s_i; θ) for non-terminal s if done: R = Variable(torch.zeros(1, 1)) else: _, R, _ = model(Variable(input), (hx, cx)) R = R.detach() values.append(R) # Train the network policy_loss = 0 value_loss = 0 A_GAE = torch.zeros(1, 1) # Generalised advantage estimator Ψ # Calculate n-step returns in forward view, stepping backwards from the last state trajectory_length = len(rewards) for i in reversed(range(trajectory_length)): # R ← r_i + γR R = rewards[i] + args.discount * R # Advantage A = R - V(s_i; θ) A = R - values[i] # dθ ← dθ - ∂A^2/∂θ value_loss += 0.5 * A**2 # Least squares error # TD residual δ = r + γV(s_i+1; θ) - V(s_i; θ) td_error = rewards[i] + args.discount * values[ i + 1].data - values[i].data # Generalised advantage estimator Ψ (roughly of form ∑(γλ)^t∙δ) A_GAE = A_GAE * args.discount * args.trace_decay + td_error # dθ ← dθ + ∇θ∙log(π(a_i|s_i; θ))∙Ψ policy_loss -= log_probs[i] * Variable( A_GAE) # Policy gradient loss if args.no_noise or args.noise_entropy: # dθ ← dθ + β∙∇θH(π(s_i; θ)) policy_loss -= args.entropy_weight * entropies[ i] # Entropy maximisation loss # Optionally normalise loss by number of time steps if not args.no_time_normalisation: policy_loss /= trajectory_length value_loss /= trajectory_length # Zero shared and local grads optimiser.zero_grad() # Note that losses were defined as negatives of normal update rules for gradient descent (policy_loss + value_loss).backward() # Gradient L2 normalisation nn.utils.clip_grad_norm(model.parameters(), args.max_gradient_norm, 2) # Transfer gradients to shared model and update _transfer_grads_to_shared_model(model, shared_model) optimiser.step() if not args.no_lr_decay: # Linearly decay learning rate _adjust_learning_rate( optimiser, max(args.lr * (args.T_max - T.value()) / args.T_max, 1e-32)) env.close()
def train(rank, args, T, shared_model, optimiser): torch.manual_seed(args.seed + rank) env = JacoEnv(args.width, args.height, args.frame_skip, args.rewarding_distance, args.control_magnitude, args.reward_continuous) env.seed(args.seed + rank) # TODO: pass in the observation and action space model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size) model.train() t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # Sync with shared model at least every t_max steps model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: hx = Variable(torch.zeros(1, args.hidden_size)) cx = Variable(torch.zeros(1, args.hidden_size)) # Reset environment and done flag state = state_to_tensor(env.reset()) action, reward, done, episode_length = (0, 0, 0, 0, 0, 0), 0, False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Vs, actions, rewards = [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and value policy, V, (hx, cx) = model(Variable(state[0]), Variable(state[1]), (hx, cx)) # Sample action action = [ p.multinomial().data[0, 0] for p in policy ] # Graph broken as loss for stochastic action calculated manually # Step state, reward, done = env.step(action) state = state_to_tensor(state) done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Save outputs for online training [ arr.append(el) for arr, el in zip((policies, Vs, actions, rewards), ( policy, V, Variable(torch.LongTensor(action)), reward)) ] # Increment counters t += 1 T.increment() # Break graph for last values calculated (used for targets, not directly as model outputs) if done: # R = 0 for terminal s R = Variable(torch.zeros(1, 1)) else: # R = V(s_i; θ) for non-terminal s _, R, _ = model(Variable(state[0]), Variable(state[1]), (hx, cx)) R = R.detach() Vs.append(R) # Train the network _train(args, T, model, shared_model, optimiser, policies, Vs, actions, rewards, R)
def train(rank, args, T, shared_model, shared_average_model, optimiser): torch.manual_seed(args.seed + rank) # env = gym.make(args.env) # env.seed(args.seed + rank) # model = ActorCritic(STATE_SPACE, ACTION_SPACE, args.hidden_size, NUM_LAYERS) model = torch.load('training_cps/training1_2_layer2_1-0_270000.pt') model.train() if not args.on_policy: # Normalise memory capacity by number of training processes # memory = EpisodicReplayMemory(args.memory_capacity // args.num_processes, args.max_episode_length) parser = Parser() several_csvs = [ 'initial_csvs/Task1_3.csv', 'initial_csvs/Task1_4.csv', 'initial_csvs/Task1_5.csv' ] parser.parseInit(several_csvs) # parser.generateRandomDataset(100) # parser.writeToFile('outputs/output1_several_layer{0}_0-4.csv'.format(parser.layer)) parser.readAMTBatch('AMT_rewards/AMT1_345_layer2_0-8.csv') several_outputs = [ 'outputs/output1_3_layer2_0-8.csv', 'outputs/output1_4_layer2_0-8.csv', 'outputs/output1_5_layer2_0-8.csv' ] parser.writeBackMemory(several_outputs) memory = parser.memory # pdb.set_trace() t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: if (T.value() % 10000 == 0 ): # 500 iterations around 1 min. 10000 iterations 20 mins torch.save( model, 'training_cps/training1_2_layer2_1-0_{0}.pt'.format(T.value() + 270000)) # On-policy episode loop while False: # Sync with shared model at least every t_max steps model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: hx, avg_hx = Variable(torch.zeros(1, args.hidden_size)), Variable( torch.zeros( 1, args.hidden_size)) cx, avg_cx = Variable(torch.zeros(1, args.hidden_size)), Variable( torch.zeros( 1, args.hidden_size)) # Reset environment and done flag # state = state_to_tensor(env.reset()) state = state_to_tensor(parser.states[0]).view(1, STATE_SPACE) print(state) done, episode_length = False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and values policy, Q, V, (hx, cx) = model(Variable(state), (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( Variable(state), (avg_hx, avg_cx)) # Sample action action = policy.multinomial().data[ 0, 0] # Graph broken as loss for stochastic action calculated manually # Step # next_state, reward, done, _ = env.step(action) next_state = parser.states[1] reward = parser.rewards[0] done = True next_state = state_to_tensor(next_state).view(1, 24) reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter if not args.on_policy: # Save (beginning part of) transition for offline training memory.append(state, action, reward, policy.data) # Save just tensors # Save outputs for online training [ arr.append(el) for arr, el in zip(( policies, Qs, Vs, actions, rewards, average_policies ), (policy, Q, V, Variable(torch.LongTensor([[action]])), Variable(torch.Tensor([[reward]])), average_policy)) ] # Increment counters t += 1 T.increment() # Update state state = next_state # Break graph for last values calculated (used for targets, not directly as model outputs) if done: # Qret = 0 for terminal s Qret = Variable(torch.zeros(1, 1)) if not args.on_policy: # Save terminal state for offline training memory.append(state, None, None, None) else: # Qret = V(s_i; θ) for non-terminal s _, _, Qret, _ = model(Variable(state), (hx, cx)) Qret = Qret.detach() # Train the network on-policy _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies) # Finish on-policy episode if done: break # Train the network off-policy when enough experience has been collected # print(len(memory)) # print(args.replay_start) if not args.on_policy and len(memory) >= args.replay_start: # Sample a number of off-policy episodes based on the replay ratio for _ in range(_poisson(args.replay_ratio)): # Act and train off-policy for a batch of (truncated) episode trajectories = memory.sample_batch(args.batch_size, maxlen=args.t_max) # Reset hidden state hx, avg_hx = Variable( torch.zeros(NUM_LAYERS, args.batch_size, args.hidden_size)), Variable( torch.zeros(NUM_LAYERS, args.batch_size, args.hidden_size)) cx, avg_cx = Variable( torch.zeros(NUM_LAYERS, args.batch_size, args.hidden_size)), Variable( torch.zeros(NUM_LAYERS, args.batch_size, args.hidden_size)) # Lists of outputs for training policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], [] # print(len(trajectories)) # Loop over trajectories (bar last timestep) for i in range(len(trajectories) - 1): # Unpack first half of transition state = torch.cat((trajectory.state for trajectory in trajectories[i]), 0) action = Variable( torch.LongTensor([ trajectory.action for trajectory in trajectories[i] ])).unsqueeze(1) reward = Variable( torch.Tensor([ trajectory.reward for trajectory in trajectories[i] ])).unsqueeze(1) old_policy = Variable( torch.cat((trajectory.policy for trajectory in trajectories[i]), 0)) # Calculate policy and values policy, Q, V, (hx, cx) = model(Variable(state), (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( Variable(state), (avg_hx, avg_cx)) # Save outputs for offline training [ arr.append(el) for arr, el in zip((policies, Qs, Vs, actions, rewards, average_policies, old_policies), ( policy, Q, V, action, reward, average_policy, old_policy)) ] # Unpack second half of transition next_state = torch.cat( (trajectory.state for trajectory in trajectories[i + 1]), 0) done = Variable( torch.Tensor([ trajectory.action is None for trajectory in trajectories[i + 1] ]).unsqueeze(1)) # Do forward pass for all transitions _, _, Qret, _ = model(Variable(next_state), (hx, cx)) # Qret = 0 for terminal s, V(s_i; θ) otherwise Qret = ((1 - done) * Qret).detach() # Train the network off-policy _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies, old_policies=old_policies) done = True T.increment()
model.train() t = 1 # Thread step counter done = True # Start new episode while True: # TODO: Need to receive kill signal from server # Sync with server model at least every t_max steps _sync_params(socket, model) # Get starting timestep t_start = t # Reset or pass on hidden state if done: hx, cx = Variable(torch.zeros(1, args.hidden_size)), Variable(torch.zeros(1, args.hidden_size)) # Reset environment and done flag state = state_to_tensor(env.reset()) action, reward, done, episode_length = 0, 0, False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx, cx = hx.detach(), cx.detach() # Lists of outputs for training values, log_probs, rewards, entropies = [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and value policy, value, (hx, cx) = model(Variable(state), (hx, cx)) log_policy = policy.log() entropy = -(log_policy * policy).sum(1) # Sample action