def learn_policy(self, epochs, actor_iterations=1, critic_iterations=1, episodes_per_update=1, epsilon_bound=0.2) \ -> Tuple[Policy, List[float]]: policy = StochasticPolicy(self.actor) agent = Agent(self.environment, policy) r_obs = RewardObserver() t_obs = TrajectoryObserver() agent.attach_observer(t_obs) agent.attach_observer(r_obs) for _ in tqdm(range(epochs)): # TODO COLLECTING EPISODES CAN BE DONE IN PARALLEL WITH MULTIPLE AGENTS for _ in range(episodes_per_update): agent.perform_episode() reward_to_go = t_obs.reward_to_go(self.discount_factor) trajectories = t_obs.sampled_trajectories # unify trajectories into single list reward_to_go = concatenate(reward_to_go) trajectories = concatenate(trajectories) # to tensor reward_to_go = torch.tensor(reward_to_go) trajectories = list_of_tuples_to_tuple_of_tensors(trajectories) if self.use_critic: state_index = 0 v = self.critic(trajectories[state_index]) v = torch.squeeze(v, 1) advantage = reward_to_go - v advantage = advantage.detach() self.update_actor(trajectories, advantage, actor_iterations, epsilon_bound) self.update_critic(trajectories, reward_to_go, critic_iterations) else: self.update_actor(trajectories, reward_to_go, actor_iterations, epsilon_bound) # reset memory for next iteration t_obs.clear() return policy, r_obs.get_rewards()
def learn_policy(self, epochs=200, episodes_per_update=1): self.v_optimizer.zero_grad() state_index = 0 action_index = 1 policy = StochasticPolicy(self.a_distribution_model) agent = Agent(self.environment, policy) # utilities to collect agent data t_obs = TrajectoryObserver() r_obs = RewardObserver() agent.attach_observer(t_obs) agent.attach_observer(r_obs) for _ in tqdm(range(epochs)): for _ in range(episodes_per_update): # perform complete episode with observers attached agent.perform_episode() # collect trajectory and calculate reward to go trajectory = t_obs.last_trajectory() reward_to_go = get_reward_to_go(trajectory, self.discount_factor) # convert to pytorch tensors trajectory = list_of_tuples_to_tuple_of_tensors(trajectory) reward_to_go = torch.tensor(reward_to_go, dtype=torch.float32) advantage = self.get_advantage(trajectory, reward_to_go) # calculate loss policy_loss = self.a_distribution_model( trajectory[state_index]) policy_loss = -policy_loss.log_prob( trajectory[action_index]) * advantage policy_loss = torch.sum(policy_loss) # to estimate the expected gradient of episodes_per_update episodes, # we divide the loss by episodes_per_update policy_loss = policy_loss / episodes_per_update # accumulate gradient policy_loss.backward() # gradient step self.a_optimizer.step() self.a_optimizer.zero_grad() self.update_advantage() t_obs.clear() return policy, r_obs.get_rewards()
def learn_policy(self, episodes=200, experience_replay_samples=32, gaussian_noise_variance=1, exponential_average_factor=0.01, noise_bound=None, buffer_size=math.inf ): pbar = tqdm(total=episodes) gaussian_noise = self.gaussian_distribution(gaussian_noise_variance) policy = DeterministicPolicy( self.a_model, additive_noise_distribution=gaussian_noise ) buffer = ReplayBuffer(buffer_size) reward_observer = RewardObserver() agent = Agent(self.environment, policy) agent.attach_observer(reward_observer) current_episode = 0 while current_episode < episodes: # collect transition state, action, reward, state_next, done = agent.step() # add to buffer buffer.add_transition(state, action, reward, state_next, done) # if enough transitions collected perform experience replay algorithm if buffer.size() >= experience_replay_samples: self.experience_replay( buffer.sample_transitions(experience_replay_samples), exponential_average_factor, noise_bound=noise_bound, noise_distribution=gaussian_noise ) # if episode ended, update progress if done: current_episode += 1 pbar.update(1) pbar.close() return DeterministicPolicy(self.a_model), reward_observer.get_rewards()
def learn_policy(self, episodes=200, experience_replay_samples=32, exponential_average_factor=0.01, entropy_coefficient=0, buffer_size=math.inf, updates_per_replay=1): pbar = tqdm(total=episodes) policy = StochasticPolicy(self.a_distribution_model) buffer = ReplayBuffer(buffer_size) reward_observer = RewardObserver() agent = Agent(self.environment, policy) agent.attach_observer(reward_observer) current_episode = 0 while current_episode < episodes: # collect transition state, action, reward, state_next, done = agent.step() # add to buffer buffer.add_transition(state, action, reward, state_next, done) # if enough transitions collected perform experience replay algorithm if buffer.size() >= experience_replay_samples: for _ in range(updates_per_replay): self.experience_replay( buffer.sample_transitions(experience_replay_samples), exponential_average_factor, entropy_coefficient) # if episode ended, update progress if done: current_episode += 1 if current_episode % 20 == 0: reward_observer.plot() reward_observer.plot_moving_average(5) pbar.update(1) pbar.close() return MeanOfStochasticModel( self.a_distribution_model), reward_observer.get_rewards()
def learn_policy(self, epochs=200, transition_batch_size=2, v_initialization_episodes=20): state_index = 0 action_index = 1 reward_index = 2 state_next_index = 3 done_index = 4 policy = StochasticPolicy(self.a_distribution_model) agent = Agent(self.environment, policy) # utilities to collect agent data r_obs = RewardObserver() agent.attach_observer(r_obs) transition_generator = agent.transition_generator() # runs montecarlo policy gradient, tuning v_model and a_distribution_model previous_rewards = self.initialize_v(v_initialization_episodes) r_obs.add(previous_rewards) print("heyey") while len(r_obs.get_rewards()) < epochs: # collect transition transitions = [ next(transition_generator) for _ in range(transition_batch_size) ] transitions = list_of_tuples_to_tuple_of_tensors(transitions) # last state_next needs to be appended state = torch.zeros(transitions[state_index].size()[0] + 1, transitions[state_index].size()[1]) state[:-1] = transitions[state_index] state[-1] = transitions[state_next_index][-1] # separate v in v and v_next v = self.v_model(state) v_next = v[1:] v = v[:-1] # bootstraping boosttrapped_v = v_next boosttrapped_v = boosttrapped_v * ( 1 - transitions[done_index]) # zeroing v_next entry if done boosttrapped_v = transitions[ reward_index] + self.discount_factor * boosttrapped_v boosttrapped_v = boosttrapped_v.detach() # set it as constant # perform v_model update self.v_optimizer.zero_grad() v_loss = torch.nn.MSELoss()(v, boosttrapped_v) v_loss.backward() self.v_optimizer.step() # advantage = (r_t + discount * V(s_ t+1)) - V(s_t) advantage = boosttrapped_v - v # to prevent policy gradient to propagate to v_model advantage = advantage.detach() policy_loss = self.a_distribution_model(transitions[state_index]) policy_loss = -policy_loss.log_prob( transitions[action_index]) * advantage policy_loss = torch.sum(policy_loss) # gradient step self.a_optimizer.zero_grad() policy_loss.backward() self.a_optimizer.step() transition_generator.close() return policy, r_obs.get_rewards()