Exemplo n.º 1
0
    def learn_policy(self,
                     epochs,
                     actor_iterations=1,
                     critic_iterations=1,
                     episodes_per_update=1,
                     epsilon_bound=0.2) \
            -> Tuple[Policy, List[float]]:

        policy = StochasticPolicy(self.actor)

        agent = Agent(self.environment, policy)
        r_obs = RewardObserver()
        t_obs = TrajectoryObserver()
        agent.attach_observer(t_obs)
        agent.attach_observer(r_obs)

        for _ in tqdm(range(epochs)):
            # TODO COLLECTING EPISODES CAN BE DONE IN PARALLEL WITH MULTIPLE AGENTS
            for _ in range(episodes_per_update):
                agent.perform_episode()

            reward_to_go = t_obs.reward_to_go(self.discount_factor)
            trajectories = t_obs.sampled_trajectories

            # unify trajectories into single list
            reward_to_go = concatenate(reward_to_go)
            trajectories = concatenate(trajectories)

            # to tensor
            reward_to_go = torch.tensor(reward_to_go)
            trajectories = list_of_tuples_to_tuple_of_tensors(trajectories)

            if self.use_critic:
                state_index = 0

                v = self.critic(trajectories[state_index])
                v = torch.squeeze(v, 1)

                advantage = reward_to_go - v
                advantage = advantage.detach()

                self.update_actor(trajectories, advantage, actor_iterations,
                                  epsilon_bound)
                self.update_critic(trajectories, reward_to_go,
                                   critic_iterations)
            else:
                self.update_actor(trajectories, reward_to_go, actor_iterations,
                                  epsilon_bound)

            # reset memory for next iteration
            t_obs.clear()

        return policy, r_obs.get_rewards()
Exemplo n.º 2
0
    def learn_policy(self, epochs=200, episodes_per_update=1):
        self.v_optimizer.zero_grad()
        state_index = 0
        action_index = 1

        policy = StochasticPolicy(self.a_distribution_model)
        agent = Agent(self.environment, policy)

        # utilities to collect agent data
        t_obs = TrajectoryObserver()
        r_obs = RewardObserver()
        agent.attach_observer(t_obs)
        agent.attach_observer(r_obs)

        for _ in tqdm(range(epochs)):
            for _ in range(episodes_per_update):
                # perform complete episode with observers attached
                agent.perform_episode()

                # collect trajectory and calculate reward to go
                trajectory = t_obs.last_trajectory()
                reward_to_go = get_reward_to_go(trajectory,
                                                self.discount_factor)

                # convert to pytorch tensors
                trajectory = list_of_tuples_to_tuple_of_tensors(trajectory)
                reward_to_go = torch.tensor(reward_to_go, dtype=torch.float32)

                advantage = self.get_advantage(trajectory, reward_to_go)

                # calculate loss
                policy_loss = self.a_distribution_model(
                    trajectory[state_index])
                policy_loss = -policy_loss.log_prob(
                    trajectory[action_index]) * advantage
                policy_loss = torch.sum(policy_loss)

                # to estimate the expected gradient of episodes_per_update episodes,
                # we divide the loss by episodes_per_update
                policy_loss = policy_loss / episodes_per_update

                # accumulate gradient
                policy_loss.backward()

            # gradient step
            self.a_optimizer.step()
            self.a_optimizer.zero_grad()
            self.update_advantage()

            t_obs.clear()

        return policy, r_obs.get_rewards()
Exemplo n.º 3
0
    def learn_policy(self,
                     episodes=200,
                     experience_replay_samples=32,
                     gaussian_noise_variance=1,
                     exponential_average_factor=0.01,
                     noise_bound=None,
                     buffer_size=math.inf
                     ):

        pbar = tqdm(total=episodes)

        gaussian_noise = self.gaussian_distribution(gaussian_noise_variance)

        policy = DeterministicPolicy(
            self.a_model,
            additive_noise_distribution=gaussian_noise
        )
        buffer = ReplayBuffer(buffer_size)

        reward_observer = RewardObserver()
        agent = Agent(self.environment, policy)
        agent.attach_observer(reward_observer)

        current_episode = 0

        while current_episode < episodes:
            # collect transition
            state, action, reward, state_next, done = agent.step()
            # add to buffer
            buffer.add_transition(state, action, reward, state_next, done)

            # if enough transitions collected perform experience replay algorithm
            if buffer.size() >= experience_replay_samples:
                self.experience_replay(
                    buffer.sample_transitions(experience_replay_samples),
                    exponential_average_factor,
                    noise_bound=noise_bound,
                    noise_distribution=gaussian_noise
                )

            # if episode ended, update progress
            if done:
                current_episode += 1
                pbar.update(1)

        pbar.close()
        return DeterministicPolicy(self.a_model), reward_observer.get_rewards()
Exemplo n.º 4
0
    def learn_policy(self,
                     episodes=200,
                     experience_replay_samples=32,
                     exponential_average_factor=0.01,
                     entropy_coefficient=0,
                     buffer_size=math.inf,
                     updates_per_replay=1):

        pbar = tqdm(total=episodes)

        policy = StochasticPolicy(self.a_distribution_model)
        buffer = ReplayBuffer(buffer_size)

        reward_observer = RewardObserver()
        agent = Agent(self.environment, policy)
        agent.attach_observer(reward_observer)

        current_episode = 0

        while current_episode < episodes:
            # collect transition
            state, action, reward, state_next, done = agent.step()
            # add to buffer
            buffer.add_transition(state, action, reward, state_next, done)

            # if enough transitions collected perform experience replay algorithm
            if buffer.size() >= experience_replay_samples:
                for _ in range(updates_per_replay):
                    self.experience_replay(
                        buffer.sample_transitions(experience_replay_samples),
                        exponential_average_factor, entropy_coefficient)

            # if episode ended, update progress
            if done:
                current_episode += 1

                if current_episode % 20 == 0:
                    reward_observer.plot()
                    reward_observer.plot_moving_average(5)

                pbar.update(1)

        pbar.close()
        return MeanOfStochasticModel(
            self.a_distribution_model), reward_observer.get_rewards()
Exemplo n.º 5
0
    def learn_policy(self,
                     epochs=200,
                     transition_batch_size=2,
                     v_initialization_episodes=20):
        state_index = 0
        action_index = 1
        reward_index = 2
        state_next_index = 3
        done_index = 4

        policy = StochasticPolicy(self.a_distribution_model)
        agent = Agent(self.environment, policy)

        # utilities to collect agent data
        r_obs = RewardObserver()
        agent.attach_observer(r_obs)
        transition_generator = agent.transition_generator()

        # runs montecarlo policy gradient, tuning v_model and a_distribution_model
        previous_rewards = self.initialize_v(v_initialization_episodes)
        r_obs.add(previous_rewards)
        print("heyey")

        while len(r_obs.get_rewards()) < epochs:

            # collect transition
            transitions = [
                next(transition_generator)
                for _ in range(transition_batch_size)
            ]
            transitions = list_of_tuples_to_tuple_of_tensors(transitions)

            # last state_next needs to be appended
            state = torch.zeros(transitions[state_index].size()[0] + 1,
                                transitions[state_index].size()[1])
            state[:-1] = transitions[state_index]
            state[-1] = transitions[state_next_index][-1]

            # separate v in v and v_next
            v = self.v_model(state)
            v_next = v[1:]
            v = v[:-1]

            # bootstraping
            boosttrapped_v = v_next
            boosttrapped_v = boosttrapped_v * (
                1 - transitions[done_index])  # zeroing v_next entry  if done
            boosttrapped_v = transitions[
                reward_index] + self.discount_factor * boosttrapped_v

            boosttrapped_v = boosttrapped_v.detach()  # set it as constant

            # perform v_model update
            self.v_optimizer.zero_grad()
            v_loss = torch.nn.MSELoss()(v, boosttrapped_v)
            v_loss.backward()
            self.v_optimizer.step()

            # advantage = (r_t + discount * V(s_ t+1)) - V(s_t)
            advantage = boosttrapped_v - v

            # to prevent policy gradient to propagate to v_model
            advantage = advantage.detach()

            policy_loss = self.a_distribution_model(transitions[state_index])
            policy_loss = -policy_loss.log_prob(
                transitions[action_index]) * advantage
            policy_loss = torch.sum(policy_loss)

            # gradient step
            self.a_optimizer.zero_grad()
            policy_loss.backward()
            self.a_optimizer.step()

        transition_generator.close()

        return policy, r_obs.get_rewards()