示例#1
0
def train_irl(sess: tf.Session, model: RQModelBase, replay: StateActionReplay,
              epochs: int):
    avg_delta_loss = IncrementalMean(50)
    loss = 0
    for epoch in range(epochs):
        if len(replay) > IRL_BATCH_SIZE:
            states, actions = replay.sample(IRL_BATCH_SIZE)
            new_loss = model.train_r(sess, states, actions)
            avg_delta_loss.add(new_loss - loss)
            loss = new_loss
            print('IRL: Epoch: {0}/{1} Loss: {2:.3f} AvgLossDelta: {3:.3f}'.
                  format(epoch, epochs, loss, avg_delta_loss.value))
            if avg_delta_loss.value < IRL_LOSS_DELTA_STOP:
                print('No significant change in loss, stopping training')
                return
示例#2
0
class DunBroadStrategy(Strategy):
    def __init__(self,
                 env: Environment,
                 model: RQModelBase,
                 memory_capacity=10000,
                 discount_factor=0.96,
                 batch_size=32,
                 epsilon=0.5):
        super(DunBroadStrategy, self).__init__(env, model)
        self.replay = GenericMemory(memory_capacity, [
            ('state', np.float32, env.state_shape),
            ('u', np.float32, env.num_actions),
        ])
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.batch_size = batch_size

        self.actions_to_sample = min(env.num_actions, batch_size)
        self.next_states = np.zeros(
            (self.actions_to_sample, ) + env.state_shape, dtype=np.float32)
        self.mean_reward = IncrementalMean(100)

    def run(self, sess: tf.Session, num_episodes: int, *args, **kwargs):
        for episode in range(num_episodes):
            # Episode start
            state = self.env.reset()
            episode_reward = 0.0

            while True:
                # If state is terminal
                is_terminal = self.env.is_terminal()

                # Predict R(s, a), U(s, a), Q(s, a), policy(s, a)  for current state s
                r, u, _, p = self.model.predict_r_u_q_p(sess, [state])
                r = r[0]
                u = u[0]
                p = p[0]

                if not is_terminal:
                    # Select N best action w.r.t. policy
                    # Selected actions are like receptive field
                    #actions = np.random.choice(self.env.num_actions, self.actions_to_sample, p=p, replace=False)

                    # Select N completely random actions (uniformly)
                    actions = np.random.choice(self.env.num_actions,
                                               self.actions_to_sample,
                                               replace=False)

                    # Get next states for N actions, but not updating internal environment state
                    # e.g. Monte Carlo node expansion
                    for i, action in enumerate(actions):
                        self.next_states[i] = self.env.do_action(
                            action, update_state=False)

                    # Get best Q values for next states
                    next_r, next_u, _, _ = self.model.predict_r_u_q_p(
                        sess, self.next_states)

                    # Update Q values of N performed actions as:
                    # Q(s, a) <- R(s, a) + gamma * max a' [ Q(s', a') ]  -- Original DQN update
                    # U(s, a) <- gamma * max a' [ Q(s', a') ]
                    # Assuming that we are following the policy
                    u[actions] = self.discount_factor * np.max(next_r + next_u,
                                                               axis=1)

                    # E-greedy policy
                    if np.random.rand() < self.epsilon:
                        action = np.random.choice(self.env.num_actions)
                    else:
                        # Choose best possible action
                        action = np.argmax(u + r)

                    episode_reward += r[action]

                    self.replay.append(state, u)

                    # Make an MDP step
                    state = self.env.do_action(action)
                else:
                    # For terminal state:
                    # Q(s, a) <- R(s,a)
                    # so, U(s, a) <- 0.0, so expectations of the next rewards are 0
                    self.replay.append(state, np.zeros(self.env.num_actions))

                if len(self.replay) > self.batch_size:
                    batch_states, batch_u = self.replay.sample(self.batch_size)
                    self.model.train_u(
                        sess,
                        batch_states,
                        batch_u,
                        average_episode_reward=self.mean_reward.value)

                if is_terminal:
                    self.mean_reward.add(episode_reward)
                    print('DQN BROAD: Episode={0}/{1} R={2:.3f} MeanR={3:.3f}'.
                          format(episode, num_episodes, episode_reward,
                                 self.mean_reward.value))
                    break
示例#3
0
class Runner(object):
    def __init__(self,
                 env: gym.Env,
                 ppo: ProximalPolicyOptimization,
                 num_steps: int,
                 logger: TfLogger = None):
        self.rollout = GenericMemory(
            num_steps,
            [
                #('observations', np.float32, env.state_shape),
                ('observations', np.float32, env.observation_space.shape),
                ('actions', np.float32, env.action_space.shape),
                ('rewards', np.float32, ()),
                ('values', np.float32, ()),
                ('next_is_terminal', np.bool, ())
            ])
        self.env = env
        self.ppo = ppo
        self.observation = env.reset()
        self.num_steps = num_steps
        self.running_reward = IncrementalMean(20)
        self.episode_reward = 0.0
        self.episode = 0
        self.logger = logger

    def run(self, sess: tf.Session):
        for step in range(self.num_steps):
            # if self.running_reward.value is not None and self.running_reward.value > 10.0:
            # self.env.render()
            action, value = self.ppo.sample_single_action_and_value(
                sess, self.observation)
            action = np.clip(action, self.env.action_space.low,
                             self.env.action_space.high)
            new_observation, reward, is_terminal, _ = self.env.step(action)
            self.episode_reward += reward

            # Is terminal indicates whether new_observation if from terminal state
            self.rollout.append(self.observation, action, reward, value,
                                is_terminal)
            if is_terminal:
                self.running_reward.add(self.episode_reward)
                if self.logger is not None:
                    self.logger.log_scalar(self.episode_reward,
                                           'EpisodeReward')
                    self.logger.log_scalar(self.running_reward.value,
                                           'MeanEpisodeReward')

                print('Episode {0} finished: R:{1:.3f} MeanR:{2:.3f}'.format(
                    self.episode, self.episode_reward,
                    self.running_reward.value))

                self.observation = self.env.reset()
                self.episode += 1
                self.episode_reward = 0.0
            else:
                self.observation = new_observation

        returns, advantages = expectations(
            self.rollout['rewards'][:-1],
            self.rollout['values'][:-1],
            self.rollout['next_is_terminal'][:-1],
            bootstrap_value=self.rollout['values'][-1],
            lam=1.0)

        # Normalize advantages for better gradients
        advantages = (advantages - advantages.mean()) / advantages.std()
        return self.rollout['observations'][:-1], self.rollout[
            'actions'][:-1], returns, advantages
示例#4
0
class DunMcStrategy(Strategy):
    """
        Updates Q values with the accumulated rewards over a whole episode
    """
    def __init__(self,
                 env: Environment,
                 model: RQModelBase,
                 memory_capacity=100000,
                 discount_factor=0.96,
                 batch_size=64,
                 epsilon=0.5):
        super(DunMcStrategy, self).__init__(env, model)
        self.replay = GenericMemory(memory_capacity, [
            ('state', np.float32, env.state_shape),
            ('q', np.float32, env.num_actions),
        ])
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.mean_reward = IncrementalMean()

    def run(self, sess: tf.Session, num_episodes: int, *args, **kwargs):
        for episode in range(num_episodes):
            states, actions, rewards, u = self.play_episode(sess)
            total_reward = rewards.sum()
            self.mean_reward.add(total_reward)

            # DQN Targets:
            # Q(s, a, w') = r + gamma * max[a] Q(s', a, w)

            # Q value of the last action = R(s)
            # U value of the last action = 0.0
            u[-1, actions[-1]] = 0.0
            self.replay.append(states[-1], u[-1])

            # Discount rewards
            # From end to start:
            # Q(s, a, t) <- R(s, a, t) + gamma * Q(s, a, t + 1)  assuming that the next Q is: max a' [Q(s, a')]
            # U(s, a, y) <- gamma * (R(s, a, t + 1) + U(s, a, t + 1))
            for i in reversed(range(len(actions) - 1)):
                u[i, actions[i]] = self.discount_factor * (
                    rewards[i + 1] + u[i + 1, actions[i + 1]])
                # q[i, actions[i]] = rewards[i] + self.discount_factor * np.max(q[i + 1])
                self.replay.append(states[i], u[i])

            if len(self.replay) > self.batch_size:
                batch_states, batch_u = self.replay.sample(self.batch_size)
                loss = self.model.train_u(sess, batch_states, batch_u)

                print(
                    'MC: Episode: {0}/{1} Loss={2:.5f} R: {3:.3f}  Avg R: {4:.3f}'
                    .format(episode, num_episodes, loss, total_reward,
                            self.mean_reward.value))

    def play_episode(self, sess: tf.Session, use_policy: bool = False):
        states = []
        u_values = []
        actions = []
        rewards = []

        last_state = self.env.reset()

        while True:
            predicted_r, predicted_u, _, predicted_policy = self.model.predict_r_u_q_p(
                sess, [last_state])

            if use_policy:
                action = np.random.choice(self.env.num_actions,
                                          p=predicted_policy[0])
            else:
                # E-greedy
                if np.random.rand() < self.epsilon:
                    action = np.random.randint(0, self.env.num_actions)
                else:
                    # Q = R + U
                    action = np.argmax(predicted_r[0] + predicted_u[0])

            new_state = self.env.do_action(action)

            states.append(last_state)
            actions.append(action)
            rewards.append(predicted_r[0][action])
            u_values.append(predicted_u[0])

            if self.env.is_terminal():
                break

            last_state = new_state

        return np.array(states), np.array(actions), np.array(
            rewards), np.array(u_values)
示例#5
0
class DunStrategy(Strategy):
    def __init__(self,
                 env: Environment,
                 model: RQModelBase,
                 memory_capacity=1000000,
                 mini_batch_size=32,
                 discount_factor=0.96,
                 epsilon_start=1.0,
                 epsilon_end=0.05,
                 epsilon_decay_over_steps=1000000,
                 mean_reward_for_episodes=100,
                 transfer_target_steps=1000):
        super(DunStrategy, self).__init__(env, model)
        self.env = env
        self.model = model
        self.replay_memory = DqnReplayMemory(memory_capacity,
                                             state_shape=env.state_shape,
                                             state_dtype=np.float32,
                                             action_dtype=np.uint16)

        self.transfer_target_steps = transfer_target_steps
        self.mini_batch_size = mini_batch_size
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay_over_steps = epsilon_decay_over_steps
        self.epsilon = epsilon_start
        self.discount_factor = discount_factor

        # Mean reward for last N episodes
        self.mean_reward = IncrementalMean(mean_reward_for_episodes)

    def run(self,
            sess: tf.Session,
            num_episodes,
            verbose=False,
            *args,
            **kwargs):
        for episode in range(num_episodes):
            episode_reward = 0.0
            state = self.env.reset()

            while True:
                r, _, q, _ = self.model.predict_r_u_q_p(sess, [state])
                r = r[0]
                q = q[0]

                # E-greedy policy
                if np.random.rand() < self.epsilon:
                    action = np.random.choice(self.env.num_actions)
                else:
                    action = np.argmax(q)

                new_state = self.env.do_action(action)
                reward = r[action]
                is_terminal = self.env.is_terminal()

                episode_reward += reward

                self.replay_memory.append(state, action, reward, is_terminal)

                if len(self.replay_memory) > self.mini_batch_size:
                    self.train_on_replay(sess)

                # Decay epsilon exponentially
                self.epsilon = (
                    self.epsilon_start - self.epsilon_end) * np.exp(
                        -5 * self.model.dqn_step /
                        self.epsilon_decay_over_steps) + self.epsilon_end

                if self.model.double and self.model.dqn_step % self.transfer_target_steps == 0:
                    print('DQN: Copying weights from Eval to Target')
                    self.model.update_target(sess)

                if is_terminal:
                    break
                else:
                    state = new_state

            self.mean_reward.add(episode_reward)
            if verbose:
                print(
                    'Episode {0}/{1}, R={2:.3f}  MeanR={3:.3f} i={4} eps={5:.4f}'
                    .format(episode, num_episodes, episode_reward,
                            self.mean_reward.value, self.model.dqn_step,
                            self.epsilon))
        if verbose:
            print('Finished run!')
            print('\tMean reward for last {0} episodes: {1:.3f}'.format(
                self.mean_reward.size, self.mean_reward.value))
        return self.mean_reward.value

    def train_on_replay(self, sess: tf.Session):
        states, actions, rewards, next_states, is_terminal = self.replay_memory.sample(
            self.mini_batch_size)

        if self.model.double:
            # Predict wit one call
            r_all, u_all, u_target_all = self.model.predict_vars(
                sess, np.concatenate((states, next_states)),
                [self.model.rewards, self.model.u, self.model.u_target])

            # Split for current and next states
            _, u, _ = r_all[:len(states)], u_all[:len(
                states)], u_target_all[:len(states)]
            r_next, u_next, u_target_next = r_all[len(states):], u_all[
                len(states):], u_target_all[len(states):]

            # Q(s, a) <- R(s, a) + gamma * Q`(s', argmax a' (Q(s', a')))
            # Q(s, a) <- R(s, a) + U(s, a)
            # a` = argmax a' [ Q(s', a') ]   -- next a
            # U(s, a) <- gamma * Q`(s', a`)
            # U(s, a) <- gamma * ( R(s, a`) + U(s, a`) )
            next_a = np.argmax(r_next + u_next, axis=1)
            u_targets = self.discount_factor * (
                r_next[range(self.mini_batch_size), next_a] +
                u_target_next[range(self.mini_batch_size), next_a])
        else:
            r_all, u_all, _, _ = self.model.predict_r_u_q_p(
                sess, np.concatenate((states, next_states)))
            _, u = r_all[:len(states)], u_all[:len(states)]
            r_next, u_next = r_all[len(states):], u_all[len(states):]

            # Q(s, a) <- R(s, a) + gamma * max a' [ Q(s', a') ]
            # Q(s, a) <- R(s, a) + U(s, a)
            # U(s, a) <- gamma * max a' [ Q(s', a') ]
            # U(s, a) <- gamma * max a' [ R(s', a') + U(s', a') ]
            u_targets = self.discount_factor * np.max(r_next + u_next, axis=1)

        # Next expected return for terminal states is 0
        u_targets[is_terminal] = 0.0

        u[range(self.mini_batch_size), actions] = u_targets
        return self.model.train_u(
            sess, states, u, average_episode_reward=self.mean_reward.value)