Пример #1
0
    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        with writer.as_default():
            tf.summary.scalar("total reward", log['total_reward'], i_iter)
            tf.summary.scalar("average reward", log['avg_reward'], i_iter)
            tf.summary.scalar("min reward", log['min_episode_reward'], i_iter)
            tf.summary.scalar("max reward", log['max_episode_reward'], i_iter)
            tf.summary.scalar("num steps", log['num_steps'], i_iter)

        batch = memory.sample()  # sample all items in memory

        batch_state = NDOUBLE(batch.state)
        batch_action = NDOUBLE(batch.action)
        batch_reward = NDOUBLE(batch.reward)
        batch_mask = NDOUBLE(batch.mask)
        batch_value = self.value_net(batch_state)

        batch_advantage, batch_return = estimate_advantages(
            batch_reward, batch_mask, batch_value, self.gamma, self.tau)
        v_loss, p_loss = vpg_step(self.policy_net, self.value_net,
                                  self.optimizer_p, self.optimizer_v,
                                  self.vpg_epochs, batch_state, batch_action,
                                  batch_return, batch_advantage)
        return v_loss, p_loss
Пример #2
0
    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        with writer.as_default():
            tf.summary.scalar("total reward", log['total_reward'], i_iter)
            tf.summary.scalar("average reward", log['avg_reward'], i_iter)
            tf.summary.scalar("min reward", log['min_episode_reward'], i_iter)
            tf.summary.scalar("max reward", log['max_episode_reward'], i_iter)
            tf.summary.scalar("num steps", log['num_steps'], i_iter)

        batch = memory.sample()  # sample all items in memory

        batch_state = NDOUBLE(batch.state)
        batch_action = NDOUBLE(batch.action)
        batch_reward = NDOUBLE(batch.reward)
        batch_mask = NDOUBLE(batch.mask)

        log_stats = {}
        for _ in range(self.reinforce_epochs):
            log_stats = reinforce_step(self.policy_net, self.optimizer_p,
                                       batch_state, batch_action, batch_reward,
                                       batch_mask, self.gamma)
        with writer.as_default():
            tf.summary.scalar("policy loss", log_stats["policy_loss"], i_iter)
        return log_stats
Пример #3
0
    def update(self, batch):
        batch_state = NDOUBLE(batch.state)
        batch_action = NLONG(batch.action)
        batch_reward = NDOUBLE(batch.reward)
        batch_next_state = NDOUBLE(batch.next_state)
        batch_mask = NDOUBLE(batch.mask)

        alg_step_stats = duelingdqn_step(self.value_net, self.optimizer, self.value_net_target, batch_state, batch_action,
                                         batch_reward, batch_next_state, batch_mask, self.gamma)
Пример #4
0
    def update(self, batch, global_steps):
        batch_state = NDOUBLE(batch.state)
        batch_action = NLONG(batch.action)
        batch_reward = NDOUBLE(batch.reward)
        batch_next_state = NDOUBLE(batch.next_state)
        batch_mask = NDOUBLE(batch.mask)

        doubledqn_step(self.value_net, self.optimizer, self.value_net_target, batch_state, batch_action,
                       batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak,
                       global_steps % self.update_target_gap == 0)
Пример #5
0
    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        with writer.as_default():
            tf.summary.scalar("total reward", log["total_reward"], i_iter)
            tf.summary.scalar("average reward", log["avg_reward"], i_iter)
            tf.summary.scalar("min reward", log["min_episode_reward"], i_iter)
            tf.summary.scalar("max reward", log["max_episode_reward"], i_iter)
            tf.summary.scalar("num steps", log["num_steps"], i_iter)

        batch = memory.sample()  # sample all items in memory

        batch_state = NDOUBLE(batch.state)
        batch_action = NDOUBLE(batch.action)
        batch_reward = NDOUBLE(batch.reward)
        batch_mask = NDOUBLE(batch.mask)
        batch_log_prob = NDOUBLE(batch.log_prob)[:, None]
        batch_value = tf.stop_gradient(self.value_net(batch_state))

        batch_advantage, batch_return = estimate_advantages(
            batch_reward, batch_mask, batch_value, self.gamma, self.tau
        )
        # update by TRPO
        log_stats = trpo_step(
            self.policy_net,
            self.value_net,
            self.optimizer_v,
            batch_state,
            batch_action,
            batch_log_prob,
            batch_advantage,
            batch_return,
            max_kl=self.max_kl,
            cg_damping=self.damping,
            vf_iters=10
        )

        with writer.as_default():
            for k, v in log_stats.items():
                tf.summary.scalar(k, v, i_iter)
        writer.flush()
        return log_stats
Пример #6
0
    def choose_action(self, state):
        """select action"""
        state = np.expand_dims(NDOUBLE(state), 0)
        action, log_prob = self.policy_net.get_action_log_prob(state)

        action = action.numpy()[0]
        return action
 def choose_action(self, state):
     state = np.expand_dims(NDOUBLE(state), 0)
     if np.random.uniform() <= self.epsilon:
         action = self.value_net.get_action(state)
         action = action.numpy()[0]
     else:  # choose action greedy
         action = np.random.randint(0, self.num_actions)
     return action
def collect_samples(pid, queue, env, policy, render, running_state,
                    min_batch_size):
    log = dict()
    memory = Memory()
    num_steps = 0
    num_episodes = 0

    min_episode_reward = float('inf')
    max_episode_reward = float('-inf')
    total_reward = 0

    while num_steps < min_batch_size:
        state = env.reset()
        episode_reward = 0
        if running_state:
            state = running_state(state)

        for t in range(10000):
            if render:
                env.render()
            state_tensor = np.expand_dims(NDOUBLE(state), 0)
            action, log_prob = policy.get_action_log_prob(state_tensor)
            action = action.numpy()[0]
            log_prob = log_prob.numpy()[0] if log_prob else None
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward

            if running_state:
                next_state = running_state(next_state)

            mask = 0 if done else 1
            # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
            memory.push(state, action, reward, next_state, mask, log_prob)
            num_steps += 1
            if done or num_steps >= min_batch_size:
                break

            state = next_state

        # num_steps += (t + 1)
        num_episodes += 1
        total_reward += episode_reward
        min_episode_reward = min(episode_reward, min_episode_reward)
        max_episode_reward = max(episode_reward, max_episode_reward)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_episode_reward'] = max_episode_reward
    log['min_episode_reward'] = min_episode_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Пример #9
0
    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        with writer.as_default():
            tf.summary.scalar("total reward", log['total_reward'], i_iter)
            tf.summary.scalar("average reward", log['avg_reward'], i_iter)
            tf.summary.scalar("min reward", log['min_episode_reward'], i_iter)
            tf.summary.scalar("max reward", log['max_episode_reward'], i_iter)
            tf.summary.scalar("num steps", log['num_steps'], i_iter)

        batch = memory.sample()  # sample all items in memory

        batch_state = NDOUBLE(batch.state)
        batch_action = NDOUBLE(batch.action)
        batch_reward = NDOUBLE(batch.reward)
        batch_mask = NDOUBLE(batch.mask)
        batch_log_prob = NDOUBLE(batch.log_prob)[:, None]
        batch_value = tf.stop_gradient(self.value_net(batch_state))

        batch_advantage, batch_return = estimate_advantages(
            batch_reward, batch_mask, batch_value, self.gamma, self.tau)
        log_stats = {}
        if self.ppo_mini_batch_size:
            batch_size = batch_state.shape[0]
            mini_batch_num = batch_size // self.ppo_mini_batch_size

            for e in range(self.ppo_epochs):
                perm = np.random.permutation(batch_size)
                for i in range(mini_batch_num):
                    ind = perm[slice(
                        i * self.ppo_mini_batch_size,
                        min(batch_size, (i + 1) * self.ppo_mini_batch_size))]
                    log_stats = ppo_step(self.policy_net, self.value_net,
                                         self.optimizer_p, self.optimizer_v, 1,
                                         batch_state[ind], batch_action[ind],
                                         batch_return[ind],
                                         batch_advantage[ind],
                                         batch_log_prob[ind],
                                         self.clip_epsilon)

        else:
            for _ in range(self.ppo_epochs):
                log_stats = ppo_step(self.policy_net, self.value_net,
                                     self.optimizer_p, self.optimizer_v, 1,
                                     batch_state, batch_action, batch_return,
                                     batch_advantage, batch_log_prob,
                                     self.clip_epsilon)

        with writer.as_default():
            tf.summary.histogram("ratio", log_stats["ratio"], i_iter)
            tf.summary.scalar("policy loss", log_stats["policy_loss"], i_iter)
            tf.summary.scalar("critic loss", log_stats["critic_loss"], i_iter)
        writer.flush()
        return log_stats