def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information with writer.as_default(): tf.summary.scalar("total reward", log['total_reward'], i_iter) tf.summary.scalar("average reward", log['avg_reward'], i_iter) tf.summary.scalar("min reward", log['min_episode_reward'], i_iter) tf.summary.scalar("max reward", log['max_episode_reward'], i_iter) tf.summary.scalar("num steps", log['num_steps'], i_iter) batch = memory.sample() # sample all items in memory batch_state = NDOUBLE(batch.state) batch_action = NDOUBLE(batch.action) batch_reward = NDOUBLE(batch.reward) batch_mask = NDOUBLE(batch.mask) batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) v_loss, p_loss = vpg_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, self.vpg_epochs, batch_state, batch_action, batch_return, batch_advantage) return v_loss, p_loss
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information with writer.as_default(): tf.summary.scalar("total reward", log['total_reward'], i_iter) tf.summary.scalar("average reward", log['avg_reward'], i_iter) tf.summary.scalar("min reward", log['min_episode_reward'], i_iter) tf.summary.scalar("max reward", log['max_episode_reward'], i_iter) tf.summary.scalar("num steps", log['num_steps'], i_iter) batch = memory.sample() # sample all items in memory batch_state = NDOUBLE(batch.state) batch_action = NDOUBLE(batch.action) batch_reward = NDOUBLE(batch.reward) batch_mask = NDOUBLE(batch.mask) log_stats = {} for _ in range(self.reinforce_epochs): log_stats = reinforce_step(self.policy_net, self.optimizer_p, batch_state, batch_action, batch_reward, batch_mask, self.gamma) with writer.as_default(): tf.summary.scalar("policy loss", log_stats["policy_loss"], i_iter) return log_stats
def update(self, batch): batch_state = NDOUBLE(batch.state) batch_action = NLONG(batch.action) batch_reward = NDOUBLE(batch.reward) batch_next_state = NDOUBLE(batch.next_state) batch_mask = NDOUBLE(batch.mask) alg_step_stats = duelingdqn_step(self.value_net, self.optimizer, self.value_net_target, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma)
def update(self, batch, global_steps): batch_state = NDOUBLE(batch.state) batch_action = NLONG(batch.action) batch_reward = NDOUBLE(batch.reward) batch_next_state = NDOUBLE(batch.next_state) batch_mask = NDOUBLE(batch.mask) doubledqn_step(self.value_net, self.optimizer, self.value_net_target, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak, global_steps % self.update_target_gap == 0)
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information with writer.as_default(): tf.summary.scalar("total reward", log["total_reward"], i_iter) tf.summary.scalar("average reward", log["avg_reward"], i_iter) tf.summary.scalar("min reward", log["min_episode_reward"], i_iter) tf.summary.scalar("max reward", log["max_episode_reward"], i_iter) tf.summary.scalar("num steps", log["num_steps"], i_iter) batch = memory.sample() # sample all items in memory batch_state = NDOUBLE(batch.state) batch_action = NDOUBLE(batch.action) batch_reward = NDOUBLE(batch.reward) batch_mask = NDOUBLE(batch.mask) batch_log_prob = NDOUBLE(batch.log_prob)[:, None] batch_value = tf.stop_gradient(self.value_net(batch_state)) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau ) # update by TRPO log_stats = trpo_step( self.policy_net, self.value_net, self.optimizer_v, batch_state, batch_action, batch_log_prob, batch_advantage, batch_return, max_kl=self.max_kl, cg_damping=self.damping, vf_iters=10 ) with writer.as_default(): for k, v in log_stats.items(): tf.summary.scalar(k, v, i_iter) writer.flush() return log_stats
def choose_action(self, state): """select action""" state = np.expand_dims(NDOUBLE(state), 0) action, log_prob = self.policy_net.get_action_log_prob(state) action = action.numpy()[0] return action
def choose_action(self, state): state = np.expand_dims(NDOUBLE(state), 0) if np.random.uniform() <= self.epsilon: action = self.value_net.get_action(state) action = action.numpy()[0] else: # choose action greedy action = np.random.randint(0, self.num_actions) return action
def collect_samples(pid, queue, env, policy, render, running_state, min_batch_size): log = dict() memory = Memory() num_steps = 0 num_episodes = 0 min_episode_reward = float('inf') max_episode_reward = float('-inf') total_reward = 0 while num_steps < min_batch_size: state = env.reset() episode_reward = 0 if running_state: state = running_state(state) for t in range(10000): if render: env.render() state_tensor = np.expand_dims(NDOUBLE(state), 0) action, log_prob = policy.get_action_log_prob(state_tensor) action = action.numpy()[0] log_prob = log_prob.numpy()[0] if log_prob else None next_state, reward, done, _ = env.step(action) episode_reward += reward if running_state: next_state = running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') memory.push(state, action, reward, next_state, mask, log_prob) num_steps += 1 if done or num_steps >= min_batch_size: break state = next_state # num_steps += (t + 1) num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_episode_reward'] = max_episode_reward log['min_episode_reward'] = min_episode_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information with writer.as_default(): tf.summary.scalar("total reward", log['total_reward'], i_iter) tf.summary.scalar("average reward", log['avg_reward'], i_iter) tf.summary.scalar("min reward", log['min_episode_reward'], i_iter) tf.summary.scalar("max reward", log['max_episode_reward'], i_iter) tf.summary.scalar("num steps", log['num_steps'], i_iter) batch = memory.sample() # sample all items in memory batch_state = NDOUBLE(batch.state) batch_action = NDOUBLE(batch.action) batch_reward = NDOUBLE(batch.reward) batch_mask = NDOUBLE(batch.mask) batch_log_prob = NDOUBLE(batch.log_prob)[:, None] batch_value = tf.stop_gradient(self.value_net(batch_state)) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) log_stats = {} if self.ppo_mini_batch_size: batch_size = batch_state.shape[0] mini_batch_num = batch_size // self.ppo_mini_batch_size for e in range(self.ppo_epochs): perm = np.random.permutation(batch_size) for i in range(mini_batch_num): ind = perm[slice( i * self.ppo_mini_batch_size, min(batch_size, (i + 1) * self.ppo_mini_batch_size))] log_stats = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, batch_state[ind], batch_action[ind], batch_return[ind], batch_advantage[ind], batch_log_prob[ind], self.clip_epsilon) else: for _ in range(self.ppo_epochs): log_stats = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.clip_epsilon) with writer.as_default(): tf.summary.histogram("ratio", log_stats["ratio"], i_iter) tf.summary.scalar("policy loss", log_stats["policy_loss"], i_iter) tf.summary.scalar("critic loss", log_stats["critic_loss"], i_iter) writer.flush() return log_stats