actions.append(a) dones.append(d) rewards.append(r) logp_ts.append(logp_t) o = n_o if d: ep += 1 if ep % record_score_size == 0: if int(ep / record_score_size) < 600: writer.add_scalar('data/reward', record_score / record_score_size, int(ep / record_score_size)) record_score = 0 writer.add_scalar('data/reward_per_episode', score, ep) print(score, ep) score = 0 o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 a, v_t, logp_t = agent.get_action([o]) values.append(v_t[0]) next_value = values[1:] value = values[:-1] adv, target = get_gaes(rewards, dones, value, next_value, agent.gamma, agent.lamda, False) value_loss, kl, ent = agent.update(states, actions, target, adv, logp_ts) writer.add_scalar('data/value_loss_per_rollout', value_loss, rollout) writer.add_scalar('data/kl_per_rollout', kl, rollout) writer.add_scalar('data/ent_per_rollout', ent, rollout) writer.add_scalar('data/reward_per_rollout', score_rollout, rollout) values, states, actions, dones, logp_ts, rewards = [], [], [], [], [], []
print(episode, score) score = 0 states = next_states total_state = np.stack(total_state).transpose([1, 0, 2 ]).reshape([-1, state_size]) total_next_state = np.stack(total_next_state).transpose([1, 0, 2]).reshape( [-1, state_size]) total_reward = np.stack(total_reward).transpose().reshape([-1]) total_done = np.stack(total_done).transpose().reshape([-1]) total_action = np.stack(total_action).transpose().reshape([-1]) total_target, total_adv = [], [] for idx in range(num_worker): value, next_value = agent.get_value( total_state[idx * num_step:(idx + 1) * num_step], total_next_state[idx * num_step:(idx + 1) * num_step]) adv, target = get_gaes( total_reward[idx * num_step:(idx + 1) * num_step], total_done[idx * num_step:(idx + 1) * num_step], value, next_value, agent.gamma, agent.lamda, normalize) total_target.append(target) total_adv.append(adv) agent.train_model(total_state, total_action, np.hstack(total_target), np.hstack(total_adv)) writer.add_scalar('data/reward_per_rollout', sum(total_reward) / (num_worker), global_update) saver.save(sess, 'lunarlander_a2c/model')
score += reward total_state.append(state) total_next_state.append(next_state) total_done.append(done) total_reward.append(reward) total_action.append(action) state = next_state if ep % train_size == 0: update_step += 1 total_state = np.stack(total_state) total_next_state = np.stack(total_next_state) total_reward = np.stack(total_reward) total_done = np.stack(total_done) total_action = np.stack(total_action) value, next_value = agent.get_value(total_state, total_next_state) adv, target = utils.get_gaes(total_reward, total_done, value, next_value, agent.gamma, agent.lamda, False) agent.train_model(total_state, total_action, target, adv) print(update_step, score / train_size) if update_step < 300: writer.add_scalar('data/reward', score / train_size, update_step) saver.save(sess, 'pendulum_ppo/model') total_state, total_reward, total_done, total_next_state, total_action = [], [], [], [], [] score = 0