MINI_BATCH) next_state_action_values = np.max(target_dqn.predict( next_states / 255.0), axis=1) y_true = dqn.predict( states / 255.0) # Y.shape: (MINI_BATCH, num_actions), i.e., (32, 6) y_true[range( MINI_BATCH ), actions] = rewards + GAMMA * next_state_action_values * np.invert( dones) dqn.train(states / 255.0, y_true) step += 1 total_episode_rewards.append(cur_episode_reward) if episode % 100 == 0: dqn.save(MODEL_DIR, 'dqn-{}'.format(episode)) if np.mean(total_episode_rewards[-30:]) > 19: dqn.save(MODEL_DIR, 'dqn-{}'.format(episode)) break np.save(os.path.join(RES_DIR, 'episode_rewards.npy'), np.array(total_episode_rewards)) # 画episode_reward plt.figure() plt.title('EPISODE - REWARD') plt.plot(range(len(total_episode_rewards)), total_episode_rewards, linewidth=2) plt.xlabel('episode') plt.ylabel('reward') plt.savefig(os.path.join(IMG_DIR, 'episode_reward.png'))
# Prepare data batch for i in range(batch_size): states[i] = experiences_batch[i][0] actions.append(experiences_batch[i][1]) next_states[i] = experiences_batch[i][2] rewards.append(experiences_batch[i][3]) current_q_values = policy_net.predict(states) target_q_values = target_net.predict(next_states) # Create Q_targets for i in range(batch_size): # Q_max = max_a' Q_target(s', a') target_q_values[i][actions[i]] = rewards[i] + gamma * (np.amax( target_q_values[i])) # Train Policy Network policy_net.train(states, target_q_values) if environment_manager.done: max_reward = max_reward if max_reward > max_episode_reward else max_episode_reward print("Episode: " + str(episode) + " Episode reward: " + str(max_episode_reward) + " Max Reward: " + str(max_reward) + " Epsilon value " + str(strategy.get_actual_exploration_rate())) break # update target network and save network if episode % target_update == 0: target_net.copy_weights_from_nn(policy_net) policy_net.save(episode, strategy.get_actual_exploration_rate())