target_dqn.copy_from(dqn) # 复制参数 if epsilon_greedy(step): action = env.action_space.sample() else: action = dqn.get_action(state / 255.0) # env.render() next_frame, reward, done, _ = env.step(action) next_state = np.array(next_frame) buf.push(state, action, reward, next_state, done) state = next_state cur_episode_reward += reward if buf.size() > MIN_BUFFER: states, actions, rewards, next_states, dones = buf.sample( MINI_BATCH) next_state_action_values = np.max(target_dqn.predict( next_states / 255.0), axis=1) y_true = dqn.predict( states / 255.0) # Y.shape: (MINI_BATCH, num_actions), i.e., (32, 6) y_true[range( MINI_BATCH ), actions] = rewards + GAMMA * next_state_action_values * np.invert( dones) dqn.train(states / 255.0, y_true) step += 1 total_episode_rewards.append(cur_episode_reward) if episode % 100 == 0: dqn.save(MODEL_DIR, 'dqn-{}'.format(episode)) if np.mean(total_episode_rewards[-30:]) > 19: dqn.save(MODEL_DIR, 'dqn-{}'.format(episode))
if memory.can_provide_sample(batch_size): experiences_batch = memory.sample(batch_size) states = np.zeros((batch_size, environment_manager.final_reshape)) next_states = np.zeros( (batch_size, environment_manager.final_reshape)) actions, rewards = [], [] # Prepare data batch for i in range(batch_size): states[i] = experiences_batch[i][0] actions.append(experiences_batch[i][1]) next_states[i] = experiences_batch[i][2] rewards.append(experiences_batch[i][3]) current_q_values = policy_net.predict(states) target_q_values = target_net.predict(next_states) # Create Q_targets for i in range(batch_size): # Q_max = max_a' Q_target(s', a') target_q_values[i][actions[i]] = rewards[i] + gamma * (np.amax( target_q_values[i])) # Train Policy Network policy_net.train(states, target_q_values) if environment_manager.done: max_reward = max_reward if max_reward > max_episode_reward else max_episode_reward print("Episode: " + str(episode) + " Episode reward: " + str(max_episode_reward) + " Max Reward: " + str(max_reward) +