Exemplo n.º 1
0
                target_dqn.copy_from(dqn)  # 复制参数
            if epsilon_greedy(step):
                action = env.action_space.sample()
            else:
                action = dqn.get_action(state / 255.0)
            # env.render()
            next_frame, reward, done, _ = env.step(action)
            next_state = np.array(next_frame)
            buf.push(state, action, reward, next_state, done)
            state = next_state
            cur_episode_reward += reward

            if buf.size() > MIN_BUFFER:
                states, actions, rewards, next_states, dones = buf.sample(
                    MINI_BATCH)
                next_state_action_values = np.max(target_dqn.predict(
                    next_states / 255.0),
                                                  axis=1)
                y_true = dqn.predict(
                    states /
                    255.0)  # Y.shape: (MINI_BATCH, num_actions), i.e., (32, 6)
                y_true[range(
                    MINI_BATCH
                ), actions] = rewards + GAMMA * next_state_action_values * np.invert(
                    dones)
                dqn.train(states / 255.0, y_true)
            step += 1
        total_episode_rewards.append(cur_episode_reward)
        if episode % 100 == 0:
            dqn.save(MODEL_DIR, 'dqn-{}'.format(episode))
        if np.mean(total_episode_rewards[-30:]) > 19:
            dqn.save(MODEL_DIR, 'dqn-{}'.format(episode))
Exemplo n.º 2
0
        if memory.can_provide_sample(batch_size):
            experiences_batch = memory.sample(batch_size)
            states = np.zeros((batch_size, environment_manager.final_reshape))
            next_states = np.zeros(
                (batch_size, environment_manager.final_reshape))
            actions, rewards = [], []

            # Prepare data batch
            for i in range(batch_size):
                states[i] = experiences_batch[i][0]
                actions.append(experiences_batch[i][1])
                next_states[i] = experiences_batch[i][2]
                rewards.append(experiences_batch[i][3])

            current_q_values = policy_net.predict(states)
            target_q_values = target_net.predict(next_states)

            # Create Q_targets
            for i in range(batch_size):
                # Q_max = max_a' Q_target(s', a')
                target_q_values[i][actions[i]] = rewards[i] + gamma * (np.amax(
                    target_q_values[i]))

            # Train Policy Network
            policy_net.train(states, target_q_values)

        if environment_manager.done:
            max_reward = max_reward if max_reward > max_episode_reward else max_episode_reward
            print("Episode: " + str(episode) + " Episode reward: " +
                  str(max_episode_reward) + " Max Reward: " + str(max_reward) +