예제 #1
0
def experiment(n_episodes, default_policy=False, policy=None, render=False):
    """
    Run a RL experiment that can be either training or testing

    Args:
        n_episodes: number of train/test episodes
        default_policy: boolean to enable testing/training phase
        policy: numpy tensor with a trained policy
        render: enable OpenAI environment graphical rendering

    Returns:
        Dictionary with:
            cumulative experiments outcomes
            list of steps per episode
            list of cumulative rewards
            trained policy
    """

    with tf.device('/gpu:0'):
        res = [0, 0]  # array of results accumulator: {[0]: Loss, [1]: Victory}
        scores = []  # Cumulative rewards
        steps = []  # Steps per episode

        reward_list = RingBuffer(100)
        env = gym.make('PongDeterministic-v4')

        input_dim = env.observation_space.shape[0]
        output_dim = env.action_space.n

        if default_policy:
            agent = DQNAgent(output_dim,
                             None,
                             use_ddqn=True,
                             default_policy=True,
                             model_filename=policy,
                             epsilon=0.05,
                             epsilon_lower_bound=0.05)
        else:
            layers = [
                Conv2D(32, (8, 8),
                       strides=(4, 4),
                       activation='relu',
                       input_shape=(84, 84, 4),
                       kernel_initializer=VarianceScaling(scale=2.0)),
                Conv2D(64, (4, 4),
                       strides=(2, 2),
                       activation='relu',
                       kernel_initializer=VarianceScaling(scale=2.0)),
                Conv2D(64, (3, 3),
                       strides=(1, 1),
                       activation='relu',
                       kernel_initializer=VarianceScaling(scale=2.0)),
                Flatten(),
                Dense(512, activation='relu'),
                Dense(output_dim)
            ]
            agent = DQNAgent(output_dim,
                             layers,
                             use_ddqn=True,
                             memory_size=700000,
                             gamma=0.99,
                             learn_thresh=50000,
                             epsilon_lower_bound=0.02,
                             epsilon_decay_function=lambda e: e -
                             (0.98 / 950000),
                             update_rate=10000,
                             optimizer=Adam(0.00025))

        gathered_frame = 0
        for episode_number in tqdm(range(n_episodes), desc="Episode"):
            frame = env.reset()
            state = pre_processing(frame)
            empty_state = np.zeros(state.shape, dtype="uint8")
            cumulative_reward = 0

            has_lost_life = True

            t = 0
            while True:
                if has_lost_life:
                    next_action = 1  # [1, 4, 5][ran.randint(0, 2)]

                    stack = np.stack(
                        (empty_state, empty_state, empty_state, empty_state),
                        axis=2)
                    stack = np.reshape([stack], (1, 84, 84, 4))

                    for _ in range(ran.randint(1, 10)):
                        gathered_frame += 1
                        frame, reward, end, _ = env.step(next_action)
                        new_state = np.reshape(pre_processing(frame),
                                               (1, 84, 84, 1))
                        new_stack = np.append(new_state,
                                              stack[:, :, :, :3],
                                              axis=3)
                        stack = new_stack

                        if (render):
                            env.render()

                    has_lost_life = False

                next_action = agent.act(stack)
                new_state, reward, end, _ = env.step(next_action)

                if (render):
                    env.render()
                    time.sleep(0.02)

                reward = np.clip(reward, -1., 1.)

                if reward != 0:
                    has_lost_life = True

                cumulative_reward += reward

                new_state = np.reshape(pre_processing(new_state),
                                       (1, 84, 84, 1))
                new_stack = np.append(new_state, stack[:, :, :, :3], axis=3)
                agent.memoise(
                    (stack, next_action, reward, new_state, has_lost_life))

                stack = new_stack
                gathered_frame += 1

                if end:
                    reward_list.append(cumulative_reward)
                    if cumulative_reward > 0:
                        res[1] += 1
                        print("You Won!, steps:", t, "reward:",
                              reward_list.mean(), "frames:", gathered_frame)
                    else:
                        res[0] += 1
                        print("You Lost!, steps:", t, "reward:",
                              reward_list.mean(), "frames:", gathered_frame)
                    steps.append(t)
                    break

                agent.learn()
                t += 1

            scores.append(cumulative_reward)
            if episode_number >= 50 and episode_number % 10 == 0:
                model_name = "partial_model_pong" + str(episode_number)
                agent.save_model(model_name)

        env.close()
        return {
            "results": np.array(res),
            "steps": np.array(steps),
            "scores": np.array(scores),
            "agent": agent
        }