示例#1
0
def experiment(n_episodes, default_policy=False, policy=None, render=False):
    """
    Run a RL experiment that can be either training or testing

    Args:
        n_episodes: number of train/test episodes
        default_policy: boolean to enable testing/training phase
        policy: numpy tensor with a trained policy
        render: enable OpenAI environment graphical rendering
        agent_config: DQNAgent object

    Returns:
        Dictionary with:
            cumulative experiments outcomes
            list of steps per episode
            list of cumulative rewards
            trained policy
    """
    res = [0, 0]  # array of results accumulator: {[0]: Loss, [1]: Victory}
    scores = []  # Cumulative rewards
    steps = []  # steps per episode

    env = gym.make('CartPole-v0')
    env = env.unwrapped
    env.seed(seed)

    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    if default_policy:
        agent = DQNAgent(output_dim,
                         None,
                         use_ddqn=True,
                         default_policy=True,
                         model_filename=policy,
                         epsilon=0,
                         epsilon_lower_bound=0,
                         learn_thresh=0,
                         tb_dir=None)
    else:
        layer1 = Dense(10, input_dim=input_dim, activation='relu')
        layer2 = Dense(output_dim)
        agent = DQNAgent(output_dim, [layer1, layer2],
                         use_ddqn=True,
                         learn_thresh=2000,
                         update_rate=100,
                         epsilon_decay_function=lambda e: e - 0.001,
                         epsilon_lower_bound=0.1,
                         optimizer=keras.optimizers.RMSprop(0.001),
                         memory_size=2000,
                         tb_dir=None)

    for _ in tqdm(range(n_episodes), desc="Episode"):
        state = env.reset()
        cumulative_reward = 0

        state = np.reshape(state, [1, 4])

        t = 0
        while True:
            if (render):
                env.render()
                time.sleep(0.1)

            next_action = agent.act(state)
            new_state, reward, end, _ = env.step(next_action)

            x, x_dot, theta, theta_dot = new_state
            new_state = np.reshape(new_state, [1, 4])

            # Reward shaping
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            r3 = -abs(theta_dot)
            reward = r1 + r2 + r3

            agent.memoise((state, next_action, reward, new_state, end))

            if end or t > 199:
                if t < 195:
                    res[0] += 1
                else:
                    res[1] += 1
                    # print("ENTRATO!,", t, "steps","reward: ",cumulative_reward)

                steps.append(t)
                break
            else:
                state = new_state
                cumulative_reward += reward

            agent.learn()
            t += 1

        cumulative_reward += reward
        scores.append(cumulative_reward)
    env.close()
    return {
        "results": np.array(res),
        "steps": np.array(steps),
        "scores": np.array(scores),
        "agent": agent
    }
示例#2
0
def experiment(n_episodes, default_policy=False, policy=None, render=False):
    """
    Run a RL experiment that can be either training or testing

    Args:
        n_episodes: number of train/test episodes
        default_policy: boolean to enable testing/training phase
        policy: numpy tensor with a trained policy
        render: enable OpenAI environment graphical rendering

    Returns:
        Dictionary with:
            cumulative experiments outcomes
            list of steps per episode
            list of cumulative rewards
            trained policy
    """

    with tf.device('/gpu:0'):
        res = [0, 0]  # array of results accumulator: {[0]: Loss, [1]: Victory}
        scores = []  # Cumulative rewards
        steps = []  # Steps per episode

        reward_list = RingBuffer(100)
        env = gym.make('PongDeterministic-v4')

        input_dim = env.observation_space.shape[0]
        output_dim = env.action_space.n

        if default_policy:
            agent = DQNAgent(output_dim,
                             None,
                             use_ddqn=True,
                             default_policy=True,
                             model_filename=policy,
                             epsilon=0.05,
                             epsilon_lower_bound=0.05)
        else:
            layers = [
                Conv2D(32, (8, 8),
                       strides=(4, 4),
                       activation='relu',
                       input_shape=(84, 84, 4),
                       kernel_initializer=VarianceScaling(scale=2.0)),
                Conv2D(64, (4, 4),
                       strides=(2, 2),
                       activation='relu',
                       kernel_initializer=VarianceScaling(scale=2.0)),
                Conv2D(64, (3, 3),
                       strides=(1, 1),
                       activation='relu',
                       kernel_initializer=VarianceScaling(scale=2.0)),
                Flatten(),
                Dense(512, activation='relu'),
                Dense(output_dim)
            ]
            agent = DQNAgent(output_dim,
                             layers,
                             use_ddqn=True,
                             memory_size=700000,
                             gamma=0.99,
                             learn_thresh=50000,
                             epsilon_lower_bound=0.02,
                             epsilon_decay_function=lambda e: e -
                             (0.98 / 950000),
                             update_rate=10000,
                             optimizer=Adam(0.00025))

        gathered_frame = 0
        for episode_number in tqdm(range(n_episodes), desc="Episode"):
            frame = env.reset()
            state = pre_processing(frame)
            empty_state = np.zeros(state.shape, dtype="uint8")
            cumulative_reward = 0

            has_lost_life = True

            t = 0
            while True:
                if has_lost_life:
                    next_action = 1  # [1, 4, 5][ran.randint(0, 2)]

                    stack = np.stack(
                        (empty_state, empty_state, empty_state, empty_state),
                        axis=2)
                    stack = np.reshape([stack], (1, 84, 84, 4))

                    for _ in range(ran.randint(1, 10)):
                        gathered_frame += 1
                        frame, reward, end, _ = env.step(next_action)
                        new_state = np.reshape(pre_processing(frame),
                                               (1, 84, 84, 1))
                        new_stack = np.append(new_state,
                                              stack[:, :, :, :3],
                                              axis=3)
                        stack = new_stack

                        if (render):
                            env.render()

                    has_lost_life = False

                next_action = agent.act(stack)
                new_state, reward, end, _ = env.step(next_action)

                if (render):
                    env.render()
                    time.sleep(0.02)

                reward = np.clip(reward, -1., 1.)

                if reward != 0:
                    has_lost_life = True

                cumulative_reward += reward

                new_state = np.reshape(pre_processing(new_state),
                                       (1, 84, 84, 1))
                new_stack = np.append(new_state, stack[:, :, :, :3], axis=3)
                agent.memoise(
                    (stack, next_action, reward, new_state, has_lost_life))

                stack = new_stack
                gathered_frame += 1

                if end:
                    reward_list.append(cumulative_reward)
                    if cumulative_reward > 0:
                        res[1] += 1
                        print("You Won!, steps:", t, "reward:",
                              reward_list.mean(), "frames:", gathered_frame)
                    else:
                        res[0] += 1
                        print("You Lost!, steps:", t, "reward:",
                              reward_list.mean(), "frames:", gathered_frame)
                    steps.append(t)
                    break

                agent.learn()
                t += 1

            scores.append(cumulative_reward)
            if episode_number >= 50 and episode_number % 10 == 0:
                model_name = "partial_model_pong" + str(episode_number)
                agent.save_model(model_name)

        env.close()
        return {
            "results": np.array(res),
            "steps": np.array(steps),
            "scores": np.array(scores),
            "agent": agent
        }
示例#3
0
def experiment(n_episodes,
               default_policy=False,
               policy=None,
               render=False,
               agent_config=None):
    """
    Run a RL experiment that can be either training or testing

    Args:
        n_episodes: number of train/test episodes
        default_policy: boolean to enable testing/training phase
        policy: numpy tensor with a trained policy
        render: enable OpenAI environment graphical rendering
        agent_config: DQNAgent object

    Returns:
        Dictionary with:
            cumulative experiments outcomes
            list of steps per episode
            list of cumulative rewards
            trained policy
    """
    res = [0, 0]  # array of results accumulator: {[0]: Loss, [1]: Victory}
    scores = []  # Cumulative rewards
    steps = []  # Steps per episode

    env = gym.make('MountainCar-v0')
    env.seed(seed)

    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    if agent_config is None:
        if default_policy:
            agent = DQNAgent(output_dim,
                             None,
                             use_ddqn=True,
                             default_policy=True,
                             model_filename=policy,
                             epsilon=0,
                             epsilon_lower_bound=0,
                             learn_thresh=0)
        else:
            layer1 = Dense(15, input_dim=input_dim, activation='relu')
            layer2 = Dense(output_dim)
            agent = DQNAgent(output_dim, [layer1, layer2],
                             use_ddqn=True,
                             learn_thresh=1000,
                             update_rate=300,
                             epsilon_decay_function=lambda e: e * 0.95,
                             epsilon_lower_bound=0.01,
                             optimizer=keras.optimizers.RMSprop(0.001))
    else:
        agent = agent_config

    for i_episode in tqdm(range(n_episodes), desc="Episode"):
        state = env.reset()
        cumulative_reward = 0

        # Model validation for early stopping
        if i_episode > 0 and (i_episode % 100) == 0 and not default_policy:
            agent.save_model("tmp_model")
            evaluation_result = experiment(500,
                                           default_policy=True,
                                           policy="tmp_model")
            acc = accuracy(evaluation_result["results"])
            if acc == 100:
                break
            else:
                print("Accuracy:", acc, "Episode:", i_episode)

        state = np.reshape(state, [1, 2])

        for t in range(env._max_episode_steps):
            if (render):
                env.render()

            next_action = agent.act(state)
            new_state, reward, end, _ = env.step(next_action)

            reward = abs(new_state[0] - (-0.5))  # r in [0, 1] (reward shaping)
            new_state = np.reshape(new_state, [1, 2])

            agent.memoise((state, next_action, reward, new_state, end))

            if end:
                if t == env._max_episode_steps - 1:
                    res[0] += 1
                else:
                    res[1] += 1
                    # print("ENTRATO!,", t, "steps")

                steps.append(t)
                break
            else:
                state = new_state
                cumulative_reward += reward

            agent.learn()

        cumulative_reward += reward
        scores.append(cumulative_reward)
    env.close()
    return {
        "results": np.array(res),
        "steps": np.array(steps),
        "scores": np.array(scores),
        "agent": agent
    }