示例#1
0
def play_one_session(
    env: TimeLimit,
    max_size: int,
    action_chooser: Callable[[TimeLimit, Any], Any],
    render: bool = False,
    custom_actions: Callable[[int, TimeLimit, Any, Any, Any, bool, Any], None] = None,
    stop_when_done: bool = True,
) -> Tuple[float, List[Dict[str, Any]]]:
    observation = env.reset()

    score = 0
    history = []

    for i in range(max_size):

        if render:
            env.render()

        action = action_chooser(env, observation)
        current_iteration_history = {"observation": observation, "action": action}
        observation, reward, done, info = env.step(action.reshape((-1,)))

        score += reward
        history.append(current_iteration_history)

        if custom_actions is not None:
            custom_actions(i, env, action, observation, reward, done, info)

        if stop_when_done and done:
            break

    return score / max_size, history
示例#2
0
def play_with_car():
    maximum_steps_allowed = 250
    env = TimeLimit(MountainCarEnv(),
                    max_episode_steps=maximum_steps_allowed + 1)
    actions = {'left': 0, 'stop': 1, 'right': 2}

    initial_state = env.reset()
    print('Initial state: ', initial_state)

    for t in range(maximum_steps_allowed):
        # need to modify policy
        if t < 50:
            s, r, done, _ = env.step(actions['left'])
        elif t < 70:
            s, r, done, _ = env.step(actions['right'])
        elif t < 120:
            s, r, done, _ = env.step(actions['left'])
        else:
            s, r, done, _ = env.step(actions['right'])

        print('State {}, Reward {}, Step {}'.format(s, r, t))
        env.render()

        if done:
            if s[0] > 0.47:
                print('Well done!')
            else:
                print('Please, try again.')
            break
    else:
        print('Time is up. Please, try again.')

    env.close()
示例#3
0
    def test_change_gravity_each_step(self):
        env: ModifiedMassEnv = self.Environment()
        max_episode_steps = 500
        n_episodes = 5

        # NOTE: Interestingly, the renderer will show
        # `env.frame_skip * max_episode_steps` frames per episode, even when
        # "Ren[d]er every frame" is set to False.
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
        env: ModifiedMassEnv
        total_steps = 0

        for episode in range(n_episodes):
            initial_state = env.reset()
            done = False
            episode_steps = 0

            start_y = initial_state[1]
            moved_up = 0
            previous_state = initial_state
            state = initial_state

            body_part = self.body_names[0]
            start_mass = env.get_mass(body_part)

            while not done:
                previous_state = state
                state, reward, done, info = env.step(env.action_space.sample())
                env.render("human")
                episode_steps += 1
                total_steps += 1
                
                env.set_mass(body_part=body_part, mass=start_mass + 5 * total_steps / max_episode_steps)
                
                moved_up += (state[1] > previous_state[1])
                
                # print(f"Moving upward? {obs[1] > state[1]}")
            
            print(f"Gravity at end of episode: {env.gravity}")
            # TODO: Check that the position (in the observation) is obeying gravity?
            # if env.gravity <= 0:
            #     # Downward force, so should not have any significant preference for
            #     # moving up vs moving down.
            #     assert 0.4 <= (moved_up / max_episode_steps) <= 0.6, env.gravity
            # # if env.gravity == 0:
            # #     assert 0.5 <= (moved_up / max_episode_steps) <= 1.0
            # if env.gravity > 0:
            #     assert 0.5 <= (moved_up / max_episode_steps) <= 1.0, env.gravity
                
        assert total_steps == n_episodes * max_episode_steps
        initial_z = env.init_qpos[1]
        final_z = env.sim.data.qpos[1]
        assert initial_z == 0
        # Check that the robot is high up in the sky! :D
        assert final_z > 20
示例#4
0
    def test_change_gravity_each_step(self):
        env: ModifiedGravityEnv = self.Environment()
        max_episode_steps = 50
        n_episodes = 3

        # NOTE: Interestingly, the renderer will show
        # `env.frame_skip * max_episode_steps` frames per episode, even when
        # "Ren[d]er every frame" is set to False.
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
        total_steps = 0
        
        for episode in range(n_episodes):
            initial_state = env.reset()
            done = False
            episode_steps = 0

            start_y = initial_state[1]
            moved_up = 0
            previous_state = initial_state
            state = initial_state
            while not done:
                previous_state = state
                state, reward, done, info = env.step(env.action_space.sample())
                env.render("human")
                episode_steps += 1
                total_steps += 1
                
                # decrease the gravity continually over time.
                # By the end, things should be floating.
                env.set_gravity(-10 + 5 * total_steps / max_episode_steps)
                moved_up += (state[1] > previous_state[1])
                # print(f"Moving upward? {obs[1] > state[1]}")

            if episode_steps != max_episode_steps:
                print(f"Episode ended early?")

            print(f"Gravity at end of episode: {env.gravity}")
            # TODO: Check that the position (in the observation) is obeying gravity?
            # if env.gravity <= 0:
            #     # Downward force, so should not have any significant preference for
            #     # moving up vs moving down.
            #     assert 0.4 <= (moved_up / max_episode_steps) <= 0.6, env.gravity
            # # if env.gravity == 0:
            # #     assert 0.5 <= (moved_up / max_episode_steps) <= 1.0
            # if env.gravity > 0:
            #     assert 0.5 <= (moved_up / max_episode_steps) <= 1.0, env.gravity

        assert total_steps <= n_episodes * max_episode_steps
        
        initial_z = env.init_qpos[1]
        final_z = env.sim.data.qpos[1]
        if env.gravity > 0:
            assert final_z > initial_z
示例#5
0
def test_task_schedule_monsterkong():
    env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1")
    from gym.wrappers import TimeLimit
    env = TimeLimit(env, max_episode_steps=10)
    env = MultiTaskEnvironment(env,
                               task_schedule={
                                   0: {
                                       "level": 0
                                   },
                                   100: {
                                       "level": 1
                                   },
                                   200: {
                                       "level": 2
                                   },
                                   300: {
                                       "level": 3
                                   },
                                   400: {
                                       "level": 4
                                   },
                               },
                               add_task_id_to_obs=True)
    obs = env.reset()

    # img, task_labels = obs
    assert obs[1] == 0
    assert env.get_level() == 0

    for i in range(500):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs[1] == i // 100
        assert env.level == i // 100
        env.render()
        assert isinstance(done, bool)
        if done:
            print(f"End of episode at step {i}")
            obs = env.reset()

    assert obs[1] == 4
    assert env.level == 4
    # level stays the same even after reaching that objective.
    for i in range(500):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs[1] == 4
        assert env.level == 4
        env.render()
        if done:
            print(f"End of episode at step {i}")
            obs = env.reset()

    env.close()
def main():
    env = make_cmdp(args.cmdp, episodic=True)
    env = TimeLimit(env, 10)

    agent_model_name = args.cmdp.split('/')[-1]
    agent_model = agent_models.get_agent_model(agent_model_name)

    values_df_index = 'E[G]', 'E[G | A=a]', 'E[G | do(A=a)]'
    values_df_columns = env.model.actions

    _, state = env.reset()
    for t in itt.count():
        print()
        print(f't: {t}')
        env.render()

        Qs_none = [
            infer_Q(env, action, 'none', agent_model=agent_model).item()
            for action in range(env.action_space.n)
        ]
        Qs_condition = [
            infer_Q(env, action, 'condition', agent_model=agent_model).item()
            for action in range(env.action_space.n)
        ]
        Qs_intervention = [
            infer_Q(env, action, 'intervention',
                    agent_model=agent_model).item()
            for action in range(env.action_space.n)
        ]

        values_df = pd.DataFrame(
            [Qs_none, Qs_condition, Qs_intervention],
            values_df_index,
            values_df_columns,
        )
        print(values_df)

        action = torch.tensor(Qs_intervention).argmax()
        state, _, done, _ = env.step(action)

        if done:
            print()
            print(f'final state: {state}')
            print(f'Episode finished after {t+1} timesteps')
            break

    env.close()
示例#7
0
def test_task_schedule_with_callables():
    """ Apply functions to the env at a given step.

    """
    env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1")
    from gym.wrappers import TimeLimit
    env = TimeLimit(env, max_episode_steps=10)

    from operator import methodcaller
    env = MultiTaskEnvironment(env,
                               task_schedule={
                                   0: methodcaller("set_level", 0),
                                   100: methodcaller("set_level", 1),
                                   200: methodcaller("set_level", 2),
                                   300: methodcaller("set_level", 3),
                                   400: methodcaller("set_level", 4),
                               },
                               add_task_id_to_obs=True)
    obs = env.reset()

    # img, task_labels = obs
    assert obs[1] == 0
    assert env.get_level() == 0

    for i in range(500):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs[1] == i // 100
        assert env.level == i // 100
        env.render()
        assert isinstance(done, bool)
        if done:
            print(f"End of episode at step {i}")
            obs = env.reset()

    assert obs[1] == 4
    assert env.level == 4
    # level stays the same even after reaching that objective.
    for i in range(500):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs[1] == 4
        assert env.level == 4
        env.render()
        if done:
            print(f"End of episode at step {i}")
            obs = env.reset()
def main():
    env = make_mdp(args.mdp, episodic=True)
    env = TimeLimit(env, 10)

    env.reset()
    for t in itt.count():
        print('---')
        print(f't: {t}')
        print('state:')
        env.render()

        action = policy(env, log=True)
        _, reward, done, _ = env.step(action)
        print(f'reward: {reward}')

        if done:
            print('final state:')
            env.render()
            print(f'Episode finished after {t+1} timesteps')
            break

    env.close()
示例#9
0
    raw_rewards = np.zeros((
        len(env.rfs),
        args.batch_size,
    ))

    real_rewards = []
    invalid_action_stats = []

    dones = np.zeros((args.batch_size, ))
    values = torch.zeros((args.batch_size, )).to(device)

    invalid_action_masks = torch.zeros(
        (args.batch_size, env.action_space.nvec.sum()))
    # TRY NOT TO MODIFY: prepare the execution of the game.
    for step in range(args.batch_size):
        env.render()
        global_step += 1
        obs[step] = next_obs.copy()

        # ALGO LOGIC: put action logic here
        invalid_action_mask = torch.ones(env.action_space.nvec.sum())
        invalid_action_mask[0:env.action_space.nvec[0]] = torch.tensor(
            env.unit_location_mask)
        invalid_action_mask[-env.action_space.nvec[-1]:] = torch.tensor(
            env.target_unit_location_mask)
        invalid_action_masks[step] = invalid_action_mask
        with torch.no_grad():
            values[step] = vf.forward(obs[step:step + 1])
            action, logproba, _, probs = pg.get_action(
                obs[step:step + 1],
                invalid_action_masks=invalid_action_masks[step:step + 1])
示例#10
0
def replay_memory(env: TimeLimit, memory: List[List[Any]]):
    for episode_memory in memory:
        env.reset()
        for action in episode_memory:
            env.step(action)
            env.render()
示例#11
0
import time
from rebar.learners.qlearner import QLearner
from rebar.learners.adp import ADP
import numpy as np
import gym
import torch
from envs import Swingup, Reacher, InvertedDoublePendulum, InvertedPendulum, Walker
from copy import deepcopy
from gym.wrappers import TimeLimit
import matplotlib.pyplot as plt

env = Reacher
eval_env = TimeLimit(deepcopy(env), max_episode_steps=500)
play_env = TimeLimit(deepcopy(env), max_episode_steps=200)
env = TimeLimit(deepcopy(env), max_episode_steps=1000)
play_env.render()

# Swingup state = <x, vx, cos(theta), sin(theta), thetadot>

q = QLearner(action_space=env.action_space,
             observation_space=env.observation_space,
             Q='simple',
             opt_args={'lr': 0.01},
             memory_len=1000,
             gamma=0.999,
             initial_epsilon=1.,
             final_epsilon=0.01,
             exploration_steps=50000,
             target_lag=100)

adp = ADP(action_space=env.action_space,
示例#12
0
def evaluate(
    env: TimeLimit,
    total_episodes: int,
    *,
    q_table: np.ndarray = None,
    winning_reward: float = None,
    is_random: bool = False,
    render: bool = False,
    display_result: bool = False,
) -> float:
    """
    Evaluate the performance of a q-table to solve a gym environment problem
    It may also use random instead of a q-table
    in order to compare the performance of a q-table against a random solution
    :param env: gym environment to solve
    :param total_episodes: number of time to repeat the evaluation.
           The bigger the more statistically significant the output will be
    :param q_table: Q-table to used solve the problem
           if given, is_random must be False
    :param winning_reward: the reward given to the agent when it solves the problem.
           It is used to compute the number of time the agent solved the problem
    :param is_random: if True will use random instead of Q-table.
           If True, q-table must not be given
    :param render: if True will call env.render()
    :param display_result: If True, prints evaluation summary in the console at the evaluation end
    """
    # Todo : rename and re-think is_random parameter into policy parameter
    # Todo : render only last evaluation
    # Todo : yield q-table, evaluate it and continue evaluation if it is not good enough

    if (q_table is not None) and is_random:
        raise RuntimeError("is_random and q_table given")
    elif q_table is None and is_random is None:
        raise RuntimeError(
            "at least one of q_table and is_random must be given")

    total_epochs, total_reward, total_won_episodes = 0, 0, 0

    for _ in range(total_episodes):
        state = env.reset()
        if render:
            env.render()
        done = False
        while not done:
            if is_random:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])
            state, reward, done, info = env.step(action)

            total_epochs += 1
            total_reward += reward

            if render:
                env.render()

        # noinspection PyUnboundLocalVariable
        if reward == winning_reward:
            total_won_episodes += 1

    score = round(total_won_episodes * 100 / total_episodes, 2)

    if display_result:
        print("-" * 30)
        print(
            f"Results after {total_episodes} episodes using {'random' if is_random else 'q_table'}:"
        )
        print(f"Average steps per episode: {total_epochs / total_episodes}")
        print(f"Average reward per episode: {total_reward / total_episodes}")
        print(f"Percentage of won episodes : {score}%")
    return score