示例#1
0
def play_with_car():
    maximum_steps_allowed = 250
    env = TimeLimit(MountainCarEnv(),
                    max_episode_steps=maximum_steps_allowed + 1)
    actions = {'left': 0, 'stop': 1, 'right': 2}

    initial_state = env.reset()
    print('Initial state: ', initial_state)

    for t in range(maximum_steps_allowed):
        # need to modify policy
        if t < 50:
            s, r, done, _ = env.step(actions['left'])
        elif t < 70:
            s, r, done, _ = env.step(actions['right'])
        elif t < 120:
            s, r, done, _ = env.step(actions['left'])
        else:
            s, r, done, _ = env.step(actions['right'])

        print('State {}, Reward {}, Step {}'.format(s, r, t))
        env.render()

        if done:
            if s[0] > 0.47:
                print('Well done!')
            else:
                print('Please, try again.')
            break
    else:
        print('Time is up. Please, try again.')

    env.close()
示例#2
0
def test_basics():
    env = TimeLimit(gym.make("CartPole-v0"), max_episode_steps=10)
    env = EnvDataset(env)
    env = EpisodeLimit(env, max_episodes=3)
    env.seed(123)

    for episode in range(3):
        obs = env.reset()
        done = False
        step = 0
        while not done:
            print(f"step {step}")
            obs, reward, done, info = env.step(env.action_space.sample())
            step += 1
    
    assert env.is_closed()
    with pytest.raises(gym.error.ClosedEnvironmentError):
        _ = env.reset()

    with pytest.raises(gym.error.ClosedEnvironmentError):
        _ = env.step(env.action_space.sample())

    with pytest.raises(gym.error.ClosedEnvironmentError):
        for _ in env:
            break
示例#3
0
def test_task_schedule_monsterkong():
    env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1")
    from gym.wrappers import TimeLimit
    env = TimeLimit(env, max_episode_steps=10)
    env = MultiTaskEnvironment(env,
                               task_schedule={
                                   0: {
                                       "level": 0
                                   },
                                   100: {
                                       "level": 1
                                   },
                                   200: {
                                       "level": 2
                                   },
                                   300: {
                                       "level": 3
                                   },
                                   400: {
                                       "level": 4
                                   },
                               },
                               add_task_id_to_obs=True)
    obs = env.reset()

    # img, task_labels = obs
    assert obs[1] == 0
    assert env.get_level() == 0

    for i in range(500):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs[1] == i // 100
        assert env.level == i // 100
        env.render()
        assert isinstance(done, bool)
        if done:
            print(f"End of episode at step {i}")
            obs = env.reset()

    assert obs[1] == 4
    assert env.level == 4
    # level stays the same even after reaching that objective.
    for i in range(500):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs[1] == 4
        assert env.level == 4
        env.render()
        if done:
            print(f"End of episode at step {i}")
            obs = env.reset()

    env.close()
示例#4
0
def play_one_session(
    env: TimeLimit,
    max_size: int,
    action_chooser: Callable[[TimeLimit, Any], Any],
    render: bool = False,
    custom_actions: Callable[[int, TimeLimit, Any, Any, Any, bool, Any], None] = None,
    stop_when_done: bool = True,
) -> Tuple[float, List[Dict[str, Any]]]:
    observation = env.reset()

    score = 0
    history = []

    for i in range(max_size):

        if render:
            env.render()

        action = action_chooser(env, observation)
        current_iteration_history = {"observation": observation, "action": action}
        observation, reward, done, info = env.step(action.reshape((-1,)))

        score += reward
        history.append(current_iteration_history)

        if custom_actions is not None:
            custom_actions(i, env, action, observation, reward, done, info)

        if stop_when_done and done:
            break

    return score / max_size, history
示例#5
0
def play(env_name: str, manual_control: bool, max_steps: int):
    # Make environment
    env = TimeLimit(gym.make(env_name, render=True), max_steps)
    observation = env.reset()

    if manual_control:
        # Create user debug interface
        import pybullet as p
        params = [
            p.addUserDebugParameter(
                p.getJointInfo(env.robot_id, j)[1].decode(), -1, 1, 0)
            for j in env.joint_list
        ]

    reward_sum = 0
    while True:
        if manual_control:
            # Read user input and simulate motor
            a = [p.readUserDebugParameter(param) for param in params]
        else:
            a = env.action_space.sample()

        observation, reward, done, _ = env.step(a)
        reward_sum += reward
        print("\nobservation", observation)
        print("reward", reward)
        print("total reward", reward_sum)
        print("done", done)

        # Reset when done
        if done:
            observation = env.reset()
            reward_sum = 0

    env.close()
示例#6
0
def test_random_task_on_each_episode_and_only_one_task_in_schedule():
    """ BUG: When the goal is to have only one task, it instead keeps sampling a new
    task from the 'distribution', in the case of cartpole!
    """
    env: MetaMonsterKongEnv = gym.make("CartPole-v1")
    from gym.wrappers import TimeLimit
    env = TimeLimit(env, max_episode_steps=10)
    env = MultiTaskEnvironment(
        env,
        task_schedule={
            0: {
                "length": 0.1
            },
        },
        add_task_id_to_obs=True,
        new_random_task_on_reset=True,
    )
    task_labels = []
    lengths = []
    for i in range(10):
        obs = env.reset()
        task_labels.append(obs[1])
        lengths.append(env.length)
        done = False
        while not done:
            obs, reward, done, info = env.step(env.action_space.sample())
            task_labels.append(obs[1])
            lengths.append(env.length)

    assert set(task_labels) == {0}
    assert set(lengths) == {0.1}
示例#7
0
def test_random_task_on_each_episode():
    env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1")
    from gym.wrappers import TimeLimit
    env = TimeLimit(env, max_episode_steps=10)
    env = MultiTaskEnvironment(
        env,
        task_schedule={
            0: {"level": 0},
            5: {"level": 1},
            200: {"level": 2},
            300: {"level": 3},
            400: {"level": 4},
        },
        add_task_id_to_obs=True,
        new_random_task_on_reset=True,
    )
    task_labels = []
    for i in range(10):
        obs = env.reset()
        task_labels.append(obs["task_labels"])
    assert len(set(task_labels)) > 1

    # Episodes only last 10 steps. Tasks don't have anything to do with the task
    # schedule.
    obs = env.reset()
    start_task_label = obs["task_labels"]
    for i in range(10):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs["task_labels"] == start_task_label
        if i == 9:
            assert done
        else:
            assert not done

    env.close()
示例#8
0
def test_task_schedule_with_callables():
    """ Apply functions to the env at a given step.

    """
    env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1")
    from gym.wrappers import TimeLimit
    env = TimeLimit(env, max_episode_steps=10)

    from operator import methodcaller
    env = MultiTaskEnvironment(env,
                               task_schedule={
                                   0: methodcaller("set_level", 0),
                                   100: methodcaller("set_level", 1),
                                   200: methodcaller("set_level", 2),
                                   300: methodcaller("set_level", 3),
                                   400: methodcaller("set_level", 4),
                               },
                               add_task_id_to_obs=True)
    obs = env.reset()

    # img, task_labels = obs
    assert obs[1] == 0
    assert env.get_level() == 0

    for i in range(500):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs[1] == i // 100
        assert env.level == i // 100
        env.render()
        assert isinstance(done, bool)
        if done:
            print(f"End of episode at step {i}")
            obs = env.reset()

    assert obs[1] == 4
    assert env.level == 4
    # level stays the same even after reaching that objective.
    for i in range(500):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs[1] == 4
        assert env.level == 4
        env.render()
        if done:
            print(f"End of episode at step {i}")
            obs = env.reset()
示例#9
0
    def test_change_gravity_each_step(self):
        env: ModifiedMassEnv = self.Environment()
        max_episode_steps = 500
        n_episodes = 5

        # NOTE: Interestingly, the renderer will show
        # `env.frame_skip * max_episode_steps` frames per episode, even when
        # "Ren[d]er every frame" is set to False.
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
        env: ModifiedMassEnv
        total_steps = 0

        for episode in range(n_episodes):
            initial_state = env.reset()
            done = False
            episode_steps = 0

            start_y = initial_state[1]
            moved_up = 0
            previous_state = initial_state
            state = initial_state

            body_part = self.body_names[0]
            start_mass = env.get_mass(body_part)

            while not done:
                previous_state = state
                state, reward, done, info = env.step(env.action_space.sample())
                env.render("human")
                episode_steps += 1
                total_steps += 1
                
                env.set_mass(body_part=body_part, mass=start_mass + 5 * total_steps / max_episode_steps)
                
                moved_up += (state[1] > previous_state[1])
                
                # print(f"Moving upward? {obs[1] > state[1]}")
            
            print(f"Gravity at end of episode: {env.gravity}")
            # TODO: Check that the position (in the observation) is obeying gravity?
            # if env.gravity <= 0:
            #     # Downward force, so should not have any significant preference for
            #     # moving up vs moving down.
            #     assert 0.4 <= (moved_up / max_episode_steps) <= 0.6, env.gravity
            # # if env.gravity == 0:
            # #     assert 0.5 <= (moved_up / max_episode_steps) <= 1.0
            # if env.gravity > 0:
            #     assert 0.5 <= (moved_up / max_episode_steps) <= 1.0, env.gravity
                
        assert total_steps == n_episodes * max_episode_steps
        initial_z = env.init_qpos[1]
        final_z = env.sim.data.qpos[1]
        assert initial_z == 0
        # Check that the robot is high up in the sky! :D
        assert final_z > 20
示例#10
0
class BaseTestRotMAB:
    """Base test class for RotMAB environment."""
    def __init__(self, winning_probs, max_steps):
        """Initialize test class."""
        self.winning_probs = winning_probs
        self.max_steps = max_steps
        self.env = TimeLimit(
            NonMarkovianRotatingMAB(winning_probs=self.winning_probs),
            max_episode_steps=self.max_steps,
        )

    def test_action_space(self):
        """Test action spaces."""
        assert self.env.action_space == Discrete(len(self.winning_probs))

    def test_observation_space(self):
        """Test observation spaces."""
        assert self.env.observation_space == Discrete(2)

    def test_interaction(self):
        """Test interaction with Rotating MAB."""
        self.env.seed()
        state = self.env.reset()
        assert state == 0

        def assert_consistency(obs, reward):
            """Assert obs = 1 iff reward = 1."""
            positive_reward = reward > 0.0
            positive_obs = obs == 1
            assert (positive_reward and positive_obs
                    or (not positive_reward and not positive_obs))

        for _i in range(self.max_steps - 1):
            action = self.env.action_space.sample()
            obs, reward, done, info = self.env.step(action)
            assert_consistency(obs, reward)
            assert not done

        # last action
        obs, reward, done, info = self.env.step(0)
        assert_consistency(obs, reward)
        assert done
示例#11
0
 def test_noop_reset_env(self):
     # runable test
     noop_max = 20
     env = gym.make(TEST_ENV_ID)
     env = TimeLimit(env, 3)
     env = atari.NoopResetEnv(env, noop_max=noop_max)
     env.reset()
     for i in range(20):
         obs, rew, done, info = env.step(env.action_space.sample())
         if done:
             break
示例#12
0
    def test_change_gravity_each_step(self):
        env: ModifiedGravityEnv = self.Environment()
        max_episode_steps = 50
        n_episodes = 3

        # NOTE: Interestingly, the renderer will show
        # `env.frame_skip * max_episode_steps` frames per episode, even when
        # "Ren[d]er every frame" is set to False.
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
        total_steps = 0
        
        for episode in range(n_episodes):
            initial_state = env.reset()
            done = False
            episode_steps = 0

            start_y = initial_state[1]
            moved_up = 0
            previous_state = initial_state
            state = initial_state
            while not done:
                previous_state = state
                state, reward, done, info = env.step(env.action_space.sample())
                env.render("human")
                episode_steps += 1
                total_steps += 1
                
                # decrease the gravity continually over time.
                # By the end, things should be floating.
                env.set_gravity(-10 + 5 * total_steps / max_episode_steps)
                moved_up += (state[1] > previous_state[1])
                # print(f"Moving upward? {obs[1] > state[1]}")

            if episode_steps != max_episode_steps:
                print(f"Episode ended early?")

            print(f"Gravity at end of episode: {env.gravity}")
            # TODO: Check that the position (in the observation) is obeying gravity?
            # if env.gravity <= 0:
            #     # Downward force, so should not have any significant preference for
            #     # moving up vs moving down.
            #     assert 0.4 <= (moved_up / max_episode_steps) <= 0.6, env.gravity
            # # if env.gravity == 0:
            # #     assert 0.5 <= (moved_up / max_episode_steps) <= 1.0
            # if env.gravity > 0:
            #     assert 0.5 <= (moved_up / max_episode_steps) <= 1.0, env.gravity

        assert total_steps <= n_episodes * max_episode_steps
        
        initial_z = env.init_qpos[1]
        final_z = env.sim.data.qpos[1]
        if env.gravity > 0:
            assert final_z > initial_z
示例#13
0
文件: test.py 项目: ltbd78/RL
def test(pkl_path, pth_path, env, attempts, display=False, video_dir=None):
    with open(pkl_path, 'rb') as f:
        logs = pickle.load(f)

    if logs['params']['max_episode_steps'] is not None:
        env = TimeLimit(env,
                        max_episode_steps=logs['params']['max_episode_steps'])

    if video_dir:
        if not os.path.exists(video_dir):
            os.makedirs(video_dir)
        env = Monitor(env, video_dir, force=True)

    if logs['agent'] == 'dqn':
        agent = DQNAgent(env.observation_space, env.action_space,
                         **logs['params'])
        agent.epsilon = 0
    elif logs['agent'] == 'a2c':
        agent = A2CAgent(env.observation_space, env.action_space,
                         **logs['params'])
    elif logs['agent'] == 'td3':
        agent = TD3Agent(env.observation_space, env.action_space,
                         **logs['params'])
    elif logs['agent'] == 'random':
        agent = RandomAgent(env.observation_space, env.action_space,
                            **logs['params'])

    agent.load(pth_path)

    try:
        rewards = []
        for attempt in range(attempts):
            state = env.reset()
            sum_reward = 0
            t = 0
            done = False
            while not done:
                action = agent.get_action(state)
                next_state, reward, done, _ = env.step(action)
                state = next_state
                sum_reward += reward
                t += 1
                if display:
                    title = f'Attempt: {attempt+1} | Timestep: {t} | Reward: {reward} | Sum Reward: {sum_reward}'
                    render(env, title)
            rewards.append(sum_reward)
        env.close()
        return rewards
    except Exception:
        traceback.print_exc()
        breakpoint()
        env.close()
示例#14
0
 def test_max_and_skip_env(self):
     # runable test
     skip = 4
     env = gym.make(TEST_ENV_ID)
     env = TimeLimit(env, 20)
     env = atari.MaxAndSkipEnv(env, skip=skip)
     env.seed(1)
     ub_utils.set_seed(1)
     env.reset()
     for i in range(20):
         obs, rew, done, info = env.step(env.action_space.sample())
         if done:
             break
     self.assertEqual(4, i)
示例#15
0
def test_monitor(n_episodes):
    steps = 15

    env = gym.make("CartPole-v1")
    # unwrap default TimeLimit and wrap with new one to simulate done=True
    # at step 5
    assert isinstance(env, TimeLimit)
    env = env.env  # unwrap
    env = TimeLimit(env, max_episode_steps=5)  # wrap

    tmpdir = tempfile.mkdtemp()
    try:
        env = pfrl.wrappers.Monitor(
            env, directory=tmpdir, video_callable=lambda episode_id: True
        )
        episode_idx = 0
        episode_len = 0
        t = 0
        _ = env.reset()
        while True:
            _, _, done, info = env.step(env.action_space.sample())
            episode_len += 1
            t += 1
            if episode_idx == 1 and episode_len >= 3:
                info["needs_reset"] = True  # simulate ContinuingTimeLimit
            if done or info.get("needs_reset", False) or t == steps:
                if episode_idx + 1 == n_episodes or t == steps:
                    break
                env.reset()
                episode_idx += 1
                episode_len = 0
        # `env.close()` is called when `env` is gabage-collected
        # (or explicitly deleted/closed).
        del env
        # check if videos & meta files were generated
        files = os.listdir(tmpdir)
        mp4s = [f for f in files if f.endswith(".mp4")]
        metas = [f for f in files if f.endswith(".meta.json")]
        stats = [f for f in files if f.endswith(".stats.json")]
        manifests = [f for f in files if f.endswith(".manifest.json")]
        assert len(mp4s) == n_episodes
        assert len(metas) == n_episodes
        assert len(stats) == 1
        assert len(manifests) == 1

    finally:
        shutil.rmtree(tmpdir)
示例#16
0
def run_episodes(neps, seed):
    reward_fn = 'task1_reward'
    termination_fn = 'pos_and_rot_close_to_goal'
    # termination_fn = 'position_close_to_goal'
    initializer = 'task4_init'
    env = make_training_env(reward_fn,
                            termination_fn,
                            initializer,
                            action_space='torque_and_position',
                            init_joint_conf=True,
                            visualization=True,
                            grasp='pinch',
                            rank=seed)
    env = env.env  # HACK to remove FLatObservationWrapper
    # tmp_dir = '/tmp/video'
    # env = Monitor(RenderWrapper(TimeLimit(env, 1000)), tmp_dir,
    #               video_callable=lambda episode_id: True, mode='evaluation',
    #               force=True)
    env = TimeLimit(env, 1000)
    viz = Viz()
    for _ in range(neps):
        obs = env.reset()

        p.configureDebugVisualizer(p.COV_ENABLE_GUI, 0)
        p.resetDebugVisualizerCamera(cameraDistance=0.6,
                                     cameraYaw=0,
                                     cameraPitch=-40,
                                     cameraTargetPosition=[0, 0, 0])
        viz.reset(obs)
        # tip_pd = TipPD([10, 1], 0.7 * env.cube_tip_positions)
        tip_pd = None
        controller = ForceControlPolicy(env, True, tip_pd)
        # obs = grasp_force_control(env, obs, controller.get_grasp_torque)
        obs = grasp_tippos_control(env, obs)

        # Then move toward the goal positions
        env.unwrapped.action_space = TriFingerPlatform.spaces.robot_torque.gym
        env.unwrapped.action_type = cube_env.ActionType.TORQUE
        done = False
        while not done:
            # transform wrenches to base frame
            torque = controller(obs)
            obs, reward, done, info = env.step(torque)
            viz.update_cube_orientation(obs)
            time.sleep(0.01)

    env.close()
def main():
    env = make_cmdp(args.cmdp, episodic=True)
    env = TimeLimit(env, 10)

    agent_model_name = args.cmdp.split('/')[-1]
    agent_model = agent_models.get_agent_model(agent_model_name)

    values_df_index = 'E[G]', 'E[G | A=a]', 'E[G | do(A=a)]'
    values_df_columns = env.model.actions

    _, state = env.reset()
    for t in itt.count():
        print()
        print(f't: {t}')
        env.render()

        Qs_none = [
            infer_Q(env, action, 'none', agent_model=agent_model).item()
            for action in range(env.action_space.n)
        ]
        Qs_condition = [
            infer_Q(env, action, 'condition', agent_model=agent_model).item()
            for action in range(env.action_space.n)
        ]
        Qs_intervention = [
            infer_Q(env, action, 'intervention',
                    agent_model=agent_model).item()
            for action in range(env.action_space.n)
        ]

        values_df = pd.DataFrame(
            [Qs_none, Qs_condition, Qs_intervention],
            values_df_index,
            values_df_columns,
        )
        print(values_df)

        action = torch.tensor(Qs_intervention).argmax()
        state, _, done, _ = env.step(action)

        if done:
            print()
            print(f'final state: {state}')
            print(f'Episode finished after {t+1} timesteps')
            break

    env.close()
示例#18
0
class Renderer:
    def __init__(self, args):
        self.env = TimeLimit(gym.make(args.env),
                             max_episode_steps=args.max_steps)

    def get_action(self, obs, ch):
        raise NotImplementedError

    def reset(self, init_obs):
        pass

    def main_loop(self, window):
        obs = self.env.reset()
        self.reset(obs)
        done = False
        action = None
        reward = None
        steps = 0
        ret = 0
        while not done:
            self.display(action, done, ret, reward, steps, window)
            ch = window.getch()
            action = self.get_action(obs, ch)
            obs, reward, done, _ = self.env.step(action)
            ret += reward
            steps += 1

        # Clear screen
        self.display(action, done, ret, reward, steps, window)
        window.getch()

    def display(self, action, done, ret, reward, steps, window):
        show(
            self.env, window, {
                'steps':
                steps,
                'action':
                gym_psketch.ID2ACTIONS[action]
                if action is not None else action,
                'reward':
                reward,
                'return':
                ret,
                'done':
                done
            })
def main():
    env = make_mdp(args.mdp, episodic=True)
    env = TimeLimit(env, 10)

    env.reset()
    for t in itt.count():
        print('---')
        print(f't: {t}')
        print('state:')
        env.render()

        action = policy(env, log=True)
        _, reward, done, _ = env.step(action)
        print(f'reward: {reward}')

        if done:
            print('final state:')
            env.render()
            print(f'Episode finished after {t+1} timesteps')
            break

    env.close()
示例#20
0
        # ALGO LOGIC: put action logic here
        logits, std = pg.forward(obs[step:step + 1])
        values[step] = vf.forward(obs[step:step + 1])

        # ALGO LOGIC: `env.action_space` specific logic
        probs = Normal(logits, std)
        action = probs.sample()
        clipped_action = torch.clamp(
            action, torch.min(torch.Tensor(env.action_space.low)),
            torch.min(torch.Tensor(env.action_space.high)))
        actions[step], neglogprobs[step], entropys[
            step] = clipped_action.tolist(
            )[0], -probs.log_prob(action).sum(), probs.entropy().sum()

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, rewards[step], dones[step], _ = env.step(actions[step])
        next_obs = np.array(next_obs)
        if dones[step]:
            break

    # ALGO LOGIC: training.
    # calculate the discounted rewards, or namely, returns
    returns = np.zeros_like(rewards)
    for t in reversed(range(rewards.shape[0] - 1)):
        returns[t] = rewards[t] + args.gamma * returns[t + 1] * (1 - dones[t])
    # advantages are returns - baseline, value estimates in our case
    advantages = returns - values.detach().cpu().numpy()

    vf_loss = loss_fn(torch.Tensor(returns).to(device), values) * args.vf_coef
    pg_loss = torch.Tensor(advantages).to(device) * neglogprobs
    loss = (pg_loss - entropys * args.ent_coef).mean() + vf_loss
示例#21
0
        obs[step] = next_obs.copy()

        # ALGO LOGIC: put action logic here
        with torch.no_grad():
            values[step] = vf.forward(obs[step:step + 1])
            action, logproba, _ = pg.get_action(obs[step:step + 1])

        actions[step] = action.data.cpu().numpy()[0]
        logprobs[step] = logproba.data.cpu().numpy()[0]

        # SUGGESTION: Find a better way to constrain policy actions to action low and higher bounds
        clipped_action = np.clip(action.tolist(), env.action_space.low,
                                 env.action_space.high)[0]

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, rewards[step], dones[step], info = env.step(clipped_action)
        real_rewards += [info['real_reward']]
        next_obs = np.array(next_obs)

        # Annealing the rate if instructed to do so.
        if args.anneal_lr:
            pg_lr_scheduler.step()
            vf_lr_scheduler.step()

        if dones[step]:
            # Computing the discounted returns:
            writer.add_scalar("charts/episode_reward", np.sum(real_rewards),
                              global_step)
            print(
                f"global_step={global_step}, episode_reward={np.sum(real_rewards)}"
            )
示例#22
0
    obs = np.empty((args.episode_length, ) + env.observation_space.shape)

    # ALGO LOGIC: put other storage logic here
    entropys = torch.zeros((args.episode_length, ), device=device)

    # TRY NOT TO MODIFY: prepare the execution of the game.
    for step in range(args.episode_length):
        global_step += 1
        obs[step] = next_obs.copy()

        # ALGO LOGIC: put action logic here
        action, _, _ = pg.get_action(obs[step:step + 1])
        actions[step] = action.tolist()[0]

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, rewards[step], dones[step], _ = env.step(action.tolist()[0])
        rb.put(
            (obs[step], actions[step], rewards[step], next_obs, dones[step]))
        next_obs = np.array(next_obs)
        # ALGO LOGIC: training.
        if len(rb.buffer) > 2000:
            s_obs, s_actions, s_rewards, s_next_obses, s_dones = rb.sample(
                args.batch_size)
            with torch.no_grad():
                next_state_action, next_state_log_pi, _ = pg.get_action(
                    s_next_obses)
                qf1_next_target = qf1_target.forward(s_next_obses,
                                                     next_state_action)
                qf2_next_target = qf2_target.forward(s_next_obses,
                                                     next_state_action)
                min_qf_next_target = torch.min(
示例#23
0
def replay_memory(env: TimeLimit, memory: List[List[Any]]):
    for episode_memory in memory:
        env.reset()
        for action in episode_memory:
            env.step(action)
            env.render()
示例#24
0
def train(agent_type, env, verbose=True, save_freq=50, save_dir='./', **params):
    if verbose:
        print(params)
    
    if agent_type == 'dqn':
        agent = DQNAgent(env.observation_space, env.action_space, **params)
    elif agent_type == 'a2c':
        agent = A2CAgent(env.observation_space, env.action_space, **params)
    elif agent_type == 'td3':
        agent = TD3Agent(env.observation_space, env.action_space, **params)
    elif agent_type == 'random':
        agent = RandomAgent(env.observation_space, env.action_space, **params)
    
    if params['max_episode_steps'] is not None:
        env = TimeLimit(env, max_episode_steps=params['max_episode_steps'])
    log = {'agent':agent_type, 'params':params, 'episodes':[]}
    
    if save_dir[-1] != '/':
        raise NotADirectory
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    try:
        ep = 0
        t_total = 0
        while t_total < params['max_steps']:
            state = env.reset()
            sum_reward = 0
            t_ep = 0
            done = False
            
            while not done:
                if t_total > params['start_at']:
                    action = agent.get_action(state)
                else:
                    action = env.action_space.sample()
                
                next_state, reward, done, _ = env.step(action)
                agent.remember(state, action, reward, next_state, done)
                state = next_state
                sum_reward += reward
                t_ep += 1
                
                # for agents using online training
                if agent.online and t_total > params['start_at']:
                    agent.learn()
            
            # for agents using offline training
            if not agent.online and t_total > params['start_at']:
                agent.learn()
            
            ep += 1
            t_total += t_ep
            ep_info = {'episode':ep, 't_ep':t_ep, 't_total':t_total, 'sum_reward':sum_reward, 'optim_steps':agent.optim_steps, 'memory':len(agent.memory)}
            log['episodes'].append(ep_info)
            if verbose:
                print(ep_info)    

            if ep % save_freq == 0:                
                agent.save(save_dir + params['file_name'] + '.pth')
                with open(save_dir + params['file_name'] + '.pkl', 'wb') as f:
                    pickle.dump(log, f)
                if verbose:
                    print('Episode ' + str(ep) + ': Saved model weights and log.')
        env.close()
        
    except Exception:
        traceback.print_exc()
        breakpoint()
示例#25
0
    plt.show()

    return np.array(s_s).T


#s_s = play(agt, play_env)
#labels = ['x', 'v_x', 'cos(theta)', 'sin(theta)', 'thetadot']
#for label, line in zip(labels, s_s):
#	plt.plot(line, label=label)
#plt.legend()
#plt.show()

s = env.reset()
for step in range(100000):
    a = int(q.get_action(s))
    sp, r, done, _ = env.step(a)

    agt.handle_transition(s, a, r, sp, done)

    s_s.append(s.detach().numpy())

    s = sp

    if done:
        s = env.reset()
        done = False

    if (step % 1000) == 0:
        print(
            f'{step}: {adp.evaluate(eval_env, 10)} (adp) {q.evaluate(eval_env, 10)} (Q)'
        )
示例#26
0
            action, logproba, _, probs = pg.get_action(
                obs[step:step + 1],
                invalid_action_masks=invalid_action_masks[step:step + 1])

            # CORE LOGIC:
            # use the action generated by CategoricalMasked, but
            # don't adjust the logprobability accordingly. Instead, calculate the log
            # probability using Categorical
            action, logproba, _, probs = pg.get_action(obs[step:step + 1],
                                                       action=action)

        actions[step] = action[:, 0].data.cpu().numpy()
        logprobs[:, [step]] = logproba

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, rewards[step], dones[step], info = env.step(
            action[:, 0].data.cpu().numpy())
        raw_rewards[:, step] = info["rewards"]
        real_rewards += [info['real_reward']]
        invalid_action_stats += [info['invalid_action_stats']]
        next_obs = np.array(next_obs)

        # Annealing the rate if instructed to do so.
        if args.anneal_lr:
            pg_lr_scheduler.step()
            vf_lr_scheduler.step()

        if dones[step]:
            # Computing the discounted returns:
            writer.add_scalar("charts/episode_reward", np.sum(real_rewards),
                              global_step)
            print(
示例#27
0
def get_cswm_data(env_name, seed, num_episodes=1000):
    logger.set_level(logger.INFO)

    env = gym.make(env_name)

    np.random.seed(seed)
    env.action_space.seed(seed)
    env.seed(seed)

    agent = RandomAgent(env.action_space)

    episode_count = num_episodes
    reward = 0
    done = False

    crop = None
    warmstart = None
    if env_name == 'PongDeterministic-v4':
        crop = (35, 190)
        warmstart = 58
    elif env_name == 'SpaceInvadersDeterministic-v4':
        crop = (30, 200)
        warmstart = 50
    else:
        crop = (35, 190)
        warmstart = 58

    max_episode_steps = warmstart + 11
    env = TimeLimit(env, max_episode_steps=max_episode_steps)

    env = AtariARIWrapper(env)
    replay_buffer = []

    for i in range(episode_count):

        replay_buffer.append({
            'obs': [],
            'action': [],
            'next_obs': [],
            'label': []
        })

        ob = env.reset()

        # Burn-in steps
        for _ in range(warmstart):
            action = agent.act(ob, reward, done)
            ob, _, _, _ = env.step(action)
        prev_ob = crop_normalize(ob, crop)
        ob, _, _, info = env.step(0)
        ob = crop_normalize(ob, crop)

        while True:
            replay_buffer[i]['obs'].append(
                np.concatenate((ob, prev_ob), axis=0))
            prev_ob = ob
            replay_buffer[i]["label"].append(info["labels"])
            action = agent.act(ob, reward, done)
            ob, reward, done, info = env.step(action)
            ob = crop_normalize(ob, crop)

            replay_buffer[i]['action'].append(action)
            replay_buffer[i]['next_obs'].append(
                np.concatenate((ob, prev_ob), axis=0))

            if done:
                break

        if i % 10 == 0:
            print("iter " + str(i))

    return replay_buffer
示例#28
0
        if args.use_levy:
            action = (action.tolist()[0] + sampleFromLevy(
                args.levy_mu, args.levy_scale, env.action_space)).clip(
                    env.action_space.low, env.action_space.high)

        else:

            action = (action.tolist()[0] +
                      np.random.normal(0,
                                       max_action * args.exploration_noise,
                                       size=env.action_space.shape[0])).clip(
                                           env.action_space.low,
                                           env.action_space.high)

    # TRY NOT TO MODIFY: execute the game and log data.
    next_obs, reward, done, info = env.step(action)
    episode_reward += reward

    # ALGO LOGIC: training.
    rb.put((obs, action, reward, next_obs, done))
    if global_step > args.learning_starts:
        s_obs, s_actions, s_rewards, s_next_obses, s_dones = rb.sample(
            args.batch_size)
        with torch.no_grad():
            next_state_actions = (target_actor.forward(
                s_next_obses, device)).clamp(env.action_space.low[0],
                                             env.action_space.high[0])
            qf1_next_target = qf1_target.forward(s_next_obses,
                                                 next_state_actions, device)
            next_q_value = torch.Tensor(s_rewards).to(device) + (
                1 - torch.Tensor(s_dones).to(device)) * args.gamma * (
示例#29
0
def evaluate(
    env: TimeLimit,
    total_episodes: int,
    *,
    q_table: np.ndarray = None,
    winning_reward: float = None,
    is_random: bool = False,
    render: bool = False,
    display_result: bool = False,
) -> float:
    """
    Evaluate the performance of a q-table to solve a gym environment problem
    It may also use random instead of a q-table
    in order to compare the performance of a q-table against a random solution
    :param env: gym environment to solve
    :param total_episodes: number of time to repeat the evaluation.
           The bigger the more statistically significant the output will be
    :param q_table: Q-table to used solve the problem
           if given, is_random must be False
    :param winning_reward: the reward given to the agent when it solves the problem.
           It is used to compute the number of time the agent solved the problem
    :param is_random: if True will use random instead of Q-table.
           If True, q-table must not be given
    :param render: if True will call env.render()
    :param display_result: If True, prints evaluation summary in the console at the evaluation end
    """
    # Todo : rename and re-think is_random parameter into policy parameter
    # Todo : render only last evaluation
    # Todo : yield q-table, evaluate it and continue evaluation if it is not good enough

    if (q_table is not None) and is_random:
        raise RuntimeError("is_random and q_table given")
    elif q_table is None and is_random is None:
        raise RuntimeError(
            "at least one of q_table and is_random must be given")

    total_epochs, total_reward, total_won_episodes = 0, 0, 0

    for _ in range(total_episodes):
        state = env.reset()
        if render:
            env.render()
        done = False
        while not done:
            if is_random:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])
            state, reward, done, info = env.step(action)

            total_epochs += 1
            total_reward += reward

            if render:
                env.render()

        # noinspection PyUnboundLocalVariable
        if reward == winning_reward:
            total_won_episodes += 1

    score = round(total_won_episodes * 100 / total_episodes, 2)

    if display_result:
        print("-" * 30)
        print(
            f"Results after {total_episodes} episodes using {'random' if is_random else 'q_table'}:"
        )
        print(f"Average steps per episode: {total_epochs / total_episodes}")
        print(f"Average reward per episode: {total_reward / total_episodes}")
        print(f"Percentage of won episodes : {score}%")
    return score