Exemplo n.º 1
0
def play_with_car():
    maximum_steps_allowed = 250
    env = TimeLimit(MountainCarEnv(),
                    max_episode_steps=maximum_steps_allowed + 1)
    actions = {'left': 0, 'stop': 1, 'right': 2}

    initial_state = env.reset()
    print('Initial state: ', initial_state)

    for t in range(maximum_steps_allowed):
        # need to modify policy
        if t < 50:
            s, r, done, _ = env.step(actions['left'])
        elif t < 70:
            s, r, done, _ = env.step(actions['right'])
        elif t < 120:
            s, r, done, _ = env.step(actions['left'])
        else:
            s, r, done, _ = env.step(actions['right'])

        print('State {}, Reward {}, Step {}'.format(s, r, t))
        env.render()

        if done:
            if s[0] > 0.47:
                print('Well done!')
            else:
                print('Please, try again.')
            break
    else:
        print('Time is up. Please, try again.')

    env.close()
Exemplo n.º 2
0
def test_random_task_on_each_episode():
    env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1")
    from gym.wrappers import TimeLimit
    env = TimeLimit(env, max_episode_steps=10)
    env = MultiTaskEnvironment(
        env,
        task_schedule={
            0: {"level": 0},
            5: {"level": 1},
            200: {"level": 2},
            300: {"level": 3},
            400: {"level": 4},
        },
        add_task_id_to_obs=True,
        new_random_task_on_reset=True,
    )
    task_labels = []
    for i in range(10):
        obs = env.reset()
        task_labels.append(obs["task_labels"])
    assert len(set(task_labels)) > 1

    # Episodes only last 10 steps. Tasks don't have anything to do with the task
    # schedule.
    obs = env.reset()
    start_task_label = obs["task_labels"]
    for i in range(10):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs["task_labels"] == start_task_label
        if i == 9:
            assert done
        else:
            assert not done

    env.close()
Exemplo n.º 3
0
def play(env_name: str, manual_control: bool, max_steps: int):
    # Make environment
    env = TimeLimit(gym.make(env_name, render=True), max_steps)
    observation = env.reset()

    if manual_control:
        # Create user debug interface
        import pybullet as p
        params = [
            p.addUserDebugParameter(
                p.getJointInfo(env.robot_id, j)[1].decode(), -1, 1, 0)
            for j in env.joint_list
        ]

    reward_sum = 0
    while True:
        if manual_control:
            # Read user input and simulate motor
            a = [p.readUserDebugParameter(param) for param in params]
        else:
            a = env.action_space.sample()

        observation, reward, done, _ = env.step(a)
        reward_sum += reward
        print("\nobservation", observation)
        print("reward", reward)
        print("total reward", reward_sum)
        print("done", done)

        # Reset when done
        if done:
            observation = env.reset()
            reward_sum = 0

    env.close()
Exemplo n.º 4
0
def test_task_schedule_monsterkong():
    env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1")
    from gym.wrappers import TimeLimit
    env = TimeLimit(env, max_episode_steps=10)
    env = MultiTaskEnvironment(env,
                               task_schedule={
                                   0: {
                                       "level": 0
                                   },
                                   100: {
                                       "level": 1
                                   },
                                   200: {
                                       "level": 2
                                   },
                                   300: {
                                       "level": 3
                                   },
                                   400: {
                                       "level": 4
                                   },
                               },
                               add_task_id_to_obs=True)
    obs = env.reset()

    # img, task_labels = obs
    assert obs[1] == 0
    assert env.get_level() == 0

    for i in range(500):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs[1] == i // 100
        assert env.level == i // 100
        env.render()
        assert isinstance(done, bool)
        if done:
            print(f"End of episode at step {i}")
            obs = env.reset()

    assert obs[1] == 4
    assert env.level == 4
    # level stays the same even after reaching that objective.
    for i in range(500):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs[1] == 4
        assert env.level == 4
        env.render()
        if done:
            print(f"End of episode at step {i}")
            obs = env.reset()

    env.close()
Exemplo n.º 5
0
Arquivo: test.py Projeto: ltbd78/RL
def test(pkl_path, pth_path, env, attempts, display=False, video_dir=None):
    with open(pkl_path, 'rb') as f:
        logs = pickle.load(f)

    if logs['params']['max_episode_steps'] is not None:
        env = TimeLimit(env,
                        max_episode_steps=logs['params']['max_episode_steps'])

    if video_dir:
        if not os.path.exists(video_dir):
            os.makedirs(video_dir)
        env = Monitor(env, video_dir, force=True)

    if logs['agent'] == 'dqn':
        agent = DQNAgent(env.observation_space, env.action_space,
                         **logs['params'])
        agent.epsilon = 0
    elif logs['agent'] == 'a2c':
        agent = A2CAgent(env.observation_space, env.action_space,
                         **logs['params'])
    elif logs['agent'] == 'td3':
        agent = TD3Agent(env.observation_space, env.action_space,
                         **logs['params'])
    elif logs['agent'] == 'random':
        agent = RandomAgent(env.observation_space, env.action_space,
                            **logs['params'])

    agent.load(pth_path)

    try:
        rewards = []
        for attempt in range(attempts):
            state = env.reset()
            sum_reward = 0
            t = 0
            done = False
            while not done:
                action = agent.get_action(state)
                next_state, reward, done, _ = env.step(action)
                state = next_state
                sum_reward += reward
                t += 1
                if display:
                    title = f'Attempt: {attempt+1} | Timestep: {t} | Reward: {reward} | Sum Reward: {sum_reward}'
                    render(env, title)
            rewards.append(sum_reward)
        env.close()
        return rewards
    except Exception:
        traceback.print_exc()
        breakpoint()
        env.close()
def main():
    env = make_cmdp(args.cmdp, episodic=True)
    env = TimeLimit(env, 10)

    agent_model_name = args.cmdp.split('/')[-1]
    agent_model = agent_models.get_agent_model(agent_model_name)

    values_df_index = 'E[G]', 'E[G | A=a]', 'E[G | do(A=a)]'
    values_df_columns = env.model.actions

    _, state = env.reset()
    for t in itt.count():
        print()
        print(f't: {t}')
        env.render()

        Qs_none = [
            infer_Q(env, action, 'none', agent_model=agent_model).item()
            for action in range(env.action_space.n)
        ]
        Qs_condition = [
            infer_Q(env, action, 'condition', agent_model=agent_model).item()
            for action in range(env.action_space.n)
        ]
        Qs_intervention = [
            infer_Q(env, action, 'intervention',
                    agent_model=agent_model).item()
            for action in range(env.action_space.n)
        ]

        values_df = pd.DataFrame(
            [Qs_none, Qs_condition, Qs_intervention],
            values_df_index,
            values_df_columns,
        )
        print(values_df)

        action = torch.tensor(Qs_intervention).argmax()
        state, _, done, _ = env.step(action)

        if done:
            print()
            print(f'final state: {state}')
            print(f'Episode finished after {t+1} timesteps')
            break

    env.close()
Exemplo n.º 7
0
def run_episodes(neps, seed):
    reward_fn = 'task1_reward'
    termination_fn = 'pos_and_rot_close_to_goal'
    # termination_fn = 'position_close_to_goal'
    initializer = 'task4_init'
    env = make_training_env(reward_fn,
                            termination_fn,
                            initializer,
                            action_space='torque_and_position',
                            init_joint_conf=True,
                            visualization=True,
                            grasp='pinch',
                            rank=seed)
    env = env.env  # HACK to remove FLatObservationWrapper
    # tmp_dir = '/tmp/video'
    # env = Monitor(RenderWrapper(TimeLimit(env, 1000)), tmp_dir,
    #               video_callable=lambda episode_id: True, mode='evaluation',
    #               force=True)
    env = TimeLimit(env, 1000)
    viz = Viz()
    for _ in range(neps):
        obs = env.reset()

        p.configureDebugVisualizer(p.COV_ENABLE_GUI, 0)
        p.resetDebugVisualizerCamera(cameraDistance=0.6,
                                     cameraYaw=0,
                                     cameraPitch=-40,
                                     cameraTargetPosition=[0, 0, 0])
        viz.reset(obs)
        # tip_pd = TipPD([10, 1], 0.7 * env.cube_tip_positions)
        tip_pd = None
        controller = ForceControlPolicy(env, True, tip_pd)
        # obs = grasp_force_control(env, obs, controller.get_grasp_torque)
        obs = grasp_tippos_control(env, obs)

        # Then move toward the goal positions
        env.unwrapped.action_space = TriFingerPlatform.spaces.robot_torque.gym
        env.unwrapped.action_type = cube_env.ActionType.TORQUE
        done = False
        while not done:
            # transform wrenches to base frame
            torque = controller(obs)
            obs, reward, done, info = env.step(torque)
            viz.update_cube_orientation(obs)
            time.sleep(0.01)

    env.close()
def main():
    env = make_mdp(args.mdp, episodic=True)
    env = TimeLimit(env, 10)

    env.reset()
    for t in itt.count():
        print('---')
        print(f't: {t}')
        print('state:')
        env.render()

        action = policy(env, log=True)
        _, reward, done, _ = env.step(action)
        print(f'reward: {reward}')

        if done:
            print('final state:')
            env.render()
            print(f'Episode finished after {t+1} timesteps')
            break

    env.close()
Exemplo n.º 9
0
            nn.utils.clip_grad_norm_(list(actor.parameters()),
                                     args.max_grad_norm)
            actor_optimizer.step()

            # update the target network
            for param, target_param in zip(actor.parameters(),
                                           target_actor.parameters()):
                target_param.data.copy_(args.tau * param.data +
                                        (1 - args.tau) * target_param.data)
            for param, target_param in zip(qf1.parameters(),
                                           qf1_target.parameters()):
                target_param.data.copy_(args.tau * param.data +
                                        (1 - args.tau) * target_param.data)

        if global_step % 100 == 0:
            writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
            writer.add_scalar("losses/actor_loss", actor_loss.item(),
                              global_step)

    # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
    obs = next_obs

    if done:
        # TRY NOT TO MODIFY: record rewards for plotting purposes
        print(f"global_step={global_step}, episode_reward={episode_reward}")
        writer.add_scalar("charts/episode_reward", episode_reward, global_step)
        obs, episode_reward = env.reset(), 0

env.close()
writer.close()
Exemplo n.º 10
0
def train(agent_type, env, verbose=True, save_freq=50, save_dir='./', **params):
    if verbose:
        print(params)
    
    if agent_type == 'dqn':
        agent = DQNAgent(env.observation_space, env.action_space, **params)
    elif agent_type == 'a2c':
        agent = A2CAgent(env.observation_space, env.action_space, **params)
    elif agent_type == 'td3':
        agent = TD3Agent(env.observation_space, env.action_space, **params)
    elif agent_type == 'random':
        agent = RandomAgent(env.observation_space, env.action_space, **params)
    
    if params['max_episode_steps'] is not None:
        env = TimeLimit(env, max_episode_steps=params['max_episode_steps'])
    log = {'agent':agent_type, 'params':params, 'episodes':[]}
    
    if save_dir[-1] != '/':
        raise NotADirectory
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    try:
        ep = 0
        t_total = 0
        while t_total < params['max_steps']:
            state = env.reset()
            sum_reward = 0
            t_ep = 0
            done = False
            
            while not done:
                if t_total > params['start_at']:
                    action = agent.get_action(state)
                else:
                    action = env.action_space.sample()
                
                next_state, reward, done, _ = env.step(action)
                agent.remember(state, action, reward, next_state, done)
                state = next_state
                sum_reward += reward
                t_ep += 1
                
                # for agents using online training
                if agent.online and t_total > params['start_at']:
                    agent.learn()
            
            # for agents using offline training
            if not agent.online and t_total > params['start_at']:
                agent.learn()
            
            ep += 1
            t_total += t_ep
            ep_info = {'episode':ep, 't_ep':t_ep, 't_total':t_total, 'sum_reward':sum_reward, 'optim_steps':agent.optim_steps, 'memory':len(agent.memory)}
            log['episodes'].append(ep_info)
            if verbose:
                print(ep_info)    

            if ep % save_freq == 0:                
                agent.save(save_dir + params['file_name'] + '.pth')
                with open(save_dir + params['file_name'] + '.pkl', 'wb') as f:
                    pickle.dump(log, f)
                if verbose:
                    print('Episode ' + str(ep) + ': Saved model weights and log.')
        env.close()
        
    except Exception:
        traceback.print_exc()
        breakpoint()