예제 #1
0
def probe_action_single(env_name, ind, use_cuda=False):
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu") if use_cuda else "cpu"
    policy_params = ParamDict(c=10, )
    params = ParamDict(policy_params=policy_params, use_cuda=False)

    def rand_actor_low(x, y):
        action = torch.zeros(action_dim)
        # action[ind] = max_action / 500.
        # action = torch.zeros(action_dim)
        action = (torch.normal(mean=torch.zeros(action_dim),
                               std=torch.ones(action_dim)) * max_action).clamp(
                                   -max_action, max_action)
        # action = (torch.ones(action_dim) * max_action / 1).clamp(-max_action, max_action)
        return action

    def rand_actor_high(x):
        action = torch.Tensor(np.random.normal(size=state_dim)) * 10
        return action

    env = create_maze_env(env_name=env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    log_video_hrl_dev(env_name, rand_actor_low, rand_actor_high, params)
예제 #2
0
def run_environment(env_name, episode_length, num_episodes):
  env = EnvWithGoal(
      create_maze_env.create_maze_env(env_name).gym,
      env_name)

  def action_fn(obs):
    action_space = env.action_space
    action_space_mean = (action_space.low + action_space.high) / 2.0
    action_space_magn = (action_space.high - action_space.low) / 2.0
    random_action = (action_space_mean +
                     action_space_magn *
                     np.random.uniform(low=-1.0, high=1.0,
                                       size=action_space.shape))
    return random_action

  rewards = []
  successes = []
  for ep in range(num_episodes):
    rewards.append(0.0)
    successes.append(False)
    obs = env.reset()
    for _ in range(episode_length):
      obs, reward, done, _ = env.step(action_fn(obs))
      rewards[-1] += reward
      successes[-1] = success_fn(reward)
      if done:
        break
    tf.logging.info('Episode %d reward: %.2f, Success: %d', ep + 1, rewards[-1], successes[-1])

  tf.logging.info('Average Reward over %d episodes: %.2f',
               num_episodes, np.mean(rewards))
  tf.logging.info('Average Success over %d episodes: %.2f',
               num_episodes, np.mean(successes))
예제 #3
0
def run_environment(env_name, episode_length, num_episodes):
  env = EnvWithGoal(
      create_maze_env.create_maze_env(env_name).gym,
      env_name)

  def action_fn(obs):
    action_space = env.action_space
    action_space_mean = (action_space.low + action_space.high) / 2.0
    action_space_magn = (action_space.high - action_space.low) / 2.0
    random_action = (action_space_mean +
                     action_space_magn *
                     np.random.uniform(low=-1.0, high=1.0,
                                       size=action_space.shape))
    return random_action

  rewards = []
  successes = []
  for ep in range(num_episodes):
    rewards.append(0.0)
    successes.append(False)
    obs = env.reset()
    for _ in range(episode_length):
      obs, reward, done, _ = env.step(action_fn(obs))
      rewards[-1] += reward
      successes[-1] = success_fn(reward)
      if done:
        break
    logging.info('Episode %d reward: %.2f, Success: %d', ep + 1, rewards[-1], successes[-1])

  logging.info('Average Reward over %d episodes: %.2f',
               num_episodes, np.mean(rewards))
  logging.info('Average Success over %d episodes: %.2f',
               num_episodes, np.mean(successes))
예제 #4
0
def log_video_hrl(env_name, actor_low, actor_high, params):
    actor_low = copy.deepcopy(actor_low).cpu()
    actor_high = copy.deepcopy(actor_high).cpu()
    actor_high.max_goal = actor_high.max_goal.to('cpu')
    policy_params = params.policy_params
    goal_dim = params.goal_dim
    if env_name in envnames_mujoco:
        env = gym.make(env_name)
    elif env_name in envnames_ant:
        env = create_maze_env(env_name=env_name)
    print('\n    > Collecting current trajectory...')
    done = False
    step = 1
    state = torch.Tensor(env.reset())
    goal = torch.Tensor(torch.randn(goal_dim))
    episode_reward, frame_buffer = 0, []
    while not done and step < 600:
        frame_buffer.append(env.render(mode='rgb_array'))
        action = actor_low(torch.Tensor(state), torch.Tensor(goal)).detach()
        next_state, reward, done, info = env.step(action)
        if (step + 1) % policy_params.c == 0 and step > 0:
            goal = actor_high(state)
        else:
            goal = (torch.Tensor(state)[:goal_dim] + goal -
                    torch.Tensor(next_state)[:goal_dim]).float()
        state = next_state
        episode_reward += reward
        step += 1
    print(
        f'    > Finished collection, saved video. Episode reward: {float(episode_reward):.3f}\n'
    )
    frame_buffer = np.array(frame_buffer).transpose(0, 3, 1, 2)
    wandb.log({"video": wandb.Video(frame_buffer, fps=30, format="mp4")})
    env.close()
예제 #5
0
    def _set_env(self):
        env = create_maze_env(self.env_name)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]

        object.__setattr__(self, 'env', env)
        object.__setattr__(self, 'state_dim', state_dim)
        object.__setattr__(self, 'action_dim', action_dim)
        # hard code goal dim to only include x, y, z
        object.__setattr__(self, 'goal_dim', 3)
예제 #6
0
def create_env(cnf):
    '''Creates an environment from either OpenAi Gym or the GoogleBrain Mujoco AntMaze
    gym environment.
    It wraps it such that it tries to reach a global target position,
    which is appended to the obs. It takes obs[:2] as x,y coordinates.
    '''
    # *show* necessary because we need to load a different xml file with spheres
    from environments.create_maze_env import create_maze_env
    env = create_maze_env(**cnf.maze_env)
    return EnvWithGoal(env, **cnf.env_w_goal)
예제 #7
0
def get_env(env_name):
    global envnames_ant
    global envnames_mujoco
    if env_name in envnames_ant:
        env = create_maze_env(env_name=env_name)
    elif env_name in envnames_mujoco:
        env = gym.make(env_name)
    else:
        raise NotImplementedError(
            "environment {} is not supported!".format(env_name))
    return env
예제 #8
0
def log_video(env_name, actor):
    if env_name in envnames_mujoco:
        env = gym.make(env_name)
    elif env_name in envnames_ant:
        env = create_maze_env(env_name=env_name)
    print('\n    > Collecting current trajectory...')
    done = False
    step = 1
    state = env.reset()
    frame_buffer = []
    while not done:
        frame_buffer.append(env.render(mode='rgb_array'))
        action = actor(torch.Tensor(state)).detach().cpu()
        state, reward, done, info = env.step(action)
        step += 1
    print('    > Finished collection, saved video.\n')
    frame_buffer = np.array(frame_buffer).transpose(0, 3, 1, 2)
    wandb.log({"video": wandb.Video(frame_buffer, fps=30, format="mp4")})
    env.close()
예제 #9
0
def test_log_video_hrl(use_cuda=False):
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu") if use_cuda else "cpu"
    policy_params = ParamDict(c=10, )
    params = ParamDict(policy_params=policy_params, use_cuda=False)

    def rand_actor_low(x, y):
        return torch.Tensor(np.random.normal(size=action_dim)) * max_action

    def rand_actor_high(x):
        return torch.Tensor(np.random.normal(size=state_dim)) * 10

    for env_name in envnames_ant:
        env = create_maze_env(env_name=env_name)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])
        actor_low = ActorLow(state_dim, action_dim, max_action).to(device)
        actor_high = ActorHigh(state_dim, max_action).to(device)
        log_video_hrl(env_name, rand_actor_low, rand_actor_high, params)
예제 #10
0
def create_env(cnf):
    '''Creates an environment from either OpenAi Gym or the GoogleBrain Mujoco AntMaze
    gym environment.
    It wraps it such that it tries to reach a global target position,
    which is appended to the obs. It takes obs[:2] as x,y coordinates.
    '''
    if cnf.main.vrep:
        # Load cool robotics env
        from environments.coppeliagym import CoppeliaEnv
        print(f'Force mode is {cnf.coppeliagym.params.force}')
        print(f'ee pos is: {cnf.coppeliagym.params.ee_pos}')
        env = CoppeliaEnv(cnf.coppeliagym)
        print(f'Target is: {env._pos_b1}')
        if cnf.main.render:
            env.render()
        return env
    else:
        # *show* necessary because we need to load a different xml file with spheres
        from environments.create_maze_env import create_maze_env
        env = create_maze_env(**cnf.maze_env)
        return EnvWithGoal(env, **cnf.env_w_goal)
예제 #11
0
def interact_env(env_name, video=False):
    env = create_maze_env(env_name=env_name)
    print('\n    > Collecting random trajectory...')
    done = False
    step = 1
    obs = env.reset()
    frame_buffer = []
    while not (done or step > 100):
        if video:
            frame_buffer.append(env.render(mode='rgb_array'))
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        step += 1
        print(f"      > Reward: {reward:.3f}")
    print('    > Finished collection', end='')
    if video:
        frame_buffer = np.array(frame_buffer).transpose(0, 3, 1, 2)
        wandb.log({"video": wandb.Video(frame_buffer, fps=30, format="mp4")})
        print(', saved video.\n')
        env.close()
    else:
        print('.\n')
    return env
예제 #12
0
def log_video_hrl_dev(env_name, actor_low, actor_high, params):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu"
                          ) if params.use_cuda else "cpu"
    # device = "cpu"
    policy_params = params.policy_params
    if env_name in envnames_mujoco:
        env = gym.make(env_name)
    elif env_name in envnames_ant:
        env = create_maze_env(env_name=env_name)
    state_dim = env.observation_space.shape[0]
    print('\n    > Collecting current trajectory...')
    done = False
    step = 1
    state = torch.Tensor(env.reset())
    goal = torch.Tensor(torch.randn_like(state))
    episode_reward, frame_buffer = 0, []
    while not done and step < 200:
        frame_buffer.append(env.render(mode='rgb_array'))
        action = actor_low(
            torch.Tensor(state).to(device),
            torch.Tensor(goal).to(device)).detach().cpu()
        next_state, reward, done, info = env.step(action)
        if (step + 1) % policy_params.c == 0 and step > 0:
            goal = actor_high(state)
        else:
            goal = state + goal - next_state
        state = next_state
        for i in range(state_dim):
            wandb.log({'state[{}]'.format(i): state[i]}, step=step)
        episode_reward += reward
        step += 1
    print(
        f'    > Finished collection, saved video. Episode reward: {float(episode_reward):.3f}\n'
    )
    frame_buffer = np.array(frame_buffer).transpose(0, 3, 1, 2)
    wandb.log({"video": wandb.Video(frame_buffer, fps=30, format="mp4")})
    env.close()
예제 #13
0
        if cd1_end_of_episode or cd2_success:
            done = True
        reward = reward if not done else 0
        return done, reward

    def observation_wrapper(self, obs):
        if not self.step_count % 10:
            self.posix = obs[0]
            self.posiy = obs[1]
        obs[0] = obs[0] - self.posix
        obs[1] = obs[1] - self.posiy

    def set_target(self, goal):
        print("Set target")
        self.base_env.wrapped_env.set_target(goal)

    def set_goal(self, goal):
        print("Set target")
        self.base_env.wrapped_env.set_goal(goal)


if __name__ == '__main__':
    env_name = "AntMaze"
    from environments.create_maze_env import create_maze_env
    env = create_maze_env(env_name, render=False)
    env = EnvWithGoal(env, env_name, 500, render=False, evalmode=False)
    done = env.reset()
    for i in range(1000):
        obs, reward, done, _ = env.step(env.action_space.sample())
        print(done)
예제 #14
0
def probe_action(env_name, use_cuda=False):
    env = create_maze_env(env_name=env_name)
    action_dim = env.action_space.shape[0]
    for i in range(action_dim):
        probe_action_single(env_name, i)