예제 #1
0
파일: explore.py 프로젝트: anthliu/dmc3gym
def main(argv):
    del argv
    environment_name = FLAGS.environment_name
    if environment_name is None:
        print('\n  '.join(['Available environments:'] + _ALL_NAMES))
        environment_name = prompt_environment_name(
            'Please select an environment name: ', _ALL_NAMES)

    index = _ALL_NAMES.index(environment_name)
    domain_name, task_name = (suite.ALL_TASKS + custom_suite.ALL_TASKS)[index]

    task_kwargs = {}
    if not FLAGS.timeout:
        task_kwargs['time_limit'] = float('inf')

    def loader():
        try:
            env = suite.load(domain_name=domain_name,
                             task_name=task_name,
                             task_kwargs=task_kwargs)
        except ValueError:
            task_kwargs['params'] = [0.25, 1.0, 8]
            env = custom_suite.load(domain_name=domain_name,
                                    task_name=task_name,
                                    task_kwargs=task_kwargs)

        env.task.visualize_reward = FLAGS.visualize_reward
        if FLAGS.action_noise > 0:
            env = action_noise.Wrapper(env, scale=FLAGS.action_noise)
        return env

    viewer.launch(loader)
예제 #2
0
def main(unused_argv):
    # The viewer calls the environment_loader on episode resets. However the task
    # cycles through one clip per episode. To avoid replaying the first clip again
    # and again we construct the environment outside the viewer to make it
    # persistent across resets.
    env = mocap_playback_env()
    viewer.launch(environment_loader=lambda: env)
예제 #3
0
def main(_):
  if FLAGS.suite == 'dm_control':
    logging.info('Loading from dm_control...')
    env = suite.load(domain_name=FLAGS.domain_name, task_name=FLAGS.task_name)
  elif FLAGS.suite == 'rwrl':
    logging.info('Loading from rwrl...')
    env = rwrl.load(domain_name=FLAGS.domain_name, task_name=FLAGS.task_name)
  random_policy = RandomAgent(env.action_spec()).action
  viewer.launch(env, policy=random_policy)
예제 #4
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError("Too many command-line arguments.")

    viewer.launch(environment_loader=functools.partial(
        soccer.load,
        team_size=2,
        walker_type=soccer.WalkerType[FLAGS.walker_type],
        disable_walker_contacts=FLAGS.disable_walker_contacts,
        enable_field_box=FLAGS.enable_field_box,
        keep_aspect_ratio=True,
        terminate_on_goal=FLAGS.terminate_on_goal))
예제 #5
0
    def test_interact(self, model_path, random=False):
        """load trained parameters"""
        if not random:
            self._actor.load_state_dict(torch.load(model_path))

        if self.benchmark == "dm_control":
            if random:

                def random_policy(time_step):
                    del time_step  # Unused.
                    return np.random.uniform(
                        low=self._env.action_spec().minimum,
                        high=self._env.action_spec().maximum,
                        size=self._env.action_spec().shape)

                viewer.launch(self._env, policy=random_policy)
            else:

                def source_policy(time_step):
                    s = None
                    for k, v in time_step.observation.items():
                        if s is None:
                            s = v.flatten()
                        else:
                            s = np.hstack([s, v])
                    s_3d = np.reshape(s, [1, self.state_dim])
                    mu, std = self._actor(torch.Tensor(s_3d).to(self.dev))
                    action = self._actor.get_action(mu, std)

                    return action

                viewer.launch(self._env, policy=source_policy)
        elif self.benchmark == "gym":
            for ep in range(self.test_iter):
                score = 0
                done = False
                state = self._env.reset()
                state = np.reshape(state, [1, self.state_dim])
                while not done:
                    mu, std = self._actor(torch.Tensor(state).to(self.dev))
                    action = self._actor.get_action(mu, std)

                    if random:
                        next_state, reward, done, info = self._env.step(
                            np.random.randn(self.action_dim))
                    else:
                        next_state, reward, done, info = self._env.step(action)
                    self._env.render()

                    score = self.gamma * score + reward
                    next_state = np.reshape(next_state, [1, self.state_dim])
                    state = next_state
                print(f"test iter : {ep}\tscore : {score}")
예제 #6
0
    def make_env(self, args=None, kwargs=None, dm_task_name=None):
        """Create dm_control/metaworld environment"""

        if self.metaworld_env:
            env = mtw_envs_rand[self.env_name](*args, **kwargs)

            if debug_mode:
                env._max_episode_steps = 10000

                env.reset()
                env.render()
                global action_to_take
                glfw.set_key_callback(env.unwrapped.viewer.window, on_press)

                while True:
                    env.render()

                    if not np.array_equal(action_to_take, np.zeros(6)):
                        _, _, d, _ = env.step(action_to_take)
                        if d:
                            env.seed(args.seed)
                            env.reset()
                            env.render()

                        # Commenting this out makes the mocap faster but
                        # introduces some instabilities.
                        # action_to_take = np.zeros(6)
        else:

            camera_id = 2 if self.domain_name == 'quadruped' else 0
            if dm_task_name is not None:
                task_name = dm_task_name
            else:
                task_name = self.task_name
            env = dmc2gym.make(domain_name=self.domain_name,
                               task_name=task_name,
                               seed=self.cfg.seed,
                               visualize_reward=False,
                               from_pixels=False,
                               height=self.cfg.image_size,
                               width=self.cfg.image_size,
                               frame_skip=self.cfg.action_repeat,
                               camera_id=camera_id)

            if debug_mode:
                from dm_control import viewer
                viewer.launch(env)

        env = FrameStack(env, k=self.cfg.frame_stack)
        env.seed(self.cfg.seed)

        return env
예제 #7
0
def main(argv):
  del argv
  environment_name = FLAGS.environment_name

  all_names = list(manipulation.ALL)

  if environment_name is None:
    print('\n  '.join(['Available environments:'] + all_names))
    environment_name = prompt_environment_name(
        'Please select an environment name: ', all_names)

  loader = functools.partial(
      manipulation.load, environment_name=environment_name)
  viewer.launch(loader)
예제 #8
0
def build_and_test(model_path, config_key):
    import dmc_wrapper
    from dm_control import viewer
    from rlpyt.utils.buffer import buffer_from_example, torchify_buffer, numpify_buffer
    import torch

    config = configs[config_key]

    reloaded = torch.load(model_path) if len(model_path) > 0 else None
    # import pdb; pdb.set_trace()
    agent = MultiFfAgent(model_kwargs=config["model"],
                         initial_model_state_dict=reloaded['agent_state_dict'],
                         **config["agent"])

    dm_env = maw.load(team_size=args.team_size,
                      time_limit=args.time_limit,
                      terrain=not args.no_hfield,
                      agent_type=args.agent_type,
                      deterministic_spawn=not args.random_spawn,
                      raise_exception_on_physics_error=False,
                      task_id=args.task_id)

    env = GymEnvWrapper(dmc2gym.DmControlWrapper('', '', env=dm_env))

    agent.initialize(env.spaces)
    agent.reset()
    # agent.eval_mode(0)

    prev_action = env.action_space.null_value()

    def get_prev_action():
        return prev_action

    def policy(time_step):
        obs = dmc_wrapper.convertObservation(time_step.observation)
        reward = time_step.reward
        reward = np.asarray(reward) if reward is not None else reward

        obs_pyt, act_pyt, rew_pyt = torchify_buffer(
            (obs, get_prev_action(), reward))
        # obs_pyt, rew_pyt = torchify_buffer((obs, reward))

        act_pyt, agent_info = agent.step(obs_pyt.float(), act_pyt, rew_pyt)
        # prev_action = act_pyt

        return act_pyt

    viewer.launch(dm_env, policy=policy)
예제 #9
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    environment_name = FLAGS.environment_name
    if environment_name == 'mujoban':
        walker = walkers.JumpingBallWithHead(add_ears=True, camera_height=0.25)
        arena = MujobanLevel(boxoban_level_generator)
        task = Mujoban(walker=walker,
                       maze=arena,
                       control_timestep=CONTROL_TIMESTEP,
                       top_camera_height=64,
                       top_camera_width=48)
        env = composer.Environment(time_limit=TIME_LIMIT,
                                   task=task,
                                   strip_singleton_obs_buffer_dim=True)
    else:
        env = functools.partial(board_games.load,
                                environment_name=environment_name)

    viewer.launch(env)
예제 #10
0
def main(argv):
  del argv
  environment_name = FLAGS.environment_name
  if environment_name is None:
    print('\n  '.join(['Available environments:'] + _ALL_NAMES))
    environment_name = prompt_environment_name(
        'Please select an environment name: ', _ALL_NAMES)

  index = _ALL_NAMES.index(environment_name.lower())
  domain_name, task_name = suite.ALL_TASKS[index]

  task_kwargs = {}
  if not FLAGS.timeout:
    task_kwargs['time_limit'] = float('inf')

  def loader():
    env = suite.load(
        domain_name=domain_name, task_name=task_name, task_kwargs=task_kwargs)
    env.task.visualize_reward = FLAGS.visualize_reward
    return env

  viewer.launch(loader)
예제 #11
0
    def __init__(self, seed, difficulty="easy", render=False):
        self.seed = seed
        self.env_name = "reacher"
        self.env = suite.load(self.env_name,
                              difficulty,
                              visualize_reward=True,
                              task_kwargs={"random": seed})
        self.render = render

        # Debug logs
        self.committed_actions = []
        self.time_step = 0

        if render:
            viewer.launch(self.env)

        MDP.__init__(self,
                     range(self.env.action_spec().minimum.shape[0]),
                     self._transition_func,
                     self._reward_func,
                     init_state=FixedReacherState(
                         self.env.reset().observation))
예제 #12
0
def visualize_trajectory(argv):
    env = build_env(
        reward_type=FLAGS.reward_type, 
        ghost_offset=1, 
        clip_name=FLAGS.clip_name,
        start_step=FLAGS.start_step,
    )
    actions = np.load(FLAGS.load_actions_path)
    analyze_trajectory(env, actions)

    def policy(time_step):
        global step
        if time_step.first():
            step = 0
        else:
            step += 1
        if step < len(actions):
            return actions[step]
        else:
            print('{} Out of actions - returning zeros'.format(step))
            return np.zeros_like(actions[0])

    viewer.launch(env, policy=policy)
예제 #13
0
    state_dict = torch.load(dump, map_location="cpu")
    policy = Actor(*state_dict["args"].tolist())
    policy.load_state_dict(state_dict)
    policy.eval()

    @torch.no_grad()
    def _policy(time_step):
        state = np.concatenate(list(time_step.observation.values()))
        state_tensor = torch.tensor(state, dtype=torch.float)
        p = policy(state_tensor).numpy()
        return np.clip(p, action_spec.minimum, action_spec.maximum)

    return _policy


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--dump", type=str, default="dumps/model.pth")
    parser.add_argument("--env",
                        nargs=2,
                        type=str,
                        default=["cartpole", "swingup"])
    args = parser.parse_args()

    env = suite.load(domain_name=args.env[0], task_name=args.env[1])
    action_spec = env.action_spec()

    policy = get_policy(args.dump, action_spec)
    viewer.launch(env, policy)
def main():
    env = suite.load(domain_name="quadruped", task_name="escape")
    viewer.launch(environment_loader=env)
예제 #15
0
def main():
    # viewer.launch(environment_loader=ant_run)
    # viewer.launch(environment_loader=ant_run_long)
    # viewer.launch(environment_loader=ant_run_walls)
    viewer.launch(environment_loader=ant_run_gaps)
from dm_control import suite
from dm_control import viewer
import numpy as np

# Load one task:
env = suite.load(domain_name="quadruped", task_name="fetch")

# Iterate over a task set:
for domain_name, task_name in suite.BENCHMARKING:
  env = suite.load(domain_name, task_name)

#viewer.launch(env)
# Step through an episode and print out reward, discount and observation.
action_spec = env.action_spec()
time_step = env.reset()
#while not time_step.last():
#viewer.launch(env)
all_actions=[]
while not time_step.last():
  action = np.random.uniform(action_spec.minimum,
                             action_spec.maximum,
                             size=action_spec.shape)
  time_step = env.step(action)
  print(time_step.reward, time_step.discount, time_step.observation)
  all_actions.append(action)

viewer.launch(env,all_actions)
viewer.launch(env)
#viewer.render()
예제 #17
0

# Define a linear control policy.
def linear_control_policy(time_step):
    # State Variables
    x_dot = time_step.observation['velocity'][0]
    theta_dot = time_step.observation['velocity'][1]
    x = time_step.observation['position'][0]
    theta = np.arccos(time_step.observation['position'][1])

    # Calculate Control Input
    x_vec = np.array([[theta], [theta_dot], [x], [x_dot]])
    u = np.matmul(-np.transpose(K), x_vec)

    # Apply Control Input
    time_step = env.step(u)
    return u


# Launch the viewer application.
viewer.launch(env, policy=linear_control_policy)

# Save K
with open(f"K_{int(time.time())}.pickle", "wb") as f:
    pickle.dump(K, f)

# Plotting
plt.plot([i for i in range(num_ep)], ep_rewards)
plt.ylabel(f"Episode Rewards")
plt.xlabel(f"Episode #")
plt.show()
예제 #18
0

def random_policy(time_step=None):
    del time_step  # Unused.
    # print(env.action_spec().minimum,env.action_spec().maximum,env.action_spec().shape)
    lo = -0.3 * np.ones((2, 1))
    hig = 0.3 * np.ones((2, 1))
    return np.random.uniform(low=lo, high=hig, size=(2, 1))


if __name__ == '__main__':

    env = RopeEnv(use_visual_observation=False,
                  use_image_goal=False,
                  n_substeps=20)
    viewer.launch(ViewerWrapper(env), policy=random_policy)

    # env.reset()
    # print(env.physics.named.model.body_pos)
    # print(env.goal_state)
    # print(env.physics.get_state())
    # action = random_policy().squeeze()
    action = np.ones((2, )) * 100
    while True:

        # print(env.physics.data.ctrl)
        pixels, _, _, _ = env.step(action.squeeze())
        pixels = pixels['observation']
        pixels = pixels / 255.0
        pixels = pixels[:, :, ::-1]  # BGR to RGB
        print(action)
예제 #19
0
    # Get new Discrete Step
    new_discrete_state = get_discrete_state(observations)

    if time_step.discount is None:
        done = True

    if not done:
        max_future_q = np.max(q_table[new_discrete_state])
        current_q = q_table[discrete_state + (action, )]
        new_q = (1 - Learning_Rate) * current_q + Learning_Rate * (
            time_step.reward + Discount * max_future_q)
        q_table[discrete_state + (action, )] = new_q

    discrete_state = new_discrete_state

    # Print the Results of the Action
    print("reward = {}, discount = {}, observations = {}.".format(
        time_step.reward, time_step.discount, time_step.observation))
    return action


plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['avg'], Label="avg")
plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['min'], Label="min")
plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['max'], Label="max")
plt.legend(loc=4)
plt.show()

# Launch the viewer application.
viewer.launch(env, policy=random_action_policy)
예제 #20
0
def view_render(env, agent):
    def random_policy(time_step):
        return agent.get_best_action(get_state(time_step.observation))

    viewer.launch(env.env, policy=random_policy)
예제 #21
0
    def create_model(self):
        model = Sequential()
        model.add(Conv2D(256, (3,3), input_shape=OBSERVATION_SPACE_VALUES))
        model.add(Activation("relu")
        model.add(MaxPooling2D(2,2))
        model.add(Dropout(0.2))
        
        model.add(Conv2D(256, (3,3)))
        model.add(Activation("relu")
        model.add(MaxPooling2D(2,2))
        model.add(Dropout(0.2))
        
        model.add(Flatten())
        model.add(Dense(64))
        
        model.Dense(ACTION_SPACE_SIZE, activation="linear")
        model.compile(Loss="mse", optimizer=Adam(Lr=0.001), metrics=['accuracy'])
        return model
        
def update_replay_memory(self, transition)
        self.replay_memory.append(transition)
    
def get_qs(self, terminal_state, step)
        return self.model_predict(np.array(state).reshape(-1, *state.shape)/255)[0] # -- Probably change this (I believe this is to reshape q value)
     
def train(self, terminal_state, step)
    if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
        return

    minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
    
    current_states = np.array([transition[0] for transition in minibatch])/255
    current_qs_list = self.model.predict(current_states)
    
    new_current_states = np.array([transition[3] for transition in minibatch])/255
    future_qs_list = self.target_model.predict(new_current_states)
    
    X = []
    y = []
    
    for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
        if not done:
            max_future_q = np.max(future_qs_list[index])
            new_q = reward + DISCOUNT * max_future_q
            
        else:
            new_q = reward
            
        current_qs = current_qs_list[index]
        current_qs[action] = new_q
        
        X.append(current_state)
        y.append(current_qs)
        
    self.model.fit(np.array(X)/255, np.array(y), batch_size = MINIBATCH_SIZE, verbose = 0, shuffle=False, callbacks = [self.tensorboard] if terminal_state else None)
    
    
   
    if terminal_state:
        self.target_update_counter += 1
        
        
    if self.target_update_counter > UPDATE_TARGET_EVERY:
        self.target_model.set_weights(self.model.get_weights())
        self.target_update_counter = 0

# Start DQN
agent = DQNAgent()

# Set up Environment
env = suite.load(domain_name="cartpole", task_name="balance_sparse")
initial_values = env.reset()

# Recording Performance
ep_rewards = []
aggr_ep_rewards = {'ep': [], 'avg': [], 'min': [], 'max': []}

for episode in tqdm(range(1, EPISODES+1), ascii=True, unit-"episode"):
    agent.tensorboard.step = episode
    
    episode_reward = 0
    step = 1
    time_step = env.reset()
    current_state = np.concatenate((time_step.observation['position'],time_step.observation['velocity']))
    
    while not done:
    
        # Decide if taking a random action w/ epsilon
        if np.random.random() > epsilon:
            action = np.argmax(agent.get_qs(current_state))
        else:
            action = np.random.randint(0, ACTION_SPACE_SIZE)
        
        # Perform the Action in the Environment
        time_step = env.step(action)
        reward = time_step.reward
        new_state = np.concatenate((time_step.observation['position'],time_step.observation['velocity']))
      
        if time_step.discount is None:
            done = True
        
        if not done:
            episode_reward += time_step.reward
            
        agent.update_replay_memory((current_state, action, reward, new_state, done))
        agent.train(done, step)
        
        current_state = new_state
        step += 1
        
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value
        
    ep_rewards.append(episode_reward)
    
    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        aggr_ep_rewards['ep'].append(episode)
        aggr_ep_rewards['avg'].append(average_reward)
        aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))
    
    
    
      
        
        










# Set up Environment
env = suite.load(domain_name="cartpole", task_name="balance_sparse")
initial_values = env.reset()


# Get Possible Actions for Environment 
action_spec = env.action_spec()

# Initialize Q Table
initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity']))
DISCRETE_OS_SIZE = np.array([30] * len(initial_observations))
guess_high_observation = 1.5
guess_low_observation = -1.5
discrete_os_win_size = np.array(([guess_high_observation - guess_low_observation] * 5)) / DISCRETE_OS_SIZE
action_space = np.array([50])

# Parameters
Learning_Rate = 0.1
Discount = 0.99
Episodes = 10000

SHOW_EVERY = 50

epsilon = 0.5
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = Episodes // 1.5 # // Ensures no float

epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

q_table = np.random.uniform(low=-1,high=1,size=(np.concatenate((DISCRETE_OS_SIZE, action_space))))

# Recording Performance
ep_rewards = []
aggr_ep_rewards = {'ep': [], 'avg': [], 'min': [], 'max': []}

# Discretize State
def get_discrete_state(state):
    discrete_state = (state - [guess_low_observation,guess_low_observation,guess_low_observation,guess_low_observation,guess_low_observation]) * discrete_os_win_size
    return tuple(discrete_state.astype(np.int))
    
discrete_state = get_discrete_state(initial_observations)
#print(q_table[discrete_state])

# Go through Episodes for Training
for episode in range(Episodes):
    done = False
    episode_reward = 0.0
    if episode % SHOW_EVERY == 0:
        print(episode)
    
    # Reset Environment
    initial_values = env.reset()
    initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity']))
    discrete_state = get_discrete_state(initial_observations)
    
    while not done:
      # Take a Action within the range of Actions and correct size
      if np.random.random() > epsilon:
        action = np.argmax(q_table[discrete_state])
        action_take = (action/25)-1
      else:
        action = np.random.randint(0,50)
        action_take = (action/25)-1
                               
      # Perform the Action in the Environment
      time_step = env.step(action_take)
      observations = np.concatenate((time_step.observation['position'],time_step.observation['velocity']))
      
      # Get new Discrete Step
      new_discrete_state = get_discrete_state(observations)
      
      if time_step.discount is None:
        done = True
      
      if not done:
        max_future_q = np.max(q_table[new_discrete_state])
        current_q = q_table[discrete_state + (action, )]
        new_q = (1-Learning_Rate) * current_q + Learning_Rate * (time_step.reward + Discount * max_future_q)
        q_table[discrete_state + (action, )] = new_q
        episode_reward += time_step.reward
        
      discrete_state = new_discrete_state
      
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value
        
    ep_rewards.append(episode_reward)
    
    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        aggr_ep_rewards['ep'].append(episode)
        aggr_ep_rewards['avg'].append(average_reward)
        aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))
        
    
# Reset Environment
initial_values = env.reset()
initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity']))
discrete_state = get_discrete_state(initial_observations)
done = False


# Define a uniform random policy.
def random_action_policy(time_step, done = False, discrete_state = get_discrete_state(initial_observations)):

  # Take a Action within the range of Actions and correct size
  action = np.argmax(q_table[discrete_state])
                           
  # Perform the Action in the Environment
  time_step = env.step(action)
  observations = np.concatenate((time_step.observation['position'],time_step.observation['velocity']))
  
  # Get new Discrete Step
  new_discrete_state = get_discrete_state(observations)
  
  if time_step.discount is None:
    done = True
  
  if not done:
    max_future_q = np.max(q_table[new_discrete_state])
    current_q = q_table[discrete_state + (action, )]
    new_q = (1-Learning_Rate) * current_q + Learning_Rate * (time_step.reward + Discount * max_future_q)
    q_table[discrete_state + (action, )] = new_q
    
  discrete_state = new_discrete_state
  
  # Print the Results of the Action
  print("reward = {}, discount = {}, observations = {}.".format(
    time_step.reward, time_step.discount, time_step.observation)) 
  return action   

plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['avg'], Label = "avg")
plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['min'], Label = "min")
plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['max'], Label = "max")
plt.legend(loc=4)
plt.show()

# Launch the viewer application.
viewer.launch(env, policy=random_action_policy)
예제 #22
0
파일: play.py 프로젝트: seanahmad/tonic
def play_control_suite(agent, environment):
    '''Launches an agent in a DeepMind Control Suite-based environment.'''

    from dm_control import viewer

    class Wrapper:
        '''Wrapper used to plug a Tonic environment in a dm_control viewer.'''
        def __init__(self, environment):
            self.environment = environment
            self.unwrapped = environment.unwrapped
            self.action_spec = self.unwrapped.environment.action_spec
            self.physics = self.unwrapped.environment.physics
            self.infos = None
            self.episodes = 0

        def reset(self):
            '''Mimics a dm_control reset for the viewer.'''

            self.observations = self.environment.reset()[None]

            self.score = 0
            self.length = 0

            return self.unwrapped.last_time_step

        def step(self, actions):
            '''Mimics a dm_control step for the viewer.'''

            ob, rew, term, _ = self.environment.step(actions)
            self.score += rew
            self.length += 1
            timeout = self.length == self.environment.max_episode_steps
            done = term or timeout

            if done:
                print()
                self.episodes += 1
                print('Episodes:', self.episodes)
                print('Score:', self.score)
                print('Length:', self.length)

            self.observations = ob[None]
            self.infos = dict(observations=ob[None],
                              rewards=np.array([rew]),
                              resets=np.array([done]),
                              terminations=[term])

            return self.unwrapped.last_time_step

    # Wrap the environment for the viewer.
    environment = Wrapper(environment)

    def policy(timestep):
        '''Mimics a dm_control policy for the viewer.'''

        if environment.infos is not None:
            agent.test_update(**environment.infos)
        return agent.test_step(environment.observations)

    # Launch the viewer with the wrapped environment and policy.
    viewer.launch(environment, policy)
예제 #23
0
def main(unused_argv):
  viewer.launch(environment_loader=basic_cmu_2019.cmu_humanoid_run_gaps)
예제 #24
0
def main():
    args = parse_args()

    if not (bool(args.viewer) ^ bool(args.save_path)):
        raise Exception("you need to provide --viewer xor --save-dir "
                        "arguments for this to do anything useful :)")

    if args.threads is not None:
        torch.set_num_threads(args.threads)

    # TODO: The next few calls are copy-pasted out of train.py. Consider
    # refactoring so that you don't have to copy-paste (otoh not very important
    # since this code only needs to be run once)
    if torch.cuda.is_available():
        dev = torch.device('cuda')
    else:
        dev = torch.device('cpu')
    pre_transform_image_size = args.pre_transform_image_size if 'crop' \
        in args.data_augs else args.image_size
    env = dmc2gym.make(
        domain_name=args.domain_name,
        task_name=args.task_name,
        seed=args.seed,
        visualize_reward=False,
        from_pixels=(args.encoder_type == 'pixel'),
        height=pre_transform_image_size,
        width=pre_transform_image_size,
        frame_skip=args.action_repeat)
    env.seed(args.seed)
    action_shape = env.action_space.shape
    obs_shape = (3 * args.frame_stack, args.image_size, args.image_size)
    agent = RadSacAgent(
        obs_shape=obs_shape,
        action_shape=action_shape,
        device=dev,
        hidden_dim=args.hidden_dim,
        encoder_type=args.encoder_type,
        encoder_feature_dim=args.encoder_feature_dim,
        num_layers=args.num_layers,
        num_filters=args.num_filters,
        latent_dim=args.latent_dim,
        data_augs=args.data_augs, )
    agent.load_ac(actor_path=args.actor_path)

    if args.viewer:
        dmc_env = unwrap(env)
        frames = collections.deque(maxlen=args.frame_stack or 1)

        def loaded_policy(time_step):
            # time_step just contains joint angles; we want image observation
            obs = env.env._get_obs(time_step)
            frames.append(obs)
            while len(frames) < frames.maxlen:
                # for init
                frames.append(obs)
            stacked_obs = np.concatenate(frames, axis=0) / 255.
            return agent.sample_action(stacked_obs)

        viewer.launch(dmc_env, policy=loaded_policy)
        return  # done

    # otherwise, we need to save a bunch of imitation.data.TrajectoryWithRew
    # instance to some directory somewhere…
    all_traj = []
    for t in range(args.ntraj):
        traj = sample_traj_stacked(env, agent,
                                   frame_stack=args.frame_stack or 1)
        all_traj.append(traj)
    # for now I'm just going to save all trajectories in one file
    print(f"Saving to '{args.save_path}'")
    save_compressed_pickle(all_traj, args.save_path)

    env.close()
예제 #25
0
from dm_control import composer
from dm_control.locomotion.examples import basic_cmu_2019
from dm_control import viewer
import numpy as np

# Build an example environment.
env = basic_cmu_2019.cmu_humanoid_run_walls()
viewer.launch(environment_loader=basic_cmu_2019.cmu_humanoid_run_walls)

action_spec = env.action_spec()

# Step through the environment for one episode with random actions.
time_step = env.reset()
while not time_step.last():
  action = np.random.uniform(action_spec.minimum, action_spec.maximum,
                             size=action_spec.shape)
  time_step = env.step(action)
  print("reward = {}, discount = {}, observations = {}.".format(
      time_step.reward, time_step.discount, time_step.observation))

#viewer.launch(environment_loader=basic_cmu_2019.cmu_humanoid_run_walls)
예제 #26
0
from lib.dm_control import suite  # This hack is so that we can define and use our own envs
from dm_control import viewer
import numpy as np

env = suite.load(domain_name="humanoid_CMU", task_name="stand")
action_spec = env.action_spec()


# Define a uniform random policy.
def random_policy(time_step):
    del time_step  # Unused.
    return np.random.uniform(low=action_spec.minimum,
                             high=action_spec.maximum,
                             size=action_spec.shape)


# Launch the viewer application.
viewer.launch(env, policy=None, width=1024, height=768)
예제 #27
0
def main(unused_argv):
    viewer.launch(environment_loader=TASKS[FLAGS.task])
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    viewer.launch(
        environment_loader=functools.partial(soccer.load, team_size=2))
from optimal_agents.morphology import Morphology
from optimal_agents.morphology import random2d
from optimal_agents.envs.dm_control_env import dm_control_test_env
from optimal_agents.morphology import arenas

global_kwargs = {"option.timestep": 0.01}
geom_kwargs = {
    "contype": 1,
    "conaffinity": 1,
    "condim": 3,
    "friction": [0.4, 0.1, 0.1],
}

joint_kwargs = {"damping": 2, "armature": 0.1, "stiffness": 20}

morphology = random2d(mutation_kwargs={})

env = dm_control_test_env(morphology, arena=arenas.GM_Terrain())

action_spec = env.action_spec()


def random_policy(time_step):
    del time_step  # Unused.
    return np.random.uniform(low=action_spec.minimum,
                             high=action_spec.maximum,
                             size=action_spec.shape)


viewer.launch(env, policy=random_policy)
예제 #30
0
from dm_control import suite
from dm_control import viewer

# Load an environment from the Control Suite.
env = suite.load(domain_name="humanoid", task_name="stand")

# Launch the viewer application.
viewer.launch(env)