def pay_attention_always_equal():
    env = create_environment(False, False, 1, 0, False)
    env.reset()
    for _ in range(100):
        obs = env.step(env.action_space.sample())[0]
        pelvis_xy = env.get_state_desc()['body_pos']['pelvis'][0:2]
        ground_pelvis_xy = env.get_state_desc(
        )['joint_pos']['ground_pelvis'][1:3]
        assert_almost_equal(pelvis_xy, ground_pelvis_xy)
        pelvis_vel = env.get_state_desc()['body_vel']['pelvis'][0:2]
        ground_pelvis_vel = env.get_state_desc(
        )['joint_vel']['ground_pelvis'][1:3]
        assert_almost_equal(pelvis_vel, ground_pelvis_vel)
def test_state_flip(exclude_centering):
    env = create_environment(False, False, 1, 0, exclude_centering)
    b = ReplayBufferFlip(2, True, env.get_observation_names(),
                         env.action_space.shape, env.observation_space.shape)
    shift = int(exclude_centering)
    env.reset()
    for _ in range(100):
        obs = env.step(env.action_space.sample())[0]
        fobs = b.swap_states(np.matrix(obs)).tolist()[0]
        assert (len(obs) == 34 - shift)
        assert (len(obs) == len(fobs))
        # pelvis does not change
        assert_almost_equal(obs[0:2 - shift], fobs[0:2 - shift])
        # hip
        assert_almost_equal(obs[2 - shift:4 - shift],
                            fobs[4 - shift:6 - shift])
        assert_almost_equal(obs[4 - shift:6 - shift],
                            fobs[2 - shift:4 - shift])
        # knee
        assert_almost_equal(obs[6 - shift:8 - shift],
                            fobs[8 - shift:10 - shift])
        assert_almost_equal(obs[8 - shift:10 - shift],
                            fobs[6 - shift:8 - shift])
        # ankle
        assert_almost_equal(obs[10 - shift:12 - shift],
                            fobs[12 - shift:14 - shift])
        assert_almost_equal(obs[12 - shift:14 - shift],
                            fobs[10 - shift:12 - shift])
        # up to torso nothing changes
        assert_almost_equal(obs[14 - shift:20 - shift],
                            fobs[14 - shift:20 - shift])
        # toes
        assert_almost_equal(obs[20 - shift:22 - shift],
                            fobs[22 - shift:24 - shift])
        assert_almost_equal(obs[22 - shift:24 - shift],
                            fobs[20 - shift:22 - shift])
        # talus
        assert_almost_equal(obs[24 - shift:26 - shift],
                            fobs[26 - shift:28 - shift])
        assert_almost_equal(obs[26 - shift:28 - shift],
                            fobs[24 - shift:26 - shift])
        # center of mass does not change
        assert_almost_equal(obs[28 - shift:32 - shift],
                            fobs[28 - shift:32 - shift])
        # pelvis speed does not change
        assert_almost_equal(obs[32 - shift:34 - shift],
                            fobs[32 - shift:34 - shift])
Exemplo n.º 3
0
def test(seed, layer_norm, full, action_repeat, fail_reward, exclude_centering_frame,
         integrator_accuracy, render, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    # Main env
    env = create_environment(render, full, action_repeat,
                             fail_reward, exclude_centering_frame,
                             integrator_accuracy)
    env.reset()
    eval_env = None

    # Parse noise_type
    nb_actions = env.action_space.shape[-1]

    # Configure components.
    memory = ReplayBufferFlip(int(5e6),
                              False,
                              env.get_observation_names(),
                              env.action_space.shape,
                              env.observation_space.shape)
    actor = Actor(nb_actions, layer_norm=layer_norm)
    critic = Critic(layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(
        rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()

    del kwargs['func']
    testing.test(env=env, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
def obs_vector_consistency(exclude_centering):
    env = create_environment(False, False, 1, 0, exclude_centering)
    shift = int(exclude_centering)
    for _ in range(100):
        # take some random action
        env.step(env.action_space.sample())
        # check consistency between state desc and obs vector
        # plus the order of obs (used in action flip)
        desc = env.get_state_desc()
        obs = env.get_observation_basic()

        # check pelvis coordinates
        centering_x = desc['body_pos']['pelvis'][0]
        if not exclude_centering:
            assert_almost_equal(centering_x, obs[0])
        pelvis_y = desc['body_pos']['pelvis'][1]
        assert_almost_equal(pelvis_y, obs[1 - shift])

        # check joint and speed
        joint_pos = desc['joint_pos']
        joint_vel = desc['joint_vel']
        # hips
        assert_almost_equal(joint_pos['hip_l'][0], obs[2 - shift])
        assert_almost_equal(joint_vel['hip_l'][0], obs[3 - shift])
        assert_almost_equal(joint_pos['hip_r'][0], obs[4 - shift])
        assert_almost_equal(joint_vel['hip_r'][0], obs[5 - shift])
        # knees
        assert_almost_equal(joint_pos['knee_l'][0], obs[6 - shift])
        assert_almost_equal(joint_vel['knee_l'][0], obs[7 - shift])
        assert_almost_equal(joint_pos['knee_r'][0], obs[8 - shift])
        assert_almost_equal(joint_vel['knee_r'][0], obs[9 - shift])
        # ankles
        assert_almost_equal(joint_pos['ankle_l'][0], obs[10 - shift])
        assert_almost_equal(joint_vel['ankle_l'][0], obs[11 - shift])
        assert_almost_equal(joint_pos['ankle_r'][0], obs[12 - shift])
        assert_almost_equal(joint_vel['ankle_r'][0], obs[13 - shift])
        # ground pelvis
        assert_almost_equal(joint_pos['ground_pelvis'][0], obs[14 - shift])
        assert_almost_equal(joint_vel['ground_pelvis'][0], obs[15 - shift])

        # check body part coordinates
        body_pos = desc['body_pos']
        # head
        assert_almost_equal(body_pos['head'][0], obs[16 - shift] + centering_x)
        assert_almost_equal(body_pos['head'][1], obs[17 - shift])
        # torso
        assert_almost_equal(body_pos['torso'][0],
                            obs[18 - shift] + centering_x)
        assert_almost_equal(body_pos['torso'][1], obs[19 - shift])
        # toes
        assert_almost_equal(body_pos['toes_l'][0],
                            obs[20 - shift] + centering_x)
        assert_almost_equal(body_pos['toes_l'][1], obs[21 - shift])
        assert_almost_equal(body_pos['toes_r'][0],
                            obs[22 - shift] + centering_x)
        assert_almost_equal(body_pos['toes_r'][1], obs[23 - shift])
        # talus
        assert_almost_equal(body_pos['talus_l'][0],
                            obs[24 - shift] + centering_x)
        assert_almost_equal(body_pos['talus_l'][1], obs[25 - shift])
        assert_almost_equal(body_pos['talus_r'][0],
                            obs[26 - shift] + centering_x)
        assert_almost_equal(body_pos['talus_r'][1], obs[27 - shift])

        # check center of mass
        com_pos = desc['misc']['mass_center_pos']
        com_vel = desc['misc']['mass_center_vel']
        assert_almost_equal(com_pos[0], obs[28 - shift] + centering_x)
        assert_almost_equal(com_pos[1], obs[29 - shift])
        assert_almost_equal(com_vel[0], obs[30 - shift])
        assert_almost_equal(com_vel[1], obs[31 - shift])

        # check pelvis speed
        assert_almost_equal(desc['body_vel']['pelvis'][0], obs[32 - shift])
        assert_almost_equal(desc['body_vel']['pelvis'][1], obs[33 - shift])
    def run(self):
        """Override Process.run()"""
        # Create environment
        env = create_environment(
            action_repeat=self.action_repeat,
            full=self.full,
            exclude_centering_frame=self.exclude_centering_frame,
            visualize=self.visualize,
            fail_reward=self.fail_reward,
            integrator_accuracy=self.integrator_accuracy)
        nb_actions = env.action_space.shape[-1]

        env.seed(os.getpid())
        set_global_seeds(os.getpid())

        num_traj = 0

        # Allocate ReplayBuffer
        memory = Memory(limit=int(1e6),
                        action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)

        # Create DPPG agent
        agent = DDPG(self.actor,
                     self.critic,
                     memory,
                     env.observation_space.shape,
                     env.action_space.shape,
                     gamma=self.gamma,
                     tau=self.tau,
                     normalize_returns=self.normalize_returns,
                     normalize_observations=self.normalize_observations,
                     batch_size=self.batch_size,
                     action_noise=None,
                     param_noise=None,
                     critic_l2_reg=self.critic_l2_reg,
                     enable_popart=self.popart,
                     clip_norm=self.clip_norm,
                     reward_scale=self.reward_scale)

        # Build the testing logic fn
        testing_fn = make_testing_fn(agent, env, self.episode_length,
                                     self.action_repeat, self.max_action,
                                     self.nb_episodes)

        # Start TF session
        with U.single_threaded_session() as sess:
            agent.initialize(sess)
            set_parameters = U.SetFromFlat(self.actor.trainable_vars)

            # Start sampling-worker loop.
            while True:
                message, actor_ws, global_step = self.inputQ.get(
                )  # Pop message
                if message == 'test':
                    # Set weights
                    set_parameters(actor_ws)
                    # Do testing
                    rewards, step_times, distances, episode_lengths = testing_fn(
                    )
                    self.outputQ.put((rewards, step_times, distances,
                                      episode_lengths, global_step))

                    # update number of trajectories
                    num_traj += self.nb_episodes

                    # restore environment if needed
                    if num_traj >= self.max_env_traj:
                        env.restore()
                        num_traj = 0

                elif message == 'exit':
                    print('[Worker {}] Exiting...'.format(os.getpid()))
                    env.close()
                    break
Exemplo n.º 6
0
    def run(self):
        """Override Process.run()"""
        # Create environment
        env = create_environment(
            action_repeat=self.action_repeat,
            full=self.full,
            exclude_centering_frame=self.exclude_centering_frame,
            visualize=self.visualize,
            fail_reward=self.fail_reward,
            integrator_accuracy=self.integrator_accuracy)
        nb_actions = env.action_space.shape[-1]

        # keep tracks of the number of trajectory done
        num_traj = 0

        env.seed(os.getpid())
        set_global_seeds(os.getpid())

        # Create OU Noise
        action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                    sigma=0.2,
                                                    theta=0.1)

        # Allocate ReplayBuffer
        memory = Memory(limit=int(1e6),
                        action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)

        # Create DPPG agent
        agent = DDPG(self.actor,
                     self.critic,
                     memory,
                     env.observation_space.shape,
                     env.action_space.shape,
                     gamma=self.gamma,
                     tau=self.tau,
                     normalize_returns=self.normalize_returns,
                     normalize_observations=self.normalize_observations,
                     batch_size=self.batch_size,
                     action_noise=action_noise,
                     param_noise=self.param_noise,
                     critic_l2_reg=self.critic_l2_reg,
                     enable_popart=self.popart,
                     clip_norm=self.clip_norm,
                     reward_scale=self.reward_scale)

        # Build the sampling logic fn
        sampling_fn = make_sampling_fn(agent, env, self.episode_length,
                                       self.action_repeat, self.max_action,
                                       self.nb_episodes,
                                       self.action_noise_prob)

        # Start TF session
        with U.single_threaded_session() as sess:
            agent.initialize(sess)
            set_parameters = U.SetFromFlat(self.actor.trainable_vars)
            # Start sampling-worker loop.
            while True:
                # self.event.wait()  # Wait for a new message
                # self.event.clear()  # Upon message receipt, mark as read
                message, actor_ws = self.inputQ.get()  # Pop message
                if message == 'sample':
                    # Set weights
                    set_parameters(actor_ws)
                    # Do sampling
                    transitions = sampling_fn()
                    self.outputQ.put((self.process_index, transitions))

                    # update number of trajectories
                    num_traj += self.nb_episodes

                    # restore environment if needed
                    if num_traj >= self.max_env_traj:
                        env.restore()
                        num_traj = 0

                elif message == 'exit':
                    print('[Worker {}] Exiting...'.format(os.getpid()))
                    env.close()
                    break
Exemplo n.º 7
0
def run(seed, parameter_noise, layer_norm, evaluation, flip_state,
        full, action_repeat, fail_reward, exclude_centering_frame,
        checkpoint_dir, log_dir, session_path, last_training_step,
        integrator_accuracy, experiment_name, **kwargs):

    # we don't directly specify timesteps for this script, so make sure that if we do specify them
    # they agree with the other parameters
    if kwargs['num_timesteps'] is not None:
        assert(kwargs['num_timesteps'] == kwargs['nb_epochs'] *
               kwargs['nb_epoch_cycles'] * kwargs['nb_rollout_steps'])

    tmp_log, tmp_chkpt = get_log_and_checkpoint_dirs(experiment_name)

    if log_dir is None:
        log_dir = tmp_log
    if checkpoint_dir is None:
        checkpoint_dir = tmp_chkpt

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Main env
    env = create_environment(False, full, action_repeat,
                             fail_reward, exclude_centering_frame, integrator_accuracy)
    env.reset()
    eval_env = None

    # Parse noise_type
    nb_actions = env.action_space.shape[-1]
    if parameter_noise:
        param_noise = AdaptiveParamNoiseSpec(
            initial_stddev=0.2, desired_action_stddev=0.2)
    else:
        param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(
        mu=np.zeros(nb_actions), sigma=0.2, theta=0.1)

    # Configure components.
    memory = ReplayBufferFlip(int(5e6),
                              flip_state,
                              env.get_observation_names(),
                              env.action_space.shape,
                              env.observation_space.shape)
    actor = Actor(nb_actions, layer_norm=layer_norm)
    critic = Critic(layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(
        rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()

    # Create LearningSession was passed
    del kwargs['func']
    sess_args = pack_run_params(seed, parameter_noise, layer_norm, evaluation, flip_state,
                                full, action_repeat, fail_reward, exclude_centering_frame, **kwargs)
    learning_session = LearningSession(
        session_path, checkpoint_dir, log_dir, last_training_step, **sess_args)

    del kwargs['num_timesteps']
    del kwargs['noise_type']
    training.train(env=env, action_noise=action_noise, param_noise=param_noise,
                   actor=actor, critic=critic, memory=memory,
                   visualize=False, full=full, action_repeat=action_repeat,
                   fail_reward=fail_reward, exclude_centering_frame=exclude_centering_frame,
                   learning_session=learning_session, integrator_accuracy=integrator_accuracy,
                   **kwargs)

    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))