def env_factory(env_name):
    gym_env = gym.make(env_name)
    gym_spec = gym.spec(env_name)
    if gym_spec.max_episode_steps in [0, None]:  # Add TimeLimit wrapper.
        gym_env = time_limit.TimeLimit(gym_env, max_episode_steps=1000)

    tf_env = tf_py_environment.TFPyEnvironment(gym_wrapper.GymWrapper(gym_env))
    return tf_env
示例#2
0
    def __init__(self, env_params):

        self.name = env_params['environment']
        self.eval_interval = env_params['EvalIntervalMilSteps'] * 1000000
        self.eval_episodes = env_params['EvalEpisodes']

        if env_params['environment'] == "MountainCarContinuous-v1":
            self.instance = MCv1()
        elif env_params['environment'] == "Pendulum-v1":
            self.instance = time_limit.TimeLimit(pendulum_v1.PendulumEnv(),
                                                 200)
        else:
            self.instance = gym.make(env_params['environment'])

        # total number of steps allowed in a run
        self.TOTAL_STEPS_LIMIT = env_params['TotalMilSteps'] * 1000000
        # self.TOTAL_EPISODES_LIMIT = env_params['TotalEpisodes']

        # maximum number of steps allowed for each episode
        # if -1 takes default setting from gym
        if env_params['EpisodeSteps'] != -1:
            self.EPISODE_STEPS_LIMIT = env_params['EpisodeSteps']
            self.instance._max_episode_steps = env_params['EpisodeSteps']

        else:
            self.EPISODE_STEPS_LIMIT = self.instance._max_episode_steps

        # state info
        self.state_dim = self.get_state_dim()
        self.state_range = self.get_state_range()
        self.state_min = self.get_state_min()
        self.state_max = self.get_state_max()
        self.state_bounded = False if np.any(
            np.isinf(self.instance.observation_space.high)) or np.any(
                np.isinf(self.instance.observation_space.low)) else True

        # action info
        self.action_dim = self.get_action_dim()
        self.action_range = self.get_action_range()
        self.action_min = self.get_action_min()
        self.action_max = self.get_action_max()
示例#3
0
                        action_permutation=act_perm, soft_mirror=soft_mirror)


if __name__ == '__main__':


    append_o = []#[0.4458616 , 0.63732893, 0.98086248, 0.94058195, 0.01685923]

    if len(sys.argv) > 1:
        if sys.argv[1] == 'Minitaur':
            from pybullet_envs.minitaur.envs import minitaur_reactive_env
            from gym.wrappers import time_limit

            env = time_limit.TimeLimit(minitaur_reactive_env.MinitaurReactiveEnv(render=True,
                                                                                 accurate_motor_model_enabled=True,
                                                                                 urdf_version='rainbow_dash_v0',
                                                                                 train_UP=len(append_o) > 0,
                                                                                 resample_MP=False),
                                       max_episode_steps=1000)
        else:
            env = gym.make(sys.argv[1])
    else:
        env = gym.make('DartWalker3dRestricted-v1')

    if len(append_o) > 0 and sys.argv[1] != 'Minitaur':
        from gym import spaces

        env.env.obs_dim += len(append_o)
        high = np.inf * np.ones(env.env.obs_dim)
        low = -high
        env.env.observation_space = spaces.Box(low, high)
        env.observation_space = spaces.Box(low, high)
示例#4
0
 def episode_limit(env):
     return time_limit.TimeLimit(env, max_episode_steps=max_steps)
示例#5
0
    policy_path = args.policy_path
    osi_iteration = args.osi_iteration
    training_sample_num = args.training_sample_num
    dyn_params = args.dyn_params

    # setup the environments
    # if use minitaur environment, set up differently
    if args.env == 'Minitaur':
        from pybullet_envs.minitaur.envs import minitaur_reactive_env
        from gym.wrappers import time_limit

        env_hist = time_limit.TimeLimit(
            minitaur_reactive_env.MinitaurReactiveEnv(
                render=False,
                accurate_motor_model_enabled=True,
                urdf_version='rainbow_dash_v0',
                include_obs_history=OSI_hist,
                include_act_history=0,
                train_UP=False),
            max_episode_steps=1000)
        env_up = time_limit.TimeLimit(
            minitaur_reactive_env.MinitaurReactiveEnv(
                render=False,
                accurate_motor_model_enabled=True,
                urdf_version='rainbow_dash_v0',
                include_obs_history=1,
                include_act_history=0,
                train_UP=True),
            max_episode_steps=1000)
    else:
        env_hist = gym.make(args.env)
示例#6
0
def main(_):
    tf.random.set_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)

    hparam_str = make_hparam_string(seed=FLAGS.seed, env_name=FLAGS.env_name)
    summary_writer = tf.summary.create_file_writer(
        os.path.join(FLAGS.save_dir, 'tb', hparam_str))
    summary_writer.set_as_default()

    if FLAGS.d4rl:
        d4rl_env = gym.make(FLAGS.env_name)
        gym_spec = gym.spec(FLAGS.env_name)
        if gym_spec.max_episode_steps in [0, None]:  # Add TimeLimit wrapper.
            gym_env = time_limit.TimeLimit(d4rl_env, max_episode_steps=1000)
        else:
            gym_env = d4rl_env
        gym_env.seed(FLAGS.seed)
        env = tf_py_environment.TFPyEnvironment(
            gym_wrapper.GymWrapper(gym_env))

        behavior_dataset = D4rlDataset(
            d4rl_env,
            normalize_states=FLAGS.normalize_states,
            normalize_rewards=FLAGS.normalize_rewards,
            noise_scale=FLAGS.noise_scale,
            bootstrap=FLAGS.bootstrap)
    else:
        env = suite_mujoco.load(FLAGS.env_name)
        env.seed(FLAGS.seed)
        env = tf_py_environment.TFPyEnvironment(env)

        data_file_name = os.path.join(
            FLAGS.data_dir, FLAGS.env_name, '0',
            f'dualdice_{FLAGS.behavior_policy_std}.pckl')
        behavior_dataset = Dataset(data_file_name,
                                   FLAGS.num_trajectories,
                                   normalize_states=FLAGS.normalize_states,
                                   normalize_rewards=FLAGS.normalize_rewards,
                                   noise_scale=FLAGS.noise_scale,
                                   bootstrap=FLAGS.bootstrap)

    tf_dataset = behavior_dataset.with_uniform_sampling(
        FLAGS.sample_batch_size)
    tf_dataset_iter = iter(tf_dataset)

    if FLAGS.d4rl:
        with tf.io.gfile.GFile(FLAGS.d4rl_policy_filename, 'rb') as f:
            policy_weights = pickle.load(f)
        actor = utils.D4rlActor(env,
                                policy_weights,
                                is_dapg='dapg' in FLAGS.d4rl_policy_filename)
    else:
        actor = Actor(env.observation_spec().shape[0], env.action_spec())
        actor.load_weights(behavior_dataset.model_filename)

    policy_returns = utils.estimate_monte_carlo_returns(
        env, FLAGS.discount, actor, FLAGS.target_policy_std,
        FLAGS.num_mc_episodes)
    logging.info('Estimated Per-Step Average Returns=%f', policy_returns)

    if 'fqe' in FLAGS.algo or 'dr' in FLAGS.algo:
        model = QFitter(env.observation_spec().shape[0],
                        env.action_spec().shape[0], FLAGS.lr,
                        FLAGS.weight_decay, FLAGS.tau)
    elif 'mb' in FLAGS.algo:
        model = ModelBased(env.observation_spec().shape[0],
                           env.action_spec().shape[0],
                           learning_rate=FLAGS.lr,
                           weight_decay=FLAGS.weight_decay)
    elif 'dual_dice' in FLAGS.algo:
        model = DualDICE(env.observation_spec().shape[0],
                         env.action_spec().shape[0], FLAGS.weight_decay)
    if 'iw' in FLAGS.algo or 'dr' in FLAGS.algo:
        behavior = BehaviorCloning(env.observation_spec().shape[0],
                                   env.action_spec(), FLAGS.lr,
                                   FLAGS.weight_decay)

    @tf.function
    def get_target_actions(states):
        return actor(tf.cast(behavior_dataset.unnormalize_states(states),
                             env.observation_spec().dtype),
                     std=FLAGS.target_policy_std)[1]

    @tf.function
    def get_target_logprobs(states, actions):
        log_probs = actor(tf.cast(behavior_dataset.unnormalize_states(states),
                                  env.observation_spec().dtype),
                          actions=actions,
                          std=FLAGS.target_policy_std)[2]
        if tf.rank(log_probs) > 1:
            log_probs = tf.reduce_sum(log_probs, -1)
        return log_probs

    min_reward = tf.reduce_min(behavior_dataset.rewards)
    max_reward = tf.reduce_max(behavior_dataset.rewards)
    min_state = tf.reduce_min(behavior_dataset.states, 0)
    max_state = tf.reduce_max(behavior_dataset.states, 0)

    @tf.function
    def update_step():
        (states, actions, next_states, rewards, masks, weights,
         _) = next(tf_dataset_iter)
        initial_actions = get_target_actions(behavior_dataset.initial_states)
        next_actions = get_target_actions(next_states)

        if 'fqe' in FLAGS.algo or 'dr' in FLAGS.algo:
            model.update(states, actions, next_states, next_actions, rewards,
                         masks, weights, FLAGS.discount, min_reward,
                         max_reward)
        elif 'mb' in FLAGS.algo:
            model.update(states, actions, next_states, rewards, masks, weights)
        elif 'dual_dice' in FLAGS.algo:
            model.update(behavior_dataset.initial_states, initial_actions,
                         behavior_dataset.initial_weights, states, actions,
                         next_states, next_actions, masks, weights,
                         FLAGS.discount)

        if 'iw' in FLAGS.algo or 'dr' in FLAGS.algo:
            behavior.update(states, actions, weights)

    gc.collect()

    for i in tqdm.tqdm(range(FLAGS.num_updates), desc='Running Training'):
        update_step()

        if i % FLAGS.eval_interval == 0:
            if 'fqe' in FLAGS.algo:
                pred_returns = model.estimate_returns(
                    behavior_dataset.initial_states,
                    behavior_dataset.initial_weights, get_target_actions)
            elif 'mb' in FLAGS.algo:
                pred_returns = model.estimate_returns(
                    behavior_dataset.initial_states,
                    behavior_dataset.initial_weights, get_target_actions,
                    FLAGS.discount, min_reward, max_reward, min_state,
                    max_state)
            elif FLAGS.algo in ['dual_dice']:
                pred_returns, pred_ratio = model.estimate_returns(
                    iter(tf_dataset))

                tf.summary.scalar('train/pred ratio', pred_ratio, step=i)
            elif 'iw' in FLAGS.algo or 'dr' in FLAGS.algo:
                discount = FLAGS.discount
                _, behavior_log_probs = behavior(behavior_dataset.states,
                                                 behavior_dataset.actions)
                target_log_probs = get_target_logprobs(
                    behavior_dataset.states, behavior_dataset.actions)
                offset = 0.0
                rewards = behavior_dataset.rewards
                if 'dr' in FLAGS.algo:
                    # Doubly-robust is effectively the same as importance-weighting but
                    # transforming rewards at (s,a) to r(s,a) + gamma * V^pi(s') -
                    # Q^pi(s,a) and adding an offset to each trajectory equal to V^pi(s0).
                    offset = model.estimate_returns(
                        behavior_dataset.initial_states,
                        behavior_dataset.initial_weights, get_target_actions)
                    q_values = (model(behavior_dataset.states,
                                      behavior_dataset.actions) /
                                (1 - discount))
                    n_samples = 10
                    next_actions = [
                        get_target_actions(behavior_dataset.next_states)
                        for _ in range(n_samples)
                    ]
                    next_q_values = sum([
                        model(behavior_dataset.next_states, next_action) /
                        (1 - discount) for next_action in next_actions
                    ]) / n_samples
                    rewards = rewards + discount * next_q_values - q_values

                # Now we compute the self-normalized importance weights.
                # Self-normalization happens over trajectories per-step, so we
                # restructure the dataset as [num_trajectories, num_steps].
                num_trajectories = len(behavior_dataset.initial_states)
                max_trajectory_length = np.max(behavior_dataset.steps) + 1
                trajectory_weights = behavior_dataset.initial_weights
                trajectory_starts = np.where(
                    np.equal(behavior_dataset.steps, 0))[0]

                batched_rewards = np.zeros(
                    [num_trajectories, max_trajectory_length])
                batched_masks = np.zeros(
                    [num_trajectories, max_trajectory_length])
                batched_log_probs = np.zeros(
                    [num_trajectories, max_trajectory_length])

                for traj_idx, traj_start in enumerate(trajectory_starts):
                    traj_end = (trajectory_starts[traj_idx + 1] if traj_idx +
                                1 < len(trajectory_starts) else len(rewards))
                    traj_length = traj_end - traj_start
                    batched_rewards[
                        traj_idx, :traj_length] = rewards[traj_start:traj_end]
                    batched_masks[traj_idx, :traj_length] = 1.
                    batched_log_probs[traj_idx, :traj_length] = (
                        -behavior_log_probs[traj_start:traj_end] +
                        target_log_probs[traj_start:traj_end])

                batched_weights = (
                    batched_masks *
                    (discount**np.arange(max_trajectory_length))[None, :])

                clipped_log_probs = np.clip(batched_log_probs, -6., 2.)
                cum_log_probs = batched_masks * np.cumsum(clipped_log_probs,
                                                          axis=1)
                cum_log_probs_offset = np.max(cum_log_probs, axis=0)
                cum_probs = np.exp(cum_log_probs -
                                   cum_log_probs_offset[None, :])
                avg_cum_probs = (
                    np.sum(cum_probs * trajectory_weights[:, None], axis=0) /
                    (1e-10 + np.sum(
                        batched_masks * trajectory_weights[:, None], axis=0)))
                norm_cum_probs = cum_probs / (1e-10 + avg_cum_probs[None, :])

                weighted_rewards = batched_weights * batched_rewards * norm_cum_probs
                trajectory_values = np.sum(weighted_rewards, axis=1)
                avg_trajectory_value = (
                    (1 - discount) *
                    np.sum(trajectory_values * trajectory_weights) /
                    np.sum(trajectory_weights))
                pred_returns = offset + avg_trajectory_value

            pred_returns = behavior_dataset.unnormalize_rewards(pred_returns)

            tf.summary.scalar('train/pred returns', pred_returns, step=i)
            logging.info('pred returns=%f', pred_returns)

            tf.summary.scalar('train/true minus pred returns',
                              policy_returns - pred_returns,
                              step=i)
            logging.info('true minus pred returns=%f',
                         policy_returns - pred_returns)
示例#7
0
def train(env_id, num_timesteps, seed, batch_size, clip, schedule, mirror,
          warmstart, train_up, dyn_params):
    from policy_transfer.ppo import ppo_sgd
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)

    if env_id == 'Minitaur':
        from pybullet_envs.minitaur.envs import minitaur_reactive_env
        from gym.wrappers import time_limit
        env = time_limit.TimeLimit(minitaur_reactive_env.MinitaurReactiveEnv(
            render=False,
            accurate_motor_model_enabled=True,
            urdf_version='rainbow_dash_v0',
            train_UP=False,
            resample_MP=False),
                                   max_episode_steps=1000)
    else:
        env = gym.make(env_id)
        if train_up:
            if env.env.train_UP is not True:
                env.env.train_UP = True
                env.env.resample_MP = True
                from gym import spaces

                env.env.param_manager.activated_param = dyn_params
                env.env.param_manager.controllable_param = dyn_params
                env.env.obs_dim += len(env.env.param_manager.activated_param)

                high = np.inf * np.ones(env.env.obs_dim)
                low = -high
                env.env.observation_space = spaces.Box(low, high)
                env.observation_space = spaces.Box(low, high)

                if hasattr(env.env, 'obs_perm'):
                    obpermapp = np.arange(
                        len(env.env.obs_perm),
                        len(env.env.obs_perm) +
                        len(env.env.param_manager.activated_param))
                    env.env.obs_perm = np.concatenate(
                        [env.env.obs_perm, obpermapp])

    with open(logger.get_dir() + "/envinfo.txt", "w") as text_file:
        text_file.write(str(env.env.__dict__))

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=3)

    def policy_mirror_fn(name, ob_space, ac_space):
        return MirrorPolicy(name=name,
                            ob_space=ob_space,
                            ac_space=ac_space,
                            hid_size=64,
                            num_hid_layers=3,
                            observation_permutation=env.env.env.obs_perm,
                            action_permutation=env.env.env.act_perm,
                            soft_mirror=(mirror == 2))

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)
    env.seed(seed + MPI.COMM_WORLD.Get_rank())

    gym.logger.setLevel(logging.WARN)

    if mirror:
        pol_func = policy_mirror_fn
    else:
        pol_func = policy_fn

    if len(warmstart) > 0:
        warstart_params = joblib.load(warmstart)
    else:
        warstart_params = None
    ppo_sgd.learn(
        env,
        pol_func,
        max_timesteps=num_timesteps,
        timesteps_per_batch=int(batch_size),
        clip_param=clip,
        entcoeff=0.0,
        optim_epochs=5,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule=schedule,
        callback=callback,
        init_policy_params=warstart_params,
    )

    env.close()
    run_cma = args.run_cma == 'True'
    max_step = args.max_step

    use_sparse_rew = args.sparse_rew == 'True'

    if args.env == 'Minitaur':
        from pybullet_envs.minitaur.envs import minitaur_reactive_env
        from gym.wrappers import time_limit

        obs_in = 1
        act_in = 0
        if testing_mode == 'HIST':
            obs_in = 10
            act_in = 10
        env = time_limit.TimeLimit(minitaur_reactive_env.MinitaurReactiveEnv(render=False,
                    accurate_motor_model_enabled = True,
                    urdf_version='rainbow_dash_v0',
                    include_obs_history=obs_in, include_act_history=act_in, train_UP=False, resample_MP=False), max_episode_steps=1000)
    else:
        env = gym.make(args.env)
        if hasattr(env.env, 'disableViewer'):
            env.env.disableViewer = False

    def policy_fn(name, ob_space, ac_space):
        hid_size = 64
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    hid_size=hid_size, num_hid_layers=3)

    def policy_mirror_fn(name, ob_space, ac_space):
        obpermapp = np.arange(len(env.env.obs_perm), len(env.env.obs_perm)+UP_dim)
        ob_perm = np.concatenate([env.env.obs_perm, obpermapp])