Пример #1
0
def to_tf_space(space):
    if isinstance(space, TheanoBox):
        return Box(low=space.low, high=space.high)
    elif isinstance(space, TheanoDiscrete):
        return Discrete(space.n)
    elif isinstance(space, TheanoProduct):
        return Product(list(map(to_tf_space, space.components)))
    else:
        return Box(low=space.low, high=space.high)
Пример #2
0
def to_tf_space(space):
    if isinstance(space, TheanoBox):
        return Box(low=space.low, high=space.high)
    elif isinstance(space, TheanoDiscrete):
        return Discrete(space.n)
    elif isinstance(space, TheanoProduct):
        return Product(list(map(to_tf_space, space.components)))
    else:
        print("HACK IN sandbox/rocky/envs/base.py")
        return Box(low=space.low, high=space.high)
Пример #3
0
    def __init__(self, env, *args, max_timesteps=None):
        """
        Initialize the environment. 

        Args:
            env (Env): gym environment. Must have discrete observation and action spaces.
            max_timesteps (int): int indicating the max timesteps the environment will be run for.
        """
        assert(isinstance(env, BanditEnv))
        self.wrapped_env = env
        
        self.nA = env.action_space.n #actions are just the same actions as those in the environment. 
        self.state_dim = env.n_arms * 2
        
        self.counts = np.zeros(self.state_dim, dtype=np.int32) 
        
        if max_timesteps is not None:
            self.max_timesteps = max_timesteps
        else:
            max_timesteps = self.max_timesteps = env.horizon
        self.timesteps = 0
        self.Gittins = None
        self.action_space = Discrete(self.nA)
        obs_high = np.full(shape=self.counts.shape, fill_value=max_timesteps)
        self.observation_space = Box(np.zeros_like(self.counts), obs_high)
        self.dV_drhos = {}
        self._seed()
Пример #4
0
 def _create_observation_space(self):
     obs_space = super()._create_observation_space()
     return Box(
         np.hstack(
             (obs_space.low, [-self.BOUNDARY_DIST, -self.BOUNDARY_DIST])),
         np.hstack(
             (obs_space.high, [self.BOUNDARY_DIST, self.BOUNDARY_DIST])),
     )
Пример #5
0
def gym_to_local():
    import gym
    from sandbox.rocky.tf.spaces.box import Box
    import envs.base as base
    gym.envs.mujoco.reacher.ReacherEnv._get_obs = ReacherEnv._get_obs
    gym.envs.mujoco.reacher.ReacherEnv._step = ReacherEnv._step
    gym.envs.mujoco.reacher.ReacherEnv.observation_space = property(
        lambda self: Box(low=ReacherEnv().observation_space.low,
                         high=ReacherEnv().observation_space.high))
    gym.envs.mujoco.reacher.ReacherEnv.reset = ReacherEnv.reset
    gym.envs.mujoco.reacher.ReacherEnv.reset_model = ReacherEnv.reset_model
    gym.envs.mujoco.reacher.ReacherEnv.n_goals = ReacherEnv.n_goals
    gym.envs.mujoco.reacher.ReacherEnv.n_states = ReacherEnv.n_states
    gym.envs.mujoco.reacher.ReacherEnv.cost_np = ReacherEnv.cost_np
    gym.envs.mujoco.reacher.ReacherEnv.cost_tf = ReacherEnv.cost_tf
    gym.envs.mujoco.reacher.ReacherEnv.cost_np_vec = ReacherEnv.cost_np_vec
    base.TfEnv.observation_space = property(
        lambda self: Box(low=ReacherEnv().observation_space.low,
                         high=ReacherEnv().observation_space.high))
    def __init__(self,
                 env_name,
                 record_video=True,
                 video_schedule=None,
                 log_dir=None,
                 record_log=True,
                 force_reset=False,
                 screen_width=84,
                 screen_height=84):
        if log_dir is None:
            if logger.get_snapshot_dir() is None:
                logger.log(
                    "Warning: skipping Gym environment monitoring since snapshot_dir not configured."
                )
            else:
                log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log")
        Serializable.quick_init(self, locals())

        env = gym.envs.make(env_name)
        if 'Doom' in env_name:
            from ppaquette_gym_doom.wrappers.action_space import ToDiscrete
            wrapper = ToDiscrete('minimal')
            env = wrapper(env)

        self.env = env
        self.env_id = env.spec.id

        monitor_manager.logger.setLevel(logging.WARNING)

        assert not (not record_log and record_video)

        if log_dir is None or record_log is False:
            self.monitoring = False
        else:
            if not record_video:
                video_schedule = NoVideoSchedule()
            else:
                if video_schedule is None:
                    video_schedule = CappedCubicVideoSchedule()
            self.env = gym.wrappers.Monitor(self.env,
                                            log_dir,
                                            video_callable=video_schedule,
                                            force=True)
            self.monitoring = True

        self._observation_space = convert_gym_space(env.observation_space)
        self._action_space = convert_gym_space(env.action_space)
        self._horizon = env.spec.timestep_limit
        self._log_dir = log_dir
        self._force_reset = force_reset
        self.screen_width = screen_width
        self.screen_height = screen_height
        self._observation_space = Box(low=0,
                                      high=1,
                                      shape=(screen_width, screen_height, 1))
Пример #7
0
def to_tf_space(space):
    if isinstance(space, TheanoBox) or isinstance(space, gymBox):
        return Box(low=space.low, high=space.high)
    elif isinstance(space, TheanoDiscrete):
        return Discrete(space.n)
    elif isinstance(space, TheanoProduct):
        return Product(list(map(to_tf_space, space.components)))
    else:

        import ipdb
        ipdb.set_trace()
        raise NotImplementedError
Пример #8
0
    def __init__(
        self,
        horizon=200,
        l2_action_penalty_weight=1e-2,
        num_steps=None,
        include_velocity=False,
        use_small_maze=False,
        num_steps_until_reset=5,
    ):
        self.init_serialization(locals())
        if use_small_maze:
            self.TARGET_RADIUS = 0.04
            self.BOUNDARY_RADIUS = 0.02
            self.BOUNDARY_DIST = 0.12
            self.BALL_RADIUS = 0.01
            super().__init__('small_water_maze.xml')
        else:
            self.TARGET_RADIUS = 0.1
            self.BOUNDARY_RADIUS = 0.02
            self.BOUNDARY_DIST = 0.3
            self.BALL_RADIUS = 0.02
            super().__init__('water_maze.xml')
        self.BALL_START_DIST = (self.BOUNDARY_DIST - self.BOUNDARY_RADIUS -
                                2 * self.BALL_RADIUS)
        self.MAX_GOAL_DIST = self.BOUNDARY_DIST - self.BOUNDARY_RADIUS
        self.l2_action_penalty_weight = l2_action_penalty_weight
        if num_steps is not None:  # support backwards compatibility
            horizon = num_steps

        self._horizon = horizon
        self._t = 0
        self._on_platform_history = deque(maxlen=5)
        self.num_steps_until_reset = num_steps_until_reset
        self.teleport_after_a_while = self.num_steps_until_reset > 0
        if self.teleport_after_a_while:
            for _ in range(self.num_steps_until_reset):
                self._on_platform_history.append(False)
        self.include_velocity = include_velocity

        self.action_space = Box(np.array([-1, -1]), np.array([1, 1]))
        self.observation_space = self._create_observation_space()
        self.reset_model()
Пример #9
0
def main(exp_name=None, fusion=False, latent_dim=3):
    max_path_length = 100
    info_coeff = 0.1
    imitation_coeff = 0.01
    batch_size = 16
    meta_batch_size = 50
    max_itrs = 20
    pre_epoch = 1000
    entropy_weight = 1.0
    reward_arch = relu_net
    if reward_arch == relu_net:
        layers = 2
        d_hidden = 32
        reward_arch_args = {
            'layers': layers,
            'd_hidden': d_hidden,
        }
    else:
        layers, d_hidden = 0, 0
        reward_arch_args = None

    tf.reset_default_graph()
    env = TfEnv(
        CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs(
        'data/maze_left_data_collect_discrete-15', n=4, latent_dim=latent_dim)

    # contexual policy pi(a|s,m)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    # approximate posterior q(m|tau)
    context_encoder_spec = EnvSpec(
        observation_space=Box(
            np.tile(
                np.concatenate((env.observation_space.low[:-latent_dim],
                                env.action_space.low)), max_path_length),
            np.tile(
                np.concatenate((env.observation_space.high[:-latent_dim],
                                env.action_space.high)), max_path_length)),
        action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)),
    )
    context_encoder = GaussianMLPPolicy(name='context_encoder',
                                        env_spec=context_encoder_spec,
                                        hidden_sizes=(128, 128))

    pretrain_model = Pretrain(experts,
                              policy,
                              context_encoder,
                              env,
                              latent_dim,
                              batch_size=400,
                              kl_weight=0.1,
                              epoch=pre_epoch)
    # pretrain_model = None
    if pretrain_model is None:
        pre_epoch = 0

    irl_model = InfoAIRL(env=env,
                         policy=policy,
                         context_encoder=context_encoder,
                         reward_arch=reward_arch,
                         reward_arch_args=reward_arch_args,
                         expert_trajs=experts,
                         state_only=True,
                         max_path_length=max_path_length,
                         fusion=fusion,
                         max_itrs=max_itrs,
                         meta_batch_size=meta_batch_size,
                         imitation_coeff=imitation_coeff,
                         info_coeff=info_coeff,
                         latent_dim=latent_dim)

    algo = MetaIRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        randomize_policy=True,
        pretrain_model=pretrain_model,
        n_itr=3000,
        meta_batch_size=meta_batch_size,
        batch_size=batch_size,
        max_path_length=max_path_length,
        discount=0.99,
        store_paths=True,
        train_irl=True,
        irl_model_wt=1.0,
        entropy_weight=entropy_weight,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    if fusion:
        dirname = 'data_fusion_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % (
            imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs,
            pre_epoch, entropy_weight, layers, d_hidden, exp_name)
    else:
        dirname = 'data_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % (
            imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs,
            pre_epoch, entropy_weight, layers, d_hidden, exp_name)

    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session():
            algo.train()
Пример #10
0
def main(exp_name=None, latent_dim=3, params_folder=None):
    max_path_length = 100
    batch_size = 16
    meta_batch_size = 1
    reward_arch = relu_net
    if reward_arch == relu_net:
        layers = 2
        d_hidden = 32
        reward_arch_args = {
            'layers': layers,
            'd_hidden': d_hidden,
        }
    else:
        layers, d_hidden = 0, 0
        reward_arch_args = None

    # tf.reset_default_graph()
    env = TfEnv(
        CustomGymEnv('PointMazeRight-v0', record_video=False,
                     record_log=False))
    barrier_range = [0.2, 0.6]
    barrier_y = 0.3

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs(
        '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect',
        n=4,
        latent_dim=latent_dim)

    irl_itr_list = [2800]

    for irl_itr in irl_itr_list:
        # params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
        params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr)
        prior_params = load_prior_params(params_file)
        init_context_encoder_params = load_prior_params(
            params_file, 'context_params')

        # params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr-800))
        policy_prior_params = load_prior_params(params_file, 'policy_params')
        # policy_prior_params = None

        # contexual policy pi(a|s,m)
        policy = GaussianMLPPolicy(name='policy',
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        # approximate posterior q(m|tau)
        context_encoder_spec = EnvSpec(
            observation_space=Box(
                np.tile(
                    np.concatenate((env.observation_space.low[:-latent_dim],
                                    env.action_space.low)), max_path_length),
                np.tile(
                    np.concatenate((env.observation_space.high[:-latent_dim],
                                    env.action_space.high)), max_path_length)),
            action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)),
        )
        context_encoder = GaussianMLPPolicy(name='context_encoder',
                                            env_spec=context_encoder_spec,
                                            hidden_sizes=(128, 128))

        irl_model = InfoAIRL(env=env,
                             expert_trajs=experts,
                             reward_arch=reward_arch,
                             reward_arch_args=reward_arch_args,
                             context_encoder=context_encoder,
                             state_only=True,
                             max_path_length=max_path_length,
                             meta_batch_size=meta_batch_size,
                             latent_dim=latent_dim)

        savedir = 'data_fusion_discrete/visualize_reward_right-%s' % irl_itr
        if not os.path.isdir(savedir):
            os.mkdir(savedir)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            irl_model.context_encoder.set_param_values(
                init_context_encoder_params)
            policy.set_param_values(policy_prior_params)
            irl_model.set_params(prior_params)
            boundary_low = -0.1
            boundary_high = 0.6

            expert_obs, expert_acts, expert_contexts = irl_model.extract_paths(
                irl_model.expert_trajs,
                keys=('observations', 'actions', 'contexts'),
                T=max_path_length)
            expert_trajs = np.concatenate(
                (expert_obs, expert_acts),
                axis=-1)  # num_experts x T x (state_dim + act_dim)

            grid_size = 0.005
            rescale = 1. / grid_size

            for itr in range(100):
                expert_traj_batch, m_batch = irl_model.sample_batch(
                    expert_trajs,
                    expert_contexts,
                    batch_size=1,
                    warm_up=False,
                    warm_up_idx=False)
                obs_batch = []
                num_y = 0
                for pos_y in np.arange(boundary_low, boundary_high, grid_size):
                    num_y += 1
                    num_x = 0
                    for pos_x in np.arange(boundary_low, boundary_high,
                                           grid_size):
                        num_x += 1
                        obs_batch.append([pos_x, pos_y, 0.])
                obs_batch = np.array(obs_batch).reshape(
                    [1, -1, max_path_length, 3])
                expert_traj_batch = np.tile(
                    np.reshape(expert_traj_batch, [1, 1, max_path_length, -1]),
                    [1, obs_batch.shape[1], 1, 1])
                reward = tf.get_default_session().run(
                    irl_model.reward,
                    feed_dict={
                        irl_model.expert_traj_var: expert_traj_batch,
                        irl_model.obs_t: obs_batch
                    })
                score = reward[:, 0]
                ax = sns.heatmap(score.reshape([num_x, num_y]),
                                 cmap="YlGnBu_r")
                ax.scatter((m_batch[0][0][0] - boundary_low) * rescale,
                           (m_batch[0][0][1] - boundary_low) * rescale,
                           marker='*',
                           s=150,
                           c='r',
                           edgecolors='k',
                           linewidths=0.5)
                ax.scatter((0.3 - boundary_low +
                            np.random.uniform(low=-0.05, high=0.05)) * rescale,
                           (0. - boundary_low +
                            np.random.uniform(low=-0.05, high=0.05)) * rescale,
                           marker='o',
                           s=120,
                           c='white',
                           linewidths=0.5,
                           edgecolors='k')
                ax.plot([(barrier_range[0] - boundary_low) * rescale,
                         (barrier_range[1] - boundary_low) * rescale],
                        [(barrier_y - boundary_low) * rescale,
                         (barrier_y - boundary_low) * rescale],
                        color='k',
                        linewidth=10)
                ax.invert_yaxis()
                plt.axis('off')
                plt.savefig(savedir + '/%s.png' % itr)
                print('Save Itr', itr)
                plt.close()
Пример #11
0
 def observation_space(self):
     # 2 embeddings (query and current page) plus the embeddings of articles
     # on the beam
     return Box(low=-5,
                high=5,
                shape=(2 + self.beam_size, self.embedding_dim))
Пример #12
0
    def __init__(self, env_name, register_info=None, record_video=True, video_schedule=None, log_dir=None, record_log=True,
                 force_reset=True, screen_width=84, screen_height=84, frame_skip=1, doom_actionspace='Box',
                 conv=True, client_port=10000, transpose_output=False, stack_frames=False, stack_size=4):
        if log_dir is None:
            if logger.get_snapshot_dir() is None:
                logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.")
            else:
                log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log")
        Serializable.quick_init(self, locals())
        if 'Doom' in env_name:
            import ex2.envs.doom
        if 'Minecraft' in env_name:
            import axe.envs.minecraft

        if register_info:
            try:
                gym.envs.register(**register_info)
            except gym.error.Error:
                traceback.print_exc()

        env = gym.envs.make(env_name)

        if 'Doom' in env_name:
            from ex2.envs.doom.wrappers import SetResolution
            from ex2.envs.doom.wrappers.action_space import ToDiscrete, ToBox
            if doom_actionspace == 'Box':
                wrapper1 = ToBox('minimal')
            else:
                wrapper1 = ToDiscrete('minimal')
            #lock = multiprocessing.Lock()
            #env.configure(lock=lock)
            wrapper2 = SetResolution('160x120')
            env = wrapper2(wrapper1(env))
        if 'Minecraft' in env_name:
            env.init(videoResolution=[screen_width, screen_height], allowContinuousMovement=["move", "turn"],
                     continuous_discrete=False, vision=False,
                     client_pool=[('127.0.0.1', client_port)])

        self.env = env
        self.env_id = env.spec.id
        self.env_name = env_name
        self.frame_skip = frame_skip
        self.stack_frames = stack_frames
        if stack_frames:
            self.channel_size = stack_size
        else:
            self.channel_size = 3

        assert not (not record_log and record_video)

        if log_dir is None or record_log is False:
            self.monitoring = False
        else:
            if not record_video:
                video_schedule = NoVideoSchedule()
            else:
                if video_schedule is None:
                    video_schedule = CappedCubicVideoSchedule()
            self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True)
            self.monitoring = True


        self._action_space = convert_gym_space(env.action_space)
        self._horizon = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
        self._log_dir = log_dir
        self._force_reset = force_reset
        self.screen_width = screen_width
        self.screen_height = screen_height
        self.conv = conv
        self.transpose_output = transpose_output
        if conv:
            if self.transpose_output:
                self._observation_space = Box(low=0, high=1, shape=(self.channel_size, screen_width, screen_height))
                #self._observation_space = Box(low=0, high=1, shape=(3* screen_width* screen_height))
            else:
                self._observation_space = Box(low=0, high=1, shape=(screen_width, screen_height, self.channel_size))
        else:
            self._observation_space = Box(low=0, high=1, shape=(self.channel_size,))
        self.last_info = None
        self.last_obs = []
Пример #13
0
 def observation_space(self):
     return Box(low=0, high=1, shape=(len(self.vocab),)) #return DiscreteBinaryBag(len(self.vocab))
import samplers.lowlevel.rarl_parallel_sampler as parallel_sampler
parallel_sampler.initialize(n_parallel=1)
parallel_sampler.set_seed(0)

#env = normalize(MultilaneEnv(),1,True,True,0.001,0.001)
#env = normalize(MultilaneEnv())
env = TfEnv(JustEgoEnv(port=9427))

obs1_dim = 4
obs2_dim = 4
action1_dim = 2
action2_dim = 2

spec1 = EnvSpec(
                observation_space = Box(low=-np.ones(4), high=np.ones(4)),
                action_space = Box(low=-np.ones(2), high=np.ones(2)),
                )
spec2 = EnvSpec(
                observation_space = Box(low=-np.ones(4), high=np.ones(4)),
                action_space = Box(low=-np.ones(2), high=np.ones(2)),
                )

with tf.Session() as sess:
    policy1 = GaussianMLPPolicy(
        env_spec=spec1,
        name="RARLTFPolicy1",
        learn_std=True,
        init_std=0.1,
        output_nonlinearity=None,
        hidden_nonlinearity=tf.nn.relu,
Пример #15
0
def main(exp_name=None, latent_dim=3, params_folder=None):
    max_path_length = 100
    batch_size = 32
    meta_batch_size = 50
    entropy_weight = 0.1
    left = 'right'
    if_filtered = True

    # tf.reset_default_graph()
    if left == 'left':
        env = TfEnv(
            CustomGymEnv('PointMazeLeft-v0',
                         record_video=False,
                         record_log=False))
    else:
        env = TfEnv(
            CustomGymEnv('PointMazeRight-v0',
                         record_video=False,
                         record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs(
        '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect',
        n=4,
        latent_dim=latent_dim)
    if if_filtered:
        experts_filtered = []
        good_range = [0.1, 0.4]  #[0.3, 0.5]
        for expert in experts:
            if expert['contexts'][0,
                                  0] >= good_range[0] and expert['contexts'][
                                      0, 0] <= good_range[1]:
                experts_filtered.append(expert)
        assert len(experts_filtered) >= meta_batch_size
        experts_filtered = experts_filtered[:-(len(experts_filtered) %
                                               meta_batch_size)]
        experts = experts_filtered

    irl_itr_list = [2800]

    results = []
    for irl_itr in irl_itr_list:
        params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr)
        prior_params = load_prior_params(params_file)
        init_context_encoder_params = load_prior_params(
            params_file, 'context_params')

        policy_prior_params = None

        # contexual policy pi(a|s,m)
        policy = GaussianMLPPolicy(name='policy',
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        # approximate posterior q(m|tau)
        context_encoder_spec = EnvSpec(
            observation_space=Box(
                np.tile(
                    np.concatenate((env.observation_space.low[:-latent_dim],
                                    env.action_space.low)), max_path_length),
                np.tile(
                    np.concatenate((env.observation_space.high[:-latent_dim],
                                    env.action_space.high)), max_path_length)),
            action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)),
        )
        context_encoder = GaussianMLPPolicy(name='context_encoder',
                                            env_spec=context_encoder_spec,
                                            hidden_sizes=(128, 128))

        irl_model = InfoAIRL(env=env,
                             expert_trajs=experts,
                             context_encoder=context_encoder,
                             state_only=True,
                             max_path_length=max_path_length,
                             meta_batch_size=meta_batch_size,
                             latent_dim=latent_dim)

        algo = MetaIRLTRPO(
            init_irl_params=prior_params,
            init_pol_params=policy_prior_params,  #policy_prior_params,
            init_context_encoder_params=init_context_encoder_params,
            env=env,
            policy=policy,
            irl_model=irl_model,
            n_itr=150,
            meta_batch_size=meta_batch_size,
            batch_size=batch_size,
            max_path_length=max_path_length,
            discount=0.99,
            store_paths=True,
            train_irl=True,  # True
            train_context_only=True,
            train_policy=True,
            irl_model_wt=1.0,
            entropy_weight=entropy_weight,
            zero_environment_reward=True,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            log_params_folder=params_folder,
            log_experiment_name=exp_name,
        )
        with rllab_logdir(
                algo=algo,
                dirname=
                'data_finetune/maze_finetune_discrete-entropy-%s-irl_itr-%s-%s-%s-generalize/%s'
                % (entropy_weight, irl_itr, left,
                   'filter' if if_filtered else '', exp_name)):
            with tf.Session():
                algo.train()
        results.append((irl_itr, np.max(algo.pol_ret)))
        tf.reset_default_graph()
    print(results)
Пример #16
0
    def reset(self):
        self.state = np.ones(self.action_space.flat_dim) * self.mu

    def evolve_state(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * nr.randn(len(x))
        self.state = x + dx
        return self.state

    @overrides
    def get_action(self, t, observation, policy, **kwargs):
        action, _ = policy.get_action(observation)
        ou_state = self.evolve_state()
        return np.clip(action + ou_state, self.action_space.low,
                       self.action_space.high)


if __name__ == "__main__":
    ou = OUStrategy(
        env_spec=AttrDict(action_space=Box(low=-1, high=1, shape=(1, ))),
        mu=0,
        theta=0.15,
        sigma=0.3)
    states = []
    for i in range(1000):
        states.append(ou.evolve_state()[0])
    import matplotlib.pyplot as plt

    plt.plot(states)
    plt.show()
Пример #17
0
 def observation_space(self):
     return Box(low=0, high=1, shape=(len(self.chars),))
Пример #18
0
 def _create_observation_space(self):
     num_obs = 4 if self.include_velocity else 2
     return Box(
         np.hstack((-np.inf + np.zeros(num_obs), [0])),
         np.hstack((np.inf + np.zeros(num_obs), [1])),
     )