Exemplo n.º 1
0
    def __init__(self,
                 env,
                 b_agent,
                 include_utterance_in_observation=False,
                 *args,
                 **kwargs):
        """
        Args:
            env: Environment which is being wrapped by this conversational
                environment. This environment must have a discrete action
                space. The RL agent will be able to take any of the actions
                in this environment, or select a new action `utterance`,
                managed by this class.
            b_agent: `Agent` instance with which the learned agent will
                interact.
            include_utterance_in_observation: If `True`, after making an
                utterance, include the uttered token as part of the next
                observation
        """
        super(SituatedConversationEnvironment, self).__init__(*args, **kwargs)

        assert isinstance(env.action_space, Discrete)
        self._env = env

        self.num_tokens = b_agent.num_tokens
        self.vocab = b_agent.vocab
        self.vocab_size = len(self.vocab)

        # Observations are a combination of observations from the wrapped
        # environment and a representation of any utterance received from the
        # agent.
        #
        # Optionally include a single-token utterance observation as well.
        self._received_message_space = DiscreteBinaryBag(self.vocab_size)
        if include_utterance_in_observation:
            utterance_space = Discrete(self.vocab_size)
            self._obs_space = Product(env.observation_space,
                                      self._received_message_space,
                                      utterance_space)
        else:
            self._obs_space = Product(env.observation_space,
                                      self._received_message_space)
        self.include_utterance_in_observation = include_utterance_in_observation

        # The agent can choose to take any action in the wrapped env, to add a
        # single token to its message, or to send a message to the agent.
        #
        # First `N` actions correspond to taking an action in the wrapped env.
        # Next `V` actions correspond to uttering a word from the vocabulary.
        # Final action corresponds to sending the message.
        action_space = Discrete(env.action_space.n + b_agent.vocab_size + 1)
        self._action_space = action_space

        self.b_agent = b_agent
Exemplo n.º 2
0
    def __init__(self,
                 beam_size,
                 graph,
                 is_training=True,
                 oracle=True,
                 *args,
                 **kwargs):
        """
        Args:
            beam_size:
            graph:
            is_training:
            oracle: If True, always follow the gold path regardless of
                provided agent actions.
        """
        super(WebNavEnvironment, self).__init__(*args, **kwargs)

        self._graph = graph

        self.beam_size = beam_size
        self.path_length = self._graph.path_length
        self.is_training = is_training

        navigator_cls = web_graph.OracleNavigator if oracle \
                else web_graph.Navigator
        self._navigator = navigator_cls(self._graph, self.beam_size,
                                        self.path_length)

        self._action_space = Discrete(self.beam_size)
Exemplo n.º 3
0
    def __init__(self, env, *args, max_timesteps=None):
        """
        Initialize the environment. 

        Args:
            env (Env): gym environment. Must have discrete observation and action spaces.
            max_timesteps (int): int indicating the max timesteps the environment will be run for.
        """
        assert(isinstance(env, BanditEnv))
        self.wrapped_env = env
        
        self.nA = env.action_space.n #actions are just the same actions as those in the environment. 
        self.state_dim = env.n_arms * 2
        
        self.counts = np.zeros(self.state_dim, dtype=np.int32) 
        
        if max_timesteps is not None:
            self.max_timesteps = max_timesteps
        else:
            max_timesteps = self.max_timesteps = env.horizon
        self.timesteps = 0
        self.Gittins = None
        self.action_space = Discrete(self.nA)
        obs_high = np.full(shape=self.counts.shape, fill_value=max_timesteps)
        self.observation_space = Box(np.zeros_like(self.counts), obs_high)
        self.dV_drhos = {}
        self._seed()
Exemplo n.º 4
0
 def __init__(self, n_arms=4, reward_dist=None, reward_args_generator=None,
              independent_arms=True, horizon=15):
     """
     Initialize a bandit environment. 
     
     Args:
         n_arms (int): number of arms in this bandit. 
         reward_dist: a generator for a discrete scipy.stats.rv_discrete used to sample from the arms. 
                 If independent_arms, then this should return a single real random variable.
                 Else, then this should return a vector-valued random variable.
         reward_args_generator (iterable): a generator of length n_arms for reward arguments. 
                                     Will be used to create args for reward_dist.
                                     Cannot be None if reward_dist is not None.
                                     Needs to continue on forever. 
         independent_arms (bool): If True, sample each arm's reward independently
                             (conditioned on reward_args_generator). 
                         If False, generate a simple sample from reward_dist.
         horizon (int): the number of timesteps to run this environment for. 
         
     """
     if not independent_arms:
         raise NotImplementedError("Only conditionally dependent arms are currently supported.")
     
     if reward_dist is None:
         reward_dist = bernoulli
         reward_args_generator = ({'p': uniform.rvs()} for _ in cycle([0]))
     elif reward_args_generator is None:
         raise ValueError("reward_args_generator cannot be None if reward_generator is specified.")
         
     self.n_arms = n_arms
     self.reward_dist = reward_dist
     self.reward_args_generator = reward_args_generator
     self.horizon = horizon
         
     self.steps_taken = 0
     
     P = {0:{}}
     for i in range(n_arms):
         P[0][i] = [(0.5, 0, 1.0, False), (0.5, 0, 0.0, False)]
     isd = [1.0]
     
     super(BanditEnv, self).__init__(1, n_arms, P, isd)
     
     self.action_space = Discrete(self.nA)
     self.observation_space = Discrete(self.nS)
Exemplo n.º 5
0
def to_tf_space(space):
    if isinstance(space, TheanoBox):
        return Box(low=space.low, high=space.high)
    elif isinstance(space, TheanoDiscrete):
        return Discrete(space.n)
    elif isinstance(space, TheanoProduct):
        return Product(list(map(to_tf_space, space.components)))
    else:
        raise NotImplementedError
Exemplo n.º 6
0
def to_tf_space(space):
    if isinstance(space, TheanoBox):
        return Box(low=space.low, high=space.high)
    elif isinstance(space, TheanoDiscrete):
        return Discrete(space.n)
    elif isinstance(space, TheanoProduct):
        return Product(list(map(to_tf_space, space.components)))
    else:
        print("HACK IN sandbox/rocky/envs/base.py")
        return Box(low=space.low, high=space.high)
Exemplo n.º 7
0
    def __init__(self, env=None, b_agent=None, *args, **kwargs):
        """
        Args:
            env: Environment which is being wrapped by this conversational
                environment. This environment must have a discrete action
                space. The RL agent will be able to take any of the actions
                in this environment, or select a new action `utterance`,
                managed by this class.
            b_agent: `Agent` instance with which the learned agent will
                interact.
        """
        super(SituatedConversationEnvironment, self).__init__(*args, **kwargs)

        if env is None:
            env = SlaveGridWorldEnv("walled_chain")
        if b_agent is None:
            b_agent = GridWorldMasterAgent(env)

        assert isinstance(env.action_space, Discrete)
        self._env = env

        self.num_tokens = b_agent.num_tokens
        self.vocab = b_agent.vocab
        self.vocab_size = len(self.vocab)

        # Observations are a combination of observations from the wrapped
        # environment and a representation of any utterance received from the
        # agent.
        self._received_message_space = DiscreteBinaryBag(self.vocab_size)
        self._obs_space = Product(env.observation_space,
                                  self._received_message_space)

        # The agent can choose to take any action in the wrapped env, to add a
        # single token to its message, or to send a message to the agent.
        #
        # First `N` actions correspond to taking an action in the wrapped env.
        # Next `V` actions correspond to uttering a word from the vocabulary.
        # Final action corresponds to sending the message.
        action_space = Discrete(env.action_space.n + b_agent.vocab_size + 1)
        self._action_space = action_space

        self._b_agent = b_agent
Exemplo n.º 8
0
def build_hierarchy(args, env, writer=None):
    levels = []

    latent_sampler = UniformlyRandomLatentSampler(
        name='base_latent_sampler',
        dim=args.latent_dim,
        scheduler=ConstantIntervalScheduler(k=args.env_H)
    )
    for level_idx in [1,0]:
        # wrap env in different spec depending on level
        if level_idx == 0:
            level_env = env
        else:
            level_env = SpecWrapperEnv(
                env,
                action_space=Discrete(args.latent_dim),
                observation_space=env.observation_space
            )
            
        with tf.variable_scope('level_{}'.format(level_idx)):
            # recognition_model = build_recognition_model(args, level_env, writer)
            recognition_model = None
            if level_idx == 0:
                policy = build_policy(args, env, latent_sampler=latent_sampler)
            else:
                scheduler = ConstantIntervalScheduler(k=args.scheduler_k)
                policy = latent_sampler = CategoricalLatentSampler(
                    scheduler=scheduler,
                    name='latent_sampler',
                    policy_name='latent_sampler_policy',
                    dim=args.latent_dim,
                    env_spec=level_env.spec,
                    latent_sampler=latent_sampler,
                    max_n_envs=args.n_envs
                )
            baseline = build_baseline(args, level_env)
            if args.vectorize:
                force_batch_sampler = False
                if level_idx == 0:
                    sampler_args = dict(n_envs=args.n_envs)
                else:
                    sampler_args = None
            else:
                force_batch_sampler = True
                sampler_args = None

            sampler_cls = None if level_idx == 0 else HierarchySampler
            algo = TRPO(
                env=level_env,
                policy=policy,
                baseline=baseline,
                batch_size=args.batch_size,
                max_path_length=args.max_path_length,
                n_itr=args.n_itr,
                discount=args.discount,
                step_size=args.trpo_step_size,
                sampler_cls=sampler_cls,
                force_batch_sampler=force_batch_sampler,
                sampler_args=sampler_args,
                optimizer_args=dict(
                    max_backtracks=50,
                    debug_nan=True
                )
            )
            reward_handler = build_reward_handler(args, writer)
            level = Level(
                depth=level_idx,
                algo=algo,
                reward_handler=reward_handler,
                recognition_model=recognition_model,
                start_itr=0,
                end_itr=0 if level_idx == 0 else np.inf
            )
            levels.append(level)

    # by convention the order of the levels should be increasing
    # but they must be built in the reverse order 
    # so reverse the list before returning it
    return list(reversed(levels))
Exemplo n.º 9
0
 def action_space(self):
     return Discrete(len(self.vocab))
Exemplo n.º 10
0
 def action_space(self):
     return Discrete(len(self.chars))
Exemplo n.º 11
0
 def action_space(self):
     return Discrete(self.nA)