def __init__(self, env, b_agent, include_utterance_in_observation=False, *args, **kwargs): """ Args: env: Environment which is being wrapped by this conversational environment. This environment must have a discrete action space. The RL agent will be able to take any of the actions in this environment, or select a new action `utterance`, managed by this class. b_agent: `Agent` instance with which the learned agent will interact. include_utterance_in_observation: If `True`, after making an utterance, include the uttered token as part of the next observation """ super(SituatedConversationEnvironment, self).__init__(*args, **kwargs) assert isinstance(env.action_space, Discrete) self._env = env self.num_tokens = b_agent.num_tokens self.vocab = b_agent.vocab self.vocab_size = len(self.vocab) # Observations are a combination of observations from the wrapped # environment and a representation of any utterance received from the # agent. # # Optionally include a single-token utterance observation as well. self._received_message_space = DiscreteBinaryBag(self.vocab_size) if include_utterance_in_observation: utterance_space = Discrete(self.vocab_size) self._obs_space = Product(env.observation_space, self._received_message_space, utterance_space) else: self._obs_space = Product(env.observation_space, self._received_message_space) self.include_utterance_in_observation = include_utterance_in_observation # The agent can choose to take any action in the wrapped env, to add a # single token to its message, or to send a message to the agent. # # First `N` actions correspond to taking an action in the wrapped env. # Next `V` actions correspond to uttering a word from the vocabulary. # Final action corresponds to sending the message. action_space = Discrete(env.action_space.n + b_agent.vocab_size + 1) self._action_space = action_space self.b_agent = b_agent
def __init__(self, beam_size, graph, is_training=True, oracle=True, *args, **kwargs): """ Args: beam_size: graph: is_training: oracle: If True, always follow the gold path regardless of provided agent actions. """ super(WebNavEnvironment, self).__init__(*args, **kwargs) self._graph = graph self.beam_size = beam_size self.path_length = self._graph.path_length self.is_training = is_training navigator_cls = web_graph.OracleNavigator if oracle \ else web_graph.Navigator self._navigator = navigator_cls(self._graph, self.beam_size, self.path_length) self._action_space = Discrete(self.beam_size)
def __init__(self, env, *args, max_timesteps=None): """ Initialize the environment. Args: env (Env): gym environment. Must have discrete observation and action spaces. max_timesteps (int): int indicating the max timesteps the environment will be run for. """ assert(isinstance(env, BanditEnv)) self.wrapped_env = env self.nA = env.action_space.n #actions are just the same actions as those in the environment. self.state_dim = env.n_arms * 2 self.counts = np.zeros(self.state_dim, dtype=np.int32) if max_timesteps is not None: self.max_timesteps = max_timesteps else: max_timesteps = self.max_timesteps = env.horizon self.timesteps = 0 self.Gittins = None self.action_space = Discrete(self.nA) obs_high = np.full(shape=self.counts.shape, fill_value=max_timesteps) self.observation_space = Box(np.zeros_like(self.counts), obs_high) self.dV_drhos = {} self._seed()
def __init__(self, n_arms=4, reward_dist=None, reward_args_generator=None, independent_arms=True, horizon=15): """ Initialize a bandit environment. Args: n_arms (int): number of arms in this bandit. reward_dist: a generator for a discrete scipy.stats.rv_discrete used to sample from the arms. If independent_arms, then this should return a single real random variable. Else, then this should return a vector-valued random variable. reward_args_generator (iterable): a generator of length n_arms for reward arguments. Will be used to create args for reward_dist. Cannot be None if reward_dist is not None. Needs to continue on forever. independent_arms (bool): If True, sample each arm's reward independently (conditioned on reward_args_generator). If False, generate a simple sample from reward_dist. horizon (int): the number of timesteps to run this environment for. """ if not independent_arms: raise NotImplementedError("Only conditionally dependent arms are currently supported.") if reward_dist is None: reward_dist = bernoulli reward_args_generator = ({'p': uniform.rvs()} for _ in cycle([0])) elif reward_args_generator is None: raise ValueError("reward_args_generator cannot be None if reward_generator is specified.") self.n_arms = n_arms self.reward_dist = reward_dist self.reward_args_generator = reward_args_generator self.horizon = horizon self.steps_taken = 0 P = {0:{}} for i in range(n_arms): P[0][i] = [(0.5, 0, 1.0, False), (0.5, 0, 0.0, False)] isd = [1.0] super(BanditEnv, self).__init__(1, n_arms, P, isd) self.action_space = Discrete(self.nA) self.observation_space = Discrete(self.nS)
def to_tf_space(space): if isinstance(space, TheanoBox): return Box(low=space.low, high=space.high) elif isinstance(space, TheanoDiscrete): return Discrete(space.n) elif isinstance(space, TheanoProduct): return Product(list(map(to_tf_space, space.components))) else: raise NotImplementedError
def to_tf_space(space): if isinstance(space, TheanoBox): return Box(low=space.low, high=space.high) elif isinstance(space, TheanoDiscrete): return Discrete(space.n) elif isinstance(space, TheanoProduct): return Product(list(map(to_tf_space, space.components))) else: print("HACK IN sandbox/rocky/envs/base.py") return Box(low=space.low, high=space.high)
def __init__(self, env=None, b_agent=None, *args, **kwargs): """ Args: env: Environment which is being wrapped by this conversational environment. This environment must have a discrete action space. The RL agent will be able to take any of the actions in this environment, or select a new action `utterance`, managed by this class. b_agent: `Agent` instance with which the learned agent will interact. """ super(SituatedConversationEnvironment, self).__init__(*args, **kwargs) if env is None: env = SlaveGridWorldEnv("walled_chain") if b_agent is None: b_agent = GridWorldMasterAgent(env) assert isinstance(env.action_space, Discrete) self._env = env self.num_tokens = b_agent.num_tokens self.vocab = b_agent.vocab self.vocab_size = len(self.vocab) # Observations are a combination of observations from the wrapped # environment and a representation of any utterance received from the # agent. self._received_message_space = DiscreteBinaryBag(self.vocab_size) self._obs_space = Product(env.observation_space, self._received_message_space) # The agent can choose to take any action in the wrapped env, to add a # single token to its message, or to send a message to the agent. # # First `N` actions correspond to taking an action in the wrapped env. # Next `V` actions correspond to uttering a word from the vocabulary. # Final action corresponds to sending the message. action_space = Discrete(env.action_space.n + b_agent.vocab_size + 1) self._action_space = action_space self._b_agent = b_agent
def build_hierarchy(args, env, writer=None): levels = [] latent_sampler = UniformlyRandomLatentSampler( name='base_latent_sampler', dim=args.latent_dim, scheduler=ConstantIntervalScheduler(k=args.env_H) ) for level_idx in [1,0]: # wrap env in different spec depending on level if level_idx == 0: level_env = env else: level_env = SpecWrapperEnv( env, action_space=Discrete(args.latent_dim), observation_space=env.observation_space ) with tf.variable_scope('level_{}'.format(level_idx)): # recognition_model = build_recognition_model(args, level_env, writer) recognition_model = None if level_idx == 0: policy = build_policy(args, env, latent_sampler=latent_sampler) else: scheduler = ConstantIntervalScheduler(k=args.scheduler_k) policy = latent_sampler = CategoricalLatentSampler( scheduler=scheduler, name='latent_sampler', policy_name='latent_sampler_policy', dim=args.latent_dim, env_spec=level_env.spec, latent_sampler=latent_sampler, max_n_envs=args.n_envs ) baseline = build_baseline(args, level_env) if args.vectorize: force_batch_sampler = False if level_idx == 0: sampler_args = dict(n_envs=args.n_envs) else: sampler_args = None else: force_batch_sampler = True sampler_args = None sampler_cls = None if level_idx == 0 else HierarchySampler algo = TRPO( env=level_env, policy=policy, baseline=baseline, batch_size=args.batch_size, max_path_length=args.max_path_length, n_itr=args.n_itr, discount=args.discount, step_size=args.trpo_step_size, sampler_cls=sampler_cls, force_batch_sampler=force_batch_sampler, sampler_args=sampler_args, optimizer_args=dict( max_backtracks=50, debug_nan=True ) ) reward_handler = build_reward_handler(args, writer) level = Level( depth=level_idx, algo=algo, reward_handler=reward_handler, recognition_model=recognition_model, start_itr=0, end_itr=0 if level_idx == 0 else np.inf ) levels.append(level) # by convention the order of the levels should be increasing # but they must be built in the reverse order # so reverse the list before returning it return list(reversed(levels))
def action_space(self): return Discrete(len(self.vocab))
def action_space(self): return Discrete(len(self.chars))
def action_space(self): return Discrete(self.nA)