def __init__(self, env, action_script, scale, to_learn, use_mask=True, learn_residuals=False): """ Args: env: GKP environmen action_script: module or class with attributes corresponding to action components such as 'alpha', 'phi' etc scale: dictionary of scaling factors for action components to_learn: dictionary of bool values for action components use_mask: flag to control masking of action components learn_residuals (bool): flag to learn residual over the scripted protocol. If False, will learn actions from scratch. If True, will learn a residual to be added to scripted protocol. """ super(ActionWrapper, self).__init__(env) self.scale = scale self.period = action_script.period # periodicity of the protocol self.to_learn = to_learn self.use_mask = use_mask self.mask = action_script.mask self.learn_residuals = learn_residuals # load the script of actions and convert to tensors self.script = action_script.script for a, val in self.script.items(): self.script[a] = tf.constant(val, dtype=tf.float32) self._action_spec = {a : specs.BoundedTensorSpec( shape = C.shape[1:], dtype=tf.float32, minimum=-1, maximum=1) for a, C in self.script.items() if self.to_learn[a]}
def __init__(self, time_step_spec, action_spec, batch_size=1, policy_state_spec_name='policy_state_spec', policy_state_name='policy_state', initial_policy_state=None): batch_shape = (batch_size, ) self._batch_shape = batch_shape minimum = np.asarray(1, dtype=np.int32) maximum = np.asarray(2, dtype=np.int32) self._maximum = maximum policy_state_spec = specs.BoundedTensorSpec( (), tf.int32, minimum=minimum, maximum=maximum, name=policy_state_spec_name) info_spec = action_spec self._policy_state = common.create_variable(name=policy_state_name, initial_value=maximum, shape=batch_shape, dtype=tf.int32) if initial_policy_state is None: self._initial_policy_state = tf.fill([batch_size], tf.constant(0, tf.int32)) else: self._initial_policy_state = initial_policy_state super(TFPolicyMock, self).__init__(time_step_spec, action_spec, policy_state_spec, info_spec)
def get_policy(self): def policy_fn(observation, dtype=tf.int32): if tf.rank(observation) < 1: observation = [observation] if self._latent_policy: embed = self._embed_state(observation) else: embed = tf.one_hot(observation, self._num_states) distribution = tf.matmul( embed, tf.nn.softmax(self._embed_policy_logits, axis=-1)) policy_info = {'distribution': distribution} return (tfp.distributions.Categorical(probs=distribution, dtype=dtype), policy_info) policy_info_spec = { 'log_probability': specs.TensorSpec([], tf.float32), 'distribution': specs.BoundedTensorSpec([self._num_actions], tf.float32, minimum=0.0, maximum=1.0) } return policy_fn, policy_info_spec
def __init__(self, time_step_spec, action_spec, batch_size=1, policy_state_spec_name='policy_state_spec', policy_state_name='policy_state'): batch_shape = (batch_size, ) self._batch_shape = batch_shape minimum = np.asarray(1, dtype=np.int32) maximum = np.asarray(2, dtype=np.int32) self._maximum = maximum policy_state_spec = specs.BoundedTensorSpec( (), tf.int32, minimum=minimum, maximum=maximum, name=policy_state_spec_name) info_spec = action_spec self._policy_state = tf.get_variable( name=policy_state_name, shape=batch_shape, dtype=tf.int32, initializer=tf.constant_initializer(maximum)) self._initial_policy_state = tf.constant(0, shape=batch_shape, dtype=tf.int32) super(TFPolicyMock, self).__init__(time_step_spec, action_spec, policy_state_spec, info_spec)
def __init__(self, initial_state=0, dtype=tf.int64, scope='TFEnviroment'): self._dtype = dtype self._scope = scope self._initial_state = tf.cast(initial_state, dtype=self._dtype) observation_spec = specs.TensorSpec([1], self._dtype, 'observation') action_spec = specs.BoundedTensorSpec([], tf.int32, minimum=0, maximum=10) time_step_spec = ts.time_step_spec(observation_spec) super(TFEnvironmentMock, self).__init__(time_step_spec, action_spec) self._state = common.create_variable('state', initial_state, dtype=self._dtype) self.steps = common.create_variable('steps', 0) self.episodes = common.create_variable('episodes', 0) self.resets = common.create_variable('resets', 0)
def __init__(self, initial_state=0, dtype=tf.int64, scope='TFEnviroment'): self._dtype = dtype self._scope = scope self._initial_state = tf.cast(initial_state, dtype=self._dtype) observation_spec = specs.TensorSpec([1], self._dtype, 'observation') action_spec = specs.BoundedTensorSpec([], tf.int32, minimum=0, maximum=10) time_step_spec = ts.time_step_spec(observation_spec) super(TFEnvironmentMock, self).__init__(time_step_spec, action_spec) with tf.compat.v1.variable_scope(self._scope): self._state = tf.Variable(initial_state, name='state', dtype=self._dtype) self.steps = tf.Variable(0, name='steps') self.episodes = tf.Variable(0, name='episodes') self.resets = tf.Variable(0, name='resets')
def testLoad(self): specs.ArraySpec([1, 2, 3], np.int32) specs.BoundedArraySpec([1, 2, 3], np.int32, 0, 1) specs.TensorSpec([1, 2, 3], np.int32) specs.BoundedTensorSpec([1, 2, 3], np.int32, 0, 1)
def __init__(self, tf_env, context_ranges=None, context_shapes=None, state_indices=None, variable_indices=None, gamma_index=None, settable_context=False, timers=None, samplers=None, reward_weights=None, reward_fn=None, random_sampler_mode='random', normalizers=None, context_transition_fn=None, context_multi_transition_fn=None, meta_action_every_n=None): self._tf_env = tf_env self.variable_indices = variable_indices self.gamma_index = gamma_index self._settable_context = settable_context self.timers = timers self._context_transition_fn = context_transition_fn self._context_multi_transition_fn = context_multi_transition_fn self._random_sampler_mode = random_sampler_mode # assign specs self._obs_spec = self._tf_env.observation_spec() self._context_shapes = tuple([ shape if shape is not None else self._obs_spec.shape for shape in context_shapes ]) self.context_specs = tuple([ specs.TensorSpec(dtype=self._obs_spec.dtype, shape=shape) for shape in self._context_shapes ]) if context_ranges is not None: self.context_ranges = context_ranges else: self.context_ranges = [None] * len(self._context_shapes) self.context_as_action_specs = tuple([ specs.BoundedTensorSpec( shape=shape, dtype=(tf.float32 if self._obs_spec.dtype in [tf.float32, tf.float64] else self._obs_spec.dtype), minimum=context_range[0], maximum=context_range[-1]) for shape, context_range in zip( self._context_shapes, self.context_ranges) ]) if state_indices is not None: self.state_indices = state_indices else: self.state_indices = [None] * len(self._context_shapes) if self.variable_indices is not None and self.n != len( self.variable_indices): raise ValueError( 'variable_indices (%s) must have the same length as contexts (%s).' % (self.variable_indices, self.context_specs)) assert self.n == len(self.context_ranges) assert self.n == len(self.state_indices) # assign reward/sampler fns self._sampler_fns = dict() self._samplers = dict() self._reward_fns = dict() # assign reward fns self._add_custom_reward_fns() reward_weights = reward_weights or None self._reward_fn = self._make_reward_fn(reward_fn, reward_weights) # assign samplers self._add_custom_sampler_fns() for mode, sampler_fns in samplers.items(): self._make_sampler_fn(sampler_fns, mode) # create normalizers if normalizers is None: self._normalizers = [None] * len(self.context_specs) else: self._normalizers = [ normalizer(tf.zeros(shape=spec.shape, dtype=spec.dtype)) if normalizer is not None else None for normalizer, spec in zip(normalizers, self.context_specs) ] assert self.n == len(self._normalizers) self.meta_action_every_n = meta_action_every_n # create vars self.context_vars = {} self.timer_vars = {} self.create_vars(self.VAR_NAME) self.t = tf.Variable(tf.zeros(shape=(), dtype=tf.int32), name='num_timer_steps')
def create_tf_policy_from_table(probability_table, obs_to_index_fn, return_distribution=False): """Creates a callable policy function given a table of state to distribution. Args: probability_table: A Tensor-like object determining the action distribution. obs_to_index_fn: A function mapping environment observation to index in table. return_distribution: Whether policy_fn should return a distribution. If not, returns sampled actions. Returns: policy_fn: A function mapping observations to action distribution or sampled actions and policy info. policy_info_spec: A spec that determines the type of objects returned by policy info. """ probability_table = tf.convert_to_tensor(probability_table, dtype=tf.float32) n_actions = tf.shape(probability_table)[-1] def policy_fn(observation, probability_table=probability_table, obs_to_index_fn=obs_to_index_fn, return_distribution=return_distribution, dtype=tf.int32): state = obs_to_index_fn(observation) distribution = tf.gather(probability_table, state) batched = tf.rank(distribution) > 1 if not batched: distributions = distribution[None, :] else: distributions = distribution batch_size = tf.shape(distributions)[0] actions = tf.random.categorical(tf.math.log(1e-8 + distributions), 1, dtype=dtype) actions = tf.squeeze(actions, -1) probs = tf.gather_nd( distributions, tf.stack([tf.range(batch_size, dtype=dtype), actions], -1)) if not batched: action = actions[0] log_prob = tf.math.log(1e-8 + probs[0]) else: action = actions log_prob = tf.math.log(1e-8 + probs) if return_distribution: policy_info = {'distribution': distribution} return (tfp.distributions.Categorical(probs=distribution, dtype=dtype), policy_info) else: policy_info = { 'log_probability': log_prob, 'distribution': distribution } return action, policy_info policy_info_spec = { 'log_probability': specs.TensorSpec([], tf.float32), 'distribution': specs.BoundedTensorSpec([n_actions], tf.float32, minimum=0.0, maximum=1.0) } return policy_fn, policy_info_spec