def _build_learn(self, env): seqlen = self._sample_size + self._burn_in_size # Explicitly instantiate tf.function to initialize variables TensorSpecs = dict(obs=((seqlen + 1, *env.obs_shape), env.obs_dtype, 'obs'), action=((seqlen + 1, env.action_dim), tf.float32, 'action'), reward=((seqlen, ), tf.float32, 'reward'), mu=((seqlen + 1, ), tf.float32, 'mu'), discount=((seqlen, ), tf.float32, 'discount'), mask=((seqlen + 1, ), tf.float32, 'mask')) if self._is_per and getattr(self, '_use_is_ratio', self._is_per): TensorSpecs['IS_ratio'] = ((), tf.float32, 'IS_ratio') if self._store_state: state_type = type(self.model.state_size) TensorSpecs['state'] = state_type( *[((sz, ), self._dtype, name) for name, sz in self.model.state_size._asdict().items()]) if self._additional_rnn_inputs: if 'prev_action' in self._additional_rnn_inputs: TensorSpecs['prev_action'] = ((seqlen, *env.action_shape), env.action_dtype, 'prev_action') if 'prev_reward' in self._additional_rnn_inputs: TensorSpecs['prev_reward'] = ( (seqlen, ), self._dtype, 'prev_reward' ) # this reward should be unnormlaized self.learn = build(self._learn, TensorSpecs, batch_size=self._batch_size)
def _build_learn(self, env): # Explicitly instantiate tf.function to avoid unintended retracing TensorSpecs = dict( obs=((self._sample_size, *env.obs_shape), env.obs_dtype, 'obs'), action=((self._sample_size, *env.action_shape), env.action_dtype, 'action'), value=((self._sample_size, ), tf.float32, 'value'), traj_ret=((self._sample_size, ), tf.float32, 'traj_ret'), advantage=((self._sample_size, ), tf.float32, 'advantage'), logpi=((self._sample_size, ), tf.float32, 'logpi'), mask=((self._sample_size, ), tf.float32, 'mask'), ) if self._store_state: state_type = type(self.model.state_size) TensorSpecs['state'] = state_type( *[((sz, ), self._dtype, name) for name, sz in self.model.state_size._asdict().items()]) if self._additional_rnn_inputs: if 'prev_action' in self._additional_rnn_inputs: TensorSpecs['prev_action'] = ((self._sample_size, *env.action_shape), env.action_dtype, 'prev_action') if 'prev_reward' in self._additional_rnn_inputs: TensorSpecs['prev_reward'] = ( (self._sample_size, ), self._dtype, 'prev_reward' ) # this reward should be unnormlaized self.learn = build(self._learn, TensorSpecs)
def _build_learn(self, env): # Explicitly instantiate tf.function to avoid unintended retracing TensorSpecs = dict( obs=(env.obs_shape, env.obs_dtype, 'obs'), action=(env.action_shape, env.action_dtype, 'action'), value=((), tf.float32, 'value'), traj_ret=((), tf.float32, 'traj_ret'), advantage=((), tf.float32, 'advantage'), logpi=((), tf.float32, 'logpi'), ) self.learn = build(self._learn, TensorSpecs) TensorSpecs = dict( obs=(env.obs_shape, env.obs_dtype, 'obs'), action=(env.action_shape, env.action_dtype, 'action'), obs_exp=(env.obs_shape, env.obs_dtype, 'obs_exp'), action_exp=(env.action_shape, env.action_dtype, 'action_exp'), ) self.learn_discriminator = build(self._learn_discriminator, TensorSpecs)
def _build_learn(self, env): # Explicitly instantiate tf.function to avoid unintended retracing TensorSpecs = dict( obs=(env.obs_shape, env.obs_dtype, 'obs'), action=(env.action_shape, env.action_dtype, 'action'), advantage=((), tf.float32, 'advantage'), logpi=((), tf.float32, 'logpi'), ) self._policy_data = ['obs', 'action', 'advantage', 'logpi'] self.learn_policy = build(self._learn_policy, TensorSpecs, batch_size=self._batch_size) TensorSpecs = dict( obs=(env.obs_shape, env.obs_dtype, 'obs'), value=((), tf.float32, 'value'), traj_ret=((), tf.float32, 'traj_ret'), ) self._value_data = ['obs', 'value', 'traj_ret'] self.learn_value = build(self._learn_value, TensorSpecs, batch_size=self._batch_size)
def _build_learn(self, env): # Explicitly instantiate tf.function to avoid unintended retracing TensorSpecs = dict( obs=(env.obs_shape, env.obs_dtype, 'obs'), action=(env.action_shape, env.action_dtype, 'action'), value=((), tf.float32, 'value'), traj_ret=((), tf.float32, 'traj_ret'), advantage=((), tf.float32, 'advantage'), logpi=((), tf.float32, 'logpi'), ) self.learn = build(self._learn, TensorSpecs, batch_size=self._batch_size) TensorSpecs = dict( obs=(env.obs_shape, env.obs_dtype, 'obs'), logits=((env.action_dim, ), tf.float32, 'logits'), value=((), tf.float32, 'value'), traj_ret=((), tf.float32, 'traj_ret'), ) self.aux_learn = build(self._aux_learn, TensorSpecs, batch_size=self._aux_batch_size)
def _build_learn(self, env): # Explicitly instantiate tf.function to avoid unintended retracing norm_obs_shape = env.obs_shape[:-1] + (1, ) TensorSpecs = dict( obs=(env.obs_shape, env.obs_dtype, 'obs'), obs_norm=(norm_obs_shape, tf.float32, 'obs_norm'), action=(env.action_shape, env.action_dtype, 'action'), traj_ret_int=((), tf.float32, 'traj_ret_int'), traj_ret_ext=((), tf.float32, 'traj_ret_ext'), value_int=((), tf.float32, 'value_int'), value_ext=((), tf.float32, 'value_ext'), advantage=((), tf.float32, 'advantage'), logpi=((), tf.float32, 'logpi'), ) self.learn = build(self._learn, TensorSpecs)
def _build_learn(self, env): # Explicitly instantiate tf.function to initialize variables TensorSpecs = dict( obs=(env.obs_shape, env.obs_dtype, 'obs'), action=((env.action_dim, ), tf.float32, 'action'), reward=((), tf.float32, 'reward'), next_obs=(env.obs_shape, env.obs_dtype, 'next_obs'), discount=((), tf.float32, 'discount'), ) if self._is_per and getattr(self, '_use_is_ratio', self._is_per): TensorSpecs['IS_ratio'] = ((), tf.float32, 'IS_ratio') if self._n_steps > 1: TensorSpecs['steps'] = ((), tf.float32, 'steps') self.learn = build(self._learn, TensorSpecs, batch_size=self._batch_size)
def _build_learn(self, env): # time dimension must be explicitly specified here # otherwise, InaccessibleTensorError arises when expanding rssm TensorSpecs = dict( obs=((self._sample_size, *self._obs_shape), self._dtype, 'obs'), prev_action=((self._sample_size, self._action_dim), self._dtype, 'prev_action'), reward=((self._sample_size, ), self._dtype, 'reward'), discount=((self._sample_size, ), self._dtype, 'discount'), log_images=(None, tf.bool, 'log_images')) if self._store_state: state_size = self.rssm.state_size TensorSpecs['state'] = (RSSMState( *[((sz, ), self._dtype, name) for name, sz in zip(RSSMState._fields, state_size)])) self.learn = build(self._learn, TensorSpecs, batch_size=self._batch_size)
def _build_learn(self, env): # Explicitly instantiate tf.function to avoid unintended retracing TensorSpecs = dict( obs=((self._sample_size + 1, self._n_agents, *env.obs_shape), env.obs_dtype, 'obs'), global_state=((self._sample_size + 1, *env.shared_state_shape), env.shared_state_dtype, 'global_state'), action_mask=((self._sample_size + 1, self._n_agents, env.action_dim), tf.bool, 'action_mask'), episodic_mask=((self._sample_size, ), tf.float32, 'episodic_mask'), action=((self._sample_size, self._n_agents, env.action_dim), tf.float32, 'action'), reward=((self._sample_size, ), tf.float32, 'reward'), discount=((self._sample_size, ), tf.float32, 'discount'), ) self.learn = build(self._learn, TensorSpecs, batch_size=self._batch_size)