def __init__(self, env_specs, policy, qf, replay_buffer, policy_optimizer=tf.optimizers.Adam(), qf_optimizer=tf.optimizers.Adam(lr=0.1), exploration_strategy=None, exploration_interval=10, target_update_tau=0.01, target_update_period=10, td_errors_loss_fn=None, gamma=0.99, reward_scale=1.0, gradient_clipping=None, train_sequence_length=None, name='Bilevel_leader', agent_id=-1): self._Serializable__initialize(locals()) self._agent_id = agent_id self._env_specs = env_specs if self._agent_id >= 0: observation_space = self._env_specs.observation_space[ self._agent_id] action_space = self._env_specs.action_space[self._agent_id] else: observation_space = self._env_specs.observation_space action_space = self._env_specs.action_space self._exploration_strategy = exploration_strategy self._qf_optimizer = qf_optimizer self._target_qf = Serializable.clone(qf, name='target_qf_agent_{}'.format( self._agent_id)) self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._td_errors_loss_fn = (td_errors_loss_fn or tf.losses.Huber) self._gamma = gamma self._reward_scale = reward_scale self._gradient_clipping = gradient_clipping self._train_step = 0 self._exploration_interval = exploration_interval self._exploration_status = False self.required_experiences = [ 'observation', 'actions', 'rewards', 'next_observations', 'opponent_actions', 'target_actions' ] self._observation_space = observation_space self._action_space = action_space self._policy = policy self._qf = qf self._replay_buffer = replay_buffer self._name = name
def __setstate__(self, state): Serializable.__setstate__(self, state) self.set_weights(state['pickled_weights'])
def __getstate__(self): state = Serializable.__getstate__(self) state['pickled_weights'] = self.get_weights() return state