def update(self, _, a, r, s1): if not self._stop: a = double_to_single_precision(a) r = double_to_single_precision(r) s1 = double_to_single_precision(np.array(s1)) d = double_to_single_precision(1.0) timestep = dm_env.transition(reward=r, observation=s1, discount=d) self.agent.observe(a, timestep) self.agent.update() # Log values. values = { 'step': self._obs_counter, 'action': a, 'reward': r, } self._logger.write(values)
def update(self, _, a, r, s1): if not self._stop: a = double_to_single_precision(np.array(a)) r = double_to_single_precision(r) s1 = double_to_single_precision(np.array(s1)) d = double_to_single_precision(1.0) timestep = dm_env.transition(reward=r, observation=s1, discount=d) self.agent.observe(a, timestep) self.agent.update() # Log values. values = { 'step': self._obs_counter, 'reward': r, } for i in range(self._params.num_phases): values[f"action_p{i}"] = a[i] self._logger.write(values)
def act(self, s): s = double_to_single_precision(np.array(s)) # Make first observation. if self._obs_counter == 0: t_1 = dm_env.restart(s) self.agent.observe_first(t_1) # Select action. if self._stop: action = self.agent.deterministic_action(s) else: action = self.agent.select_action(s) self._obs_counter += 1 return int(action)