def OnEpisodeFinishedCallback(self, env: Environment, brain: Brain, episode_idx: int, num_of_episodes: int, episode_reward: float, steps: int): values = brain.GetValues(self._states) for idx, v in enumerate(values): for a in self._actions: self._value_traces[a][idx].append(v[a])
def _protected_ProcessTransition( self, brain: base.Brain, transition: base.Transition, step_idx: int, ) -> None: brain.UpdateFromTransitions([transition])
def _protected_ProcessTransition( self, brain: base.Brain, transition: base.Transition, step_idx: int, ) -> None: """Processes a new transition; e.g. to train the QFunction.""" brain.UpdateFromTransitions([transition])
def _protected_ProcessTransition( self, brain: base.Brain, transition: base.Transition, step_idx: int, ) -> None: self._experience.AddTransition(transition) if step_idx % self._train_every_n_steps == 0: brain.UpdateFromTransitions( self._experience.Sample(self._experience_sample_batch_size))
def Decide( self, env: base.Environment, brain: base.Brain, state: base.State, episode_idx: int, num_of_episodes: int, ) -> base.Action: return env.GetActionFromChoice( numpy.random.choice(env.GetActionSpaceSize(), p=brain.GetValues(state)[0]))
def _protected_ProcessTransition( self, brain: base.Brain, transition: base.Transition, step_idx: int, ) -> None: """Processes a new transition; e.g. to train the QFunction.""" self._memory.append(transition) if len(self._memory) == self._batch_size: brain.UpdateFromTransitions(self._memory) self._memory = []
def Decide( self, env: base.Environment, brain: base.Brain, state: base.State, episode_idx: int, num_of_episodes: int, ) -> base.Action: values = brain.GetValues(state) choice = int(numpy.argmax(values)) logging.vlog( 20, 'making greedy decision for state %s using values: %s; choice: %d', state, values, choice) return env.GetActionFromChoice(choice)
def OnEpisodeFinishedCallback(self, env: Environment, brain: Brain, episode_idx: int, num_of_episodes: int, episode_reward: float, steps: int): if self._use_rewards: self._values.append(episode_reward) else: self._values.append(steps) new_value = numpy.mean( self._values[-self._average_over_num_of_episodes:]) if new_value <= self._best_value: return logging.vlog(6, 'saving model for new best value: %s', new_value) self._best_value = new_value brain.Save(self._save_filepath)
def _protected_ProcessTransition( self, brain: base.Brain, transition: base.Transition, step_idx: int, ) -> None: train_transitions = [] self._memory.append(transition) train_transitions.append(self._GetNStepTransition()) if len(self._memory) >= self._n_step_return: self._memory.pop(0) if transition.sp is None: while self._memory: train_transitions.append(self._GetNStepTransition()) self._memory.pop(0) brain.UpdateFromTransitions(train_transitions)