def OnEpisodeFinishedCallback(self, env: Environment, brain: Brain,
                               episode_idx: int, num_of_episodes: int,
                               episode_reward: float, steps: int):
     values = brain.GetValues(self._states)
     for idx, v in enumerate(values):
         for a in self._actions:
             self._value_traces[a][idx].append(v[a])
Exemplo n.º 2
0
 def _protected_ProcessTransition(
     self,
     brain: base.Brain,
     transition: base.Transition,
     step_idx: int,
 ) -> None:
     brain.UpdateFromTransitions([transition])
Exemplo n.º 3
0
 def _protected_ProcessTransition(
     self,
     brain: base.Brain,
     transition: base.Transition,
     step_idx: int,
 ) -> None:
     """Processes a new transition; e.g. to train the QFunction."""
     brain.UpdateFromTransitions([transition])
Exemplo n.º 4
0
 def _protected_ProcessTransition(
     self,
     brain: base.Brain,
     transition: base.Transition,
     step_idx: int,
 ) -> None:
     self._experience.AddTransition(transition)
     if step_idx % self._train_every_n_steps == 0:
         brain.UpdateFromTransitions(
             self._experience.Sample(self._experience_sample_batch_size))
Exemplo n.º 5
0
 def Decide(
     self,
     env: base.Environment,
     brain: base.Brain,
     state: base.State,
     episode_idx: int,
     num_of_episodes: int,
 ) -> base.Action:
     return env.GetActionFromChoice(
         numpy.random.choice(env.GetActionSpaceSize(),
                             p=brain.GetValues(state)[0]))
Exemplo n.º 6
0
 def _protected_ProcessTransition(
     self,
     brain: base.Brain,
     transition: base.Transition,
     step_idx: int,
 ) -> None:
     """Processes a new transition; e.g. to train the QFunction."""
     self._memory.append(transition)
     if len(self._memory) == self._batch_size:
         brain.UpdateFromTransitions(self._memory)
         self._memory = []
Exemplo n.º 7
0
 def Decide(
     self,
     env: base.Environment,
     brain: base.Brain,
     state: base.State,
     episode_idx: int,
     num_of_episodes: int,
 ) -> base.Action:
     values = brain.GetValues(state)
     choice = int(numpy.argmax(values))
     logging.vlog(
         20,
         'making greedy decision for state %s using values: %s; choice: %d',
         state, values, choice)
     return env.GetActionFromChoice(choice)
    def OnEpisodeFinishedCallback(self, env: Environment, brain: Brain,
                                  episode_idx: int, num_of_episodes: int,
                                  episode_reward: float, steps: int):
        if self._use_rewards:
            self._values.append(episode_reward)
        else:
            self._values.append(steps)

        new_value = numpy.mean(
            self._values[-self._average_over_num_of_episodes:])
        if new_value <= self._best_value:
            return

        logging.vlog(6, 'saving model for new best value: %s', new_value)
        self._best_value = new_value
        brain.Save(self._save_filepath)
Exemplo n.º 9
0
    def _protected_ProcessTransition(
        self,
        brain: base.Brain,
        transition: base.Transition,
        step_idx: int,
    ) -> None:
        train_transitions = []
        self._memory.append(transition)
        train_transitions.append(self._GetNStepTransition())

        if len(self._memory) >= self._n_step_return:
            self._memory.pop(0)

        if transition.sp is None:
            while self._memory:
                train_transitions.append(self._GetNStepTransition())
                self._memory.pop(0)
        brain.UpdateFromTransitions(train_transitions)