def _swap_snapshots(self) -> None: """ Swaps the appropriate weight to the policy and pushes it to respective policy queues """ for team_id in self._team_to_name_to_policy_queue: if team_id == self._learning_team: continue elif np.random.uniform() < (1 - self.play_against_latest_model_ratio): x = np.random.randint(len(self.policy_snapshots)) snapshot = self.policy_snapshots[x] else: snapshot = self.current_policy_snapshot x = "current" self.current_opponent = -1 if x == "current" else x name_to_policy_queue = self._team_to_name_to_policy_queue[team_id] for brain_name in self._team_to_name_to_policy_queue[team_id]: behavior_id = create_name_behavior_id(brain_name, team_id) policy = self.get_policy(behavior_id) policy.load_weights(snapshot[brain_name]) name_to_policy_queue[brain_name].put(policy) logger.debug( "Step {}: Swapping snapshot {} to id {} with team {} learning" .format(self.ghost_step, x, behavior_id, self._learning_team))
def advance(self) -> None: """ Steps the trainer, passing trajectories to wrapped trainer and calling trainer advance """ for trajectory_queue in self.trajectory_queues: parsed_behavior_id = self._name_to_parsed_behavior_id[ trajectory_queue.behavior_id] if parsed_behavior_id.team_id == self._learning_team: # With a future multiagent trainer, this will be indexed by 'role' internal_trajectory_queue = self._internal_trajectory_queues[ parsed_behavior_id.brain_name] try: # We grab at most the maximum length of the queue. # This ensures that even if the queue is being filled faster than it is # being emptied, the trajectories in the queue are on-policy. for _ in range(trajectory_queue.maxlen): t = trajectory_queue.get_nowait() # adds to wrapped trainers queue internal_trajectory_queue.put(t) self._process_trajectory(t) except AgentManagerQueue.Empty: pass else: # Dump trajectories from non-learning policy try: for _ in range(trajectory_queue.maxlen): t = trajectory_queue.get_nowait() # count ghost steps self.ghost_step += len(t.steps) except AgentManagerQueue.Empty: pass self.next_summary_step = self.trainer.next_summary_step self.trainer.advance() if self.get_step - self.last_team_change > self.steps_to_train_team: self.controller.change_training_team(self.get_step) self.last_team_change = self.get_step next_learning_team = self.controller.get_learning_team # CASE 1: Current learning team is managed by this GhostTrainer. # If the learning team changes, the following loop over queues will push the # new policy into the policy queue for the new learning agent if # that policy is managed by this GhostTrainer. Otherwise, it will save the current snapshot. # CASE 2: Current learning team is managed by a different GhostTrainer. # If the learning team changes to a team managed by this GhostTrainer, this loop # will push the current_snapshot into the correct queue. Otherwise, # it will continue skipping and swap_snapshot will continue to handle # pushing fixed snapshots # Case 3: No team change. The if statement just continues to push the policy # into the correct queue (or not if not learning team). for brain_name in self._internal_policy_queues: internal_policy_queue = self._internal_policy_queues[brain_name] try: policy = cast(TFPolicy, internal_policy_queue.get_nowait()) self.current_policy_snapshot[brain_name] = policy.get_weights() except AgentManagerQueue.Empty: pass if next_learning_team in self._team_to_name_to_policy_queue: name_to_policy_queue = self._team_to_name_to_policy_queue[ next_learning_team] if brain_name in name_to_policy_queue: behavior_id = create_name_behavior_id( brain_name, next_learning_team) policy = self.get_policy(behavior_id) policy.load_weights( self.current_policy_snapshot[brain_name]) name_to_policy_queue[brain_name].put(policy) # Note save and swap should be on different step counters. # We don't want to save unless the policy is learning. if self.get_step - self.last_save > self.steps_between_save: self._save_snapshot() self.last_save = self.get_step if (self._learning_team != next_learning_team or self.ghost_step - self.last_swap > self.steps_between_swap): self._learning_team = next_learning_team self._swap_snapshots() self.last_swap = self.ghost_step