def test_copy(self): s = SampleBatch({ "a": np.array([1, 2, 3, 2, 3, 4]), "b": { "c": np.array([4, 5, 6, 5, 6, 7]) }, "seq_lens": [2, 3, 1], "state_in_0": [1.0, 3.0, 4.0], }) s_copy = s.copy(shallow=False) s_copy["a"][0] = 100 s_copy["b"]["c"][0] = 200 s_copy["seq_lens"][0] = 3 s_copy["seq_lens"][1] = 2 s_copy["state_in_0"][0] = 400.0 self.assertNotEqual(s["a"][0], s_copy["a"][0]) self.assertNotEqual(s["b"]["c"][0], s_copy["b"]["c"][0]) self.assertNotEqual(s["seq_lens"][0], s_copy["seq_lens"][0]) self.assertNotEqual(s["seq_lens"][1], s_copy["seq_lens"][1]) self.assertNotEqual(s["state_in_0"][0], s_copy["state_in_0"][0]) s_copy = s.copy(shallow=True) s_copy["a"][0] = 100 s_copy["b"]["c"][0] = 200 s_copy["seq_lens"][0] = 3 s_copy["seq_lens"][1] = 2 s_copy["state_in_0"][0] = 400.0 self.assertEqual(s["a"][0], s_copy["a"][0]) self.assertEqual(s["b"]["c"][0], s_copy["b"]["c"][0]) self.assertEqual(s["seq_lens"][0], s_copy["seq_lens"][0]) self.assertEqual(s["seq_lens"][1], s_copy["seq_lens"][1]) self.assertEqual(s["state_in_0"][0], s_copy["state_in_0"][0])
def _learn_on_batch(self, samples: SampleBatch): ( policies_idx_to_train, policies_to_train, ) = self._get_policies_idx_to_train_with_current_batch() if len(policies_idx_to_train) == 0: return self.learner_stats logging.debug(f"policies_idx_to_train {policies_idx_to_train}") self._init_log_learn_on_batch(policies_idx_to_train) for policy_n, algo in zip(policies_idx_to_train, policies_to_train): samples_copy = samples.copy() samples_copy = self._modify_batch_for_policy( policy_n, samples_copy) if (len(samples_copy[samples_copy.ACTIONS]) > len(samples[samples.ACTIONS]) // 2): self._to_log[f"learn_on_batch_algo{policy_n}"] = len( samples_copy[samples_copy.ACTIONS]) self.learner_stats["learner_stats"][ f"algo{policy_n}"] = algo.learn_on_batch(samples_copy) else: self.learner_stats["learner_stats"][f"algo{policy_n}"] = {} return self.learner_stats
def from_batch(self, train_batch: SampleBatch, is_training: bool = True) -> (TensorType, List[TensorType]): """Convenience function that calls this model with a tensor batch. All this does is unpack the tensor batch to call this model with the right input dict, state, and seq len arguments. """ input_dict = train_batch.copy() input_dict["is_training"] = is_training states = [] i = 0 while "state_in_{}".format(i) in input_dict: states.append(input_dict["state_in_{}".format(i)]) i += 1 ret = self.__call__(input_dict, states, input_dict.get("seq_lens")) return ret
def learn_on_batch(self, samples: SampleBatch): learner_stats = {"learner_stats": {}} # Update LR used in optimizer self.optimizer() for policy_n, algo in enumerate(self.algorithms): samples_copy = samples.copy() samples_copy = self._modify_batch_for_policy( policy_n, samples_copy) if len(samples_copy[samples_copy.ACTIONS]) > 0: learner_stats["learner_stats"][ f"learner_stats_algo{policy_n}"] = algo.learn_on_batch( samples_copy) # self.to_log[f'algo{policy_n}_cur_lr'] = algo.cur_lr # For debugging purpose log the true lr (to be compared to algo.cur_lr) # for j, opt in enumerate(algo._optimizers): # self.to_log[f"algo_{policy_n}_{j}_lr"] = [p["lr"] for p in opt.param_groups][0] return learner_stats
def test_pg_loss_functions(self): """Tests the PG loss function math.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" # Fake CartPole episode of n time steps. train_batch = SampleBatch({ SampleBatch.OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]]), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.EPS_ID: np.array([1234, 1234, 1234]), SampleBatch.AGENT_INDEX: np.array([0, 0, 0]), }) for fw, sess in framework_iterator(config, session=True): dist_cls = (Categorical if fw != "torch" else TorchCategorical) trainer = pg.PGTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() vars = policy.model.trainable_variables() if sess: vars = policy.get_session().run(vars) # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] = # [2.9701, 1.99, 1.0] train_batch_ = pg.post_process_advantages(policy, train_batch.copy()) if fw == "torch": train_batch_ = policy._lazy_tensor_dict(train_batch_) # Check Advantage values. check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0]) # Actual loss results. if sess: results = policy.get_session().run( policy._loss, feed_dict=policy._get_loss_inputs_dict(train_batch_, shuffle=False)) else: results = (pg.pg_tf_loss if fw in ["tf2", "tfe"] else pg.pg_torch_loss)(policy, policy.model, dist_class=dist_cls, train_batch=train_batch_) # Calculate expected results. if fw != "torch": expected_logits = fc(fc(train_batch_[SampleBatch.OBS], vars[0], vars[1], framework=fw), vars[2], vars[3], framework=fw) else: expected_logits = fc(fc(train_batch_[SampleBatch.OBS], vars[2], vars[3], framework=fw), vars[0], vars[1], framework=fw) expected_logp = dist_cls(expected_logits, policy.model).logp( train_batch_[SampleBatch.ACTIONS]) adv = train_batch_[Postprocessing.ADVANTAGES] if sess: expected_logp = sess.run(expected_logp) elif fw == "torch": expected_logp = expected_logp.detach().cpu().numpy() adv = adv.detach().cpu().numpy() else: expected_logp = expected_logp.numpy() expected_loss = -np.mean(expected_logp * adv) check(results, expected_loss, decimals=4)
def compute_gae_for_sample_batch( policy: Policy, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None, episode: Optional[MultiAgentEpisode] = None) -> SampleBatch: """Adds GAE (generalized advantage estimations) to a trajectory. The trajectory contains only data from one episode and from one agent. - If `config.batch_mode=truncate_episodes` (default), sample_batch may contain a truncated (at-the-end) episode, in case the `config.rollout_fragment_length` was reached by the sampler. - If `config.batch_mode=complete_episodes`, sample_batch will contain exactly one episode (no matter how long). New columns can be added to sample_batch and existing ones may be altered. Args: policy (Policy): The Policy used to generate the trajectory (`sample_batch`) sample_batch (SampleBatch): The SampleBatch to postprocess. other_agent_batches (Optional[Dict[PolicyID, SampleBatch]]): Optional dict of AgentIDs mapping to other agents' trajectory data (from the same episode). NOTE: The other agents use the same policy. episode (Optional[MultiAgentEpisode]): Optional multi-agent episode object in which the agents operated. Returns: SampleBatch: The postprocessed, modified SampleBatch (or a new one). """ # the trajectory view API will pass populate the info dict with a np.zeros((n,)) # array in the first call, in that case the dtype will be float32 and we # have to ignore it. For regular calls, we extract the rewards from the info # dict into the samplebatch_infos_rewards dict, which now holds the rewards # for all agents as dict. samplebatch_infos_rewards = {'0': sample_batch[SampleBatch.INFOS]} if not sample_batch[SampleBatch.INFOS].dtype == "float32": samplebatch_infos = SampleBatch.concat_samples([ SampleBatch({k: [v] for k, v in s.items()}) for s in sample_batch[SampleBatch.INFOS] ]) samplebatch_infos_rewards = SampleBatch.concat_samples([ SampleBatch({str(k): [v] for k, v in s.items()}) for s in samplebatch_infos["rewards"] ]) if not isinstance(policy.action_space, gym.spaces.tuple.Tuple): raise InvalidActionSpace("Expect tuple action space") # samplebatches for each agents batches = [] for key, action_space in zip(samplebatch_infos_rewards.keys(), policy.action_space): i = int(key) sample_batch_agent = sample_batch.copy() sample_batch_agent[SampleBatch.REWARDS] = ( samplebatch_infos_rewards[key]) if isinstance(action_space, gym.spaces.box.Box): assert len(action_space.shape) == 1 a_w = action_space.shape[0] elif isinstance(action_space, gym.spaces.discrete.Discrete): a_w = 1 else: raise InvalidActionSpace( "Expect gym.spaces.box or gym.spaces.discrete action space") sample_batch_agent[SampleBatch.ACTIONS] = sample_batch[ SampleBatch.ACTIONS][:, a_w * i:a_w * (i + 1)] sample_batch_agent[SampleBatch.VF_PREDS] = sample_batch[ SampleBatch.VF_PREDS][:, i] # Trajectory is actually complete -> last r=0.0. if sample_batch[SampleBatch.DONES][-1]: last_r = 0.0 # Trajectory has been truncated -> last r=VF estimate of last obs. else: # Input dict is provided to us automatically via the Model's # requirements. It's a single-timestep (last one in trajectory) # input_dict. # Create an input dict according to the Model's requirements. input_dict = policy.model.get_input_dict(sample_batch, index="last") all_values = policy._value(**input_dict, seq_lens=input_dict.seq_lens) last_r = all_values[i].item() # Adds the policy logits, VF preds, and advantages to the batch, # using GAE ("generalized advantage estimation") or not. batches.append( compute_advantages(sample_batch_agent, last_r, policy.config["gamma"], policy.config["lambda"], use_gae=policy.config["use_gae"], use_critic=policy.config.get( "use_critic", True))) # Now take original samplebatch and overwrite following elements as a concatenation of these for k in [ SampleBatch.REWARDS, SampleBatch.VF_PREDS, Postprocessing.ADVANTAGES, Postprocessing.VALUE_TARGETS, ]: sample_batch[k] = np.stack([b[k] for b in batches], axis=-1) return sample_batch
def train(self, batch: SampleBatch) -> TensorType: """Trains self.q_model using FQE loss on given batch. Args: batch: A SampleBatch of episodes to train on Returns: A list of losses for each training iteration """ losses = [] minibatch_size = self.minibatch_size or batch.count # Copy batch for shuffling batch = batch.copy(shallow=True) for _ in range(self.n_iters): minibatch_losses = [] batch.shuffle() for idx in range(0, batch.count, minibatch_size): minibatch = batch[idx : idx + minibatch_size] obs = torch.tensor(minibatch[SampleBatch.OBS], device=self.device) actions = torch.tensor( minibatch[SampleBatch.ACTIONS], device=self.device, dtype=int, ) rewards = torch.tensor( minibatch[SampleBatch.REWARDS], device=self.device ) next_obs = torch.tensor( minibatch[SampleBatch.NEXT_OBS], device=self.device ) dones = torch.tensor( minibatch[SampleBatch.DONES], device=self.device, dtype=float ) # Compute Q-values for current obs q_values, _ = self.q_model({"obs": obs}, [], None) q_acts = torch.gather(q_values, -1, actions.unsqueeze(-1)).squeeze(-1) next_action_probs = self._compute_action_probs(next_obs) # Compute Q-values for next obs with torch.no_grad(): next_q_values, _ = self.target_q_model({"obs": next_obs}, [], None) # Compute estimated state value next_v = E_{a ~ pi(s)} [Q(next_obs,a)] next_v = torch.sum(next_q_values * next_action_probs, axis=-1) targets = rewards + (1 - dones) * self.gamma * next_v loss = (targets - q_acts) ** 2 loss = torch.mean(loss) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad.clip_grad_norm_( self.q_model.variables(), self.clip_grad_norm ) self.optimizer.step() minibatch_losses.append(loss.item()) iter_loss = sum(minibatch_losses) / len(minibatch_losses) losses.append(iter_loss) if iter_loss < self.delta: break self.update_target() return losses
def ppo_loss(policy: Policy, model: ModelV2, dist_class: Type[TorchDistributionWrapper], train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]: """ TODO: Write documentation. """ # Compute original ppo loss total_loss = ppo_surrogate_loss(policy, model, dist_class, train_batch) # Shallow copy the input batch. # Be careful accessing fields using the original batch to properly # keep track of acessed keys, which will be used to discard useless # components of policy's view requirements. train_batch_copy = train_batch.copy(shallow=True) # Extract mean of predicted action from logits. # No need to compute the perform model forward pass since the original # PPO loss is already doing it, so just getting back the last ouput. action_logits = model._last_output if issubclass(dist_class, TorchDiagGaussian): action_mean_true, _ = torch.chunk(action_logits, 2, dim=1) else: action_dist = dist_class(action_logits, model) action_mean_true = action_dist.deterministic_sample() if policy.config["caps_temporal_reg"] > 0.0: # Compute the mean action corresponding to the previous observation observation_prev = train_batch["_prev_obs"] train_batch_copy["obs"] = observation_prev action_logits_prev, _ = model(train_batch_copy) if issubclass(dist_class, TorchDiagGaussian): action_mean_prev, _ = torch.chunk(action_logits_prev, 2, dim=1) else: action_dist_prev = dist_class(action_logits_prev, model) action_mean_prev = action_dist_prev.deterministic_sample() # Minimize the difference between the successive action mean policy._mean_temporal_caps_loss = torch.mean( (action_mean_prev - action_mean_true)**2) # Add temporal smoothness loss to total loss total_loss += policy.config["caps_temporal_reg"] * \ policy._mean_temporal_caps_loss if policy.config["caps_spatial_reg"] > 0.0 or \ policy.config["symmetric_policy_reg"] > 0.0: # Generate noisy observation based on specified sensivity offset = 0 observation_true = train_batch["obs"] observation_noisy = observation_true.clone() batch_dim = observation_true.shape[:-1] observation_space = policy.observation_space.original_space for scale in observation_space.sensitivity.values(): scale = torch.from_numpy(scale.copy()).to( dtype=torch.float32, device=observation_true.device) unit_noise = torch.randn((*batch_dim, len(scale)), device=observation_true.device) slice_idx = slice(offset, offset + len(scale)) observation_noisy[..., slice_idx].addcmul_(scale, unit_noise) offset += len(scale) # Compute the mean action corresponding to the noisy observation train_batch_copy["obs"] = observation_noisy action_logits_noisy, _ = model(train_batch_copy) if issubclass(dist_class, TorchDiagGaussian): action_mean_noisy, _ = torch.chunk(action_logits_noisy, 2, dim=1) else: action_dist_noisy = dist_class(action_logits_noisy, model) action_mean_noisy = action_dist_noisy.deterministic_sample() if policy.config["caps_spatial_reg"] > 0.0: # Minimize the difference between the original action mean and the # one corresponding to the noisy observation. policy._mean_spatial_caps_loss = torch.mean( (action_mean_noisy - action_mean_true)**2) # Add spatial smoothness loss to total loss total_loss += policy.config["caps_spatial_reg"] * \ policy._mean_spatial_caps_loss if policy.config["caps_global_reg"] > 0.0: # Minimize the magnitude of action mean policy._mean_global_caps_loss = torch.mean(action_mean_true**2) # Add global smoothness loss to total loss total_loss += policy.config["caps_global_reg"] * \ policy._mean_global_caps_loss if policy.config["symmetric_policy_reg"] > 0.0: # Compute mirrorred observation offset = 0 observation_mirror = torch.empty_like(observation_true) observation_space = policy.observation_space.original_space for mirror_mat in observation_space.mirror_mat.values(): mirror_mat = torch.from_numpy(mirror_mat.T.copy()).to( dtype=torch.float32, device=observation_true.device) slice_idx = slice(offset, offset + len(mirror_mat)) torch.mm(observation_true[..., slice_idx], mirror_mat, out=observation_mirror[..., slice_idx]) offset += len(mirror_mat) # Compute the mirrored mean action corresponding to the mirrored action train_batch_copy["obs"] = observation_mirror action_logits_mirror, _ = model(train_batch_copy) if issubclass(dist_class, TorchDiagGaussian): action_mean_mirror, _ = torch.chunk(action_logits_mirror, 2, dim=1) else: action_dist_mirror = dist_class(action_logits_mirror, model) action_mean_mirror = action_dist_mirror.deterministic_sample() action_mirror_mat = policy.action_space.mirror_mat action_mirror_mat = torch.from_numpy(action_mirror_mat.T.copy()).to( dtype=torch.float32, device=observation_true.device) action_mean_mirror = action_mean_mirror @ action_mirror_mat # Minimize the assymetry of policy output policy._mean_symmetric_policy_loss = torch.mean( (action_mean_mirror - action_mean_true)**2) # Add policy symmetry loss to total loss total_loss += policy.config["symmetric_policy_reg"] * \ policy._mean_symmetric_policy_loss return total_loss