def from_batch(self, train_batch: SampleBatch, is_training: bool = True) -> (TensorType, List[TensorType]): """Convenience function that calls this model with a tensor batch. All this does is unpack the tensor batch to call this model with the right input dict, state, and seq len arguments. """ input_dict = { "obs": train_batch[SampleBatch.CUR_OBS], "is_training": is_training, } if SampleBatch.PREV_ACTIONS in train_batch: input_dict["prev_actions"] = train_batch[SampleBatch.PREV_ACTIONS] if SampleBatch.PREV_REWARDS in train_batch: input_dict["prev_rewards"] = train_batch[SampleBatch.PREV_REWARDS] states = [] i = 0 while "state_in_{}".format(i) in train_batch: states.append(train_batch["state_in_{}".format(i)]) i += 1 return self.__call__(input_dict, states, train_batch.get("seq_lens"))
def flatten_data(data: AgentConnectorsOutput): assert isinstance( data, AgentConnectorsOutput ), "Single agent data must be of type AgentConnectorsOutput" for_training = data.for_training for_action = data.for_action flattened = {} for k, v in for_action.items(): if k in [SampleBatch.INFOS, SampleBatch.ACTIONS ] or k.startswith("state_out_"): # Do not flatten infos, actions, and state_out_ columns. flattened[k] = v continue if v is None: # Keep the same column shape. flattened[k] = None continue flattened[k] = np.array(tree.flatten(v)) flattened = SampleBatch(flattened, is_training=False) return AgentConnectorsOutput(for_training, flattened)
def add_postprocessed_batch_for_training( self, batch: SampleBatch, view_requirements: Dict[str, ViewRequirement]) -> None: """Adds a postprocessed SampleBatch (single agent) to our buffers. Args: batch (SampleBatch): A single agent (one trajectory) SampleBatch to be added to the Policy's buffers. view_requirements (Dict[str, ViewRequirement]: The view requirements for the policy. This is so we know, whether a view-column needs to be copied at all (not needed for training). """ for view_col, data in batch.items(): # TODO(ekl) how do we handle this for policies that don't extend # Torch / TF Policy template (no inference of view reqs)? # Skip columns that are not used for training. # if view_col not in view_requirements or \ # not view_requirements[view_col].used_for_training: # continue self.buffers[view_col].extend(data) # Add the agent's trajectory length to our count. self.count += batch.count
def test_n_step_from_same_obs_source_array(self): """Tests, whether n-step also works on a shared obs/new-obs array.""" gamma = 0.99 # The underlying observation data. Both obs and next_obs will # be references into that same np.array. underlying_obs = np.arange(0, 8) obs = underlying_obs[:7] next_obs = underlying_obs[1:] actions = np.random.randint(-1, 3, size=(7, )) check_actions = actions.copy() rewards = [10.0, 0.0, 100.0, 50.0, 60.0, 10.0, 100.0] dones = [False, False, False, False, False, False, True] batch = SampleBatch({ SampleBatch.OBS: obs, SampleBatch.ACTIONS: actions, SampleBatch.REWARDS: rewards, SampleBatch.DONES: dones, SampleBatch.NEXT_OBS: next_obs, }) adjust_nstep(4, gamma, batch) check(batch[SampleBatch.OBS], [0, 1, 2, 3, 4, 5, 6]) check(batch[SampleBatch.ACTIONS], check_actions) check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 7, 7, 7]) check(batch[SampleBatch.DONES], [False, False, False, True, True, True, True]) check(batch[SampleBatch.REWARDS], [ discount_cumsum(np.array(rewards[0:4]), gamma)[0], discount_cumsum(np.array(rewards[1:5]), gamma)[0], discount_cumsum(np.array(rewards[2:6]), gamma)[0], discount_cumsum(np.array(rewards[3:7]), gamma)[0], discount_cumsum(np.array(rewards[4:]), gamma)[0], discount_cumsum(np.array(rewards[5:]), gamma)[0], discount_cumsum(np.array(rewards[6:]), gamma)[0], ])
def stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]: """Stats function for APPO. Returns a dict with important loss stats. Args: policy (Policy): The Policy to generate stats for. train_batch (SampleBatch): The SampleBatch (already) used for training. Returns: Dict[str, TensorType]: The stats dict. """ values_batched = _make_time_major( policy, train_batch.get("seq_lens"), policy.model.value_function(), drop_last=policy.config["vtrace"]) stats_dict = { "cur_lr": tf.cast(policy.cur_lr, tf.float64), "policy_loss": policy._mean_policy_loss, "entropy": policy._mean_entropy, "var_gnorm": tf.linalg.global_norm(policy.model.trainable_variables()), "vf_loss": policy._mean_vf_loss, "vf_explained_var": explained_variance( tf.reshape(policy._value_targets, [-1]), tf.reshape(values_batched, [-1])), } if policy.config["vtrace"]: is_stat_mean, is_stat_var = tf.nn.moments(policy._is_ratio, [0, 1]) stats_dict["mean_IS"] = is_stat_mean stats_dict["var_IS"] = is_stat_var if policy.config["use_kl_loss"]: stats_dict["kl"] = policy._mean_kl stats_dict["KL_Coeff"] = policy.kl_coeff return stats_dict
def compute_actions( self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, info_batch=None, episodes=None, explore=None, timestep=None, **kwargs, ): # Create input dict to simply pass the entire call to # self.compute_actions_from_input_dict(). input_dict = SampleBatch( { SampleBatch.CUR_OBS: obs_batch, }, _is_training=tf.constant(False), ) if state_batches is not None: for s in enumerate(state_batches): input_dict["state_in_{i}"] = s if prev_action_batch is not None: input_dict[SampleBatch.PREV_ACTIONS] = prev_action_batch if prev_reward_batch is not None: input_dict[SampleBatch.PREV_REWARDS] = prev_reward_batch if info_batch is not None: input_dict[SampleBatch.INFOS] = info_batch return self.compute_actions_from_input_dict( input_dict=input_dict, explore=explore, timestep=timestep, episodes=episodes, **kwargs, )
def test_pad_batch_dynamic_max(self): """Test pad_batch_to_sequences_of_same_size when dynamic_max = True""" view_requirements = { "state_in_0": ViewRequirement( "state_out_0", shift=[-1], used_for_training=False, used_for_compute_actions=True, batch_repeat_value=1, ) } max_seq_len = 20 num_seqs = np.random.randint(1, 20) seq_lens = np.random.randint(1, max_seq_len, size=(num_seqs)) max_len = np.max(seq_lens) sum_seq_lens = np.sum(seq_lens) s1 = SampleBatch( { "a": np.arange(sum_seq_lens), "b": np.arange(sum_seq_lens), "seq_lens": seq_lens, "state_in_0": [[0]] * num_seqs, }, _max_seq_len=max_seq_len, ) pad_batch_to_sequences_of_same_size( s1, max_seq_len=max_seq_len, feature_keys=["a", "b"], view_requirements=view_requirements, ) check(s1.max_seq_len, max_len) check(s1["a"].shape[0], max_len * num_seqs) check(s1["b"].shape[0], max_len * num_seqs)
def step(self): with self.update_weights_timer: if self.workers.remote_workers(): weights = ray.put(self.workers.local_worker().get_weights()) for e in self.workers.remote_workers(): e.set_weights.remote(weights) with self.sample_timer: if self.workers.remote_workers(): batch = SampleBatch.concat_samples( ray_get_and_free([ e.sample.remote() for e in self.workers.remote_workers() ])) else: batch = self.workers.local_worker().sample() # Handle everything as if multiagent if isinstance(batch, SampleBatch): batch = MultiAgentBatch({ DEFAULT_POLICY_ID: batch }, batch.count) for policy_id, s in batch.policy_batches.items(): for row in s.rows(): self.replay_buffers[policy_id].add( pack_if_needed(row["obs"]), row["actions"], row["rewards"], pack_if_needed(row["new_obs"]), row["dones"], weight=None) if self.num_steps_sampled >= self.replay_starts: self._optimize() self.num_steps_sampled += batch.count
def call(self, input_dict: SampleBatch) -> \ (TensorType, List[TensorType], Dict[str, TensorType]): assert input_dict.get(SampleBatch.SEQ_LENS) is not None # Push obs through underlying (wrapped) model first. wrapped_out, _, _ = self.wrapped_keras_model(input_dict) # Concat. prev-action/reward if required. prev_a_r = [] if self.lstm_use_prev_action: prev_a = input_dict[SampleBatch.PREV_ACTIONS] if isinstance(self.action_space, (Discrete, MultiDiscrete)): prev_a = one_hot(prev_a, self.action_space) prev_a_r.append( tf.reshape(tf.cast(prev_a, tf.float32), [-1, self.action_dim])) if self.lstm_use_prev_reward: prev_a_r.append( tf.reshape( tf.cast(input_dict[SampleBatch.PREV_REWARDS], tf.float32), [-1, 1])) if prev_a_r: wrapped_out = tf.concat([wrapped_out] + prev_a_r, axis=1) max_seq_len = tf.shape(wrapped_out)[0] // tf.shape( input_dict[SampleBatch.SEQ_LENS])[0] wrapped_out_plus_time_dim = add_time_dimension(wrapped_out, max_seq_len=max_seq_len, framework="tf") model_out, value_out, h, c = self._rnn_model([ wrapped_out_plus_time_dim, input_dict[SampleBatch.SEQ_LENS], input_dict["state_in_0"], input_dict["state_in_1"] ]) model_out_no_time_dim = tf.reshape( model_out, tf.concat([[-1], tf.shape(model_out)[2:]], axis=0)) return model_out_no_time_dim, [h, c], { SampleBatch.VF_PREDS: tf.reshape(value_out, [-1]) }
def _build_actor_network( self, obs, obs_space, act_space, use_state_preprocessor, hiddens, activation=None, scope=None, ): with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope: if use_state_preprocessor: model = ModelCatalog.get_model( SampleBatch( obs=obs, _is_training=self._get_is_training_placeholder()), obs_space, act_space, 1, self.config["model"], ) out = model.last_layer else: model = None out = obs for hidden in hiddens: out = tf1.layers.dense(out, units=hidden, activation=activation) feature = tf1.layers.dense(out, units=act_space.shape[0], activation=None) sampler = tfp.distributions.RelaxedOneHotCategorical( temperature=1.0, logits=feature).sample() return sampler, feature, model, tf1.global_variables(scope.name)
def gen_replay(timeout): while True: samples = {} idxes = None for policy_id, reservoir_buffer in reservoir_buffers.buffers.items( ): if len(reservoir_buffer) >= min_size_to_learn and \ reservoir_buffers.steps[policy_id] >= learn_every: idxes = reservoir_buffer.sample_idxes(train_batch_size) (obses_t, actions) = reservoir_buffer.sample_with_idxes(idxes) samples[policy_id] = SampleBatch({ "obs": obses_t, "actions": actions, }) reservoir_buffers.steps[policy_id] = 0 if samples == {}: yield _NextValueNotReady() else: yield MultiAgentBatch(samples, train_batch_size)
def test_n_step_3(self): """Tests, whether n-step adjustments of trajectories work.""" # n-step = 3 gamma = 0.9 obs = [1, 2, 3, 4, 5, 6, 7] actions = ["ac1", "ac2", "ac1", "ac1", "ac1", "ac2", "ac1"] rewards = [10.0, 0.0, 100.0, 100.0, 100.0, 100.0, 100.0] dones = [0, 0, 0, 0, 0, 0, 1] next_obs = [2, 3, 4, 5, 6, 7, 8] batch = SampleBatch({ SampleBatch.OBS: obs, SampleBatch.ACTIONS: actions, SampleBatch.REWARDS: rewards, SampleBatch.DONES: dones, SampleBatch.NEXT_OBS: next_obs, }) adjust_nstep(3, gamma, batch) check(batch[SampleBatch.OBS], [1, 2, 3, 4, 5, 6, 7]) check(batch[SampleBatch.ACTIONS], ["ac1", "ac2", "ac1", "ac1", "ac1", "ac2", "ac1"]) check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 8, 8, 8]) check(batch[SampleBatch.DONES], [0, 0, 0, 0, 1, 1, 1]) check(batch[SampleBatch.REWARDS], [91.0, 171.0, 271.0, 271.0, 271.0, 190.0, 100.0])
def postprocess_trajectory( self, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[Any, SampleBatch]] = None, episode: Optional["Episode"] = None, ): sample_batch = super().postprocess_trajectory( sample_batch, other_agent_batches, episode ) # Trajectory is actually complete -> last r=0.0. if sample_batch[SampleBatch.DONES][-1]: last_r = 0.0 # Trajectory has been truncated -> last r=VF estimate of last obs. else: # Input dict is provided to us automatically via the Model's # requirements. It's a single-timestep (last one in trajectory) # input_dict. # Create an input dict according to the Model's requirements. index = "last" if SampleBatch.NEXT_OBS in sample_batch else -1 input_dict = sample_batch.get_single_step_input_dict( self.model.view_requirements, index=index ) last_r = self._value(**input_dict) # Adds the "advantages" (which in the case of MARWIL are simply the # discounted cumulative rewards) to the SampleBatch. return compute_advantages( sample_batch, last_r, self.config["gamma"], # We just want the discounted cumulative rewards, so we won't need # GAE nor critic (use_critic=True: Subtract vf-estimates from returns). use_gae=False, use_critic=False, )
def __call__(self, samples: List[SampleBatchType]): samples, split_lst = self.post_process_samples(samples) self.buffer.extend(samples) self.split.append(split_lst) self.post_process_metrics() if len(self.split) > self.n: out = SampleBatch.concat_samples(self.buffer) out["split"] = np.array(self.split) self.buffer = [] self.split = [] # Metrics Reporting metrics = _get_shared_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += out.count # Reporting Adaptation Rew Diff ep_rew_pre = self.metrics["episode_reward_mean"] ep_rew_post = self.metrics["episode_reward_mean_adapt_" + str(self.n)] self.metrics["adaptation_delta"] = ep_rew_post - ep_rew_pre return [(out, self.metrics)] else: self.inner_adaptation_step(samples) return []
def test_n_step_4(self): """Tests, whether n-step adjustments of trajectories work.""" # n-step = 4 gamma = 0.99 obs = np.arange(0, 7) actions = np.random.randint(-1, 3, size=(7,)) check_actions = actions.copy() rewards = [10.0, 0.0, 100.0, 50.0, 60.0, 10.0, 100.0] dones = [False, False, False, False, False, False, True] next_obs = np.arange(1, 8) batch = SampleBatch( { SampleBatch.OBS: obs, SampleBatch.ACTIONS: actions, SampleBatch.REWARDS: rewards, SampleBatch.DONES: dones, SampleBatch.NEXT_OBS: next_obs, } ) adjust_nstep(4, gamma, batch) check(batch[SampleBatch.OBS], [0, 1, 2, 3, 4, 5, 6]) check(batch[SampleBatch.ACTIONS], check_actions) check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 7, 7, 7]) check(batch[SampleBatch.DONES], [False, False, False, True, True, True, True]) check( batch[SampleBatch.REWARDS], [ discount_cumsum(np.array(rewards[0:4]), gamma)[0], discount_cumsum(np.array(rewards[1:5]), gamma)[0], discount_cumsum(np.array(rewards[2:6]), gamma)[0], discount_cumsum(np.array(rewards[3:7]), gamma)[0], discount_cumsum(np.array(rewards[4:]), gamma)[0], discount_cumsum(np.array(rewards[5:]), gamma)[0], discount_cumsum(np.array(rewards[6:]), gamma)[0], ], )
def step(self): with self.update_weights_timer: if self.workers.remote_workers(): weights = ray.put(self.workers.local_worker().get_weights()) for e in self.workers.remote_workers(): e.set_weights.remote(weights) with self.sample_timer: samples = [] while sum(s.count for s in samples) < self.train_batch_size: if self.workers.remote_workers(): samples.extend( ray.get([ e.sample.remote() for e in self.workers.remote_workers() ])) else: samples.append(self.workers.local_worker().sample()) samples = SampleBatch.concat_samples(samples) self.sample_timer.push_units_processed(samples.count) with self.grad_timer: fetches = do_minibatch_sgd(samples, self.policies, self.workers.local_worker(), self.num_sgd_iter, self.sgd_minibatch_size, self.standardize_fields) self.grad_timer.push_units_processed(samples.count) if len(fetches) == 1 and DEFAULT_POLICY_ID in fetches: self.learner_stats = fetches[DEFAULT_POLICY_ID] else: self.learner_stats = fetches self.num_steps_sampled += samples.count self.num_steps_trained += samples.count return self.learner_stats
def _get_dummy_batch_from_view_requirements( self, batch_size: int = 1) -> SampleBatch: """Creates a numpy dummy batch based on the Policy's view requirements. Args: batch_size (int): The size of the batch to create. Returns: Dict[str, TensorType]: The dummy batch containing all zero values. """ ret = {} for view_col, view_req in self.view_requirements.items(): if isinstance(view_req.space, (gym.spaces.Dict, gym.spaces.Tuple)): _, shape = ModelCatalog.get_action_shape(view_req.space) ret[view_col] = \ np.zeros((batch_size, ) + shape[1:], np.float32) else: if isinstance(view_req.space, gym.spaces.Space): ret[view_col] = np.zeros_like( [view_req.space.sample() for _ in range(batch_size)]) else: ret[view_col] = [view_req.space for _ in range(batch_size)] return SampleBatch(ret)
def _add_sample_batch_to_buffer(self, buffer, batch_size, num_batches=5, **kwargs): self.eps_id = 0 def _generate_data(): self.eps_id += 1 return SampleBatch( { SampleBatch.T: [0, 1], SampleBatch.ACTIONS: 2 * [np.random.choice([0, 1])], SampleBatch.REWARDS: 2 * [np.random.rand()], SampleBatch.OBS: 2 * [np.random.random((4,))], SampleBatch.NEXT_OBS: 2 * [np.random.random((4,))], SampleBatch.DONES: 2 * [np.random.choice([False, True])], SampleBatch.EPS_ID: 2 * [self.eps_id], SampleBatch.AGENT_INDEX: 2 * [0], "batch_id": 2 * [self.batch_id], } ) for i in range(num_batches): data = [_generate_data() for _ in range(batch_size)] self.batch_id += 1 batch = SampleBatch.concat_samples(data) buffer.add(batch, **kwargs)
def ParallelRollouts(workers: WorkerSet, *, mode="bulk_sync", num_async=1) -> LocalIterator[SampleBatch]: """Operator to collect experiences in parallel from rollout workers. If there are no remote workers, experiences will be collected serially from the local worker instance instead. Arguments: workers (WorkerSet): set of rollout workers to use. mode (str): One of {'async', 'bulk_sync', 'raw'}. - In 'async' mode, batches are returned as soon as they are computed by rollout workers with no order guarantees. - In 'bulk_sync' mode, we collect one batch from each worker and concatenate them together into a large batch to return. - In 'raw' mode, the ParallelIterator object is returned directly and the caller is responsible for implementing gather and updating the timesteps counter. num_async (int): In async mode, the max number of async requests in flight per actor. Returns: A local iterator over experiences collected in parallel. Examples: >>> rollouts = ParallelRollouts(workers, mode="async") >>> batch = next(rollouts) >>> print(batch.count) 50 # config.rollout_fragment_length >>> rollouts = ParallelRollouts(workers, mode="bulk_sync") >>> batch = next(rollouts) >>> print(batch.count) 200 # config.rollout_fragment_length * config.num_workers Updates the STEPS_SAMPLED_COUNTER counter in the local iterator context. """ # Ensure workers are initially in sync. workers.sync_weights() def report_timesteps(batch): metrics = _get_shared_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += batch.count return batch if not workers.remote_workers(): # Handle the serial sampling case. def sampler(_): while True: yield workers.local_worker().sample() return (LocalIterator(sampler, SharedMetrics()).for_each(report_timesteps)) # Create a parallel iterator over generated experiences. rollouts = from_actors(workers.remote_workers()) if mode == "bulk_sync": return rollouts \ .batch_across_shards() \ .for_each(lambda batches: SampleBatch.concat_samples(batches)) \ .for_each(report_timesteps) elif mode == "async": return rollouts.gather_async( num_async=num_async).for_each(report_timesteps) elif mode == "raw": return rollouts else: raise ValueError("mode must be one of 'bulk_sync', 'async', 'raw', " "got '{}'".format(mode))
def build(self, view_requirements: ViewRequirementsDict) -> SampleBatch: """Builds a SampleBatch from the thus-far collected agent data. If the episode/trajectory has no DONE=True at the end, will copy the necessary n timesteps at the end of the trajectory back to the beginning of the buffers and wait for new samples coming in. SampleBatches created by this method will be ready for postprocessing by a Policy. Args: view_requirements (ViewRequirementsDict): The view requirements dict needed to build the SampleBatch from the raw buffers (which may have data shifts as well as mappings from view-col to data-col in them). Returns: SampleBatch: The built SampleBatch for this agent, ready to go into postprocessing. """ batch_data = {} np_data = {} for view_col, view_req in view_requirements.items(): # Create the batch of data from the different buffers. data_col = view_req.data_col or view_col # Some columns don't exist yet (get created during postprocessing). # -> skip. if data_col not in self.buffers: continue # OBS are already shifted by -1 (the initial obs starts one ts # before all other data columns). obs_shift = -1 if data_col == SampleBatch.OBS else 0 # Keep an np-array cache so we don't have to regenerate the # np-array for different view_cols using to the same data_col. if data_col not in np_data: np_data[data_col] = to_float_np_array(self.buffers[data_col]) # Range of indices on time-axis, e.g. "-50:-1". Together with # the `batch_repeat_value`, this determines the data produced. # Example: # batch_repeat_value=10, shift_from=-3, shift_to=-1 # buffer=[-3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # resulting data=[[-3, -2, -1], [7, 8, 9]] # Range of 3 consecutive items repeats every 10 timesteps. if view_req.shift_from is not None: # Batch repeat value > 1: Only repeat the shift_from/to range # every n timesteps. if view_req.batch_repeat_value > 1: count = int( math.ceil((len(np_data[data_col]) - self.shift_before) / view_req.batch_repeat_value)) data = np.asarray([ np_data[data_col][self.shift_before + (i * view_req.batch_repeat_value) + view_req.shift_from + obs_shift:self.shift_before + (i * view_req.batch_repeat_value) + view_req.shift_to + 1 + obs_shift] for i in range(count) ]) # Batch repeat value = 1: Repeat the shift_from/to range at # each timestep. else: d = np_data[data_col] shift_win = view_req.shift_to - view_req.shift_from + 1 data_size = d.itemsize * int(np.product(d.shape[1:])) strides = [ d.itemsize * int(np.product(d.shape[i + 1:])) for i in range(1, len(d.shape)) ] data = np.lib.stride_tricks.as_strided( d[self.shift_before - shift_win:], [self.agent_steps, shift_win ] + [d.shape[i] for i in range(1, len(d.shape))], [data_size, data_size] + strides) # Set of (probably non-consecutive) indices. # Example: # shift=[-3, 0] # buffer=[-3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # resulting data=[[-3, 0], [-2, 1], [-1, 2], [0, 3], [1, 4], ...] elif isinstance(view_req.shift, np.ndarray): data = np_data[data_col][self.shift_before + obs_shift + view_req.shift] # Single shift int value. Use the trajectory as-is, and if # `shift` != 0: shifted by that value. else: shift = view_req.shift + obs_shift # Batch repeat (only provide a value every n timesteps). if view_req.batch_repeat_value > 1: count = int( math.ceil((len(np_data[data_col]) - self.shift_before) / view_req.batch_repeat_value)) data = np.asarray([ np_data[data_col][self.shift_before + ( i * view_req.batch_repeat_value) + shift] for i in range(count) ]) # Shift is exactly 0: Use trajectory as is. elif shift == 0: data = np_data[data_col][self.shift_before:] # Shift is positive: We still need to 0-pad at the end. elif shift > 0: data = to_float_np_array( self.buffers[data_col][self.shift_before + shift:] + [ np.zeros( shape=view_req.space.shape, dtype=view_req.space.dtype) for _ in range(shift) ]) # Shift is negative: Shift into the already existing and # 0-padded "before" area of our buffers. else: data = np_data[data_col][self.shift_before + shift:shift] if len(data) > 0: batch_data[view_col] = data # Due to possible batch-repeats > 1, columns in the resulting batch # may not all have the same batch size. batch = SampleBatch(batch_data, _dont_check_lens=True) # Add EPS_ID and UNROLL_ID to batch. batch.data[SampleBatch.EPS_ID] = np.repeat(self.episode_id, batch.count) if SampleBatch.UNROLL_ID not in batch.data: # TODO: (sven) Once we have the additional # model.preprocess_train_batch in place (attention net PR), we # should not even need UNROLL_ID anymore: # Add "if SampleBatch.UNROLL_ID in view_requirements:" here. batch.data[SampleBatch.UNROLL_ID] = np.repeat( _AgentCollector._next_unroll_id, batch.count) _AgentCollector._next_unroll_id += 1 # This trajectory is continuing -> Copy data at the end (in the size of # self.shift_before) to the beginning of buffers and erase everything # else. if not self.buffers[SampleBatch.DONES][-1]: # Copy data to beginning of buffer and cut lists. if self.shift_before > 0: for k, data in self.buffers.items(): self.buffers[k] = data[-self.shift_before:] self.agent_steps = 0 return batch
check_compute_single_action # Fake CartPole episode of n time steps. FAKE_BATCH = SampleBatch({ SampleBatch.OBS: np.array( [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]], dtype=np.float32), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.PREV_ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32), SampleBatch.PREV_REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32), SampleBatch.ACTION_DIST_INPUTS: np.array([[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32), SampleBatch.ACTION_LOGP: np.array([-0.5, -0.1, -0.2], dtype=np.float32), SampleBatch.EPS_ID: np.array([0, 0, 0]), SampleBatch.AGENT_INDEX: np.array([0, 0, 0]), }) class MyCallbacks(DefaultCallbacks):
def ddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: SampleBatch) -> TensorType: twin_q = policy.config["twin_q"] gamma = policy.config["gamma"] n_step = policy.config["n_step"] use_huber = policy.config["use_huber"] huber_threshold = policy.config["huber_threshold"] l2_reg = policy.config["l2_reg"] input_dict = SampleBatch(obs=train_batch[SampleBatch.CUR_OBS], _is_training=True) input_dict_next = SampleBatch(obs=train_batch[SampleBatch.NEXT_OBS], _is_training=True) model_out_t, _ = model(input_dict, [], None) model_out_tp1, _ = model(input_dict_next, [], None) target_model_out_tp1, _ = policy.target_model(input_dict_next, [], None) policy.target_q_func_vars = policy.target_model.variables() # Policy network evaluation. policy_t = model.get_policy_output(model_out_t) policy_tp1 = policy.target_model.get_policy_output(target_model_out_tp1) # Action outputs. if policy.config["smooth_target_policy"]: target_noise_clip = policy.config["target_noise_clip"] clipped_normal_sample = tf.clip_by_value( tf.random.normal(tf.shape(policy_tp1), stddev=policy.config["target_noise"]), -target_noise_clip, target_noise_clip, ) policy_tp1_smoothed = tf.clip_by_value( policy_tp1 + clipped_normal_sample, policy.action_space.low * tf.ones_like(policy_tp1), policy.action_space.high * tf.ones_like(policy_tp1), ) else: # No smoothing, just use deterministic actions. policy_tp1_smoothed = policy_tp1 # Q-net(s) evaluation. # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) # Q-values for given actions & observations in given current q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) # Q-values for current policy (no noise) in given current state q_t_det_policy = model.get_q_values(model_out_t, policy_t) if twin_q: twin_q_t = model.get_twin_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) # Target q-net(s) evaluation. q_tp1 = policy.target_model.get_q_values(target_model_out_tp1, policy_tp1_smoothed) if twin_q: twin_q_tp1 = policy.target_model.get_twin_q_values( target_model_out_tp1, policy_tp1_smoothed) q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) if twin_q: twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1) q_tp1 = tf.minimum(q_tp1, twin_q_tp1) q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = ( 1.0 - tf.cast(train_batch[SampleBatch.DONES], tf.float32)) * q_tp1_best # Compute RHS of bellman equation. q_t_selected_target = tf.stop_gradient( tf.cast(train_batch[SampleBatch.REWARDS], tf.float32) + gamma**n_step * q_tp1_best_masked) # Compute the error (potentially clipped). if twin_q: td_error = q_t_selected - q_t_selected_target twin_td_error = twin_q_t_selected - q_t_selected_target if use_huber: errors = huber_loss(td_error, huber_threshold) + huber_loss( twin_td_error, huber_threshold) else: errors = 0.5 * tf.math.square(td_error) + 0.5 * tf.math.square( twin_td_error) else: td_error = q_t_selected - q_t_selected_target if use_huber: errors = huber_loss(td_error, huber_threshold) else: errors = 0.5 * tf.math.square(td_error) critic_loss = tf.reduce_mean( tf.cast(train_batch[PRIO_WEIGHTS], tf.float32) * errors) actor_loss = -tf.reduce_mean(q_t_det_policy) # Add l2-regularization if required. if l2_reg is not None: for var in policy.model.policy_variables(): if "bias" not in var.name: actor_loss += l2_reg * tf.nn.l2_loss(var) for var in policy.model.q_variables(): if "bias" not in var.name: critic_loss += l2_reg * tf.nn.l2_loss(var) # Model self-supervised losses. if policy.config["use_state_preprocessor"]: # Expand input_dict in case custom_loss' need them. input_dict[SampleBatch.ACTIONS] = train_batch[SampleBatch.ACTIONS] input_dict[SampleBatch.REWARDS] = train_batch[SampleBatch.REWARDS] input_dict[SampleBatch.DONES] = train_batch[SampleBatch.DONES] input_dict[SampleBatch.NEXT_OBS] = train_batch[SampleBatch.NEXT_OBS] if log_once("ddpg_custom_loss"): logger.warning( "You are using a state-preprocessor with DDPG and " "therefore, `custom_loss` will be called on your Model! " "Please be aware that DDPG now uses the ModelV2 API, which " "merges all previously separate sub-models (policy_model, " "q_model, and twin_q_model) into one ModelV2, on which " "`custom_loss` is called, passing it " "[actor_loss, critic_loss] as 1st argument. " "You may have to change your custom loss function to handle " "this.") [actor_loss, critic_loss] = model.custom_loss([actor_loss, critic_loss], input_dict) # Store values for stats function. policy.actor_loss = actor_loss policy.critic_loss = critic_loss policy.td_error = td_error policy.q_t = q_t # Return one loss value (even though we treat them separately in our # 2 optimizers: actor and critic). return policy.critic_loss + policy.actor_loss
def _initialize_loss_from_dummy_batch( self, auto_remove_unneeded_view_reqs: bool = True, stats_fn=None) -> None: # Create the optimizer/exploration optimizer here. Some initialization # steps (e.g. exploration postprocessing) may need this. self._optimizer = self.optimizer() # Test calls depend on variable init, so initialize model first. self._sess.run(tf1.global_variables_initializer()) if self.config["_use_trajectory_view_api"]: logger.info("Testing `compute_actions` w/ dummy batch.") actions, state_outs, extra_fetches = \ self.compute_actions_from_input_dict( self._dummy_batch, explore=False, timestep=0) for key, value in extra_fetches.items(): self._dummy_batch[key] = np.zeros_like(value) self._input_dict[key] = get_placeholder(value=value, name=key) if key not in self.view_requirements: logger.info("Adding extra-action-fetch `{}` to " "view-reqs.".format(key)) self.view_requirements[key] = \ ViewRequirement(space=gym.spaces.Box( -1.0, 1.0, shape=value.shape[1:], dtype=value.dtype)) dummy_batch = self._dummy_batch else: def fake_array(tensor): shape = tensor.shape.as_list() shape = [s if s is not None else 1 for s in shape] return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype) dummy_batch = { SampleBatch.CUR_OBS: fake_array(self._obs_input), SampleBatch.NEXT_OBS: fake_array(self._obs_input), SampleBatch.DONES: np.array([False], dtype=np.bool), SampleBatch.ACTIONS: fake_array( ModelCatalog.get_action_placeholder(self.action_space)), SampleBatch.REWARDS: np.array([0], dtype=np.float32), } if self._obs_include_prev_action_reward: dummy_batch.update({ SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input), SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), }) state_init = self.get_initial_state() state_batches = [] for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0) dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0) state_batches.append(np.expand_dims(h, 0)) if state_init: dummy_batch["seq_lens"] = np.array([1], dtype=np.int32) for k, v in self.extra_compute_action_fetches().items(): dummy_batch[k] = fake_array(v) dummy_batch = SampleBatch(dummy_batch) batch_for_postproc = UsageTrackingDict(dummy_batch) batch_for_postproc.count = dummy_batch.count logger.info("Testing `postprocess_trajectory` w/ dummy batch.") self.exploration.postprocess_trajectory(self, batch_for_postproc, self._sess) postprocessed_batch = self.postprocess_trajectory(batch_for_postproc) # Add new columns automatically to (loss) input_dict. if self.config["_use_trajectory_view_api"]: for key in batch_for_postproc.added_keys: if key not in self._input_dict: self._input_dict[key] = get_placeholder( value=batch_for_postproc[key], name=key) if key not in self.view_requirements: self.view_requirements[key] = \ ViewRequirement(space=gym.spaces.Box( -1.0, 1.0, shape=batch_for_postproc[key].shape[1:], dtype=batch_for_postproc[key].dtype)) if not self.config["_use_trajectory_view_api"]: train_batch = UsageTrackingDict( dict({ SampleBatch.CUR_OBS: self._obs_input, }, **self._loss_input_dict)) if self._obs_include_prev_action_reward: train_batch.update({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, }) for k, v in postprocessed_batch.items(): if k in train_batch: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF elif k == "seq_lens" or k.startswith("state_in_"): continue shape = (None, ) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf1.placeholder(dtype, shape=shape, name=k) train_batch[k] = placeholder for i, si in enumerate(self._state_inputs): train_batch["state_in_{}".format(i)] = si else: train_batch = UsageTrackingDict( dict(self._input_dict, **self._loss_input_dict)) if self._state_inputs: train_batch["seq_lens"] = self._seq_lens if log_once("loss_init"): logger.debug( "Initializing loss function with dummy input:\n\n{}\n".format( summarize(train_batch))) self._loss_input_dict.update({k: v for k, v in train_batch.items()}) loss = self._do_loss_init(train_batch) all_accessed_keys = \ train_batch.accessed_keys | batch_for_postproc.accessed_keys | \ batch_for_postproc.added_keys | set( self.model.view_requirements.keys()) TFPolicy._initialize_loss( self, loss, [(k, v) for k, v in train_batch.items() if k in all_accessed_keys]) if "is_training" in self._loss_input_dict: del self._loss_input_dict["is_training"] # Call the grads stats fn. # TODO: (sven) rename to simply stats_fn to match eager and torch. if self._grad_stats_fn: self._stats_fetches.update( self._grad_stats_fn(self, train_batch, self._grads)) # Add new columns automatically to view-reqs. if self.config["_use_trajectory_view_api"] and \ auto_remove_unneeded_view_reqs: # Add those needed for postprocessing and training. all_accessed_keys = train_batch.accessed_keys | \ batch_for_postproc.accessed_keys # Tag those only needed for post-processing (with some exceptions). for key in batch_for_postproc.accessed_keys: if key not in train_batch.accessed_keys and \ key not in self.model.view_requirements and \ key not in [ SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, SampleBatch.UNROLL_ID, SampleBatch.DONES, SampleBatch.REWARDS, SampleBatch.INFOS]: if key in self.view_requirements: self.view_requirements[key].used_for_training = False if key in self._loss_input_dict: del self._loss_input_dict[key] # Remove those not needed at all (leave those that are needed # by Sampler to properly execute sample collection). # Also always leave DONES, REWARDS, and INFOS, no matter what. for key in list(self.view_requirements.keys()): if key not in all_accessed_keys and key not in [ SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, SampleBatch.UNROLL_ID, SampleBatch.DONES, SampleBatch.REWARDS, SampleBatch.INFOS] and \ key not in self.model.view_requirements: # If user deleted this key manually in postprocessing # fn, warn about it and do not remove from # view-requirements. if key in batch_for_postproc.deleted_keys: logger.warning( "SampleBatch key '{}' was deleted manually in " "postprocessing function! RLlib will " "automatically remove non-used items from the " "data stream. Remove the `del` from your " "postprocessing function.".format(key)) else: del self.view_requirements[key] if key in self._loss_input_dict: del self._loss_input_dict[key] # Add those data_cols (again) that are missing and have # dependencies by view_cols. for key in list(self.view_requirements.keys()): vr = self.view_requirements[key] if (vr.data_col is not None and vr.data_col not in self.view_requirements): used_for_training = \ vr.data_col in train_batch.accessed_keys self.view_requirements[vr.data_col] = ViewRequirement( space=vr.space, used_for_training=used_for_training) self._loss_input_dict_no_rnn = { k: v for k, v in self._loss_input_dict.items() if (v not in self._state_inputs and v != self._seq_lens) } # Initialize again after loss init. self._sess.run(tf1.global_variables_initializer())
import unittest import ray from ray.tune.registry import register_env, register_input, \ registry_get_input, registry_contains_input from ray.rllib.agents.pg import PGTrainer from ray.rllib.examples.env.multi_agent import MultiAgentCartPole from ray.rllib.offline import IOContext, JsonWriter, JsonReader, InputReader, \ ShuffledInput from ray.rllib.offline.json_writer import _to_json from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.test_utils import framework_iterator SAMPLES = SampleBatch({ "actions": np.array([1, 2, 3, 4]), "obs": np.array([4, 5, 6, 7]), "eps_id": [1, 1, 2, 3], }) def make_sample_batch(i): return SampleBatch({ "actions": np.array([i, i, i]), "obs": np.array([i, i, i]) }) class AgentIOTest(unittest.TestCase): def setUp(self): ray.init(num_cpus=1, ignore_reinit_error=True) self.test_dir = tempfile.mkdtemp()
def make_sample_batch(i): return SampleBatch({ "actions": np.array([i, i, i]), "obs": np.array([i, i, i]) })
def test_episodes_unit(self): """Tests adding, sampling, and eviction of episodes.""" buffer = ReplayBuffer(capacity=18, storage_unit="episodes") batches = [ SampleBatch({ SampleBatch.T: [0, 1, 2, 3], SampleBatch.ACTIONS: 4 * [np.random.choice([0, 1])], SampleBatch.REWARDS: 4 * [np.random.rand()], SampleBatch.DONES: [False, False, False, True], SampleBatch.SEQ_LENS: [4], SampleBatch.EPS_ID: 4 * [i], }) for i in range(3) ] batches.append( SampleBatch({ SampleBatch.T: [0, 1, 0, 1], SampleBatch.ACTIONS: 4 * [np.random.choice([0, 1])], SampleBatch.REWARDS: 4 * [np.random.rand()], SampleBatch.DONES: [False, True, False, True], SampleBatch.SEQ_LENS: [2, 2], SampleBatch.EPS_ID: [3, 3, 4, 4], })) for batch in batches: buffer.add(batch) num_sampled_dict = {_id: 0 for _id in range(5)} num_samples = 200 for i in range(num_samples): sample = buffer.sample(1) _id = sample[SampleBatch.EPS_ID][0] assert len(sample[SampleBatch.SEQ_LENS]) == 1 num_sampled_dict[_id] += 1 # All episodes, even though in different batches should be sampled # equally often assert np.allclose( np.array(list(num_sampled_dict.values())) / num_samples, [1 / 5, 1 / 5, 1 / 5, 1 / 5, 1 / 5], atol=0.1, ) # Episode 6 is not entirely inside this batch, it should not be added # to the buffer buffer.add( SampleBatch({ SampleBatch.T: [0, 1, 0, 1], SampleBatch.ACTIONS: 4 * [np.random.choice([0, 1])], SampleBatch.REWARDS: 4 * [np.random.rand()], SampleBatch.DONES: [False, True, False, False], SampleBatch.SEQ_LENS: [2, 2], SampleBatch.EPS_ID: [5, 5, 6, 6], })) num_sampled_dict = {_id: 0 for _id in range(7)} num_samples = 200 for i in range(num_samples): sample = buffer.sample(1) _id = sample[SampleBatch.EPS_ID][0] assert len(sample[SampleBatch.SEQ_LENS]) == 1 num_sampled_dict[_id] += 1 # Episode 7 should be dropped for not ending inside the batch assert np.allclose( np.array(list(num_sampled_dict.values())) / num_samples, [1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6, 0], atol=0.1, ) # Add another batch to evict the first batch buffer.add( SampleBatch({ SampleBatch.T: [0, 1, 2, 3], SampleBatch.ACTIONS: 4 * [np.random.choice([0, 1])], SampleBatch.REWARDS: 4 * [np.random.rand()], SampleBatch.DONES: [False, False, False, True], SampleBatch.SEQ_LENS: [4], SampleBatch.EPS_ID: 4 * [7], })) # After adding 1 more batch, eviction has started with 24 # timesteps added in total, 2 of which were discarded assert len(buffer) == 6 assert buffer._num_timesteps_added == 4 * 6 - 2 assert buffer._num_timesteps_added_wrap == 4 assert buffer._next_idx == 1 assert buffer._eviction_started is True num_sampled_dict = {_id: 0 for _id in range(8)} num_samples = 200 for i in range(num_samples): sample = buffer.sample(1) _id = sample[SampleBatch.EPS_ID][0] assert len(sample[SampleBatch.SEQ_LENS]) == 1 num_sampled_dict[_id] += 1 assert np.allclose( np.array(list(num_sampled_dict.values())) / num_samples, [0, 1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6, 0, 1 / 6], atol=0.1, )
def test_sequences_unit(self): """Tests adding, sampling and eviction of sequences.""" buffer = ReplayBuffer(capacity=10, storage_unit="sequences") batches = [ SampleBatch({ SampleBatch.T: i * [np.random.random((4, ))], SampleBatch.ACTIONS: i * [np.random.choice([0, 1])], SampleBatch.REWARDS: i * [np.random.rand()], SampleBatch.DONES: i * [np.random.choice([False, True])], SampleBatch.SEQ_LENS: [i], "batch_id": i * [i], }) for i in range(1, 4) ] batches.append( SampleBatch({ SampleBatch.T: 4 * [np.random.random((4, ))], SampleBatch.ACTIONS: 4 * [np.random.choice([0, 1])], SampleBatch.REWARDS: 4 * [np.random.rand()], SampleBatch.DONES: 4 * [np.random.choice([False, True])], SampleBatch.SEQ_LENS: [2, 2], "batch_id": 4 * [4], })) for batch in batches: buffer.add(batch) num_sampled_dict = {_id: 0 for _id in range(1, 5)} num_samples = 200 for i in range(num_samples): sample = buffer.sample(1) _id = sample["batch_id"][0] assert len(sample[SampleBatch.SEQ_LENS]) == 1 num_sampled_dict[_id] += 1 # Out of five sequences, we want to sequences from the last batch to # be sampled twice as often, because they are stored separately assert np.allclose( np.array(list(num_sampled_dict.values())) / num_samples, [1 / 5, 1 / 5, 1 / 5, 2 / 5], atol=0.1, ) # Add another batch to evict buffer.add( SampleBatch({ SampleBatch.T: 5 * [np.random.random((4, ))], SampleBatch.ACTIONS: 5 * [np.random.choice([0, 1])], SampleBatch.REWARDS: 5 * [np.random.rand()], SampleBatch.DONES: 5 * [np.random.choice([False, True])], SampleBatch.SEQ_LENS: [5], "batch_id": 5 * [5], })) # After adding 1 more batch, eviction has started with 15 # timesteps added in total assert len(buffer) == 5 assert buffer._num_timesteps_added == sum(range(1, 6)) assert buffer._num_timesteps_added_wrap == 5 assert buffer._next_idx == 1 assert buffer._eviction_started is True # The first batch should now not be sampled anymore, other batches # should be sampled as before num_sampled_dict = {_id: 0 for _id in range(2, 6)} num_samples = 200 for i in range(num_samples): sample = buffer.sample(1) _id = sample["batch_id"][0] assert len(sample[SampleBatch.SEQ_LENS]) == 1 num_sampled_dict[_id] += 1 assert np.allclose( np.array(list(num_sampled_dict.values())) / num_samples, [1 / 5, 1 / 5, 2 / 5, 1 / 5], atol=0.1, )
def compute_single_action( self, obs: Optional[TensorStructType] = None, state: Optional[List[TensorType]] = None, *, prev_action: Optional[TensorStructType] = None, prev_reward: Optional[TensorStructType] = None, info: dict = None, input_dict: Optional[SampleBatch] = None, episode: Optional["Episode"] = None, explore: Optional[bool] = None, timestep: Optional[int] = None, # Kwars placeholder for future compatibility. **kwargs, ) -> Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]]: """Computes and returns a single (B=1) action value. Takes an input dict (usually a SampleBatch) as its main data input. This allows for using this method in case a more complex input pattern (view requirements) is needed, for example when the Model requires the last n observations, the last m actions/rewards, or a combination of any of these. Alternatively, in case no complex inputs are required, takes a single `obs` values (and possibly single state values, prev-action/reward values, etc..). Args: obs: Single observation. state: List of RNN state inputs, if any. prev_action: Previous action value, if any. prev_reward: Previous reward, if any. info: Info object, if any. input_dict: A SampleBatch or input dict containing the single (unbatched) Tensors to compute actions. If given, it'll be used instead of `obs`, `state`, `prev_action|reward`, and `info`. episode: This provides access to all of the internal episode state, which may be useful for model-based or multi-agent algorithms. explore: Whether to pick an exploitation or exploration action (default: None -> use self.config["explore"]). timestep: The current (sampling) time step. Keyword Args: kwargs: Forward compatibility placeholder. Returns: Tuple consisting of the action, the list of RNN state outputs (if any), and a dictionary of extra features (if any). """ # Build the input-dict used for the call to # `self.compute_actions_from_input_dict()`. if input_dict is None: input_dict = {SampleBatch.OBS: obs} if state is not None: for i, s in enumerate(state): input_dict[f"state_in_{i}"] = s if prev_action is not None: input_dict[SampleBatch.PREV_ACTIONS] = prev_action if prev_reward is not None: input_dict[SampleBatch.PREV_REWARDS] = prev_reward if info is not None: input_dict[SampleBatch.INFOS] = info # Batch all data in input dict. input_dict = tree.map_structure_with_path( lambda p, s: (s if p == "seq_lens" else s.unsqueeze(0) if torch and isinstance( s, torch.Tensor) else np.expand_dims(s, 0)), input_dict, ) episodes = None if episode is not None: episodes = [episode] out = self.compute_actions_from_input_dict( input_dict=SampleBatch(input_dict), episodes=episodes, explore=explore, timestep=timestep, ) # Some policies don't return a tuple, but always just a single action. # E.g. ES and ARS. if not isinstance(out, tuple): single_action = out state_out = [] info = {} # Normal case: Policy should return (action, state, info) tuple. else: batched_action, state_out, info = out single_action = unbatch(batched_action) assert len(single_action) == 1 single_action = single_action[0] # Return action, internal state(s), infos. return ( single_action, [s[0] for s in state_out], {k: v[0] for k, v in info.items()}, )
def actor_critic_loss( policy: Policy, model: ModelV2, dist_class: Type[TorchDistributionWrapper], train_batch: SampleBatch, ) -> Union[TensorType, List[TensorType]]: """Constructs the loss for the Soft Actor Critic. Args: policy: The Policy to calculate the loss for. model (ModelV2): The Model to calculate the loss for. dist_class (Type[TorchDistributionWrapper]: The action distr. class. train_batch: The training data. Returns: Union[TensorType, List[TensorType]]: A single loss tensor or a list of loss tensors. """ # Look up the target model (tower) using the model tower. target_model = policy.target_models[model] # Should be True only for debugging purposes (e.g. test cases)! deterministic = policy.config["_deterministic_loss"] model_out_t, _ = model( SampleBatch(obs=train_batch[SampleBatch.CUR_OBS], _is_training=True), [], None) model_out_tp1, _ = model( SampleBatch(obs=train_batch[SampleBatch.NEXT_OBS], _is_training=True), [], None) target_model_out_tp1, _ = target_model( SampleBatch(obs=train_batch[SampleBatch.NEXT_OBS], _is_training=True), [], None) alpha = torch.exp(model.log_alpha) # Discrete case. if model.discrete: # Get all action probs directly from pi and form their logp. action_dist_inputs_t, _ = model.get_action_model_outputs(model_out_t) log_pis_t = F.log_softmax(action_dist_inputs_t, dim=-1) policy_t = torch.exp(log_pis_t) action_dist_inputs_tp1, _ = model.get_action_model_outputs( model_out_tp1) log_pis_tp1 = F.log_softmax(action_dist_inputs_tp1, -1) policy_tp1 = torch.exp(log_pis_tp1) # Q-values. q_t, _ = model.get_q_values(model_out_t) # Target Q-values. q_tp1, _ = target_model.get_q_values(target_model_out_tp1) if policy.config["twin_q"]: twin_q_t, _ = model.get_twin_q_values(model_out_t) twin_q_tp1, _ = target_model.get_twin_q_values( target_model_out_tp1) q_tp1 = torch.min(q_tp1, twin_q_tp1) q_tp1 -= alpha * log_pis_tp1 # Actually selected Q-values (from the actions batch). one_hot = F.one_hot(train_batch[SampleBatch.ACTIONS].long(), num_classes=q_t.size()[-1]) q_t_selected = torch.sum(q_t * one_hot, dim=-1) if policy.config["twin_q"]: twin_q_t_selected = torch.sum(twin_q_t * one_hot, dim=-1) # Discrete case: "Best" means weighted by the policy (prob) outputs. q_tp1_best = torch.sum(torch.mul(policy_tp1, q_tp1), dim=-1) q_tp1_best_masked = ( 1.0 - train_batch[SampleBatch.DONES].float()) * q_tp1_best # Continuous actions case. else: # Sample single actions from distribution. action_dist_class = _get_dist_class(policy, policy.config, policy.action_space) action_dist_inputs_t, _ = model.get_action_model_outputs(model_out_t) action_dist_t = action_dist_class(action_dist_inputs_t, model) policy_t = (action_dist_t.sample() if not deterministic else action_dist_t.deterministic_sample()) log_pis_t = torch.unsqueeze(action_dist_t.logp(policy_t), -1) action_dist_inputs_tp1, _ = model.get_action_model_outputs( model_out_tp1) action_dist_tp1 = action_dist_class(action_dist_inputs_tp1, model) policy_tp1 = (action_dist_tp1.sample() if not deterministic else action_dist_tp1.deterministic_sample()) log_pis_tp1 = torch.unsqueeze(action_dist_tp1.logp(policy_tp1), -1) # Q-values for the actually selected actions. q_t, _ = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) if policy.config["twin_q"]: twin_q_t, _ = model.get_twin_q_values( model_out_t, train_batch[SampleBatch.ACTIONS]) # Q-values for current policy in given current state. q_t_det_policy, _ = model.get_q_values(model_out_t, policy_t) if policy.config["twin_q"]: twin_q_t_det_policy, _ = model.get_twin_q_values( model_out_t, policy_t) q_t_det_policy = torch.min(q_t_det_policy, twin_q_t_det_policy) # Target q network evaluation. q_tp1, _ = target_model.get_q_values(target_model_out_tp1, policy_tp1) if policy.config["twin_q"]: twin_q_tp1, _ = target_model.get_twin_q_values( target_model_out_tp1, policy_tp1) # Take min over both twin-NNs. q_tp1 = torch.min(q_tp1, twin_q_tp1) q_t_selected = torch.squeeze(q_t, dim=-1) if policy.config["twin_q"]: twin_q_t_selected = torch.squeeze(twin_q_t, dim=-1) q_tp1 -= alpha * log_pis_tp1 q_tp1_best = torch.squeeze(input=q_tp1, dim=-1) q_tp1_best_masked = ( 1.0 - train_batch[SampleBatch.DONES].float()) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = (train_batch[SampleBatch.REWARDS] + (policy.config["gamma"]**policy.config["n_step"]) * q_tp1_best_masked).detach() # Compute the TD-error (potentially clipped). base_td_error = torch.abs(q_t_selected - q_t_selected_target) if policy.config["twin_q"]: twin_td_error = torch.abs(twin_q_t_selected - q_t_selected_target) td_error = 0.5 * (base_td_error + twin_td_error) else: td_error = base_td_error critic_loss = [ torch.mean(train_batch[PRIO_WEIGHTS] * huber_loss(base_td_error)) ] if policy.config["twin_q"]: critic_loss.append( torch.mean(train_batch[PRIO_WEIGHTS] * huber_loss(twin_td_error))) # Alpha- and actor losses. # Note: In the papers, alpha is used directly, here we take the log. # Discrete case: Multiply the action probs as weights with the original # loss terms (no expectations needed). if model.discrete: weighted_log_alpha_loss = policy_t.detach() * ( -model.log_alpha * (log_pis_t + model.target_entropy).detach()) # Sum up weighted terms and mean over all batch items. alpha_loss = torch.mean(torch.sum(weighted_log_alpha_loss, dim=-1)) # Actor loss. actor_loss = torch.mean( torch.sum( torch.mul( # NOTE: No stop_grad around policy output here # (compare with q_t_det_policy for continuous case). policy_t, alpha.detach() * log_pis_t - q_t.detach(), ), dim=-1, )) else: alpha_loss = -torch.mean(model.log_alpha * (log_pis_t + model.target_entropy).detach()) # Note: Do not detach q_t_det_policy here b/c is depends partly # on the policy vars (policy sample pushed through Q-net). # However, we must make sure `actor_loss` is not used to update # the Q-net(s)' variables. actor_loss = torch.mean(alpha.detach() * log_pis_t - q_t_det_policy) # Store values for stats function in model (tower), such that for # multi-GPU, we do not override them during the parallel loss phase. model.tower_stats["q_t"] = q_t model.tower_stats["policy_t"] = policy_t model.tower_stats["log_pis_t"] = log_pis_t model.tower_stats["actor_loss"] = actor_loss model.tower_stats["critic_loss"] = critic_loss model.tower_stats["alpha_loss"] = alpha_loss # TD-error tensor in final stats # will be concatenated and retrieved for each individual batch item. model.tower_stats["td_error"] = td_error # Return all loss terms corresponding to our optimizers. return tuple([actor_loss] + critic_loss + [alpha_loss])
def postprocess_trajectory(self, policy: "Policy", sample_batch: SampleBatch, tf_sess: Optional["tf.Session"] = None): noisy_action_dist = noise_free_action_dist = None # Adjust the stddev depending on the action (pi)-distance. # Also see [1] for details. # TODO(sven): Find out whether this can be scrapped by simply using # the `sample_batch` to get the noisy/noise-free action dist. _, _, fetches = policy.compute_actions( obs_batch=sample_batch[SampleBatch.CUR_OBS], # TODO(sven): What about state-ins and seq-lens? prev_action_batch=sample_batch.get(SampleBatch.PREV_ACTIONS), prev_reward_batch=sample_batch.get(SampleBatch.PREV_REWARDS), explore=self.weights_are_currently_noisy) # Categorical case (e.g. DQN). if policy.dist_class in (Categorical, TorchCategorical): action_dist = softmax(fetches[SampleBatch.ACTION_DIST_INPUTS]) # Deterministic (Gaussian actions, e.g. DDPG). elif policy.dist_class in [Deterministic, TorchDeterministic]: action_dist = fetches[SampleBatch.ACTION_DIST_INPUTS] else: raise NotImplementedError # TODO(sven): Other action-dist cases. if self.weights_are_currently_noisy: noisy_action_dist = action_dist else: noise_free_action_dist = action_dist _, _, fetches = policy.compute_actions( obs_batch=sample_batch[SampleBatch.CUR_OBS], prev_action_batch=sample_batch.get(SampleBatch.PREV_ACTIONS), prev_reward_batch=sample_batch.get(SampleBatch.PREV_REWARDS), explore=not self.weights_are_currently_noisy) # Categorical case (e.g. DQN). if policy.dist_class in (Categorical, TorchCategorical): action_dist = softmax(fetches[SampleBatch.ACTION_DIST_INPUTS]) # Deterministic (Gaussian actions, e.g. DDPG). elif policy.dist_class in [Deterministic, TorchDeterministic]: action_dist = fetches[SampleBatch.ACTION_DIST_INPUTS] if noisy_action_dist is None: noisy_action_dist = action_dist else: noise_free_action_dist = action_dist delta = distance = None # Categorical case (e.g. DQN). if policy.dist_class in (Categorical, TorchCategorical): # Calculate KL-divergence (DKL(clean||noisy)) according to [2]. # TODO(sven): Allow KL-divergence to be calculated by our # Distribution classes (don't support off-graph/numpy yet). distance = np.nanmean( np.sum( noise_free_action_dist * np.log(noise_free_action_dist / (noisy_action_dist + SMALL_NUMBER)), 1)) current_epsilon = self.sub_exploration.get_info( sess=tf_sess)["cur_epsilon"] delta = -np.log(1 - current_epsilon + current_epsilon / self.action_space.n) elif policy.dist_class in [Deterministic, TorchDeterministic]: # Calculate MSE between noisy and non-noisy output (see [2]). distance = np.sqrt( np.mean(np.square(noise_free_action_dist - noisy_action_dist))) current_scale = self.sub_exploration.get_info( sess=tf_sess)["cur_scale"] delta = getattr(self.sub_exploration, "ou_sigma", 0.2) * \ current_scale # Adjust stddev according to the calculated action-distance. if distance <= delta: self.stddev_val *= 1.01 else: self.stddev_val /= 1.01 # Set self.stddev to calculated value. if self.framework == "tf": self.stddev.load(self.stddev_val, session=tf_sess) else: self.stddev = self.stddev_val return sample_batch