def _relabel_given_goal(relabel_goal): obs_dim = relabel_goal.shape[0] all_trajectories = nest_utils.unstack_nested_tensors( all_data, full_buffer.data_spec) last_traj_idx = len(all_trajectories) for traj_idx, cur_trajectory in enumerate(all_trajectories): if cur_trajectory.step_type.numpy() != 2: new_obs = tf.concat( [cur_trajectory.observation[:obs_dim], relabel_goal], axis=0) if traj_idx == len(all_trajectories) - 1: next_obs = tf.concat( [last_step.observation[0, :obs_dim], relabel_goal], axis=0) else: next_obs = tf.concat([ all_trajectories[traj_idx + 1].observation[:obs_dim], relabel_goal ], axis=0) new_reward = tf.constant(reward_fn(obs=next_obs)) # terminate episode if new_reward.numpy() > 0.0: new_traj = cur_trajectory._replace( observation=new_obs, next_step_type=tf.constant(2), reward=new_reward, discount=tf.constant(0., dtype=tf.float32)) last_traj_idx = traj_idx + 1 full_buffer.add_batch( nest_utils.batch_nested_tensors(new_traj)) break else: new_traj = cur_trajectory._replace( observation=new_obs, reward=new_reward, ) full_buffer.add_batch( nest_utils.batch_nested_tensors(new_traj)) if last_traj_idx == len(all_trajectories): last_observation = tf.concat( [last_step.observation[0, :obs_dim], relabel_goal], axis=0) else: last_observation = tf.concat([ all_trajectories[last_traj_idx].observation[:obs_dim], relabel_goal ], axis=0) last_traj = cur_trajectory._replace( # pylint: disable=undefined-loop-variable step_type=tf.constant(2), observation=last_observation, next_step_type=tf.constant(0), reward=tf.constant(0.0), discount=tf.constant(1., dtype=tf.float32)) full_buffer.add_batch(nest_utils.batch_nested_tensors(last_traj))
def testUnstackNestedTensors(self): shape = [5, 8] batch_size = 7 specs = self.nest_spec(shape, include_sparse=False) batched_tensors = self.zeros_from_spec(specs, batch_size=batch_size) tf.nest.assert_same_structure(batched_tensors, specs) tensors = nest_utils.unstack_nested_tensors(batched_tensors, specs) self.assertEqual(batch_size, len(tensors)) for t in tensors: tf.nest.assert_same_structure(specs, t) assert_shapes = lambda t: self.assertEqual(t.shape.as_list(), shape) tf.nest.map_structure(assert_shapes, tensors)
def loop_body(counter, time_step, policy_state): """Runs a step in environment. While loop will call multiple times. Args: counter: Episode counters per batch index. Shape [batch_size]. time_step: TimeStep tuple with elements shape [batch_size, ...]. policy_state: Poicy state tensor shape [batch_size, policy_state_dim]. Pass empty tuple for non-recurrent policies. Returns: loop_vars for next iteration of tf.while_loop. """ action_step = self.policy.action(time_step, policy_state) # TODO: TF2 while_loop seems to either ignore # parallel_iterations or doesn't properly propagate control dependencies # from one step to the next. Without this dep, self.env.step() is called with tf.control_dependencies(tf.nest.flatten([time_step])): # in parallel. next_time_step = self.env.step(action_step.action) policy_state = action_step.state if self._is_bandit_env: # For Bandits we create episodes of length 1. # Since the `next_time_step` is always of type LAST we need to replace # the step type of the current `time_step` to FIRST. batch_size = tf.shape(input=time_step.discount) time_step = time_step._replace( step_type=tf.fill(batch_size, ts.StepType.FIRST)) traj = trajectory.from_transition(time_step, action_step, next_time_step) observer_ops = [observer(traj) for observer in self._observers] transition_observer_ops = [ observer((time_step, action_step, next_time_step)) for observer in self._transition_observers ] with tf.control_dependencies( [tf.group(observer_ops + transition_observer_ops)]): time_step, next_time_step, policy_state = tf.nest.map_structure( tf.identity, (time_step, next_time_step, policy_state)) # While loop counter is only incremented for episode reset episodes. # For Bandits, this is every trajectory, for MDPs, this is at boundaries. if self._is_bandit_env: counter += tf.ones(batch_size, dtype=tf.int32) else: counter += tf.cast(traj.is_boundary(), dtype=tf.int32) if not tf.reduce_any(tf.less(counter, 1)): # all episodes have finished: for ep_id in range(self._num_episodes): episode = self._temp_rb._get_episode(ep_id) if episode.observation['task_agn_rew'][-1] == 1: rew_type = episode.observation['task_agn_rew'].dtype ep_len = episode.observation['task_agn_rew'].shape[0] start = max(-self._ep_history_unsafe, -ep_len) if self._unsafe_label == 'constant': discount = tf.ones((-start, ), dtype=rew_type) elif self._unsafe_label == 'exp': discount = 0.99**tf.reverse(tf.range( -start, dtype=rew_type), axis=[0]) elif self._unsafe_label == 'linear': discount = (tf.range(-start, dtype=rew_type) + 1) / -start discount = tf.pad(discount, [[ep_len + start, 0]]) obs = episode.observation obs['task_agn_rew'] = discount episode._replace(observation=obs) trajs = nest_utils.unstack_nested_tensors( episode, self._final_rb.data_spec) for traj in trajs: self._final_rb.add_batch(traj) self._temp_rb.clear() return [counter, next_time_step, policy_state]
def copy_replay_buffer(small_buffer, big_buffer): """Copy small buffer into the big buffer.""" all_data = nest_utils.unbatch_nested_tensors(small_buffer.gather_all()) for trajectory in nest_utils.unstack_nested_tensors( all_data, big_buffer.data_spec): big_buffer.add_batch(nest_utils.batch_nested_tensors(trajectory))
def data_multiplier(offline_data, reward_fn): def _custom_print(some_traj): # pylint: disable=unused-variable np.set_printoptions(precision=2, suppress=True) print('step', some_traj.step_type.numpy(), 'obs', some_traj.observation.numpy(), 'action', some_traj.action.numpy(), 'reward', some_traj.reward.numpy(), 'next_step', some_traj.next_step_type.numpy(), 'discount', some_traj.discount.numpy()) all_data = nest_utils.unbatch_nested_tensors(offline_data.gather_all()) all_trajs = nest_utils.unstack_nested_tensors(all_data, offline_data.data_spec) for idx, traj in enumerate(all_trajs): # print('index:', idx) if traj.step_type.numpy() == 0: ep_start_idx = idx # print('new start index:', ep_start_idx) # TODO(architsh): remove this and change to else: # elif idx in [12, 24, 36, 48, 60, 72, 84, 96, 108]: else: # print('adding new trajectory') obs_dim = traj.observation.shape[0] // 2 relabel_goal = traj.observation[:obs_dim] # print('new goal:', relabel_goal) last_traj_idx = len(all_trajs[ep_start_idx:idx + 1]) for traj_idx, cur_trajectory in enumerate( all_trajs[ep_start_idx:idx + 1]): if cur_trajectory.step_type.numpy() != 2: new_obs = tf.concat( [cur_trajectory.observation[:obs_dim], relabel_goal], axis=0) next_obs = tf.concat([ all_trajs[ep_start_idx + traj_idx + 1].observation[:obs_dim], relabel_goal ], axis=0) new_reward = tf.constant(reward_fn(obs=next_obs)) # terminate episode if new_reward.numpy() > 0.0: new_traj = cur_trajectory._replace( observation=new_obs, next_step_type=tf.constant(2), reward=new_reward, discount=tf.constant(0., dtype=tf.float32)) last_traj_idx = ep_start_idx + traj_idx + 1 # _custom_print(new_traj) offline_data.add_batch( nest_utils.batch_nested_tensors(new_traj)) break else: new_traj = cur_trajectory._replace( observation=new_obs, reward=new_reward, ) # _custom_print(new_traj) offline_data.add_batch( nest_utils.batch_nested_tensors(new_traj)) last_observation = tf.concat( [all_trajs[last_traj_idx].observation[:obs_dim], relabel_goal], axis=0) last_traj = cur_trajectory._replace( # pylint: disable=undefined-loop-variable step_type=tf.constant(2), observation=last_observation, next_step_type=tf.constant(0), reward=tf.constant(0.0), discount=tf.constant(1., dtype=tf.float32)) # _custom_print(last_traj) offline_data.add_batch(nest_utils.batch_nested_tensors(last_traj))
def relabel_function(cur_episode, last_step, reward_fn, full_buffer): all_data = cur_episode.gather_all() # add all actual interaction to the replay buffer all_data = nest_utils.unbatch_nested_tensors(all_data) for cur_trajectory in nest_utils.unstack_nested_tensors( all_data, full_buffer.data_spec): # was already added by previous iteration if cur_trajectory.step_type.numpy() != 2: full_buffer.add_batch( nest_utils.batch_nested_tensors(cur_trajectory)) last_traj = cur_trajectory._replace( # pylint: disable=undefined-loop-variable step_type=tf.constant(2), observation=last_step.observation[0], next_step_type=tf.constant(0), reward=tf.constant(0.0), discount=tf.constant(1., dtype=tf.float32)) full_buffer.add_batch(nest_utils.batch_nested_tensors(last_traj)) def _relabel_given_goal(relabel_goal): obs_dim = relabel_goal.shape[0] all_trajectories = nest_utils.unstack_nested_tensors( all_data, full_buffer.data_spec) last_traj_idx = len(all_trajectories) for traj_idx, cur_trajectory in enumerate(all_trajectories): if cur_trajectory.step_type.numpy() != 2: new_obs = tf.concat( [cur_trajectory.observation[:obs_dim], relabel_goal], axis=0) if traj_idx == len(all_trajectories) - 1: next_obs = tf.concat( [last_step.observation[0, :obs_dim], relabel_goal], axis=0) else: next_obs = tf.concat([ all_trajectories[traj_idx + 1].observation[:obs_dim], relabel_goal ], axis=0) new_reward = tf.constant(reward_fn(obs=next_obs)) # terminate episode if new_reward.numpy() > 0.0: new_traj = cur_trajectory._replace( observation=new_obs, next_step_type=tf.constant(2), reward=new_reward, discount=tf.constant(0., dtype=tf.float32)) last_traj_idx = traj_idx + 1 full_buffer.add_batch( nest_utils.batch_nested_tensors(new_traj)) break else: new_traj = cur_trajectory._replace( observation=new_obs, reward=new_reward, ) full_buffer.add_batch( nest_utils.batch_nested_tensors(new_traj)) if last_traj_idx == len(all_trajectories): last_observation = tf.concat( [last_step.observation[0, :obs_dim], relabel_goal], axis=0) else: last_observation = tf.concat([ all_trajectories[last_traj_idx].observation[:obs_dim], relabel_goal ], axis=0) last_traj = cur_trajectory._replace( # pylint: disable=undefined-loop-variable step_type=tf.constant(2), observation=last_observation, next_step_type=tf.constant(0), reward=tf.constant(0.0), discount=tf.constant(1., dtype=tf.float32)) full_buffer.add_batch(nest_utils.batch_nested_tensors(last_traj)) # relabel with last time step achieved in the episode if FLAGS.goal_relabel_type == 0 or (FLAGS.goal_relabel_type == 1 and last_step.reward.numpy()[0] <= 0.): obs_dim = last_step.observation.shape[1] // 2 _relabel_given_goal(last_step.observation[0, :obs_dim]) elif FLAGS.goal_relabel_type == 2 and last_step.reward.numpy()[0] <= 0.: goals = [ [1.2, 0., 2.5, 0., -1., -1.], [2., 0., 2.4, 0., 0., 0.], [0.8, 0., 1.2, 0., 0., 0.], [-0.1, -0.3, 0.3, -0.3, 0., 0.], [-0.6, -1., -0.2, -1., 0., 0.], [-1.8, -1., -1.4, -1., 0., 0.], [-2.8, -0.8, -2.4, -1., -1., -1.], [-2.4, 0., -2.4, -1., -1., -1.], [-1.2, 0., -2.4, -1., -1., -1.], [0.0, 0.0, -2.5, -1, -1., -1.], ] goals = np.stack(goals).astype('float32') print('unrelabelled goal:', last_step.observation[0, 6:].numpy()) relabel_goal_idxs = np.arange(goals.shape[0]) np.random.shuffle(relabel_goal_idxs) obs_dim = last_step.observation.shape[1] // 2 relabel_count = 0 for goal_idx in relabel_goal_idxs: chosen_goal = goals[goal_idx] if (chosen_goal == last_step.observation[0, obs_dim:].numpy()).all(): continue print('goal for relabelling:', chosen_goal) _relabel_given_goal(relabel_goal=tf.constant(chosen_goal)) relabel_count += 1 if relabel_count >= FLAGS.num_relabelled_goals: break else: print('not adding relabelled trajectories')
def copy_replay_buffer(small_buffer, big_buffer): """Copy small buffer into the big buffer.""" all_data = nest_utils.unbatch_nested_tensors(small_buffer.gather_all()) for trajectory in nest_utils.unstack_nested_tensors( # pylint: disable=redefined-outer-name all_data, big_buffer.data_spec): big_buffer.add_batch(trajectory)