def testSaveWrappedPolicyRestoreOuterCheckAssertConsumed(self, batch_size=5): actor_policy_save_path = os.path.join(self.get_temp_dir(), 'actor_policy', str(batch_size)) noise_policy_save_path = os.path.join(self.get_temp_dir(), 'noise_policy', str(batch_size)) # Construct a policy to be saved under a tf.Graph instance. policy_saved_graph = tf.Graph() with policy_saved_graph.as_default(): actor_network = DummyActionNet(self._obs_spec, self._float_action_spec) wrapped_policy = actor_policy.ActorPolicy( time_step_spec=self._time_step_spec, action_spec=self._float_action_spec, actor_network=actor_network, clip=False) tf_policy = ou_noise_policy.OUNoisePolicy(wrapped_policy) # Save the exploration policy and the wrapped actor policy. actor_policy_saved = py_tf_policy.PyTFPolicy(wrapped_policy) noise_policy_saved = py_tf_policy.PyTFPolicy(tf_policy) for policy_saved, policy_save_path in zip( [actor_policy_saved, noise_policy_saved], [actor_policy_save_path, noise_policy_save_path]): policy_saved.session = tf.compat.v1.Session(graph=policy_saved_graph) policy_saved.initialize(batch_size) policy_saved.save(policy_dir=policy_save_path, graph=policy_saved_graph) # Construct a policy to be restored under another tf.Graph instance. policy_restore_graph = tf.Graph() with policy_restore_graph.as_default(): actor_network = DummyActionNet(self._obs_spec, self._float_action_spec) wrapped_policy = actor_policy.ActorPolicy( time_step_spec=self._time_step_spec, action_spec=self._float_action_spec, actor_network=actor_network, clip=False) tf_policy = ou_noise_policy.OUNoisePolicy(wrapped_policy) policy_restored = py_tf_policy.PyTFPolicy(tf_policy) policy_restored.session = tf.compat.v1.Session(graph=policy_restore_graph) policy_restored.initialize(batch_size) # 1). Restoring the same noise policy as was saved. policy_restored.restore( policy_dir=noise_policy_save_path, graph=policy_restore_graph) # 2). Restoring the actor policy inside of the noise policy. While the # graph for policy restore contains additional local variable for the # OUNoise, if there is no checking that checkpoint was consumed, this # also works. policy_restored.restore( policy_dir=actor_policy_save_path, graph=policy_restore_graph, assert_consumed=False) # 3). Restoring the actor policy while checking that all variables in # the checkpoint were found in the graph should fail. with self.assertRaisesRegexp( AssertionError, 'Some Python objects were not bound to checkpointed values*'): policy_restored.restore( policy_dir=actor_policy_save_path, graph=policy_restore_graph)
def testActionIsInRange(self): policy = ou_noise_policy.OUNoisePolicy(self._wrapped_policy) action_step = policy.action(self._time_step_batch) self.assertEqual(action_step.action.shape.as_list(), [2, 1]) self.assertEqual(action_step.action.dtype, tf.float32) self.evaluate(tf.global_variables_initializer()) self.evaluate(tf.local_variables_initializer()) actions_ = self.evaluate(action_step.action) self.assertTrue(np.all(actions_ >= self._action_spec.minimum)) self.assertTrue(np.all(actions_ <= self._action_spec.maximum))
def testActionAddsOUNoise(self): policy = ou_noise_policy.OUNoisePolicy(self._wrapped_policy, clip=False) action_step = policy.action(self._time_step_batch) wrapped_action_step = self._wrapped_policy.action(self._time_step_batch) self.evaluate(tf.global_variables_initializer()) self.evaluate(tf.local_variables_initializer()) actions_ = self.evaluate(action_step.action) wrapped_policy_actions_ = self.evaluate(wrapped_action_step.action) self.assertTrue(np.linalg.norm(actions_ - wrapped_policy_actions_) > 0)
def testActionList(self): action_spec = [self._action_spec] actor_network = DummyActionNet(self._obs_spec, action_spec) self._wrapped_policy = actor_policy.ActorPolicy( time_step_spec=self._time_step_spec, action_spec=action_spec, actor_network=actor_network, clip=False) policy = ou_noise_policy.OUNoisePolicy(self._wrapped_policy) action_step = policy.action(self._time_step_batch) self.assertEqual(action_step.action[0].shape.as_list(), [2, 1]) self.assertEqual(action_step.action[0].dtype, tf.float32) self.evaluate(tf.global_variables_initializer()) self.evaluate(tf.local_variables_initializer()) actions_ = self.evaluate(action_step.action) self.assertTrue(np.all(actions_[0] >= self._action_spec.minimum)) self.assertTrue(np.all(actions_[0] <= self._action_spec.maximum))
def testBuild(self): policy = ou_noise_policy.OUNoisePolicy(self._wrapped_policy) self.assertEqual(policy.time_step_spec(), self._time_step_spec) self.assertEqual(policy.action_spec(), self._action_spec) self.assertEqual(len(policy.variables()), 2)
def __init__(self, time_step_spec, action_spec, actor_network, critic_network, actor_optimizer, critic_optimizer, ou_stddev=1.0, ou_damping=1.0, target_update_tau=1.0, target_update_period=1, dqda_clipping=None, td_errors_loss_fn=None, gamma=1.0, reward_scale_factor=1.0, target_policy_noise=0.2, target_policy_noise_clip=0.5, gradient_clipping=None, debug_summaries=False, summarize_grads_and_vars=False): """Creates a Td3Agent Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. actor_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, step_type). critic_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, action, step_type). actor_optimizer: The default optimizer to use for the actor network. critic_optimizer: The default optimizer to use for the critic network. ou_stddev: Standard deviation for the Ornstein-Uhlenbeck (OU) noise added in the default collect policy. ou_damping: Damping factor for the OU noise added in the default collect policy. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. dqda_clipping: A scalar or float clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Default is None representing no clippiing. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of elementwise huber_loss is used. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. target_policy_noise: Scale factor on target action noise target_policy_noise_clip: Value to clip noise. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. """ self._actor_network = actor_network self._target_actor_network = actor_network.copy( name='TargetActorNetwork') self._critic_network_1 = critic_network self._target_critic_network_1 = critic_network.copy( name='TargetCriticNetwork1') self._critic_network_2 = critic_network.copy(name='CriticNetwork2') self._target_critic_network_2 = critic_network.copy( name='TargetCriticNetwork2') self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer # TODO(kewa): better variable names. self._ou_stddev = ou_stddev self._ou_damping = ou_damping self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._dqda_clipping = dqda_clipping self._td_errors_loss_fn = (td_errors_loss_fn or common_utils.element_wise_huber_loss) self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._target_policy_noise = target_policy_noise self._target_policy_noise_clip = target_policy_noise_clip self._gradient_clipping = gradient_clipping policy = actor_policy.ActorPolicy(time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=True) collect_policy = actor_policy.ActorPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=False) collect_policy = ou_noise_policy.OUNoisePolicy( collect_policy, ou_stddev=self._ou_stddev, ou_damping=self._ou_damping, clip=True) super(Td3Agent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=2 if not self._actor_network.state_spec else None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars)
def __init__(self, time_step_spec, action_spec, actor_network, critic_network, actor_optimizer=None, critic_optimizer=None, ou_stddev=1.0, ou_damping=1.0, target_actor_network=None, target_critic_network=None, target_update_tau=1.0, target_update_period=1, dqda_clipping=None, td_errors_loss_fn=None, gamma=1.0, reward_scale_factor=1.0, gradient_clipping=None, debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, name=None): """Creates a DDPG Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. actor_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, step_type[, policy_state]) and should return (action, new_state). critic_network: A tf_agents.network.Network to be used by the agent. The network will be called with call((observation, action), step_type[, policy_state]) and should return (q_value, new_state). actor_optimizer: The optimizer to use for the actor network. critic_optimizer: The optimizer to use for the critic network. ou_stddev: Standard deviation for the Ornstein-Uhlenbeck (OU) noise added in the default collect policy. ou_damping: Damping factor for the OU noise added in the default collect policy. target_actor_network: (Optional.) A `tf_agents.network.Network` to be used as the actor target network during Q learning. Every `target_update_period` train steps, the weights from `actor_network` are copied (possibly withsmoothing via `target_update_tau`) to ` target_q_network`. If `target_actor_network` is not provided, it is created by making a copy of `actor_network`, which initializes a new network with the same structure and its own layers and weights. Performing a `Network.copy` does not work when the network instance already has trainable parameters (e.g., has already been built, or when the network is sharing layers with another). In these cases, it is up to you to build a copy having weights that are not shared with the original `actor_network`, so that this can be used as a target network. If you provide a `target_actor_network` that shares any weights with `actor_network`, a warning will be logged but no exception is thrown. target_critic_network: (Optional.) Similar network as target_actor_network but for the critic_network. See documentation for target_actor_network. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. dqda_clipping: when computing the actor loss, clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Does not perform clipping if dqda_clipping == 0. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of elementwise huber_loss is used. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) self._actor_network = actor_network actor_network.create_variables() if target_actor_network: target_actor_network.create_variables() self._target_actor_network = common.maybe_copy_target_network_with_checks( self._actor_network, target_actor_network, 'TargetActorNetwork') self._critic_network = critic_network critic_network.create_variables() if target_critic_network: target_critic_network.create_variables() self._target_critic_network = common.maybe_copy_target_network_with_checks( self._critic_network, target_critic_network, 'TargetCriticNetwork') self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._ou_stddev = ou_stddev self._ou_damping = ou_damping self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._dqda_clipping = dqda_clipping self._td_errors_loss_fn = (td_errors_loss_fn or common.element_wise_huber_loss) self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._gradient_clipping = gradient_clipping self._update_target = self._get_target_updater(target_update_tau, target_update_period) """Nitty: change time_step_spec to that of individual agent from total spec""" individual_time_step_spec = ts.get_individual_time_step_spec( time_step_spec) policy = actor_policy.ActorPolicy( time_step_spec=individual_time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=True) collect_policy = actor_policy.ActorPolicy( time_step_spec=individual_time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=False) # policy = actor_policy.ActorPolicy( # time_step_spec=time_step_spec, action_spec=action_spec, # actor_network=self._actor_network, clip=True) # collect_policy = actor_policy.ActorPolicy( # time_step_spec=time_step_spec, action_spec=action_spec, # actor_network=self._actor_network, clip=False) collect_policy = ou_noise_policy.OUNoisePolicy( collect_policy, ou_stddev=self._ou_stddev, ou_damping=self._ou_damping, clip=True) super(DdpgAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=2 if not self._actor_network.state_spec else None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter)
def __init__(self, time_step_spec, action_spec, load_model_path=None, save_model_path=None, ou_stddev=0.0, ou_damping=1.0, target_update_tau=0.05, target_update_period=5, max_episode_steps=None, ensemble_size=3, combine_ensemble_method='min', distance_type='distributional'): tf.Module.__init__(self, name='UvfAgent') assert max_episode_steps is not None self._max_episode_steps = max_episode_steps self._ensemble_size = ensemble_size self._distance_type = distance_type self._actor_network = GoalConditionedActorNetwork( time_step_spec.observation, action_spec) self._target_actor_network = self._actor_network.copy( name='TargetActorNetwork') critic_net_input_specs = (time_step_spec.observation, action_spec) critic_network = GoalConditionedCriticNetwork( critic_net_input_specs, output_dim=max_episode_steps if distance_type == 'distributional' else None) self._critic_network_list = [] self._target_critic_network_list = [] for ensemble_index in range(self._ensemble_size): self._critic_network_list.append( critic_network.copy(name='CriticNetwork%d' % ensemble_index)) self._target_critic_network_list.append( critic_network.copy(name='TargetCriticNetwork%d' % ensemble_index)) net_list = [ self._actor_network, self._target_actor_network ] + self._critic_network_list + self._target_critic_network_list for net in net_list: net.create_variables() self._actor_optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=1e-4) self._critic_optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=1e-4) self._train_iter = tf.Variable(0) mix_dict = self.model_variable self.load_model(load_model_path, save_model_path, mix_dict) self._ou_stddev = ou_stddev self._ou_damping = ou_damping self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._update_target = self._get_target_updater(target_update_tau, target_update_period) policy = actor_policy.ActorPolicy(time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=True) collect_policy = actor_policy.ActorPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=False) # noise x = (1-damping)*x + N(0,std) collect_policy = ou_noise_policy.OUNoisePolicy( collect_policy, ou_stddev=self._ou_stddev, ou_damping=self._ou_damping, clip=True) super(UvfAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=2)
def __init__(self, time_step_spec, action_spec, actor_network, critic_network, actor_optimizer=None, critic_optimizer=None, ou_stddev=1.0, ou_damping=1.0, target_update_tau=1.0, target_update_period=1, dqda_clipping=None, td_errors_loss_fn=None, gamma=1.0, reward_scale_factor=1.0, gradient_clipping=None, debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, name=None): """Creates a DDPG Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. actor_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, step_type). critic_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, action, step_type). actor_optimizer: The optimizer to use for the actor network. critic_optimizer: The optimizer to use for the critic network. ou_stddev: Standard deviation for the Ornstein-Uhlenbeck (OU) noise added in the default collect policy. ou_damping: Damping factor for the OU noise added in the default collect policy. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. dqda_clipping: when computing the actor loss, clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Does not perform clipping if dqda_clipping == 0. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of elementwise huber_loss is used. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) self._actor_network = actor_network self._target_actor_network = self._actor_network.copy( name='TargetActorNetwork') self._critic_network = critic_network self._target_critic_network = self._critic_network.copy( name='TargetCriticNetwork') self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._ou_stddev = ou_stddev self._ou_damping = ou_damping self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._dqda_clipping = dqda_clipping self._td_errors_loss_fn = (td_errors_loss_fn or common.element_wise_huber_loss) self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._gradient_clipping = gradient_clipping self._update_target = self._get_target_updater(target_update_tau, target_update_period) policy = actor_policy.ActorPolicy(time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=True) collect_policy = actor_policy.ActorPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=False) collect_policy = ou_noise_policy.OUNoisePolicy( collect_policy, ou_stddev=self._ou_stddev, ou_damping=self._ou_damping, clip=True) super(DdpgAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=2 if not self._actor_network.state_spec else None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter)
def __init__(self, time_step_spec, action_spec, ou_stddev=1.0, ou_damping=1.0, target_update_tau=0.05, target_update_period=5, max_episode_steps=None, ensemble_size=3, combine_ensemble_method='min', use_distributional_rl=True): """Creates a Uvf Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. ou_stddev: Standard deviation for the Ornstein-Uhlenbeck (OU) noise added in the default collect policy. ou_damping: Damping factor for the OU noise added in the default collect policy. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. max_episode_steps: Int indicating number of steps in an episode. Used for determining the number of bins for distributional RL. ensemble_size: (int) Number of models in ensemble of critics. combine_ensemble_method: (str) At test time, how to combine the distances predicted by each member of the ensemble. Options are 'mean', 'min', and 'td3'. The 'td3' option is pessimistic w.r.t. the pdf, and then takes computes the corresponding distance. The 'min' option takes the minimum q values, corresponding to taking the maximum predicted distance. Note that we never aggregate predictions during training. use_distributional_rl: (bool) Whether to use distributional RL. """ tf.Module.__init__(self, name='UvfAgent') assert max_episode_steps is not None self._max_episode_steps = max_episode_steps self._ensemble_size = ensemble_size self._use_distributional_rl = use_distributional_rl # Create the actor self._actor_network = GoalConditionedActorNetwork( time_step_spec.observation, action_spec) self._target_actor_network = self._actor_network.copy( name='TargetActorNetwork') # Create a prototypical critic, which we will copy to create the ensemble. critic_net_input_specs = (time_step_spec.observation, action_spec) critic_network = GoalConditionedCriticNetwork( critic_net_input_specs, output_dim=max_episode_steps if use_distributional_rl else None, ) self._critic_network_list = [] self._target_critic_network_list = [] for ensemble_index in range(self._ensemble_size): self._critic_network_list.append( critic_network.copy(name='CriticNetwork%d' % ensemble_index)) self._target_critic_network_list.append( critic_network.copy(name='TargetCriticNetwork%d' % ensemble_index)) self._actor_optimizer = tf.train.AdamOptimizer(learning_rate=3e-4) self._critic_optimizer = tf.train.AdamOptimizer(learning_rate=3e-4) self._ou_stddev = ou_stddev self._ou_damping = ou_damping self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._update_target = self._get_target_updater(target_update_tau, target_update_period) policy = actor_policy.ActorPolicy(time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=True) collect_policy = actor_policy.ActorPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=False) collect_policy = ou_noise_policy.OUNoisePolicy( collect_policy, ou_stddev=self._ou_stddev, ou_damping=self._ou_damping, clip=True) super(UvfAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=2)
def main(_): # setting up start_time = time.time() tf.compat.v1.enable_resource_variables() tf.compat.v1.disable_eager_execution() logging.set_verbosity(logging.INFO) global observation_omit_size, goal_coord, sample_count, iter_count, episode_size_buffer, episode_return_buffer root_dir = os.path.abspath(os.path.expanduser(FLAGS.logdir)) if not tf.io.gfile.exists(root_dir): tf.io.gfile.makedirs(root_dir) log_dir = os.path.join(root_dir, FLAGS.environment) if not tf.io.gfile.exists(log_dir): tf.io.gfile.makedirs(log_dir) save_dir = os.path.join(log_dir, "models") if not tf.io.gfile.exists(save_dir): tf.io.gfile.makedirs(save_dir) print("directory for recording experiment data:", log_dir) # in case training is paused and resumed, so can be restored try: sample_count = np.load(os.path.join(log_dir, "sample_count.npy")).tolist() iter_count = np.load(os.path.join(log_dir, "iter_count.npy")).tolist() episode_size_buffer = np.load( os.path.join(log_dir, "episode_size_buffer.npy")).tolist() episode_return_buffer = np.load( os.path.join(log_dir, "episode_return_buffer.npy")).tolist() except: sample_count = 0 iter_count = 0 episode_size_buffer = [] episode_return_buffer = [] train_summary_writer = tf.compat.v2.summary.create_file_writer( os.path.join(log_dir, "train", "in_graph_data"), flush_millis=10 * 1000) train_summary_writer.set_as_default() global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if(True): # environment related stuff env = do.get_environment(env_name=FLAGS.environment) py_env = wrap_env( skill_wrapper.SkillWrapper( env, num_latent_skills=FLAGS.num_skills, skill_type=FLAGS.skill_type, preset_skill=None, min_steps_before_resample=FLAGS.min_steps_before_resample, resample_prob=FLAGS.resample_prob, ), max_episode_steps=FLAGS.max_env_steps, ) # all specifications required for all networks and agents py_action_spec = py_env.action_spec() tf_action_spec = tensor_spec.from_spec( py_action_spec) # policy, critic action spec env_obs_spec = py_env.observation_spec() py_env_time_step_spec = ts.time_step_spec( env_obs_spec) # replay buffer time_step spec if observation_omit_size > 0: agent_obs_spec = array_spec.BoundedArraySpec( (env_obs_spec.shape[0] - observation_omit_size, ), env_obs_spec.dtype, minimum=env_obs_spec.minimum, maximum=env_obs_spec.maximum, name=env_obs_spec.name, ) # policy, critic observation spec else: agent_obs_spec = env_obs_spec py_agent_time_step_spec = ts.time_step_spec( agent_obs_spec) # policy, critic time_step spec tf_agent_time_step_spec = tensor_spec.from_spec( py_agent_time_step_spec) if not FLAGS.reduced_observation: skill_dynamics_observation_size = ( py_env_time_step_spec.observation.shape[0] - FLAGS.num_skills) else: skill_dynamics_observation_size = FLAGS.reduced_observation # TODO(architsh): Shift co-ordinate hiding to actor_net and critic_net (good for futher image based processing as well) actor_net = actor_distribution_network.ActorDistributionNetwork( tf_agent_time_step_spec.observation, tf_action_spec, fc_layer_params=(FLAGS.hidden_layer_size, ) * 2, continuous_projection_net=do._normal_projection_net, ) critic_net = critic_network.CriticNetwork( (tf_agent_time_step_spec.observation, tf_action_spec), observation_fc_layer_params=None, action_fc_layer_params=None, joint_fc_layer_params=(FLAGS.hidden_layer_size, ) * 2, ) if (FLAGS.skill_dynamics_relabel_type is not None and "importance_sampling" in FLAGS.skill_dynamics_relabel_type and FLAGS.is_clip_eps > 1.0): reweigh_batches_flag = True else: reweigh_batches_flag = False agent = dads_agent.DADSAgent( # DADS parameters save_dir, skill_dynamics_observation_size, observation_modify_fn=do.process_observation, restrict_input_size=observation_omit_size, latent_size=FLAGS.num_skills, latent_prior=FLAGS.skill_type, prior_samples=FLAGS.random_skills, fc_layer_params=(FLAGS.hidden_layer_size, ) * 2, normalize_observations=FLAGS.normalize_data, network_type=FLAGS.graph_type, num_mixture_components=FLAGS.num_components, fix_variance=FLAGS.fix_variance, reweigh_batches=reweigh_batches_flag, skill_dynamics_learning_rate=FLAGS.skill_dynamics_lr, # SAC parameters time_step_spec=tf_agent_time_step_spec, action_spec=tf_action_spec, actor_network=actor_net, critic_network=critic_net, target_update_tau=0.005, target_update_period=1, actor_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=FLAGS.agent_lr), critic_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=FLAGS.agent_lr), alpha_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=FLAGS.agent_lr), td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error, gamma=FLAGS.agent_gamma, reward_scale_factor=1.0 / (FLAGS.agent_entropy + 1e-12), gradient_clipping=None, debug_summaries=FLAGS.debug, train_step_counter=global_step, ) # evaluation policy eval_policy = py_tf_policy.PyTFPolicy(agent.policy) # collection policy if FLAGS.collect_policy == "default": collect_policy = py_tf_policy.PyTFPolicy(agent.collect_policy) elif FLAGS.collect_policy == "ou_noise": collect_policy = py_tf_policy.PyTFPolicy( ou_noise_policy.OUNoisePolicy(agent.collect_policy, ou_stddev=0.2, ou_damping=0.15)) # relabelling policy deals with batches of data, unlike collect and eval relabel_policy = py_tf_policy.PyTFPolicy(agent.collect_policy) # constructing a replay buffer, need a python spec policy_step_spec = policy_step.PolicyStep(action=py_action_spec, state=(), info=()) if (FLAGS.skill_dynamics_relabel_type is not None and "importance_sampling" in FLAGS.skill_dynamics_relabel_type and FLAGS.is_clip_eps > 1.0): policy_step_spec = policy_step_spec._replace( info=policy_step.set_log_probability( policy_step_spec.info, array_spec.ArraySpec( shape=(), dtype=np.float32, name="action_log_prob"), )) trajectory_spec = from_transition(py_env_time_step_spec, policy_step_spec, py_env_time_step_spec) capacity = FLAGS.replay_buffer_capacity # for all the data collected rbuffer = py_uniform_replay_buffer.PyUniformReplayBuffer( capacity=capacity, data_spec=trajectory_spec) if FLAGS.train_skill_dynamics_on_policy: # for on-policy data (if something special is required) on_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer( capacity=FLAGS.initial_collect_steps + FLAGS.collect_steps + 10, data_spec=trajectory_spec, ) # insert experience manually with relabelled rewards and skills agent.build_agent_graph() agent.build_skill_dynamics_graph() agent.create_savers() # saving this way requires the saver to be out the object train_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(save_dir, "agent"), agent=agent, global_step=global_step, ) policy_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(save_dir, "policy"), policy=agent.policy, global_step=global_step, ) rb_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(save_dir, "replay_buffer"), max_to_keep=1, replay_buffer=rbuffer, ) setup_time = time.time() - start_time print("Setup time:", setup_time) with tf.compat.v1.Session().as_default() as sess: eval_policy.session = sess eval_policy.initialize(None) eval_policy.restore(os.path.join(FLAGS.logdir, "models", "policy")) plotdir = os.path.join(FLAGS.logdir, "plots") if not os.path.exists(plotdir): os.mkdir(plotdir) do.FLAGS = FLAGS do.eval_loop(eval_dir=plotdir, eval_policy=eval_policy, plot_name="plot")