def testActorLoss(self, num_bc_steps, expected_loss): agent = cql_sac_agent.CqlSacAgent(self._time_step_spec, self._action_spec, critic_network=DummyCriticNet(), actor_network=DummyActorNet( self._obs_spec, self._action_spec), actor_optimizer=None, critic_optimizer=None, alpha_optimizer=None, cql_alpha=1.0, num_cql_samples=1, include_critic_entropy_term=False, use_lagrange_cql_alpha=False, num_bc_steps=num_bc_steps, actor_policy_ctor=DummyActorPolicy) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[5], [6]], dtype=tf.float32) loss = agent.actor_loss(time_steps, actions) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def testAgentTransitionTrain(self): actor_net = actor_distribution_network.ActorDistributionNetwork( self._obs_spec, self._action_spec, fc_layer_params=(10, ), continuous_projection_net=tanh_normal_projection_network. TanhNormalProjectionNetwork) agent = cql_sac_agent.CqlSacAgent( self._time_step_spec, self._action_spec, critic_network=DummyCriticNet(), actor_network=actor_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), cql_alpha=5.0, num_cql_samples=1, include_critic_entropy_term=False, use_lagrange_cql_alpha=False) time_step_spec = self._time_step_spec._replace( reward=tensor_spec.BoundedTensorSpec( [], tf.float32, minimum=0.0, maximum=1.0, name='reward')) transition_spec = trajectory.Transition( time_step=time_step_spec, action_step=policy_step.PolicyStep(action=self._action_spec, state=(), info=()), next_time_step=time_step_spec) sample_trajectory_experience = tensor_spec.sample_spec_nest( transition_spec, outer_dims=(3, )) agent.train(sample_trajectory_experience)
def testCqlLoss(self, cql_alpha, num_cql_samples, expected_loss): agent = cql_sac_agent.CqlSacAgent(self._time_step_spec, self._action_spec, critic_network=DummyCriticNet(), actor_network=None, actor_optimizer=None, critic_optimizer=None, alpha_optimizer=None, cql_alpha=cql_alpha, num_cql_samples=num_cql_samples, include_critic_entropy_term=False, use_lagrange_cql_alpha=False, random_seed=self._random_seed, actor_policy_ctor=DummyActorPolicy) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[5], [6]], dtype=tf.float32) loss = agent._cql_loss(time_steps, actions, training=False) * agent._get_cql_alpha() self.initialize_v1_variables() loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def testCriticLoss(self, include_critic_entropy_term, reward_noise_variance, use_tf_variable, td_targets): if use_tf_variable: reward_noise_variance = tf.Variable(reward_noise_variance) agent = cql_sac_agent.CqlSacAgent( self._time_step_spec, self._action_spec, critic_network=DummyCriticNet(), actor_network=None, actor_optimizer=None, critic_optimizer=None, alpha_optimizer=None, cql_alpha=1.0, num_cql_samples=1, include_critic_entropy_term=include_critic_entropy_term, use_lagrange_cql_alpha=False, reward_noise_variance=reward_noise_variance, actor_policy_ctor=DummyActorPolicy) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[5], [6]], dtype=tf.float32) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32) next_time_steps = ts.transition(next_observations, rewards, discounts) pred_td_targets = [7., 10.] self.evaluate(tf.compat.v1.global_variables_initializer()) # Expected critic loss has factor of 2, for the two TD3 critics. expected_loss = self.evaluate( 2 * tf.compat.v1.losses.mean_squared_error( tf.constant(td_targets), tf.constant(pred_td_targets))) loss = agent._critic_loss_with_optional_entropy_term( time_steps, actions, next_time_steps, td_errors_loss_fn=tf.math.squared_difference) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def testTrainWithLagrange( self, use_lagrange_cql_alpha, use_variable_for_cql_alpha, log_cql_alpha_clipping, expected_cql_alpha_step_one, expected_cql_alpha_step_two, expected_cql_loss_step_one, expected_cql_loss_step_two): if use_variable_for_cql_alpha: cql_alpha = tf.Variable(5.0) cql_alpha_var = cql_alpha # Getting around type checking. else: cql_alpha = 5.0 cql_alpha_learning_rate = 0.5 cql_tau = 10 num_cql_samples = 5 actor_net = actor_distribution_network.ActorDistributionNetwork( self._obs_spec, self._action_spec, fc_layer_params=None) critic_net = critic_network.CriticNetwork( (self._obs_spec, self._action_spec), observation_fc_layer_params=(16, ), action_fc_layer_params=(16, ), joint_fc_layer_params=(16, ), kernel_initializer='glorot_uniform', last_kernel_initializer='glorot_uniform') counter = common.create_variable('test_train_counter') optimizer_fn = tf.compat.v1.train.AdamOptimizer agent = cql_sac_agent.CqlSacAgent( self._time_step_spec, self._action_spec, critic_network=critic_net, actor_network=actor_net, actor_optimizer=optimizer_fn(1e-3), critic_optimizer=optimizer_fn(1e-3), alpha_optimizer=optimizer_fn(1e-3), cql_alpha=cql_alpha, num_cql_samples=num_cql_samples, include_critic_entropy_term=False, use_lagrange_cql_alpha=use_lagrange_cql_alpha, cql_alpha_learning_rate=cql_alpha_learning_rate, cql_tau=cql_tau, random_seed=self._random_seed, log_cql_alpha_clipping=log_cql_alpha_clipping, train_step_counter=counter) batch_size = 5 observations = tf.constant([[[1, 2], [3, 4]]] * batch_size, dtype=tf.float32) actions = tf.constant([[[0], [1]]] * batch_size, dtype=tf.float32) time_steps = ts.TimeStep(step_type=tf.constant([[1] * 2] * batch_size, dtype=tf.int32), reward=tf.constant([[1] * 2] * batch_size, dtype=tf.float32), discount=tf.constant([[1] * 2] * batch_size, dtype=tf.float32), observation=observations) experience = trajectory.Trajectory(time_steps.step_type, observations, actions, (), time_steps.step_type, time_steps.reward, time_steps.discount) # Force variable creation. agent.policy.variables() if not tf.executing_eagerly(): # Get experience first to make sure optimizer variables are created and # can be initialized. experience = agent.train(experience) with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertEqual(self.evaluate(counter), 0) self.evaluate(experience) self.assertEqual(self.evaluate(counter), 1) else: # Training step one. self.assertEqual(self.evaluate(counter), 0) loss = self.evaluate(agent.train(experience)) self.assertEqual(self.evaluate(counter), 1) self.assertAllClose(loss.extra.cql_loss, expected_cql_loss_step_one) self.assertAllClose(loss.extra.cql_alpha, expected_cql_alpha_step_one) if use_lagrange_cql_alpha: self.assertGreater(loss.extra.cql_alpha_loss, 0) else: self.assertEqual(loss.extra.cql_alpha_loss, 0) # Training step two. if use_variable_for_cql_alpha: cql_alpha_var.assign_add(1) loss = self.evaluate(agent.train(experience)) self.assertEqual(self.evaluate(counter), 2) self.assertAllClose(loss.extra.cql_loss, expected_cql_loss_step_two) # GPU (V100) needs slightly increased to pass. if tf.test.is_gpu_available(): self.assertAllClose(loss.extra.cql_alpha, expected_cql_alpha_step_two, atol=4.5e-5, rtol=1.5e-5) else: self.assertAllClose(loss.extra.cql_alpha, expected_cql_alpha_step_two)
def testTrainWithRnn(self, cql_alpha, num_cql_samples, include_critic_entropy_term, use_lagrange_cql_alpha, expected_loss): actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( self._obs_spec, self._action_spec, input_fc_layer_params=None, output_fc_layer_params=None, conv_layer_params=None, lstm_size=(40, ), ) critic_net = critic_rnn_network.CriticRnnNetwork( (self._obs_spec, self._action_spec), observation_fc_layer_params=(16, ), action_fc_layer_params=(16, ), joint_fc_layer_params=(16, ), lstm_size=(16, ), output_fc_layer_params=None, ) counter = common.create_variable('test_train_counter') optimizer_fn = tf.compat.v1.train.AdamOptimizer agent = cql_sac_agent.CqlSacAgent( self._time_step_spec, self._action_spec, critic_network=critic_net, actor_network=actor_net, actor_optimizer=optimizer_fn(1e-3), critic_optimizer=optimizer_fn(1e-3), alpha_optimizer=optimizer_fn(1e-3), cql_alpha=cql_alpha, num_cql_samples=num_cql_samples, include_critic_entropy_term=include_critic_entropy_term, use_lagrange_cql_alpha=use_lagrange_cql_alpha, random_seed=self._random_seed, train_step_counter=counter, ) batch_size = 5 observations = tf.constant([[[1, 2], [3, 4], [5, 6]]] * batch_size, dtype=tf.float32) actions = tf.constant([[[0], [1], [1]]] * batch_size, dtype=tf.float32) time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * batch_size, dtype=tf.int32), reward=tf.constant([[1] * 3] * batch_size, dtype=tf.float32), discount=tf.constant([[1] * 3] * batch_size, dtype=tf.float32), observation=observations) experience = trajectory.Trajectory(time_steps.step_type, observations, actions, (), time_steps.step_type, time_steps.reward, time_steps.discount) # Force variable creation. agent.policy.variables() if not tf.executing_eagerly(): # Get experience first to make sure optimizer variables are created and # can be initialized. experience = agent.train(experience) with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertEqual(self.evaluate(counter), 0) self.evaluate(experience) self.assertEqual(self.evaluate(counter), 1) else: self.assertEqual(self.evaluate(counter), 0) loss = self.evaluate(agent.train(experience)) self.assertAllClose(loss.loss, expected_loss) self.assertEqual(self.evaluate(counter), 1)
def train_eval( root_dir, dataset_path, env_name, # Training params tpu=False, use_gpu=False, num_gradient_updates=1000000, actor_fc_layers=(256, 256), critic_joint_fc_layers=(256, 256, 256), # Agent params batch_size=256, bc_steps=0, actor_learning_rate=3e-5, critic_learning_rate=3e-4, alpha_learning_rate=3e-4, reward_scale_factor=1.0, cql_alpha_learning_rate=3e-4, cql_alpha=5.0, cql_tau=10.0, num_cql_samples=10, reward_noise_variance=0.0, include_critic_entropy_term=False, use_lagrange_cql_alpha=True, log_cql_alpha_clipping=None, softmax_temperature=1.0, # Data params reward_shift=0.0, action_clipping=None, use_trajectories=False, data_shuffle_buffer_size_per_record=1, data_shuffle_buffer_size=100, data_num_shards=1, data_block_length=10, data_parallel_reads=None, data_parallel_calls=10, data_prefetch=10, data_cycle_length=10, # Others policy_save_interval=10000, eval_interval=10000, summary_interval=1000, learner_iterations_per_call=1, eval_episodes=10, debug_summaries=False, summarize_grads_and_vars=False, seed=None): """Trains and evaluates CQL-SAC.""" logging.info('Training CQL-SAC on: %s', env_name) tf.random.set_seed(seed) np.random.seed(seed) # Load environment. env = load_d4rl(env_name) tf_env = tf_py_environment.TFPyEnvironment(env) strategy = strategy_utils.get_strategy(tpu, use_gpu) if not dataset_path.endswith('.tfrecord'): dataset_path = os.path.join(dataset_path, env_name, '%s*.tfrecord' % env_name) logging.info('Loading dataset from %s', dataset_path) dataset_paths = tf.io.gfile.glob(dataset_path) # Create dataset. with strategy.scope(): dataset = create_tf_record_dataset( dataset_paths, batch_size, shuffle_buffer_size_per_record=data_shuffle_buffer_size_per_record, shuffle_buffer_size=data_shuffle_buffer_size, num_shards=data_num_shards, cycle_length=data_cycle_length, block_length=data_block_length, num_parallel_reads=data_parallel_reads, num_parallel_calls=data_parallel_calls, num_prefetch=data_prefetch, strategy=strategy, reward_shift=reward_shift, action_clipping=action_clipping, use_trajectories=use_trajectories) # Create agent. time_step_spec = tf_env.time_step_spec() observation_spec = time_step_spec.observation action_spec = tf_env.action_spec() with strategy.scope(): train_step = train_utils.create_train_step() actor_net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, fc_layer_params=actor_fc_layers, continuous_projection_net=tanh_normal_projection_network. TanhNormalProjectionNetwork) critic_net = critic_network.CriticNetwork( (observation_spec, action_spec), joint_fc_layer_params=critic_joint_fc_layers, kernel_initializer='glorot_uniform', last_kernel_initializer='glorot_uniform') agent = cql_sac_agent.CqlSacAgent( time_step_spec, action_spec, actor_network=actor_net, critic_network=critic_net, actor_optimizer=tf.keras.optimizers.Adam( learning_rate=actor_learning_rate), critic_optimizer=tf.keras.optimizers.Adam( learning_rate=critic_learning_rate), alpha_optimizer=tf.keras.optimizers.Adam( learning_rate=alpha_learning_rate), cql_alpha=cql_alpha, num_cql_samples=num_cql_samples, include_critic_entropy_term=include_critic_entropy_term, use_lagrange_cql_alpha=use_lagrange_cql_alpha, cql_alpha_learning_rate=cql_alpha_learning_rate, target_update_tau=5e-3, target_update_period=1, random_seed=seed, cql_tau=cql_tau, reward_noise_variance=reward_noise_variance, num_bc_steps=bc_steps, td_errors_loss_fn=tf.math.squared_difference, gamma=0.99, reward_scale_factor=reward_scale_factor, gradient_clipping=None, log_cql_alpha_clipping=log_cql_alpha_clipping, softmax_temperature=softmax_temperature, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step) agent.initialize() # Create learner. saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR) collect_env_step_metric = py_metrics.EnvironmentSteps() learning_triggers = [ triggers.PolicySavedModelTrigger(saved_model_dir, agent, train_step, interval=policy_save_interval, metadata_metrics={ triggers.ENV_STEP_METADATA_KEY: collect_env_step_metric }), triggers.StepPerSecondLogTrigger(train_step, interval=100) ] cql_learner = learner.Learner(root_dir, train_step, agent, experience_dataset_fn=lambda: dataset, triggers=learning_triggers, summary_interval=summary_interval, strategy=strategy) # Create actor for evaluation. tf_greedy_policy = greedy_policy.GreedyPolicy(agent.policy) eval_greedy_policy = py_tf_eager_policy.PyTFEagerPolicy( tf_greedy_policy, use_tf_function=True) eval_actor = actor.Actor(env, eval_greedy_policy, train_step, metrics=actor.eval_metrics(eval_episodes), summary_dir=os.path.join(root_dir, 'eval'), episodes_per_run=eval_episodes) # Run. dummy_trajectory = trajectory.mid((), (), (), 0., 1.) num_learner_iterations = int(num_gradient_updates / learner_iterations_per_call) for _ in range(num_learner_iterations): # Mimic collecting environment steps since we loaded a static dataset. for _ in range(learner_iterations_per_call): collect_env_step_metric(dummy_trajectory) cql_learner.run(iterations=learner_iterations_per_call) if eval_interval and train_step.numpy() % eval_interval == 0: eval_actor.run_and_log()