def testBuild(self): batch_size = 3 num_state_dims = 5 num_actions = 2 states = tf.random.uniform([batch_size, num_state_dims]) network = q_network.QNetwork( input_tensor_spec=tensor_spec.TensorSpec([num_state_dims], tf.float32), action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)) q_values, _ = network(states) self.assertAllEqual(q_values.shape.as_list(), [batch_size, num_actions]) self.assertEqual(len(network.trainable_weights), 6)
def testNetworkVariablesAreReused(self): batch_size = 3 num_state_dims = 5 states = tf.ones([batch_size, num_state_dims]) next_states = tf.ones([batch_size, num_state_dims]) network = q_network.QNetwork( input_tensor_spec=tensor_spec.TensorSpec([num_state_dims], tf.float32), action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)) q_values, _ = network(states) next_q_values, _ = network(next_states) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(q_values, next_q_values)
def testAddConvLayers(self): batch_size = 3 num_state_dims = 5 num_actions = 2 states = tf.random.uniform([batch_size, 5, 5, num_state_dims]) network = q_network.QNetwork( input_tensor_spec=tensor_spec.TensorSpec([5, 5, num_state_dims], tf.float32), action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1), conv_layer_params=((16, 3, 2),)) q_values, _ = network(states) self.assertAllEqual(q_values.shape.as_list(), [batch_size, num_actions]) self.assertEqual(len(network.trainable_variables), 8)
def build(self): # build environment self.train_py_env = suite_gym.load(self.env_name) self.eval_py_env = suite_gym.load(self.env_name) # we can chagne cartpole parameters here self.train_env = tf_py_environment.TFPyEnvironment(self.train_py_env) self.eval_env = tf_py_environment.TFPyEnvironment(self.eval_py_env) # build agent q_net = q_network.QNetwork( self.train_env.observation_spec(), self.train_env.action_spec(), fc_layer_params=self.fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=self.learning_rate) train_step_counter = tf.Variable(0) self.agent = dqn_agent.DqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter) self.agent.initialize() # build policy self.random_policy = random_tf_policy.RandomTFPolicy(self.train_env.time_step_spec(),self.train_env.action_spec()) # build replay buffer self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=self.replay_buffer_max_length) # build collect self.collect_data(self.train_env, self.random_policy, self.replay_buffer, self.initial_collect_steps) # build dataset self.dataset = self.replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=self.batch_size, num_steps=2).prefetch(3) self.iterator = iter(self.dataset)
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 env = wheel_py_environment.WheelPyEnvironment(DELTA, MU_BASE, STD_BASE, MU_HIGH, STD_HIGH, BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_reward, delta=DELTA, mu_inside=MU_BASE[0], mu_high=MU_HIGH) optimal_action_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_action, delta=DELTA) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=LAYERS) agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def create_agent(self): q_net = q_network.QNetwork(self.env_t.observation_spec(), self.env_t.action_spec(), fc_layer_params=self.settings['fc_layer']) optimizer = v1.train.AdamOptimizer(learning_rate=self.settings['lr']) train_step_counter = v1.Variable(0) self.agent = dqn_agent.DqnAgent( self.env_t.time_step_spec(), self.env_t.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter) self.agent.initialize()
def testUpdateWithCompositeSavedModelAndCheckpoint(self): # Create and saved_model for a q_policy. network = q_network.QNetwork( input_tensor_spec=self._time_step_spec.observation, action_spec=self._action_spec) policy = q_policy.QPolicy(time_step_spec=self._time_step_spec, action_spec=self._action_spec, q_network=network) saver = policy_saver.PolicySaver(policy, batch_size=None) full_model_path = os.path.join(self.get_temp_dir(), 'save_model') def assert_val_equal_var(val, var): self.assertTrue(np.array_equal(np.full_like(var, val), var)) self.evaluate(tf.compat.v1.global_variables_initializer()) # Set all variables in the saved model to 1 variables = policy.variables() self.evaluate( tf.nest.map_structure(lambda v: v.assign(v * 0 + 1), variables)) for v in self.evaluate(variables): assert_val_equal_var(1, v) saver.save(full_model_path) # Assign 2 to all variables in the policy. Making checkpoint different than # the initial saved_model. self.evaluate( tf.nest.map_structure(lambda v: v.assign(v * 0 + 2), variables)) for v in self.evaluate(variables): assert_val_equal_var(2, v) checkpoint_path = os.path.join(self.get_temp_dir(), 'checkpoint') saver.save_checkpoint(checkpoint_path) # Reload the full model and check all variables are 1 reloaded_policy = tf.compat.v2.saved_model.load(full_model_path) for v in self.evaluate(reloaded_policy.model_variables): assert_val_equal_var(1, v) # Compose a new full saved model from the original saved model files # and variables from the checkpoint. composite_path = os.path.join(self.get_temp_dir(), 'composite_model') self.copy_tree(full_model_path, composite_path, skip_variables=True) self.copy_tree(checkpoint_path, os.path.join(composite_path)) # Reload the composite model and check all variables are 2 reloaded_policy = tf.compat.v2.saved_model.load(composite_path) for v in self.evaluate(reloaded_policy.model_variables): assert_val_equal_var(2, v)
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 action_reward_fns = ( environment_utilities.sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) env = sspe.StationaryStochasticPyEnvironment(functools.partial( environment_utilities.context_sampling_fn, batch_size=BATCH_SIZE, context_dim=CONTEXT_DIM), action_reward_fns, batch_size=BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) q_net = q_network.QNetwork(environment.observation_spec(), environment.action_spec(), fc_layer_params=(50, 50)) agent = dqn_agent.DqnAgent( environment.time_step_spec(), environment.action_spec(), q_network=q_net, epsilon_greedy=0.1, target_update_tau=0.05, target_update_period=5, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-2), td_errors_loss_fn=common.element_wise_squared_loss) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def create_agent(self): q_net = q_network.QNetwork(self.env.observation_spec(), self.env.action_spec(), fc_layer_params=self.fc_layer_params) self.tf_agent = dqn_agent.DqnAgent( self.env.time_step_spec(), self.env.action_spec(), q_network=q_net, optimizer=self.optimizer, td_errors_loss_fn=dqn_agent.element_wise_squared_loss, train_step_counter=self.train_step_counter, gamma=self.gamma) self.init_steps = 0 self.episode_steps = 0
def get_agent(train_env): fc_layer_params = (100, ) q_net = q_network.QNetwork(train_env.observation_spec(), train_env.action_spec(), fc_layer_params=fc_layer_params) train_step_counter = tf.Variable(0) return dqn_agent.DqnAgent( train_env.time_step_spec(), train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter)
def __init__(self, input_tensor_spec, action_spec, mask_q_value=-100000, fc_layer_params=fc_layer_params, activation_fn=tf.keras.activations.relu, name='MaskedQNetwork'): super(MaskedQNetwork, self).__init__(input_tensor_spec, action_spec, name=name) self._q_net = q_network.QNetwork(input_tensor_spec['state'], action_spec, fc_layer_params=fc_layer_params, activation_fn=activation_fn) # self._q_net = q_network.QNetwork(input_tensor_spec, action_spec, fc_layer_params=fc_layer_params, # activation_fn=activation_fn) self._mask_q_value = mask_q_value
def train(num_iterations): train_env = tf_py_environment.TFPyEnvironment(Cliff()) test_env = tf_py_environment.TFPyEnvironment(Cliff()) counter = tf.Variable(0) # Build network network = q_network.QNetwork(train_env.observation_spec(), train_env.action_spec(), fc_layer_params=(100, )) agent = dqn_agent.DqnAgent( train_env.time_step_spec(), train_env.action_spec(), q_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3), td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=counter) agent.initialize() agent.train = common.function(agent.train) agent.train_step_counter.assign(0) buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=train_env.batch_size, max_length=100) dataset = buffer.as_dataset(sample_batch_size=32, num_steps=2) iterator = iter(dataset) first_reward = compute_average_reward(train_env, agent.policy, num_episodes=10) print(f'Before training: {first_reward}') rewards = [first_reward] for _ in range(num_iterations): for _ in range(2): collect_steps(train_env, agent.collect_policy, buffer) experience, info = next(iterator) loss = agent.train(experience).loss step_number = agent.train_step_counter.numpy() if step_number % 10 == 0: print(f'step={step_number}: loss={loss}') if step_number % 20 == 0: average_reward = compute_average_reward(test_env, agent.policy, 1) print(f'step={step_number}: Reward:={average_reward}')
def testNumericFeatureColumnInput(self): key = 'feature_key' batch_size = 3 state_dims = 5 column = tf.feature_column.numeric_column(key, [state_dims]) state = {key: tf.ones([batch_size, state_dims], tf.int32)} state_spec = {key: tensor_spec.TensorSpec([state_dims], tf.int32)} online_network = q_network.QNetwork( input_tensor_spec=state_spec, action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1), preprocessing_combiner=tf.keras.layers.DenseFeatures([column])) target_network = online_network.copy(name='TargetNetwork') q_online, _ = online_network(state) q_target, _ = target_network(state) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(q_online, q_target, rtol=1.0, atol=1.0)
def __init__(self, env): self.env = env self.input_shape = (96, 96) self.extractor = keras.applications.MobileNetV2( input_shape=(self.input_shape+(3,)), include_top=False, weights='imagenet' ) self.extractor.trainable = False self.net = q_network.QNetwork( self.env.observation_spec(), self.env.action_spec(), preprocessing_layers=self.extractor, fc_layer_params=(64,) ) self.optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=1e-3)
def testCheckpointSave(self): network = q_network.QNetwork( input_tensor_spec=self._time_step_spec.observation, action_spec=self._action_spec) policy = q_policy.QPolicy(time_step_spec=self._time_step_spec, action_spec=self._action_spec, q_network=network) saver = policy_saver.PolicySaver(policy, batch_size=None) path = os.path.join(self.get_temp_dir(), 'save_model') self.evaluate(tf.compat.v1.global_variables_initializer()) saver.save(path) checkpoint_path = os.path.join(self.get_temp_dir(), 'checkpoint') saver.save_checkpoint(checkpoint_path) self.assertTrue(tf.compat.v2.io.gfile.exists(checkpoint_path))
def testMasking(self): batch_size = 3 num_state_dims = 5 num_actions = 6 states = tf.random.uniform([batch_size, num_state_dims]) input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32) action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, num_actions - 1) mask = tf.constant([[1, 0, 1, 0, 0, 1] for _ in range(batch_size)]) network = q_network.QNetwork(input_tensor_spec, action_spec, mask_split_fn=lambda observation: (observation, mask)) self.assertIsNotNone(network.mask_split_fn) # Run a pass through the network to catch any shape errors. network(states)
def testTrainStepNotSaved(self): network = q_network.QNetwork( input_tensor_spec=self._time_step_spec.observation, action_spec=self._action_spec) policy = q_policy.QPolicy(time_step_spec=self._time_step_spec, action_spec=self._action_spec, q_network=network) saver = policy_saver.PolicySaver(policy, batch_size=None) path = os.path.join(self.get_temp_dir(), 'save_model') saver.save(path) reloaded = tf.compat.v2.saved_model.load(path) self.assertIn('get_train_step', reloaded.signatures) train_step_value = self.evaluate(reloaded.get_train_step()) self.assertEqual(-1, train_step_value)
def __init__(self, *args, **kwargs): self.env = tf_py_environment.TFPyEnvironment(CardGameEnv()) self.q_net = q_network.QNetwork(self.env.observation_spec(), self.env.action_spec()) self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001) self.train_step_counter = tf.Variable(0) self.agent = dqn_agent.DqnAgent( self.env.time_step_spec(), self.env.action_spec(), q_network=self.q_net, optimizer=self.optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=self.train_step_counter) self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=self.env.batch_size, max_length=100000) self.num_iterations = 10000
def initialize_agent(self): """ Instance TF agent with hparams """ # Q network self.q_net = q_network.QNetwork( self.train_env.observation_spec(), self.train_env.action_spec(), fc_layer_params=self.qnet_fc_hidden_size) # DQN agent self.agent = dqn_agent.DqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=self.q_net, optimizer=self.optimizer, epsilon_greedy=0.1, # [TODO] - add the hyper param td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=self.train_step_counter) self.agent.initialize() self.policy = self.agent.policy
def generic_dqn_agent(env: TFPyEnvironment) -> (dqn_agent.DqnAgent, q_network.QNetwork): """ Function that returns a generic dqn agent args: env (TFPyEnvironment) : The environment the agent will live in Returns: dqn_agent.DqnAgent: The agent to train q_network.QNetwork: The network used in the agent """ inp = env.observation_spec().shape[0] q_net = q_network.QNetwork( env.observation_spec(), env.action_spec(), fc_layer_params=(20,20,20,20,20), activation_fn=tf.keras.activations.relu) optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005) agent = dqn_agent.DqnAgent( env.time_step_spec(), env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=tf.Variable(0), epsilon_greedy=0.1 ) """def observation_and_action_constraint_splitter(observation): action_mask = [1,1] if observation[0][-1] > 5: action_mask[0] = 1 return observation, tf.convert_to_tensor(action_mask, dtype=np.int32) agent.policy._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter )""" #tf_agents.policies.greedy_policy.GreedyPolicy agent.initialize() return agent, q_net
def testAgentWithDifferentSubagentsUpdate(self): num_actions = 3 context_dim = 2 batch_size = 7 observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent1 = lin_ucb_agent.LinearUCBAgent( time_step_spec, action_spec, emit_policy_info=( policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, )) reward_net = q_network.QNetwork(input_tensor_spec=observation_spec, action_spec=action_spec, fc_layer_params=(4, 3, 2)) agent2 = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec, action_spec, reward_network=reward_net, emit_policy_info=( policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, ), optimizer=tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1), epsilon=0.1) agents = [agent1, agent2] mixture_agent = static_mixture_agent.StaticMixtureAgent([1, 1], agents) initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action, 2, num_actions) experience = _get_experience(initial_step, action_step, final_step) for agent in agents: self.evaluate(agent.initialize()) self.evaluate(tf.compat.v1.initialize_all_variables()) self.evaluate(mixture_agent.initialize()) loss_info = mixture_agent.train(experience) self.evaluate(loss_info)
def testIndicatorFeatureColumnInput(self): key = 'feature_key' vocab_list = [2, 3, 4] column = tf.feature_column.categorical_column_with_vocabulary_list( key, vocab_list) column = tf.feature_column.indicator_column(column) feature_tensor = tf.convert_to_tensor([3, 2, 2, 4, 3]) state = {key: tf.expand_dims(feature_tensor, -1)} state_spec = {key: tensor_spec.TensorSpec([1], tf.int32)} online_network = q_network.QNetwork( input_tensor_spec=state_spec, action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1), preprocessing_combiner=tf.keras.layers.DenseFeatures([column])) target_network = online_network.copy(name='TargetNetwork') q_online, _ = online_network(state) q_target, _ = target_network(state) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(tf.compat.v1.initializers.tables_initializer()) self.assertAllClose(q_online, q_target, rtol=1.0, atol=1.0)
def testTrainStepSaved(self): # We need to use one default session so that self.evaluate and the # SavedModel loader share the same session. with tf.compat.v1.Session().as_default(): network = q_network.QNetwork( input_tensor_spec=self._time_step_spec.observation, action_spec=self._action_spec) policy = q_policy.QPolicy(time_step_spec=self._time_step_spec, action_spec=self._action_spec, q_network=network) self.evaluate( tf.compat.v1.initializers.variables(policy.variables())) train_step = common.create_variable('train_step', initial_value=7) self.evaluate(tf.compat.v1.initializers.variables([train_step])) saver = policy_saver.PolicySaver(policy, batch_size=None, train_step=train_step) if tf.executing_eagerly(): step = saver.get_train_step() else: step = self.evaluate(saver.get_train_step()) self.assertEqual(7, step) path = os.path.join(self.get_temp_dir(), 'save_model') saver.save(path) reloaded = tf.compat.v2.saved_model.load(path) self.assertIn('get_train_step', reloaded.signatures) self.evaluate(tf.compat.v1.global_variables_initializer()) train_step_value = self.evaluate(reloaded.get_train_step()) self.assertEqual(7, train_step_value) train_step = train_step.assign_add(3) self.evaluate(train_step) saver.save(path) reloaded = tf.compat.v2.saved_model.load(path) self.evaluate(tf.compat.v1.global_variables_initializer()) train_step_value = self.evaluate(reloaded.get_train_step()) self.assertEqual(10, train_step_value)
def testTrainWithNN(self, is_convert, is_distribution_network): # Hard code a trajectory shaped (time=6, batch=1, ...). traj, time_step_spec, action_spec = create_arbitrary_trajectory() if is_convert: # Convert to single step trajectory of shapes (batch=6, 1, ...). traj = tf.nest.map_structure(common.transpose_batch_time, traj) if is_distribution_network: cloning_net = sequential.Sequential([ expand_dims_layer.ExpandDims(-1), tf.keras.layers.Dense(action_spec.maximum - action_spec.minimum + 1), tf.keras.layers.Lambda( lambda t: tfp.distributions.Categorical(logits=t)), ]) else: cloning_net = q_network.QNetwork(time_step_spec.observation, action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=cloning_net, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001), num_outer_dims=2) # Disable clipping to make sure we can see the difference in behavior agent.policy._clip = False # TODO(b/123883319) if tf.executing_eagerly(): train_and_loss = lambda: agent.train(traj) else: train_and_loss = agent.train(traj) self.evaluate(tf.compat.v1.global_variables_initializer()) initial_loss = self.evaluate(train_and_loss).loss for _ in range(TRAIN_ITERATIONS - 1): loss = self.evaluate(train_and_loss).loss # We don't necessarily converge to the same actions as in trajectory after # 10 steps of an untuned optimizer, but the loss should go down. self.assertGreater(initial_loss, loss)
def __init__(self, environment): self.preprocessing_layers = { 'history': tf.keras.models.Sequential([ tf.keras.layers.Embedding(4, 16, input_length=1 * 24 * 60 // 15), tf.keras.layers.LSTM(32), tf.keras.layers.Flatten(), tf.keras.layers.Dense(50, activation="relu") ]), 'boiler_state': tf.keras.layers.Dense(1, activation="relu"), "usage_state": tf.keras.layers.Dense(1, activation="relu"), "water_temperature": tf.keras.layers.Dense(1, activation="relu"), } self.preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1) self.q_net = q_network.QNetwork( environment.observation_spec(), environment.action_spec(), fc_layer_params=fc_layer_params, preprocessing_layers=self.preprocessing_layers, preprocessing_combiner=self.preprocessing_combiner )
def testTrainStepNotSaved(self): if not common.has_eager_been_enabled(): self.skipTest('Only supported in TF2.x. Step is required in TF1.x') network = q_network.QNetwork( input_tensor_spec=self._time_step_spec.observation, action_spec=self._action_spec) policy = q_policy.QPolicy(time_step_spec=self._time_step_spec, action_spec=self._action_spec, q_network=network) saver = policy_saver.PolicySaver(policy, batch_size=None) path = os.path.join(self.get_temp_dir(), 'save_model') saver.save(path) reloaded = tf.compat.v2.saved_model.load(path) self.assertIn('get_train_step', reloaded.signatures) train_step_value = self.evaluate(reloaded.get_train_step()) self.assertEqual(-1, train_step_value)
def testUniqueSignatures(self): network = q_network.QNetwork( input_tensor_spec=self._time_step_spec.observation, action_spec=self._action_spec) policy = q_policy.QPolicy(time_step_spec=self._time_step_spec, action_spec=self._action_spec, q_network=network) saver = policy_saver.PolicySaver(policy, batch_size=None) action_signature_names = [ s.name for s in saver._signatures['action'].input_signature ] self.assertAllEqual( ['0/step_type', '0/reward', '0/discount', '0/observation'], action_signature_names) initial_state_signature_names = [ s.name for s in saver._signatures['get_initial_state'].input_signature ] self.assertAllEqual(['batch_size'], initial_state_signature_names)
def testTrainStepNotSaved(self): if not tf.executing_eagerly(): self.skipTest( 'b/129079730: PolicySaver does not work in TF1.x yet') network = q_network.QNetwork( input_tensor_spec=self._time_step_spec.observation, action_spec=self._action_spec) policy = q_policy.QPolicy(time_step_spec=self._time_step_spec, action_spec=self._action_spec, q_network=network) saver = policy_saver.PolicySaver(policy, batch_size=None) path = os.path.join(self.get_temp_dir(), 'save_model') saver.save(path) reloaded = tf.compat.v2.saved_model.load(path) self.assertIn('get_train_step', reloaded.signatures) train_step_value = self.evaluate(reloaded.train_step()) self.assertEqual(-1, train_step_value)
def __init__(self, train_environment, eval_environment, replay_buffer_capacity=1000, fc_layer_params=(100, ), learning_rate=1e-3): # Use TF Environment Wrappers to translate them for TF self.train_env = tf_py_environment.TFPyEnvironment(train_environment) self.eval_env = tf_py_environment.TFPyEnvironment(eval_environment) # Define Q-Network q_net = q_network.QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) train_step_counter = tf.compat.v2.Variable(0) # Define Agent self.agent = dqn_agent.DqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=dqn_agent.element_wise_squared_loss, train_step_counter=train_step_counter) self.agent.initialize() self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=train_env.batch_size, max_length=replay_buffer_capacity) self.eval_policy = self.agent.policy self.collect_policy = self.agent.collect_policy self.random_policy = random_tf_policy.RandomTFPolicy( self.train_env.time_step_spec(), self.train_env.action_spec())
def testAddPreprocessingLayers(self): batch_size = 3 num_actions = 2 states = (tf.random.uniform([batch_size, 1]), tf.random.uniform([batch_size])) preprocessing_layers = (tf.keras.layers.Dense(4), tf.keras.Sequential([ tf.keras.layers.Reshape((1, )), tf.keras.layers.Dense(4) ])) network = q_network.QNetwork( input_tensor_spec=(tensor_spec.TensorSpec([1], tf.float32), tensor_spec.TensorSpec([], tf.float32)), preprocessing_layers=preprocessing_layers, preprocessing_combiner=tf.keras.layers.Add(), action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, num_actions - 1)) q_values, _ = network(states) self.assertAllEqual(q_values.shape.as_list(), [batch_size, num_actions]) # At least 2 variables each for the preprocessing layers. self.assertGreater(len(network.trainable_variables), 4)