예제 #1
0
 def testBuild(self):
     batch_size = 3
     num_state_dims = 5
     num_actions = 2
     states = tf.random.uniform([batch_size, num_state_dims])
     network = q_network.QNetwork(
         input_tensor_spec=tensor_spec.TensorSpec([num_state_dims],
                                                  tf.float32),
         action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1))
     q_values, _ = network(states)
     self.assertAllEqual(q_values.shape.as_list(),
                         [batch_size, num_actions])
     self.assertEqual(len(network.trainable_weights), 6)
예제 #2
0
 def testNetworkVariablesAreReused(self):
     batch_size = 3
     num_state_dims = 5
     states = tf.ones([batch_size, num_state_dims])
     next_states = tf.ones([batch_size, num_state_dims])
     network = q_network.QNetwork(
         input_tensor_spec=tensor_spec.TensorSpec([num_state_dims],
                                                  tf.float32),
         action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1))
     q_values, _ = network(states)
     next_q_values, _ = network(next_states)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertAllClose(q_values, next_q_values)
예제 #3
0
 def testAddConvLayers(self):
   batch_size = 3
   num_state_dims = 5
   num_actions = 2
   states = tf.random.uniform([batch_size, 5, 5, num_state_dims])
   network = q_network.QNetwork(
       input_tensor_spec=tensor_spec.TensorSpec([5, 5, num_state_dims],
                                                tf.float32),
       action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1),
       conv_layer_params=((16, 3, 2),))
   q_values, _ = network(states)
   self.assertAllEqual(q_values.shape.as_list(), [batch_size, num_actions])
   self.assertEqual(len(network.trainable_variables), 8)
예제 #4
0
    def build(self):

        # build environment
        self.train_py_env = suite_gym.load(self.env_name)
        self.eval_py_env = suite_gym.load(self.env_name)

        # we can chagne cartpole parameters here

        self.train_env = tf_py_environment.TFPyEnvironment(self.train_py_env)
        self.eval_env = tf_py_environment.TFPyEnvironment(self.eval_py_env)

        # build agent
        q_net = q_network.QNetwork(
            self.train_env.observation_spec(),
            self.train_env.action_spec(),
            fc_layer_params=self.fc_layer_params)

        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=self.learning_rate)
        train_step_counter = tf.Variable(0)

        self.agent = dqn_agent.DqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
            train_step_counter=train_step_counter)

        self.agent.initialize()


        # build policy
        self.random_policy = random_tf_policy.RandomTFPolicy(self.train_env.time_step_spec(),self.train_env.action_spec())

        # build replay buffer

        self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=self.replay_buffer_max_length)

        # build collect
        self.collect_data(self.train_env, self.random_policy, self.replay_buffer, self.initial_collect_steps)

        # build dataset
        self.dataset = self.replay_buffer.as_dataset(
            num_parallel_calls=3, 
            sample_batch_size=self.batch_size, 
            num_steps=2).prefetch(3)

        self.iterator = iter(self.dataset)
예제 #5
0
def main(unused_argv):
  tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

  with tf.device('/CPU:0'):  # due to b/128333994
    env = wheel_py_environment.WheelPyEnvironment(DELTA, MU_BASE, STD_BASE,
                                                  MU_HIGH, STD_HIGH, BATCH_SIZE)
    environment = tf_py_environment.TFPyEnvironment(env)

    optimal_reward_fn = functools.partial(
        environment_utilities.tf_wheel_bandit_compute_optimal_reward,
        delta=DELTA,
        mu_inside=MU_BASE[0],
        mu_high=MU_HIGH)
    optimal_action_fn = functools.partial(
        environment_utilities.tf_wheel_bandit_compute_optimal_action,
        delta=DELTA)

    if FLAGS.agent == 'LinUCB':
      agent = lin_ucb_agent.LinearUCBAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          alpha=AGENT_ALPHA,
          dtype=tf.float32)
    elif FLAGS.agent == 'LinTS':
      agent = lin_ts_agent.LinearThompsonSamplingAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          dtype=tf.float32)
    elif FLAGS.agent == 'epsGreedy':
      network = q_network.QNetwork(
          input_tensor_spec=environment.time_step_spec().observation,
          action_spec=environment.action_spec(),
          fc_layer_params=LAYERS)
      agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          reward_network=network,
          optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
          epsilon=EPSILON)

    regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
    suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
        optimal_action_fn)

    trainer.train(
        root_dir=FLAGS.root_dir,
        agent=agent,
        environment=environment,
        training_loops=TRAINING_LOOPS,
        steps_per_loop=STEPS_PER_LOOP,
        additional_metrics=[regret_metric, suboptimal_arms_metric])
예제 #6
0
파일: dqnnet.py 프로젝트: ssghost/MyDqnnet
 def create_agent(self):
     q_net = q_network.QNetwork(self.env_t.observation_spec(),
                                self.env_t.action_spec(),
                                fc_layer_params=self.settings['fc_layer'])
     optimizer = v1.train.AdamOptimizer(learning_rate=self.settings['lr'])
     train_step_counter = v1.Variable(0)
     self.agent = dqn_agent.DqnAgent(
         self.env_t.time_step_spec(),
         self.env_t.action_spec(),
         q_network=q_net,
         optimizer=optimizer,
         td_errors_loss_fn=common.element_wise_squared_loss,
         train_step_counter=train_step_counter)
     self.agent.initialize()
예제 #7
0
    def testUpdateWithCompositeSavedModelAndCheckpoint(self):
        # Create and saved_model for a q_policy.
        network = q_network.QNetwork(
            input_tensor_spec=self._time_step_spec.observation,
            action_spec=self._action_spec)

        policy = q_policy.QPolicy(time_step_spec=self._time_step_spec,
                                  action_spec=self._action_spec,
                                  q_network=network)

        saver = policy_saver.PolicySaver(policy, batch_size=None)
        full_model_path = os.path.join(self.get_temp_dir(), 'save_model')

        def assert_val_equal_var(val, var):
            self.assertTrue(np.array_equal(np.full_like(var, val), var))

        self.evaluate(tf.compat.v1.global_variables_initializer())
        # Set all variables in the saved model to 1
        variables = policy.variables()
        self.evaluate(
            tf.nest.map_structure(lambda v: v.assign(v * 0 + 1), variables))
        for v in self.evaluate(variables):
            assert_val_equal_var(1, v)
        saver.save(full_model_path)

        # Assign 2 to all variables in the policy. Making checkpoint different than
        # the initial saved_model.
        self.evaluate(
            tf.nest.map_structure(lambda v: v.assign(v * 0 + 2), variables))
        for v in self.evaluate(variables):
            assert_val_equal_var(2, v)
        checkpoint_path = os.path.join(self.get_temp_dir(), 'checkpoint')
        saver.save_checkpoint(checkpoint_path)

        # Reload the full model and check all variables are 1
        reloaded_policy = tf.compat.v2.saved_model.load(full_model_path)
        for v in self.evaluate(reloaded_policy.model_variables):
            assert_val_equal_var(1, v)

        # Compose a new full saved model from the original saved model files
        # and variables from the checkpoint.
        composite_path = os.path.join(self.get_temp_dir(), 'composite_model')
        self.copy_tree(full_model_path, composite_path, skip_variables=True)
        self.copy_tree(checkpoint_path, os.path.join(composite_path))

        # Reload the composite model and check all variables are 2
        reloaded_policy = tf.compat.v2.saved_model.load(composite_path)
        for v in self.evaluate(reloaded_policy.model_variables):
            assert_val_equal_var(2, v)
예제 #8
0
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    with tf.device('/CPU:0'):  # due to b/128333994
        action_reward_fns = (
            environment_utilities.sliding_linear_reward_fn_generator(
                CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))

        env = sspe.StationaryStochasticPyEnvironment(functools.partial(
            environment_utilities.context_sampling_fn,
            batch_size=BATCH_SIZE,
            context_dim=CONTEXT_DIM),
                                                     action_reward_fns,
                                                     batch_size=BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_compute_optimal_reward,
            per_action_reward_fns=action_reward_fns)

        optimal_action_fn = functools.partial(
            environment_utilities.tf_compute_optimal_action,
            per_action_reward_fns=action_reward_fns)

        q_net = q_network.QNetwork(environment.observation_spec(),
                                   environment.action_spec(),
                                   fc_layer_params=(50, 50))

        agent = dqn_agent.DqnAgent(
            environment.time_step_spec(),
            environment.action_spec(),
            q_network=q_net,
            epsilon_greedy=0.1,
            target_update_tau=0.05,
            target_update_period=5,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-2),
            td_errors_loss_fn=common.element_wise_squared_loss)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
예제 #9
0
    def create_agent(self):
        q_net = q_network.QNetwork(self.env.observation_spec(),
                                   self.env.action_spec(),
                                   fc_layer_params=self.fc_layer_params)
        self.tf_agent = dqn_agent.DqnAgent(
            self.env.time_step_spec(),
            self.env.action_spec(),
            q_network=q_net,
            optimizer=self.optimizer,
            td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
            train_step_counter=self.train_step_counter,
            gamma=self.gamma)

        self.init_steps = 0
        self.episode_steps = 0
예제 #10
0
def get_agent(train_env):
    fc_layer_params = (100, )

    q_net = q_network.QNetwork(train_env.observation_spec(),
                               train_env.action_spec(),
                               fc_layer_params=fc_layer_params)
    train_step_counter = tf.Variable(0)

    return dqn_agent.DqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=train_step_counter)
예제 #11
0
  def __init__(self,
               input_tensor_spec,
               action_spec,
               mask_q_value=-100000,
               fc_layer_params=fc_layer_params,
               activation_fn=tf.keras.activations.relu,
               name='MaskedQNetwork'):

      super(MaskedQNetwork, self).__init__(input_tensor_spec, action_spec, name=name)

      self._q_net = q_network.QNetwork(input_tensor_spec['state'], action_spec, fc_layer_params=fc_layer_params,
                                      activation_fn=activation_fn)
      # self._q_net = q_network.QNetwork(input_tensor_spec, action_spec, fc_layer_params=fc_layer_params,
      #                                 activation_fn=activation_fn)
      self._mask_q_value = mask_q_value
def train(num_iterations):
    train_env = tf_py_environment.TFPyEnvironment(Cliff())
    test_env = tf_py_environment.TFPyEnvironment(Cliff())
    counter = tf.Variable(0)

    # Build network
    network = q_network.QNetwork(train_env.observation_spec(),
                                 train_env.action_spec(),
                                 fc_layer_params=(100, ))
    agent = dqn_agent.DqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        q_network=network,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3),
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=counter)

    agent.initialize()
    agent.train = common.function(agent.train)
    agent.train_step_counter.assign(0)

    buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=100)
    dataset = buffer.as_dataset(sample_batch_size=32, num_steps=2)
    iterator = iter(dataset)
    first_reward = compute_average_reward(train_env,
                                          agent.policy,
                                          num_episodes=10)
    print(f'Before training: {first_reward}')
    rewards = [first_reward]

    for _ in range(num_iterations):
        for _ in range(2):
            collect_steps(train_env, agent.collect_policy, buffer)

        experience, info = next(iterator)
        loss = agent.train(experience).loss
        step_number = agent.train_step_counter.numpy()

        if step_number % 10 == 0:
            print(f'step={step_number}: loss={loss}')

        if step_number % 20 == 0:
            average_reward = compute_average_reward(test_env, agent.policy, 1)
            print(f'step={step_number}: Reward:={average_reward}')
예제 #13
0
    def testNumericFeatureColumnInput(self):
        key = 'feature_key'
        batch_size = 3
        state_dims = 5
        column = tf.feature_column.numeric_column(key, [state_dims])
        state = {key: tf.ones([batch_size, state_dims], tf.int32)}
        state_spec = {key: tensor_spec.TensorSpec([state_dims], tf.int32)}

        online_network = q_network.QNetwork(
            input_tensor_spec=state_spec,
            action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1),
            preprocessing_combiner=tf.keras.layers.DenseFeatures([column]))
        target_network = online_network.copy(name='TargetNetwork')
        q_online, _ = online_network(state)
        q_target, _ = target_network(state)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllClose(q_online, q_target, rtol=1.0, atol=1.0)
예제 #14
0
 def __init__(self, env):
     self.env = env
     self.input_shape = (96, 96)
     self.extractor = keras.applications.MobileNetV2(
         input_shape=(self.input_shape+(3,)),
         include_top=False,
         weights='imagenet'
     )
     self.extractor.trainable = False
     self.net = q_network.QNetwork(
         self.env.observation_spec(),
         self.env.action_spec(),
         preprocessing_layers=self.extractor,
         fc_layer_params=(64,)
     )
     self.optimizer = tf.compat.v1.train.AdamOptimizer(
         learning_rate=1e-3)
예제 #15
0
    def testCheckpointSave(self):
        network = q_network.QNetwork(
            input_tensor_spec=self._time_step_spec.observation,
            action_spec=self._action_spec)

        policy = q_policy.QPolicy(time_step_spec=self._time_step_spec,
                                  action_spec=self._action_spec,
                                  q_network=network)

        saver = policy_saver.PolicySaver(policy, batch_size=None)
        path = os.path.join(self.get_temp_dir(), 'save_model')

        self.evaluate(tf.compat.v1.global_variables_initializer())
        saver.save(path)
        checkpoint_path = os.path.join(self.get_temp_dir(), 'checkpoint')
        saver.save_checkpoint(checkpoint_path)

        self.assertTrue(tf.compat.v2.io.gfile.exists(checkpoint_path))
예제 #16
0
    def testMasking(self):
        batch_size = 3
        num_state_dims = 5
        num_actions = 6
        states = tf.random.uniform([batch_size, num_state_dims])
        input_tensor_spec = tensor_spec.TensorSpec([num_state_dims],
                                                   tf.float32)
        action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0,
                                                    num_actions - 1)
        mask = tf.constant([[1, 0, 1, 0, 0, 1] for _ in range(batch_size)])
        network = q_network.QNetwork(input_tensor_spec,
                                     action_spec,
                                     mask_split_fn=lambda observation:
                                     (observation, mask))
        self.assertIsNotNone(network.mask_split_fn)

        # Run a pass through the network to catch any shape errors.
        network(states)
예제 #17
0
    def testTrainStepNotSaved(self):
        network = q_network.QNetwork(
            input_tensor_spec=self._time_step_spec.observation,
            action_spec=self._action_spec)

        policy = q_policy.QPolicy(time_step_spec=self._time_step_spec,
                                  action_spec=self._action_spec,
                                  q_network=network)

        saver = policy_saver.PolicySaver(policy, batch_size=None)
        path = os.path.join(self.get_temp_dir(), 'save_model')

        saver.save(path)
        reloaded = tf.compat.v2.saved_model.load(path)

        self.assertIn('get_train_step', reloaded.signatures)
        train_step_value = self.evaluate(reloaded.get_train_step())
        self.assertEqual(-1, train_step_value)
예제 #18
0
    def __init__(self, *args, **kwargs):
        self.env = tf_py_environment.TFPyEnvironment(CardGameEnv())
        self.q_net = q_network.QNetwork(self.env.observation_spec(),
                                        self.env.action_spec())
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        self.train_step_counter = tf.Variable(0)
        self.agent = dqn_agent.DqnAgent(
            self.env.time_step_spec(),
            self.env.action_spec(),
            q_network=self.q_net,
            optimizer=self.optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
            train_step_counter=self.train_step_counter)

        self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.env.batch_size,
            max_length=100000)
        self.num_iterations = 10000
예제 #19
0
 def initialize_agent(self):
     """ Instance TF agent with hparams
     """
     # Q network
     self.q_net = q_network.QNetwork(
         self.train_env.observation_spec(),
         self.train_env.action_spec(),
         fc_layer_params=self.qnet_fc_hidden_size)
     # DQN agent
     self.agent = dqn_agent.DqnAgent(
         self.train_env.time_step_spec(),
         self.train_env.action_spec(),
         q_network=self.q_net,
         optimizer=self.optimizer,
         epsilon_greedy=0.1,  # [TODO] - add the hyper param
         td_errors_loss_fn=common.element_wise_squared_loss,
         train_step_counter=self.train_step_counter)
     self.agent.initialize()
     self.policy = self.agent.policy
예제 #20
0
def generic_dqn_agent(env: TFPyEnvironment) -> (dqn_agent.DqnAgent, q_network.QNetwork):
    """ Function that returns a generic dqn agent
    args:
        env (TFPyEnvironment) : The environment the agent will live in

    Returns:
        dqn_agent.DqnAgent: The agent to train
        q_network.QNetwork: The network used in the agent
    """

    inp = env.observation_spec().shape[0]
    q_net = q_network.QNetwork(
      env.observation_spec(),
      env.action_spec(),
      fc_layer_params=(20,20,20,20,20),
      activation_fn=tf.keras.activations.relu)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)

    agent = dqn_agent.DqnAgent(
      env.time_step_spec(),
      env.action_spec(),
      q_network=q_net,
      optimizer=optimizer,
      td_errors_loss_fn=common.element_wise_squared_loss,
      train_step_counter=tf.Variable(0),
      epsilon_greedy=0.1
    )

    """def observation_and_action_constraint_splitter(observation):
        action_mask = [1,1]
        if observation[0][-1] > 5:
            action_mask[0] = 1
        return observation, tf.convert_to_tensor(action_mask, dtype=np.int32)

    agent.policy._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter
    )"""
    #tf_agents.policies.greedy_policy.GreedyPolicy

    agent.initialize()

    return agent, q_net
예제 #21
0
 def testAgentWithDifferentSubagentsUpdate(self):
     num_actions = 3
     context_dim = 2
     batch_size = 7
     observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
     time_step_spec = time_step.time_step_spec(observation_spec)
     action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                 shape=(),
                                                 minimum=0,
                                                 maximum=num_actions - 1)
     agent1 = lin_ucb_agent.LinearUCBAgent(
         time_step_spec,
         action_spec,
         emit_policy_info=(
             policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, ))
     reward_net = q_network.QNetwork(input_tensor_spec=observation_spec,
                                     action_spec=action_spec,
                                     fc_layer_params=(4, 3, 2))
     agent2 = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
         time_step_spec,
         action_spec,
         reward_network=reward_net,
         emit_policy_info=(
             policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, ),
         optimizer=tf.compat.v1.train.GradientDescentOptimizer(
             learning_rate=0.1),
         epsilon=0.1)
     agents = [agent1, agent2]
     mixture_agent = static_mixture_agent.StaticMixtureAgent([1, 1], agents)
     initial_step, final_step = _get_initial_and_final_steps(
         batch_size, context_dim)
     action = np.random.randint(num_actions,
                                size=batch_size,
                                dtype=np.int32)
     action_step = _get_action_step(action, 2, num_actions)
     experience = _get_experience(initial_step, action_step, final_step)
     for agent in agents:
         self.evaluate(agent.initialize())
     self.evaluate(tf.compat.v1.initialize_all_variables())
     self.evaluate(mixture_agent.initialize())
     loss_info = mixture_agent.train(experience)
     self.evaluate(loss_info)
예제 #22
0
    def testIndicatorFeatureColumnInput(self):
        key = 'feature_key'
        vocab_list = [2, 3, 4]
        column = tf.feature_column.categorical_column_with_vocabulary_list(
            key, vocab_list)
        column = tf.feature_column.indicator_column(column)
        feature_tensor = tf.convert_to_tensor([3, 2, 2, 4, 3])
        state = {key: tf.expand_dims(feature_tensor, -1)}
        state_spec = {key: tensor_spec.TensorSpec([1], tf.int32)}

        online_network = q_network.QNetwork(
            input_tensor_spec=state_spec,
            action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1),
            preprocessing_combiner=tf.keras.layers.DenseFeatures([column]))
        target_network = online_network.copy(name='TargetNetwork')
        q_online, _ = online_network(state)
        q_target, _ = target_network(state)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(tf.compat.v1.initializers.tables_initializer())
        self.assertAllClose(q_online, q_target, rtol=1.0, atol=1.0)
    def testTrainStepSaved(self):
        # We need to use one default session so that self.evaluate and the
        # SavedModel loader share the same session.
        with tf.compat.v1.Session().as_default():
            network = q_network.QNetwork(
                input_tensor_spec=self._time_step_spec.observation,
                action_spec=self._action_spec)

            policy = q_policy.QPolicy(time_step_spec=self._time_step_spec,
                                      action_spec=self._action_spec,
                                      q_network=network)
            self.evaluate(
                tf.compat.v1.initializers.variables(policy.variables()))

            train_step = common.create_variable('train_step', initial_value=7)
            self.evaluate(tf.compat.v1.initializers.variables([train_step]))

            saver = policy_saver.PolicySaver(policy,
                                             batch_size=None,
                                             train_step=train_step)
            if tf.executing_eagerly():
                step = saver.get_train_step()
            else:
                step = self.evaluate(saver.get_train_step())
            self.assertEqual(7, step)
            path = os.path.join(self.get_temp_dir(), 'save_model')
            saver.save(path)

            reloaded = tf.compat.v2.saved_model.load(path)
            self.assertIn('get_train_step', reloaded.signatures)
            self.evaluate(tf.compat.v1.global_variables_initializer())
            train_step_value = self.evaluate(reloaded.get_train_step())
            self.assertEqual(7, train_step_value)
            train_step = train_step.assign_add(3)
            self.evaluate(train_step)
            saver.save(path)

            reloaded = tf.compat.v2.saved_model.load(path)
            self.evaluate(tf.compat.v1.global_variables_initializer())
            train_step_value = self.evaluate(reloaded.get_train_step())
            self.assertEqual(10, train_step_value)
    def testTrainWithNN(self, is_convert, is_distribution_network):
        # Hard code a trajectory shaped (time=6, batch=1, ...).
        traj, time_step_spec, action_spec = create_arbitrary_trajectory()

        if is_convert:
            # Convert to single step trajectory of shapes (batch=6, 1, ...).
            traj = tf.nest.map_structure(common.transpose_batch_time, traj)

        if is_distribution_network:
            cloning_net = sequential.Sequential([
                expand_dims_layer.ExpandDims(-1),
                tf.keras.layers.Dense(action_spec.maximum -
                                      action_spec.minimum + 1),
                tf.keras.layers.Lambda(
                    lambda t: tfp.distributions.Categorical(logits=t)),
            ])
        else:
            cloning_net = q_network.QNetwork(time_step_spec.observation,
                                             action_spec)
        agent = behavioral_cloning_agent.BehavioralCloningAgent(
            time_step_spec,
            action_spec,
            cloning_network=cloning_net,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001),
            num_outer_dims=2)
        # Disable clipping to make sure we can see the difference in behavior
        agent.policy._clip = False
        # TODO(b/123883319)
        if tf.executing_eagerly():
            train_and_loss = lambda: agent.train(traj)
        else:
            train_and_loss = agent.train(traj)
        self.evaluate(tf.compat.v1.global_variables_initializer())

        initial_loss = self.evaluate(train_and_loss).loss
        for _ in range(TRAIN_ITERATIONS - 1):
            loss = self.evaluate(train_and_loss).loss

        # We don't necessarily converge to the same actions as in trajectory after
        # 10 steps of an untuned optimizer, but the loss should go down.
        self.assertGreater(initial_loss, loss)
    def __init__(self, environment):
        self.preprocessing_layers = {
            'history': tf.keras.models.Sequential([
                tf.keras.layers.Embedding(4, 16, input_length=1 * 24 * 60 // 15),
                tf.keras.layers.LSTM(32),
                tf.keras.layers.Flatten(),
                tf.keras.layers.Dense(50, activation="relu")
            ]),
            'boiler_state': tf.keras.layers.Dense(1, activation="relu"),
            "usage_state": tf.keras.layers.Dense(1, activation="relu"),
            "water_temperature": tf.keras.layers.Dense(1, activation="relu"),
        }

        self.preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1)
        self.q_net = q_network.QNetwork(
            environment.observation_spec(),
            environment.action_spec(),
            fc_layer_params=fc_layer_params,
            preprocessing_layers=self.preprocessing_layers,
            preprocessing_combiner=self.preprocessing_combiner
        )
예제 #26
0
    def testTrainStepNotSaved(self):
        if not common.has_eager_been_enabled():
            self.skipTest('Only supported in TF2.x. Step is required in TF1.x')

        network = q_network.QNetwork(
            input_tensor_spec=self._time_step_spec.observation,
            action_spec=self._action_spec)

        policy = q_policy.QPolicy(time_step_spec=self._time_step_spec,
                                  action_spec=self._action_spec,
                                  q_network=network)

        saver = policy_saver.PolicySaver(policy, batch_size=None)
        path = os.path.join(self.get_temp_dir(), 'save_model')

        saver.save(path)
        reloaded = tf.compat.v2.saved_model.load(path)

        self.assertIn('get_train_step', reloaded.signatures)
        train_step_value = self.evaluate(reloaded.get_train_step())
        self.assertEqual(-1, train_step_value)
예제 #27
0
    def testUniqueSignatures(self):
        network = q_network.QNetwork(
            input_tensor_spec=self._time_step_spec.observation,
            action_spec=self._action_spec)

        policy = q_policy.QPolicy(time_step_spec=self._time_step_spec,
                                  action_spec=self._action_spec,
                                  q_network=network)

        saver = policy_saver.PolicySaver(policy, batch_size=None)
        action_signature_names = [
            s.name for s in saver._signatures['action'].input_signature
        ]
        self.assertAllEqual(
            ['0/step_type', '0/reward', '0/discount', '0/observation'],
            action_signature_names)
        initial_state_signature_names = [
            s.name
            for s in saver._signatures['get_initial_state'].input_signature
        ]
        self.assertAllEqual(['batch_size'], initial_state_signature_names)
예제 #28
0
    def testTrainStepNotSaved(self):
        if not tf.executing_eagerly():
            self.skipTest(
                'b/129079730: PolicySaver does not work in TF1.x yet')
        network = q_network.QNetwork(
            input_tensor_spec=self._time_step_spec.observation,
            action_spec=self._action_spec)

        policy = q_policy.QPolicy(time_step_spec=self._time_step_spec,
                                  action_spec=self._action_spec,
                                  q_network=network)

        saver = policy_saver.PolicySaver(policy, batch_size=None)
        path = os.path.join(self.get_temp_dir(), 'save_model')

        saver.save(path)
        reloaded = tf.compat.v2.saved_model.load(path)

        self.assertIn('get_train_step', reloaded.signatures)
        train_step_value = self.evaluate(reloaded.train_step())
        self.assertEqual(-1, train_step_value)
예제 #29
0
    def __init__(self,
                 train_environment,
                 eval_environment,
                 replay_buffer_capacity=1000,
                 fc_layer_params=(100, ),
                 learning_rate=1e-3):
        # Use TF Environment Wrappers to translate them for TF
        self.train_env = tf_py_environment.TFPyEnvironment(train_environment)
        self.eval_env = tf_py_environment.TFPyEnvironment(eval_environment)

        # Define Q-Network
        q_net = q_network.QNetwork(self.train_env.observation_spec(),
                                   self.train_env.action_spec(),
                                   fc_layer_params=fc_layer_params)

        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)

        train_step_counter = tf.compat.v2.Variable(0)
        # Define Agent
        self.agent = dqn_agent.DqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
            train_step_counter=train_step_counter)

        self.agent.initialize()

        self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=train_env.batch_size,
            max_length=replay_buffer_capacity)

        self.eval_policy = self.agent.policy
        self.collect_policy = self.agent.collect_policy

        self.random_policy = random_tf_policy.RandomTFPolicy(
            self.train_env.time_step_spec(), self.train_env.action_spec())
예제 #30
0
 def testAddPreprocessingLayers(self):
     batch_size = 3
     num_actions = 2
     states = (tf.random.uniform([batch_size,
                                  1]), tf.random.uniform([batch_size]))
     preprocessing_layers = (tf.keras.layers.Dense(4),
                             tf.keras.Sequential([
                                 tf.keras.layers.Reshape((1, )),
                                 tf.keras.layers.Dense(4)
                             ]))
     network = q_network.QNetwork(
         input_tensor_spec=(tensor_spec.TensorSpec([1], tf.float32),
                            tensor_spec.TensorSpec([], tf.float32)),
         preprocessing_layers=preprocessing_layers,
         preprocessing_combiner=tf.keras.layers.Add(),
         action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0,
                                                   num_actions - 1))
     q_values, _ = network(states)
     self.assertAllEqual(q_values.shape.as_list(),
                         [batch_size, num_actions])
     # At least 2 variables each for the preprocessing layers.
     self.assertGreater(len(network.trainable_variables), 4)