示例#1
0
 def test_episodic_observer_assert_sequence_length_positive(self):
   with self.assertRaises(ValueError):
     _ = reverb_utils.ReverbAddEpisodeObserver(
         self._client,
         table_name='test_table',
         max_sequence_length=-1,
         priority=3)
示例#2
0
 def test_episodic_observer_update_priority(self):
     observer = reverb_utils.ReverbAddEpisodeObserver(
         self._client,
         table_name='test_table',
         max_sequence_length=1,
         priority=3)
     self.assertEqual(observer._priority, 3)
     observer.update_priority(4)
     self.assertEqual(observer._priority, 4)
示例#3
0
def train_agent(iterations, modeldir, logdir, policydir):
    """Train and convert the model using TF Agents."""

    # TODO: add code to instantiate the training and evaluation environments

    # TODO: add code to create a reinforcement learning agent that is going to be trained

    tf_agent.initialize()

    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy

    tf_policy_saver = policy_saver.PolicySaver(collect_policy)

    # Use reverb as replay buffer
    replay_buffer_signature = tensor_spec.from_spec(tf_agent.collect_data_spec)
    table = reverb.Table(
        REPLAY_BUFFER_TABLE_NAME,
        max_size=REPLAY_BUFFER_CAPACITY,
        sampler=reverb.selectors.Uniform(),
        remover=reverb.selectors.Fifo(),
        rate_limiter=reverb.rate_limiters.MinSize(1),
        signature=replay_buffer_signature,
    )  # specify signature here for validation at insertion time

    reverb_server = reverb.Server([table])

    replay_buffer = reverb_replay_buffer.ReverbReplayBuffer(
        tf_agent.collect_data_spec,
        sequence_length=None,
        table_name=REPLAY_BUFFER_TABLE_NAME,
        local_server=reverb_server,
    )

    replay_buffer_observer = reverb_utils.ReverbAddEpisodeObserver(
        replay_buffer.py_client, REPLAY_BUFFER_TABLE_NAME,
        REPLAY_BUFFER_CAPACITY)

    # Optimize by wrapping some of the code in a graph using TF function.
    tf_agent.train = common.function(tf_agent.train)

    # Evaluate the agent's policy once before training.
    avg_return = compute_avg_return_and_steps(eval_env, tf_agent.policy,
                                              NUM_EVAL_EPISODES)

    summary_writer = tf.summary.create_file_writer(logdir)

    for i in range(iterations):
        # TODO: add code to collect game episodes and train the agent

        logger = tf.get_logger()
        if i % EVAL_INTERVAL == 0:
            avg_return, avg_episode_length = compute_avg_return_and_steps(
                eval_env, eval_policy, NUM_EVAL_EPISODES)
            with summary_writer.as_default():
                tf.summary.scalar("Average return", avg_return, step=i)
                tf.summary.scalar("Average episode length",
                                  avg_episode_length,
                                  step=i)
                summary_writer.flush()
            logger.info(
                "iteration = {0}: Average Return = {1}, Average Episode Length = {2}"
                .format(i, avg_return, avg_episode_length))

    summary_writer.close()

    tf_policy_saver.save(policydir)
示例#4
0
                     max_size=replay_buffer_capacity,
                     sampler=reverb.selectors.Uniform(),
                     remover=reverb.selectors.Fifo(),
                     rate_limiter=reverb.rate_limiters.MinSize(1),
                     signature=replay_buffer_signature)

reverb_server = reverb.Server([table])

replay_buffer = reverb_replay_buffer.ReverbReplayBuffer(
    tf_agent.collect_data_spec,
    table_name=table_name,
    sequence_length=None,
    local_server=reverb_server)

rb_observer = reverb_utils.ReverbAddEpisodeObserver(replay_buffer.py_client,
                                                    table_name,
                                                    replay_buffer_capacity)


def collect_episode(environment, policy, num_episodes):

    driver = py_driver.PyDriver(environment,
                                py_tf_eager_policy.PyTFEagerPolicy(
                                    policy, use_tf_function=True),
                                [rb_observer],
                                max_episodes=num_episodes)
    initial_time_step = environment.reset()
    driver.run(initial_time_step)


# (Optional) Optimize by wrapping some of the code in a graph using TF function.
示例#5
0
 def _create_and_yield(client):
     yield reverb_utils.ReverbAddEpisodeObserver(client, *args, **kwargs)
示例#6
0
def train_agent(iterations, modeldir, logdir, policydir):
    """Train and convert the model using TF Agents."""

    train_py_env = planestrike_py_environment.PlaneStrikePyEnvironment(
        board_size=BOARD_SIZE, discount=DISCOUNT, max_steps=BOARD_SIZE**2)
    eval_py_env = planestrike_py_environment.PlaneStrikePyEnvironment(
        board_size=BOARD_SIZE, discount=DISCOUNT, max_steps=BOARD_SIZE**2)

    train_env = tf_py_environment.TFPyEnvironment(train_py_env)
    eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    # Alternatively you could use ActorDistributionNetwork as actor_net
    actor_net = tfa.networks.Sequential(
        [
            tfa.keras_layers.InnerReshape([BOARD_SIZE, BOARD_SIZE],
                                          [BOARD_SIZE**2]),
            tf.keras.layers.Dense(FC_LAYER_PARAMS, activation='relu'),
            tf.keras.layers.Dense(BOARD_SIZE**2),
            tf.keras.layers.Lambda(
                lambda t: tfp.distributions.Categorical(logits=t)),
        ],
        input_spec=train_py_env.observation_spec())

    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

    train_step_counter = tf.Variable(0)

    tf_agent = reinforce_agent.ReinforceAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        actor_network=actor_net,
        optimizer=optimizer,
        normalize_returns=True,
        train_step_counter=train_step_counter)

    tf_agent.initialize()

    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy

    tf_policy_saver = policy_saver.PolicySaver(collect_policy)

    # Use reverb as replay buffer
    replay_buffer_signature = tensor_spec.from_spec(tf_agent.collect_data_spec)
    table = reverb.Table(
        REPLAY_BUFFER_TABLE_NAME,
        max_size=REPLAY_BUFFER_CAPACITY,
        sampler=reverb.selectors.Uniform(),
        remover=reverb.selectors.Fifo(),
        rate_limiter=reverb.rate_limiters.MinSize(1),
        signature=replay_buffer_signature
    )  # specify signature here for validation at insertion time

    reverb_server = reverb.Server([table])

    replay_buffer = reverb_replay_buffer.ReverbReplayBuffer(
        tf_agent.collect_data_spec,
        sequence_length=None,
        table_name=REPLAY_BUFFER_TABLE_NAME,
        local_server=reverb_server)

    replay_buffer_observer = reverb_utils.ReverbAddEpisodeObserver(
        replay_buffer.py_client, REPLAY_BUFFER_TABLE_NAME,
        REPLAY_BUFFER_CAPACITY)

    # Optimize by wrapping some of the code in a graph using TF function.
    tf_agent.train = common.function(tf_agent.train)

    # Evaluate the agent's policy once before training.
    avg_return = compute_avg_return_and_steps(eval_env, tf_agent.policy,
                                              NUM_EVAL_EPISODES)

    summary_writer = tf.summary.create_file_writer(logdir)

    for i in range(iterations):
        # Collect a few episodes using collect_policy and save to the replay buffer.
        collect_episode(train_py_env, collect_policy,
                        COLLECT_EPISODES_PER_ITERATION, replay_buffer_observer)

        # Use data from the buffer and update the agent's network.
        iterator = iter(replay_buffer.as_dataset(sample_batch_size=1))
        trajectories, _ = next(iterator)
        tf_agent.train(experience=trajectories)
        replay_buffer.clear()

        logger = tf.get_logger()
        if i % EVAL_INTERVAL == 0:
            avg_return, avg_episode_length = compute_avg_return_and_steps(
                eval_env, eval_policy, NUM_EVAL_EPISODES)
            with summary_writer.as_default():
                tf.summary.scalar('Average return', avg_return, step=i)
                tf.summary.scalar('Average episode length',
                                  avg_episode_length,
                                  step=i)
                summary_writer.flush()
            logger.info(
                'iteration = {0}: Average Return = {1}, Average Episode Length = {2}'
                .format(i, avg_return, avg_episode_length))

    summary_writer.close()

    tf_policy_saver.save(policydir)
    # Convert to tflite model
    converter = tf.lite.TFLiteConverter.from_saved_model(
        policydir, signature_keys=['action'])
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS,  # enable TensorFlow Lite ops.
        tf.lite.OpsSet.SELECT_TF_OPS  # enable TensorFlow ops.
    ]
    tflite_policy = converter.convert()
    with open(os.path.join(modeldir, 'planestrike_tf_agents.tflite'),
              'wb') as f:
        f.write(tflite_policy)