def train_dyke_agent(train_env: TFPyEnvironment, eval_env: TFPyEnvironment,
                     agent: DqnAgent, train_steps: int, steps_per_episode: int,
                     eval_episodes: int) -> Dict[str, Any]:
    """
	Trains the DQN agent on the dyke maintenance task.

	:param train_env: The training environment.
	:param eval_env: The environment for testing agent performance.
	:param agent: The agent.
	:param train_steps: The number of training steps to use.
	:param steps_per_episode: The number of time steps that can be taken in a single dyke environment episode.
	:param eval_episodes: The number of episodes to use per evaluation.
	:return: A mapping to various metrics pertaining to the training's results.
	"""
    losses: np.ndarray = np.zeros(shape=(train_steps, steps_per_episode))
    evaluations: np.ndarray = np.zeros(shape=(train_steps, eval_episodes))
    train_metrics: Tuple = (AverageReturnMetric, )
    train_metric_results: np.ndarray = np.zeros(shape=(len(train_metrics),
                                                       train_steps,
                                                       steps_per_episode))
    for step in range(train_steps):
        # we uniformly sample experiences (single time steps) from one episode per train step
        print('STEP %d/%d' % (step + 1, train_steps))
        train_env.reset()
        rep_buf = _dyke_replay_buffer(train_env, agent, steps_per_episode)
        train_metric_inst: Tuple = tuple(
            [metric() for metric in train_metrics])  # instantiate the metrics
        obs: Tuple = (rep_buf.add_batch, ) + train_metric_inst
        _ = DynamicStepDriver(
            env=train_env,
            policy=agent.collect_policy,
            observers=obs,
            num_steps=steps_per_episode
        ).run(
        )  # experience a single episode using the agent's current configuration
        dataset: tf.data.Dataset = rep_buf.as_dataset(
            sample_batch_size=_REP_BUF_BATCH_SIZE,
            num_steps=_REP_BUF_NUM_STEPS)
        iterator = iter(dataset)
        for tr in range(steps_per_episode):
            trajectories, _ = next(iterator)
            losses[step, tr] = agent.train(experience=trajectories).loss
            for met in range(len(train_metrics)):
                train_metric_results[
                    met, step, tr] = train_metric_inst[met].result().numpy()
        evaluations[step, :] = _evaluate_dyke_agent(eval_env, agent,
                                                    eval_episodes)
    return {
        'loss': losses,
        'eval': evaluations,
        'train-metrics': train_metric_results
    }
示例#2
0
def create_pong_agent(train_environment: TFEnvironment,
                      dense_layer_sizes: Sequence[int],
                      learning_rate: float) -> (DqnAgent, QNetwork):

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    global_step = tf.compat.v1.train.get_or_create_global_step()

    q_net = QNetwork(input_tensor_spec=train_environment.observation_spec(),
                     action_spec=train_environment.action_spec(),
                     fc_layer_params=dense_layer_sizes)

    agent = DqnAgent(time_step_spec=train_environment.time_step_spec(),
                     action_spec=train_environment.action_spec(),
                     q_network=q_net,
                     optimizer=optimizer,
                     td_errors_loss_fn=element_wise_squared_loss,
                     train_step_counter=global_step)

    agent.initialize()
    agent.train = common.function(agent.train)
    agent.train_step_counter.assign(0)

    return agent
示例#3
0
    decay_steps=epsilon_decay_steps,
    end_learning_rate=epsilon_final)  # final ε

agent = DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    target_update_period=target_update_period,
    td_errors_loss_fn=keras.losses.Huber(reduction="none"),
    gamma=discount_factor,  # discount factor
    train_step_counter=train_step,
    epsilon_greedy=lambda: epsilon_fn(train_step))
agent.initialize()
# Speed up as tensorflow function
agent.train = function(agent.train)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    # Determines the data spec type
    data_spec=agent.collect_data_spec,
    # The number of trajectories added at each step
    batch_size=tf_env.batch_size,
    # This can store 4 million trajectories (note: requires a lot of RAM)
    max_length=n_iterations)

# Create the observer that adds trajectories to the replay buffer
replay_buffer_observer = replay_buffer.add_batch
示例#4
0
# execute the random policy in the environment for a few steps
# and record the data (observations, actions, rewards etc) in the replay buffer
print("Collecting initial random steps")
random_policy = RandomTFPolicy(train_env.time_step_spec(),
                               train_env.action_spec())
for _ in range(initial_collect_steps):
    collect_step(train_env, random_policy)

dataset = replay_buffer.as_dataset(num_parallel_calls=3,
                                   sample_batch_size=batch_size,
                                   num_steps=2).prefetch(3)
iterator = iter(dataset)

# train the agent
print("Training the agent")
tf_agent.train = common.function(tf_agent.train)

# Reset the train step
tf_agent.train_step_counter.assign(0)


def compute_avg_reward(environment, policy, num_episodes=10):
    total_reward = 0.0
    for episode in range(num_episodes):
        print(f"Computing reward: game {episode + 1}/{num_episodes}")

        time_step = environment.reset()
        episode_reward = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
示例#5
0
class DQNAgent:
    def __init__(self) -> None:
        """
        A class for training a TF-agent
        based on https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial
        """

        self.train_env = None  # Training environment
        self.agent = None  # The algorithm used to solve an RL problem is represented by a TF-Agent
        self.replay_buffer = None  # The replay buffer keeps track of data collected from the environment
        self.dataset = None  # The agent needs access to the replay buffer via an iterable tf.data.Dataset
        self.iterator = None  # The iterator of self.dataset

    def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float,
                replay_buffer_max_length: int, layers: dict) -> None:
        """
        Create the Q-network, agent and policy

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            lr: learn rate for the optimizer (default Adam)
            epsilon: Used for the default epsilon greedy policy for choosing a random action.
            gamma: The discount factor for learning Q-values
            imb_ratio: ratio of imbalance. Used to specifiy reward in the environment
            replay_buffer_max_length: Maximum lenght of replay memory.
            layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout).
        """

        dense_layers = layers.get("dense")
        conv_layers = layers.get("conv")
        dropout_layers = layers.get("dropout")

        self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio))  # create a custom environment

        q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers,
                         fc_layer_params=dense_layers, dropout_layer_params=dropout_layers)

        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr)

        train_step_counter = tf.Variable(0)

        self.agent = DqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
            train_step_counter=train_step_counter,
            gamma=gamma,
            epsilon_greedy=epsilon,
        )

        self.agent.initialize()

        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=replay_buffer_max_length)

    def fit(self, X_train: np.ndarray, y_train: np.ndarray, epochs: int, batch_size: int, eval_step: int, log_step: int,
            collect_steps_per_episode: int) -> None:
        """
        Starts the training of the Agent.

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            epochs: Number of epochs to train Agent
            batch_size: The Batch Size
            eval_step: Evaluate Model each 'eval_step'
            log_step: Monitor results of model each 'log_step'
            collect_steps_per_episode: Collect a few steps using collect_policy and save to the replay buffer.
        """

        self.dataset = self.replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
            num_steps=2).prefetch(3)

        self.iterator = iter(self.dataset)

        def collect_step(environment, policy, buffer):
            time_step = environment.current_time_step()
            action_step = policy.action(time_step)
            next_time_step = environment.step(action_step.action)
            traj = trajectory.from_transition(time_step, action_step, next_time_step)

            # Add trajectory to the replay buffer
            buffer.add_batch(traj)

        def collect_data(env, policy, buffer, steps):
            for _ in range(steps):
                collect_step(env, policy, buffer)

        # (Optional) Optimize by wrapping some of the code in a graph using TF function.
        self.agent.train = common.function(self.agent.train)

        # Reset the train step
        self.agent.train_step_counter.assign(0)

        for _ in range(epochs):
            #print("epoch: ", _)
            # Collect a few steps using collect_policy and save to the replay buffer.
            collect_data(self.train_env, self.agent.collect_policy, self.replay_buffer, collect_steps_per_episode)

            # Sample a batch of data from the buffer and update the agent's network.
            experience, _ = next(self.iterator)
            train_loss = self.agent.train(experience).loss

            step = self.agent.train_step_counter.numpy()

            if step % log_step == 0:
                print('step = {0}: loss = {1}'.format(step, train_loss))

            if step % eval_step == 0:
                metrics = self.compute_metrics(X_train, y_train)
                print(metrics)

    def compute_metrics(self, X: np.ndarray, y_true: list) -> dict:
        """Compute Metrics for Evaluation"""
        # TODO: apply softmax layer for q logits?

        q, _ = self.agent._target_q_network (X, training=False)

        # y_scores = np.max(q.numpy(), axis=1)  # predicted scores (Q-Values)
        y_pred = np.argmax(q.numpy(), axis=1)  # predicted class label

        metrics = custom_metrics(y_true, y_pred)

        return metrics

    def evaluate(self, X: np.ndarray, y: list, X_train=None, y_train=None) -> dict:
        """
         Evaluation of trained Q-network
        """
        metrics = self.compute_metrics(X, y)

        print("evaluation: ", metrics)
        return metrics
示例#6
0
def main(_):
    # Environment
    env_name = "Breakout-v4"
    train_num_parallel_environments = 5
    max_steps_per_episode = 1000
    # Replay buffer
    replay_buffer_capacity = 50000
    init_replay_buffer = 500
    # Driver
    collect_steps_per_iteration = 1 * train_num_parallel_environments
    # Training
    train_batch_size = 32
    train_iterations = 100000
    train_summary_interval = 200
    train_checkpoint_interval = 200
    # Evaluation
    eval_num_parallel_environments = 5
    eval_summary_interval = 500
    eval_num_episodes = 20
    # File paths
    path = pathlib.Path(__file__)
    parent_dir = path.parent.resolve()
    folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S")
    train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint")
    train_summary_dir = str(parent_dir / folder_name / "train_summary")
    eval_summary_dir = str(parent_dir / folder_name / "eval_summary")

    # Parallel training environment
    tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * train_num_parallel_environments))
    tf_env.seed([42] * tf_env.batch_size)
    tf_env.reset()

    # Parallel evaluation environment
    eval_tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * eval_num_parallel_environments))
    eval_tf_env.seed([42] * eval_tf_env.batch_size)
    eval_tf_env.reset()

    # Creating the Deep Q-Network
    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.)

    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(tf_env.observation_spec(),
                     tf_env.action_spec(),
                     preprocessing_layers=preprocessing_layer,
                     conv_layer_params=conv_layer_params,
                     fc_layer_params=fc_layer_params)

    # Creating the DQN Agent
    optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                         rho=0.95,
                                         momentum=0.0,
                                         epsilon=0.00001,
                                         centered=True)

    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=2500000,
        end_learning_rate=0.01)  # final ε

    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=200,
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=global_step,
        epsilon_greedy=lambda: epsilon_fn(global_step))
    agent.initialize()

    # Creating the Replay Buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    # Observer: Replay Buffer Observer
    replay_buffer_observer = replay_buffer.add_batch

    # Observer: Training Metrics
    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size),
        tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size),
    ]

    # Creating the Collect Driver
    collect_driver = DynamicStepDriver(tf_env,
                                       agent.collect_policy,
                                       observers=[replay_buffer_observer] +
                                       train_metrics,
                                       num_steps=collect_steps_per_iteration)

    # Initialize replay buffer
    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer_observer,
                   ShowProgress()],
        num_steps=init_replay_buffer)
    final_time_step, final_policy_state = init_driver.run()

    # Creating the Dataset
    dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size,
                                       num_steps=2,
                                       num_parallel_calls=3).prefetch(3)

    # Optimize by wrapping some of the code in a graph using TF function.
    collect_driver.run = function(collect_driver.run)
    agent.train = function(agent.train)

    print("\n\n++++++++++++++++++++++++++++++++++\n")

    # Create checkpoint
    train_checkpointer = Checkpointer(
        ckpt_dir=train_checkpoint_dir,
        max_to_keep=1,
        agent=agent,
        # replay_buffer=replay_buffer,
        global_step=global_step,
        # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')
    )

    # Restore checkpoint
    # train_checkpointer.initialize_or_restore()

    # Summary writers and metrics
    train_summary_writer = tf.summary.create_file_writer(train_summary_dir)
    eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir)
    eval_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size,
                                       buffer_size=eval_num_episodes),
        tf_metrics.AverageEpisodeLengthMetric(
            batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes)
    ]

    # Create evaluate callback function
    eval_callback = evaluate(eval_metrics=eval_metrics,
                             eval_tf_env=eval_tf_env,
                             eval_policy=agent.policy,
                             eval_num_episodes=eval_num_episodes,
                             train_step=global_step,
                             eval_summary_writer=eval_summary_writer)

    # Train agent
    train_agent(tf_env=tf_env,
                train_iterations=train_iterations,
                global_step=global_step,
                agent=agent,
                dataset=dataset,
                collect_driver=collect_driver,
                train_metrics=train_metrics,
                train_checkpointer=train_checkpointer,
                train_checkpoint_interval=train_checkpoint_interval,
                train_summary_writer=train_summary_writer,
                train_summary_interval=train_summary_interval,
                eval_summary_interval=eval_summary_interval,
                eval_callback=eval_callback)

    print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")
示例#7
0
        num_parallel_calls=3,
        sample_batch_size=BATCH_SIZE,
        num_steps=2).prefetch(3)
    iterator = iter(dataset)

    agent.train_step_counter.assign(0)
    avg_return = compute_avg_return(eval_env, agent.policy)
    returns = [avg_return]

    # Pre-populate replay buffer
    for _ in range(PRETRAIN_LEN):
        driver.run()

    # Train
    # Optimize
    agent.train = common.function(agent.train)

    start_time = time.time()
    for _ in range(NUM_ITERATIONS):
        driver.run()

        experience, unused_info = next(iterator)
        train_loss = agent.train(experience).loss
        step = agent.train_step_counter.numpy()

        if step % 200 == 0:
            print(f'Step {step}: loss = {train_loss}')
        if step % 1000 == 0:
            avg_return = compute_avg_return(eval_env, agent.policy)
            current_time = time.time()
            elapsed_time = current_time - start_time