def dyke_dqn_agent(env: TFPyEnvironment,
                   layers: Optional[List[Layer]] = None) -> DqnAgent:
    """
	Prepares a deep Q-network (DQN) agent for use in the dyke maintenance environment.

	:param env: The dyke environment on which to base the DQN agent.
	:param layers: Optional. A list of layers to supply to the DQN agent's network.
	:return: The agent.
	"""
    layers = fully_connected_dyke_dqn_agent_network(
        sizes=(100, 50)) if layers is None else layers
    # prepare the Q-values layer
    action_as: BoundedArraySpec = from_spec(env.action_spec())
    number_actions: int = int(action_as.maximum - action_as.minimum + 1)
    q_values_layer: Layer = Dense(units=number_actions,
                                  activation=None,
                                  kernel_initializer=RandomUniform(
                                      minval=-3e-3, maxval=3e-3),
                                  bias_initializer=Constant(-2e-1))
    net = Sequential(layers=layers + [q_values_layer])
    # instantiate and return the agent
    optimizer = Adam(learning_rate=1e-3)
    train_step_counter = Variable(initial_value=0)
    return DqnAgent(time_step_spec=env.time_step_spec(),
                    action_spec=env.action_spec(),
                    q_network=net,
                    optimizer=optimizer,
                    epsilon_greedy=0.1,
                    td_errors_loss_fn=element_wise_squared_loss,
                    train_step_counter=train_step_counter)
示例#2
0
def create_dqn_agent(env, q_net):

    # see TF-agents issue #113
    #optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0,
    #                                     epsilon=0.00001, centered=True)

    train_step = tf.Variable(0)
    update_period = config.UPDATE_PERIOD  # run a training step every 4 collect steps
    optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4,
                                                    decay=0.95,
                                                    momentum=0.0,
                                                    epsilon=0.00001,
                                                    centered=True)
    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=250000 // update_period,  # <=> 1,000,000 ALE frames
        end_learning_rate=0.01)  # final ε

    agent = DqnAgent(
        env.time_step_spec(),
        env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=2000,  # <=> 32,000 ALE frames
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=train_step,
        epsilon_greedy=lambda: epsilon_fn(train_step))

    return agent
示例#3
0
    def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float,
                replay_buffer_max_length: int, layers: dict) -> None:
        """
        Create the Q-network, agent and policy

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            lr: learn rate for the optimizer (default Adam)
            epsilon: Used for the default epsilon greedy policy for choosing a random action.
            gamma: The discount factor for learning Q-values
            imb_ratio: ratio of imbalance. Used to specifiy reward in the environment
            replay_buffer_max_length: Maximum lenght of replay memory.
            layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout).
        """

        dense_layers = layers.get("dense")
        conv_layers = layers.get("conv")
        dropout_layers = layers.get("dropout")

        self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio))  # create a custom environment

        q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers,
                         fc_layer_params=dense_layers, dropout_layer_params=dropout_layers)

        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr)

        train_step_counter = tf.Variable(0)

        self.agent = DqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
            train_step_counter=train_step_counter,
            gamma=gamma,
            epsilon_greedy=epsilon,
        )

        self.agent.initialize()

        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=replay_buffer_max_length)
def train_dyke_agent(train_env: TFPyEnvironment, eval_env: TFPyEnvironment,
                     agent: DqnAgent, train_steps: int, steps_per_episode: int,
                     eval_episodes: int) -> Dict[str, Any]:
    """
	Trains the DQN agent on the dyke maintenance task.

	:param train_env: The training environment.
	:param eval_env: The environment for testing agent performance.
	:param agent: The agent.
	:param train_steps: The number of training steps to use.
	:param steps_per_episode: The number of time steps that can be taken in a single dyke environment episode.
	:param eval_episodes: The number of episodes to use per evaluation.
	:return: A mapping to various metrics pertaining to the training's results.
	"""
    losses: np.ndarray = np.zeros(shape=(train_steps, steps_per_episode))
    evaluations: np.ndarray = np.zeros(shape=(train_steps, eval_episodes))
    train_metrics: Tuple = (AverageReturnMetric, )
    train_metric_results: np.ndarray = np.zeros(shape=(len(train_metrics),
                                                       train_steps,
                                                       steps_per_episode))
    for step in range(train_steps):
        # we uniformly sample experiences (single time steps) from one episode per train step
        print('STEP %d/%d' % (step + 1, train_steps))
        train_env.reset()
        rep_buf = _dyke_replay_buffer(train_env, agent, steps_per_episode)
        train_metric_inst: Tuple = tuple(
            [metric() for metric in train_metrics])  # instantiate the metrics
        obs: Tuple = (rep_buf.add_batch, ) + train_metric_inst
        _ = DynamicStepDriver(
            env=train_env,
            policy=agent.collect_policy,
            observers=obs,
            num_steps=steps_per_episode
        ).run(
        )  # experience a single episode using the agent's current configuration
        dataset: tf.data.Dataset = rep_buf.as_dataset(
            sample_batch_size=_REP_BUF_BATCH_SIZE,
            num_steps=_REP_BUF_NUM_STEPS)
        iterator = iter(dataset)
        for tr in range(steps_per_episode):
            trajectories, _ = next(iterator)
            losses[step, tr] = agent.train(experience=trajectories).loss
            for met in range(len(train_metrics)):
                train_metric_results[
                    met, step, tr] = train_metric_inst[met].result().numpy()
        evaluations[step, :] = _evaluate_dyke_agent(eval_env, agent,
                                                    eval_episodes)
    return {
        'loss': losses,
        'eval': evaluations,
        'train-metrics': train_metric_results
    }
示例#5
0
def create_pong_agent(train_environment: TFEnvironment,
                      dense_layer_sizes: Sequence[int],
                      learning_rate: float) -> (DqnAgent, QNetwork):

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    global_step = tf.compat.v1.train.get_or_create_global_step()

    q_net = QNetwork(input_tensor_spec=train_environment.observation_spec(),
                     action_spec=train_environment.action_spec(),
                     fc_layer_params=dense_layer_sizes)

    agent = DqnAgent(time_step_spec=train_environment.time_step_spec(),
                     action_spec=train_environment.action_spec(),
                     q_network=q_net,
                     optimizer=optimizer,
                     td_errors_loss_fn=element_wise_squared_loss,
                     train_step_counter=global_step)

    agent.initialize()
    agent.train = common.function(agent.train)
    agent.train_step_counter.assign(0)

    return agent
示例#6
0
  def build_dqn_agent(self):
    """Build DQN agent with QNetwork."""
    temp_env = self.build_temp_env()

    q_net = q_network.QNetwork(
        temp_env.observation_spec(),
        temp_env.action_spec(),
        fc_layer_params=self.fc_layer_params,
        dropout_layer_params=self.dropout_layer_params)

    optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
    agent = DqnAgent(
        temp_env.time_step_spec(),
        temp_env.action_spec(),
        n_step_update=self.n_step_update,
        q_network=q_net,
        optimizer=optimizer,
        epsilon_greedy=self.epsilon_greedy,
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=tf.Variable(0, dtype=tf.int64))

    return q_net, agent
示例#7
0
 def train_model(self):
     train_step = tf.Variable(0)  # count num of training steps
     update_period = 4  # train the model every 4 steps
     optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                          rho=0.95,
                                          momentum=0.0,
                                          epsilon=0.00001,
                                          centered=True)
     epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
         initial_learning_rate=1.0,  # initial ε
         decay_steps=250000 // update_period,  # <=> 1,000,000 ALE frames
         end_learning_rate=0.01)  # final ε
     agent = DqnAgent(
         self.tf_env.time_step_spec(),
         self.tf_env.action_spec(),
         q_network=self.q_net,
         optimizer=optimizer,
         target_update_period=2000,  # <=> 32,000 ALE frames
         td_errors_loss_fn=keras.losses.Huber(reduction="none"),
         gamma=0.99,  # discount factor
         train_step_counter=train_step,
         epsilon_greedy=lambda: epsilon_fn(train_step))
     return agent
示例#8
0
class DQNAgent:
    def __init__(self) -> None:
        """
        A class for training a TF-agent
        based on https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial
        """

        self.train_env = None  # Training environment
        self.agent = None  # The algorithm used to solve an RL problem is represented by a TF-Agent
        self.replay_buffer = None  # The replay buffer keeps track of data collected from the environment
        self.dataset = None  # The agent needs access to the replay buffer via an iterable tf.data.Dataset
        self.iterator = None  # The iterator of self.dataset

    def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float,
                replay_buffer_max_length: int, layers: dict) -> None:
        """
        Create the Q-network, agent and policy

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            lr: learn rate for the optimizer (default Adam)
            epsilon: Used for the default epsilon greedy policy for choosing a random action.
            gamma: The discount factor for learning Q-values
            imb_ratio: ratio of imbalance. Used to specifiy reward in the environment
            replay_buffer_max_length: Maximum lenght of replay memory.
            layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout).
        """

        dense_layers = layers.get("dense")
        conv_layers = layers.get("conv")
        dropout_layers = layers.get("dropout")

        self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio))  # create a custom environment

        q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers,
                         fc_layer_params=dense_layers, dropout_layer_params=dropout_layers)

        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr)

        train_step_counter = tf.Variable(0)

        self.agent = DqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
            train_step_counter=train_step_counter,
            gamma=gamma,
            epsilon_greedy=epsilon,
        )

        self.agent.initialize()

        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=replay_buffer_max_length)

    def fit(self, X_train: np.ndarray, y_train: np.ndarray, epochs: int, batch_size: int, eval_step: int, log_step: int,
            collect_steps_per_episode: int) -> None:
        """
        Starts the training of the Agent.

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            epochs: Number of epochs to train Agent
            batch_size: The Batch Size
            eval_step: Evaluate Model each 'eval_step'
            log_step: Monitor results of model each 'log_step'
            collect_steps_per_episode: Collect a few steps using collect_policy and save to the replay buffer.
        """

        self.dataset = self.replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
            num_steps=2).prefetch(3)

        self.iterator = iter(self.dataset)

        def collect_step(environment, policy, buffer):
            time_step = environment.current_time_step()
            action_step = policy.action(time_step)
            next_time_step = environment.step(action_step.action)
            traj = trajectory.from_transition(time_step, action_step, next_time_step)

            # Add trajectory to the replay buffer
            buffer.add_batch(traj)

        def collect_data(env, policy, buffer, steps):
            for _ in range(steps):
                collect_step(env, policy, buffer)

        # (Optional) Optimize by wrapping some of the code in a graph using TF function.
        self.agent.train = common.function(self.agent.train)

        # Reset the train step
        self.agent.train_step_counter.assign(0)

        for _ in range(epochs):
            #print("epoch: ", _)
            # Collect a few steps using collect_policy and save to the replay buffer.
            collect_data(self.train_env, self.agent.collect_policy, self.replay_buffer, collect_steps_per_episode)

            # Sample a batch of data from the buffer and update the agent's network.
            experience, _ = next(self.iterator)
            train_loss = self.agent.train(experience).loss

            step = self.agent.train_step_counter.numpy()

            if step % log_step == 0:
                print('step = {0}: loss = {1}'.format(step, train_loss))

            if step % eval_step == 0:
                metrics = self.compute_metrics(X_train, y_train)
                print(metrics)

    def compute_metrics(self, X: np.ndarray, y_true: list) -> dict:
        """Compute Metrics for Evaluation"""
        # TODO: apply softmax layer for q logits?

        q, _ = self.agent._target_q_network (X, training=False)

        # y_scores = np.max(q.numpy(), axis=1)  # predicted scores (Q-Values)
        y_pred = np.argmax(q.numpy(), axis=1)  # predicted class label

        metrics = custom_metrics(y_true, y_pred)

        return metrics

    def evaluate(self, X: np.ndarray, y: list, X_train=None, y_train=None) -> dict:
        """
         Evaluation of trained Q-network
        """
        metrics = self.compute_metrics(X, y)

        print("evaluation: ", metrics)
        return metrics
示例#9
0
train_step = tf.Variable(0)
# Create optimizer 
optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=optimizer_learning_rate,
                                                decay=optimizer_decay, momentum=optimizer_momentum,
                                                epsilon=optimizer_epsilon, centered=True)
# Computes epsilon for epsilon greedy policy given the training step
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=0.01, # initial ε
    decay_steps=epsilon_decay_steps, 
    end_learning_rate=epsilon_final) # final ε

agent = DqnAgent(tf_env.time_step_spec(),
                 tf_env.action_spec(),
                 q_network=q_net,
                 optimizer=optimizer,
                 target_update_period=target_update_period, 
                 td_errors_loss_fn=keras.losses.Huber(reduction="none"),
                 gamma=discount_factor, # discount factor
                 train_step_counter=train_step,
                 epsilon_greedy=lambda: epsilon_fn(train_step))
agent.policy = tf.compat.v2.saved_model.load('../DATA/policy_{}'.format(II))
agent.initialize()

# Speed up as tensorflow function
agent.train = function(agent.train)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
示例#10
0
update_period = 4
optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                     rho=0.95,
                                     momentum=0.0,
                                     epsilon=0.00001,
                                     centered=True)

print("Before Epsilon function")
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1.0, decay_steps=250000, end_learning_rate=0.01)
print("Before Agent")
agent = DqnAgent(tf_env.time_step_spec(),
                 tf_env.action_spec(),
                 q_network=q_net,
                 optimizer=optimizer,
                 target_update_period=2000,
                 td_errors_loss_fn=keras.losses.Huber(reduction="none"),
                 gamma=0.99,
                 train_step_counter=train_step,
                 epsilon_greedy=lambda: epsilon_fn(train_step))

agent.initialize()
if policy != None:
    agent.policy = policy

print("After  Agent.initialize()")

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=100000)
示例#11
0
文件: dqn.py 项目: zynga/rl-bakery
 def _init_qagent(self, optimizer, q_net, global_step):
     args = self._get_agent_args(optimizer, q_net, global_step)
     return DqnAgent(**args)
示例#12
0
hidden_layers = (100, )

dqn_network = QNetwork(train_env.observation_spec(),
                       train_env.action_spec(),
                       fc_layer_params=hidden_layers)

ddqn_network = QNetwork(train_env.observation_spec(),
                        train_env.action_spec(),
                        fc_layer_params=hidden_layers)

counter = tf.Variable(0)

dqn_agent = DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=dqn_network,
    optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3),
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=counter)

ddqn_agent = DdqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=ddqn_network,
    optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3),
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=counter)

dqn_agent.initialize()
ddqn_agent.initialize()
示例#13
0
                   gym_env_wrappers=[ShrinkWrapper, DiscreteActionWrapper]))

# create DQN (deep Q-Learning network)
q_net = QNetwork(train_env.observation_spec(),
                 train_env.action_spec(),
                 conv_layer_params=conv_layer_params,
                 fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

# create deep reinforcement learning agent
tf_agent = DqnAgent(train_env.time_step_spec(),
                    train_env.action_spec(),
                    q_network=q_net,
                    optimizer=optimizer,
                    td_errors_loss_fn=element_wise_squared_loss,
                    train_step_counter=train_step_counter)
tf_agent.initialize()

# create evaluation and data collection policies
eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy

# create replay buffer
print("Creating replay buffer")
replay_buffer = TFUniformReplayBuffer(data_spec=tf_agent.collect_data_spec,
                                      batch_size=train_env.batch_size,
                                      max_length=replay_buffer_capacity)

示例#14
0
    update_period = 4  # train model every 4 steps
    optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                         rho=0.95,
                                         momentum=0.0,
                                         epsilon=0.00001,
                                         centered=True)
    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial epsilon
        decay_steps=250000,
        end_learning_rate=0.01)  # final epsilon
    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=2000,  # every 32,000 frames
        td_errors_loss_fn=keras.losses.Huber(
            reduction="none"),  # must return error per instance
        gamma=0.99,  # discount factor
        train_step_counter=train_step,
        epsilon_greedy=lambda: epsilon_fn(train_step))

    agent.initialize()

    # Create the reply buffer and the observer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=1000000)

    replay_buffer_observer = replay_buffer.add_batch
示例#15
0
def main(_):
    # Environment
    env_name = "Breakout-v4"
    train_num_parallel_environments = 5
    max_steps_per_episode = 1000
    # Replay buffer
    replay_buffer_capacity = 50000
    init_replay_buffer = 500
    # Driver
    collect_steps_per_iteration = 1 * train_num_parallel_environments
    # Training
    train_batch_size = 32
    train_iterations = 100000
    train_summary_interval = 200
    train_checkpoint_interval = 200
    # Evaluation
    eval_num_parallel_environments = 5
    eval_summary_interval = 500
    eval_num_episodes = 20
    # File paths
    path = pathlib.Path(__file__)
    parent_dir = path.parent.resolve()
    folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S")
    train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint")
    train_summary_dir = str(parent_dir / folder_name / "train_summary")
    eval_summary_dir = str(parent_dir / folder_name / "eval_summary")

    # Parallel training environment
    tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * train_num_parallel_environments))
    tf_env.seed([42] * tf_env.batch_size)
    tf_env.reset()

    # Parallel evaluation environment
    eval_tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * eval_num_parallel_environments))
    eval_tf_env.seed([42] * eval_tf_env.batch_size)
    eval_tf_env.reset()

    # Creating the Deep Q-Network
    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.)

    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(tf_env.observation_spec(),
                     tf_env.action_spec(),
                     preprocessing_layers=preprocessing_layer,
                     conv_layer_params=conv_layer_params,
                     fc_layer_params=fc_layer_params)

    # Creating the DQN Agent
    optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                         rho=0.95,
                                         momentum=0.0,
                                         epsilon=0.00001,
                                         centered=True)

    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=2500000,
        end_learning_rate=0.01)  # final ε

    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=200,
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=global_step,
        epsilon_greedy=lambda: epsilon_fn(global_step))
    agent.initialize()

    # Creating the Replay Buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    # Observer: Replay Buffer Observer
    replay_buffer_observer = replay_buffer.add_batch

    # Observer: Training Metrics
    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size),
        tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size),
    ]

    # Creating the Collect Driver
    collect_driver = DynamicStepDriver(tf_env,
                                       agent.collect_policy,
                                       observers=[replay_buffer_observer] +
                                       train_metrics,
                                       num_steps=collect_steps_per_iteration)

    # Initialize replay buffer
    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer_observer,
                   ShowProgress()],
        num_steps=init_replay_buffer)
    final_time_step, final_policy_state = init_driver.run()

    # Creating the Dataset
    dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size,
                                       num_steps=2,
                                       num_parallel_calls=3).prefetch(3)

    # Optimize by wrapping some of the code in a graph using TF function.
    collect_driver.run = function(collect_driver.run)
    agent.train = function(agent.train)

    print("\n\n++++++++++++++++++++++++++++++++++\n")

    # Create checkpoint
    train_checkpointer = Checkpointer(
        ckpt_dir=train_checkpoint_dir,
        max_to_keep=1,
        agent=agent,
        # replay_buffer=replay_buffer,
        global_step=global_step,
        # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')
    )

    # Restore checkpoint
    # train_checkpointer.initialize_or_restore()

    # Summary writers and metrics
    train_summary_writer = tf.summary.create_file_writer(train_summary_dir)
    eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir)
    eval_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size,
                                       buffer_size=eval_num_episodes),
        tf_metrics.AverageEpisodeLengthMetric(
            batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes)
    ]

    # Create evaluate callback function
    eval_callback = evaluate(eval_metrics=eval_metrics,
                             eval_tf_env=eval_tf_env,
                             eval_policy=agent.policy,
                             eval_num_episodes=eval_num_episodes,
                             train_step=global_step,
                             eval_summary_writer=eval_summary_writer)

    # Train agent
    train_agent(tf_env=tf_env,
                train_iterations=train_iterations,
                global_step=global_step,
                agent=agent,
                dataset=dataset,
                collect_driver=collect_driver,
                train_metrics=train_metrics,
                train_checkpointer=train_checkpointer,
                train_checkpoint_interval=train_checkpoint_interval,
                train_summary_writer=train_summary_writer,
                train_summary_interval=train_summary_interval,
                eval_summary_interval=eval_summary_interval,
                eval_callback=eval_callback)

    print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")
示例#16
0
                                                decay=0.95,
                                                momentum=0.0,
                                                epsilon=0.0001,
                                                centered=True)

epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1.0,  # initial ?
    decay_steps=25000 // update_period,  # <=> 1,000,000 ALE frames
    end_learning_rate=0.01)  # final ?

agent = DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    target_update_period=2000,  # <=> 32,000 ALE frames
    td_errors_loss_fn=keras.losses.Huber(reduction="none"),
    gamma=0.95,  # discount factor
    train_step_counter=train_step,
    epsilon_greedy=lambda: epsilon_fn(train_step),
    reward_scale_factor=1.5)

agent.initialize()

# Create Replay Buffer and Observer
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=1000000)

replay_buffer_observer = replay_buffer.add_batch
示例#17
0
def breakout_v4(seed=42):
    env = suite_gym.load("Breakout-v4")
    env.seed(seed)
    env.reset()

    repeating_env = ActionRepeat(env, times=4)
    for name in dir(tf_agents.environments.wrappers):
        obj = getattr(tf_agents.environments.wrappers, name)
        if hasattr(obj, "__base__") and issubclass(
                obj, tf_agents.environments.wrappers.PyEnvironmentBaseWrapper):
            print("{:27s} {}".format(name, obj.__doc__.split("\n")[0]))

    limited_repeating_env = suite_gym.load(
        "Breakout-v4",
        gym_env_wrappers=[partial(TimeLimit, max_episode_steps=10000)],
        env_wrappers=[partial(ActionRepeat, times=4)],
    )

    max_episode_steps = 27000  # <=> 108k ALE frames since 1 step = 4 frames
    environment_name = "BreakoutNoFrameskip-v4"

    env = suite_atari.load(
        environment_name,
        max_episode_steps=max_episode_steps,
        gym_env_wrappers=[AtariPreprocessing, FrameStack4],
    )

    env.seed(42)
    env.reset()
    time_step = env.step(np.array(1))  # FIRE
    for _ in range(4):
        time_step = env.step(np.array(3))  # LEFT

    def plot_observation(obs):
        # Since there are only 3 color channels, you cannot display 4 frames
        # with one primary color per frame. So this code computes the delta between
        # the current frame and the mean of the other frames, and it adds this delta
        # to the red and blue channels to get a pink color for the current frame.
        obs = obs.astype(np.float32)
        img_ = obs[..., :3]
        current_frame_delta = np.maximum(
            obs[..., 3] - obs[..., :3].mean(axis=-1), 0.0)
        img_[..., 0] += current_frame_delta
        img_[..., 2] += current_frame_delta
        img_ = np.clip(img_ / 150, 0, 1)
        plt.imshow(img_)
        plt.axis("off")

    plt.figure(figsize=(6, 6))
    plot_observation(time_step.observation)
    plt.tight_layout()
    plt.savefig("./images/preprocessed_breakout_plot.png",
                format="png",
                dpi=300)
    plt.show()

    tf_env = TFPyEnvironment(env)

    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.0)
    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(
        tf_env.observation_spec(),
        tf_env.action_spec(),
        preprocessing_layers=preprocessing_layer,
        conv_layer_params=conv_layer_params,
        fc_layer_params=fc_layer_params,
    )

    # see TF-agents issue #113
    # optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0,
    #                                     epsilon=0.00001, centered=True)

    train_step = tf.Variable(0)
    update_period = 4  # run a training step every 4 collect steps
    optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4,
                                                    decay=0.95,
                                                    momentum=0.0,
                                                    epsilon=0.00001,
                                                    centered=True)
    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=250000 // update_period,  # <=> 1,000,000 ALE frames
        end_learning_rate=0.01,
    )  # final ε
    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=2000,  # <=> 32,000 ALE frames
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=train_step,
        epsilon_greedy=lambda: epsilon_fn(train_step),
    )
    agent.initialize()

    from tf_agents.replay_buffers import tf_uniform_replay_buffer

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=1000000)

    replay_buffer_observer = replay_buffer.add_batch

    class ShowProgress:
        def __init__(self, total):
            self.counter = 0
            self.total = total

        def __call__(self, trajectory):
            if not trajectory.is_boundary():
                self.counter += 1
            if self.counter % 100 == 0:
                print("\r{}/{}".format(self.counter, self.total), end="")

    from tf_agents.metrics import tf_metrics

    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    from tf_agents.eval.metric_utils import log_metrics
    import logging

    logging.getLogger().setLevel(logging.INFO)
    log_metrics(train_metrics)

    from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

    collect_driver = DynamicStepDriver(
        tf_env,
        agent.collect_policy,
        observers=[replay_buffer_observer] + train_metrics,
        num_steps=update_period,
    )  # collect 4 steps for each training iteration

    from tf_agents.policies.random_tf_policy import RandomTFPolicy

    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer.add_batch,
                   ShowProgress(20000)],
        num_steps=20000,
    )  # <=> 80,000 ALE frames
    final_time_step, final_policy_state = init_driver.run()
示例#18
0
    train_env = TFPyEnvironment(parallel_env)
    # train_env = TFPyEnvironment(suite_gym.load(env_name))
    eval_env = TFPyEnvironment(suite_gym.load(env_name))

    fc_layer_params = (100,)
    q_net = QNetwork(
        train_env.observation_spec(),
        train_env.action_spec(),
        fc_layer_params=fc_layer_params
    )
    train_step_counter = tf.Variable(0)

    agent = DqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        q_network=q_net,
        optimizer=Adam(learning_rate=LEARNING_RATE),
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=train_step_counter
    )
    agent.initialize()

    random_policy = RandomTFPolicy(
        train_env.time_step_spec(),
        train_env.action_spec()
    )

    def compute_avg_return(environment, policy, num_episodes=10):
        total_return = 0
        for _ in range(num_episodes):
            time_step = environment.reset()
            episode_return = 0
示例#19
0
    learning_rate=optimizer_learning_rate,
    decay=optimizer_decay,
    momentum=optimizer_momentum,
    epsilon=optimizer_epsilon,
    centered=True)
# Computes epsilon for epsilon greedy policy given the training step
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1.0,  # initial ε
    decay_steps=epsilon_decay_steps,
    end_learning_rate=epsilon_final)  # final ε

agent = DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    target_update_period=target_update_period,
    td_errors_loss_fn=keras.losses.Huber(reduction="none"),
    gamma=discount_factor,  # discount factor
    train_step_counter=train_step,
    epsilon_greedy=lambda: epsilon_fn(train_step))
agent.initialize()
# Speed up as tensorflow function
agent.train = function(agent.train)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    # Determines the data spec type
    data_spec=agent.collect_data_spec,
示例#20
0
optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                     rho=0.95,
                                     momentum=0.0,
                                     epsilon=1e-5)
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1.0,
    decay_steps=int(250e3) // update_period,
    end_learning_rate=0.01,
)

agent = DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    target_update_period=TARGET_UPDATE_PERIOD,
    td_errors_loss_fn=keras.losses.Huber(reduction="none"),
    gamma=0.99,
    train_step_counter=train_step,
    epsilon_greedy=lambda: epsilon_fn(train_step),
)

agent.initialize()

# %% Create rezplay buffer
from tf_agents.replay_buffers import tf_uniform_replay_buffer

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=int(REPLAY_BUFFER_MAXLEN),