예제 #1
0
파일: abstract.py 프로젝트: zynga/rl-bakery
 def _build_q_network(self):
     # By default, this builds a q-network using the supplied fc_layer_params
     # This function can be overridden to provide a more complicated Q Network
     assert(self._config.agent.fc_layer_params)
     q_net = QNetwork(self._data_spec.observation_spec, self._data_spec.action_spec,
                     fc_layer_params=self._config.agent.fc_layer_params)
     return q_net
 def model(self):
     q_net = QNetwork(
         self.tf_env.observation_spec(),
         self.tf_env.action_spec(),
         preprocessing_layers=self.preprocessing_layer,
         conv_layer_params=self.conv_layer_params,
         fc_layer_params=self.fc_layer_params)
     return q_net
예제 #3
0
    def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float,
                replay_buffer_max_length: int, layers: dict) -> None:
        """
        Create the Q-network, agent and policy

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            lr: learn rate for the optimizer (default Adam)
            epsilon: Used for the default epsilon greedy policy for choosing a random action.
            gamma: The discount factor for learning Q-values
            imb_ratio: ratio of imbalance. Used to specifiy reward in the environment
            replay_buffer_max_length: Maximum lenght of replay memory.
            layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout).
        """

        dense_layers = layers.get("dense")
        conv_layers = layers.get("conv")
        dropout_layers = layers.get("dropout")

        self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio))  # create a custom environment

        q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers,
                         fc_layer_params=dense_layers, dropout_layer_params=dropout_layers)

        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr)

        train_step_counter = tf.Variable(0)

        self.agent = DqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
            train_step_counter=train_step_counter,
            gamma=gamma,
            epsilon_greedy=epsilon,
        )

        self.agent.initialize()

        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=replay_buffer_max_length)
예제 #4
0
def create_qnetwork(env):
    '''
    Create a small class to normalize the observations. 
    Images are stored using bytes from 0 to 255 to use 
    less RAM, but we want to pass floats from 0.0 to 1.0 
    to the neural network:
    '''
    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.)

    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]

    fc_layer_params = [512]

    q_net = QNetwork(env.observation_spec(),
                     env.action_spec(),
                     preprocessing_layers=preprocessing_layer,
                     conv_layer_params=conv_layer_params,
                     fc_layer_params=fc_layer_params)

    return q_net
예제 #5
0
def create_pong_agent(train_environment: TFEnvironment,
                      dense_layer_sizes: Sequence[int],
                      learning_rate: float) -> (DqnAgent, QNetwork):

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    global_step = tf.compat.v1.train.get_or_create_global_step()

    q_net = QNetwork(input_tensor_spec=train_environment.observation_spec(),
                     action_spec=train_environment.action_spec(),
                     fc_layer_params=dense_layer_sizes)

    agent = DqnAgent(time_step_spec=train_environment.time_step_spec(),
                     action_spec=train_environment.action_spec(),
                     q_network=q_net,
                     optimizer=optimizer,
                     td_errors_loss_fn=element_wise_squared_loss,
                     train_step_counter=global_step)

    agent.initialize()
    agent.train = common.function(agent.train)
    agent.train_step_counter.assign(0)

    return agent
예제 #6
0
def create_sac_algorithm(env,
                         actor_fc_layers=(100, 100),
                         critic_fc_layers=(100, 100),
                         use_rnns=False,
                         alpha_learning_rate=5e-3,
                         actor_learning_rate=5e-3,
                         critic_learning_rate=5e-3,
                         debug_summaries=False):
    """Create a simple SacAlgorithm.

    Args:
        env (TFEnvironment): A TFEnvironment
        actor_fc_layers (list[int]): list of fc layers parameters for actor network
        critic_fc_layers (list[int]): list of fc layers parameters for critic network
        use_rnns (bool): True if rnn should be used
        alpha_learning_rate (float): learning rate for alpha
        actor_learning_rate (float) : learning rate for actor network
        critic_learning_rate (float) : learning rate for critic network
        debug_summaries (bool): True if debug summaries should be created
    """

    observation_spec = env.observation_spec()
    action_spec = env.action_spec()

    is_continuous = tensor_spec.is_continuous(tf.nest.flatten(action_spec)[0])
    if use_rnns:
        actor_net = ActorDistributionRnnNetwork(
            observation_spec,
            action_spec,
            input_fc_layer_params=actor_fc_layers,
            output_fc_layer_params=())
        if is_continuous:
            critic_net = CriticRnnNetwork(
                (observation_spec, action_spec),
                observation_fc_layer_params=(),
                action_fc_layer_params=(),
                output_fc_layer_params=(),
                joint_fc_layer_params=critic_fc_layers)
        else:
            critic_net = QRnnNetwork(observation_spec,
                                     action_spec,
                                     output_fc_layer_params=(),
                                     input_fc_layer_params=critic_fc_layers)
    else:
        actor_net = ActorDistributionNetwork(observation_spec,
                                             action_spec,
                                             fc_layer_params=actor_fc_layers)
        if is_continuous:
            critic_net = CriticNetwork((observation_spec, action_spec),
                                       joint_fc_layer_params=critic_fc_layers)
        else:
            critic_net = QNetwork(observation_spec,
                                  action_spec,
                                  fc_layer_params=critic_fc_layers)

    actor_optimizer = tf.optimizers.Adam(learning_rate=actor_learning_rate)
    critic_optimizer = tf.optimizers.Adam(learning_rate=critic_learning_rate)
    alpha_optimizer = tf.optimizers.Adam(learning_rate=alpha_learning_rate)
    return SacAlgorithm(action_spec=action_spec,
                        actor_network=actor_net,
                        critic_network=critic_net,
                        actor_optimizer=actor_optimizer,
                        critic_optimizer=critic_optimizer,
                        alpha_optimizer=alpha_optimizer,
                        debug_summaries=debug_summaries)
예제 #7
0
def main():
    def compute_avg_return(environment, policy, num_episodes=10):
        total_return = 0.0
        for _ in range(num_episodes):
            time_step = environment.reset()
            episode_return = 0.0

            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = environment.step(action_step.action)
                episode_return += time_step.reward
            total_return += episode_return

        avg_return = total_return / num_episodes
        return avg_return.numpy()[0]

    class ShowProgress:
        def __init__(self, total):
            self.counter = 0
            self.total = total

        def __call__(self, trajectory):
            if not trajectory.is_boundary():
                self.counter += 1
            if self.counter % 100 == 0:
                print("\r{}/{}".format(self.counter, self.total), end="")

    def train_agent(n_iterations, save_each=10000, print_each=500):
        time_step = None
        policy_state = agent.collect_policy.get_initial_state(
            tf_env.batch_size)
        iterator = iter(dataset)

        for iteration in range(n_iterations):
            step = agent.train_step_counter.numpy()
            current_metrics = []

            time_step, policy_state = collect_driver.run(
                time_step, policy_state)
            trajectories, buffer_info = next(iterator)

            train_loss = agent.train(trajectories)
            all_train_loss.append(train_loss.loss.numpy())

            for i in range(len(train_metrics)):
                current_metrics.append(train_metrics[i].result().numpy())

            all_metrics.append(current_metrics)

            if iteration % print_each == 0:
                print("\nIteration: {}, loss:{:.2f}".format(
                    iteration, train_loss.loss.numpy()))

                for i in range(len(train_metrics)):
                    print('{}: {}'.format(train_metrics[i].name,
                                          train_metrics[i].result().numpy()))

            if step % EVAL_INTERVAL == 0:
                avg_return = compute_avg_return(eval_tf_env, agent.policy,
                                                NUM_EVAL_EPISODES)
                print(f'Step = {step}, Average Return = {avg_return}')
                returns.append((step, avg_return))

            if step % save_each == 0:
                print("Saving model")
                train_checkpointer.save(train_step)
                policy_save_handler.save("policy")
                with open("checkpoint/train_loss.pickle", "wb") as f:
                    pickle.dump(all_train_loss, f)
                with open("checkpoint/all_metrics.pickle", "wb") as f:
                    pickle.dump(all_metrics, f)
                with open("checkpoint/returns.pickle", "wb") as f:
                    pickle.dump(returns, f)

    eval_tf_env = tf_py_environment.TFPyEnvironment(BombermanEnvironment())

    #tf_env = tf_py_environment.TFPyEnvironment(
    #   parallel_py_environment.ParallelPyEnvironment(
    #       [BombermanEnvironment] * N_PARALLEL_ENVIRONMENTS
    #   ))

    tf_env = tf_py_environment.TFPyEnvironment(BombermanEnvironment())

    q_net = QNetwork(tf_env.observation_spec(),
                     tf_env.action_spec(),
                     conv_layer_params=[(32, 3, 1), (32, 3, 1)],
                     fc_layer_params=[128, 64, 32])

    train_step = tf.Variable(0)
    update_period = 4
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)  # todo fine tune

    epsilon_fn = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=0.7,
        decay_steps=25000 // update_period,
        end_learning_rate=0.01)

    agent = dqn_agent.DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        td_errors_loss_fn=common.element_wise_squared_loss,
        gamma=0.99,
        train_step_counter=train_step,
        epsilon_greedy=lambda: epsilon_fn(train_step))

    agent.initialize()

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=10000)
    replay_buffer_observer = replay_buffer.add_batch

    train_metrics = [
        tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size),
        tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size)
    ]

    collect_driver = dynamic_step_driver.DynamicStepDriver(
        tf_env,
        agent.collect_policy,
        observers=[replay_buffer_observer] + train_metrics,
        num_steps=update_period)

    initial_collect_policy = random_tf_policy.RandomTFPolicy(
        tf_env.time_step_spec(), tf_env.action_spec())

    initial_driver = dynamic_step_driver.DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[
            replay_buffer.add_batch,
            ShowProgress(INITIAL_COLLECT_STEPS)
        ],
        num_steps=INITIAL_COLLECT_STEPS)
    final_time_step, final_policy_state = initial_driver.run()

    dataset = replay_buffer.as_dataset(sample_batch_size=64,
                                       num_steps=2,
                                       num_parallel_calls=3).prefetch(3)

    agent.train = common.function(agent.train)

    all_train_loss = []
    all_metrics = []
    returns = []

    checkpoint_dir = "checkpoint/"
    train_checkpointer = common.Checkpointer(ckpt_dir=checkpoint_dir,
                                             max_to_keep=1,
                                             agent=agent,
                                             policy=agent.policy,
                                             replay_buffer=replay_buffer,
                                             global_step=train_step)
    # train_checkpointer.initialize_or_restore()
    # train_step = tf.compat.v1.train.get_global_step()
    policy_save_handler = policy_saver.PolicySaver(agent.policy)

    # training here
    train_agent(2000)

    # save at end in every case

    policy_save_handler.save("policy")
예제 #8
0
               DEATH_REWARD=death_reward, FOOD_SPAWN_MODE=food_spawn_mode, KILL_STEP_REWARD = kill_step_reward,
               FOOD_REWARD_MODE = food_reward_mode, BOARD_SIZE=8, MAX_HEALTH=100)
tf_env = tf_py_environment.TFPyEnvironment(env)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

preprocessing_layer = keras.layers.Lambda(
                          lambda obs: tf.cast(obs, np.float32) / 100)

# Layers params are specified by local variables ovtained from DataFrame
q_net = QNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=preprocessing_layer,
    conv_layer_params=conv_layer_params,
    fc_layer_params=fc_layer_params,
    batch_squash=False)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

# Create variable that counts the number of training steps
train_step = tf.Variable(0)
# Create optimizer 
optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=optimizer_learning_rate,
                                                decay=optimizer_decay, momentum=optimizer_momentum,
                                                epsilon=optimizer_epsilon, centered=True)
# Computes epsilon for epsilon greedy policy given the training step
예제 #9
0
#         print("game over", i)
#         break
#     tf_env.render(mode = "human")
#     time.sleep(0.2)

preprocessing_layers = keras.layers.Lambda(
    lambda obs: tf.cast(obs, np.float32) / 255.)

print("after preprocessing layer")
conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]

fc_layer_params = [512]

q_net = QNetwork(tf_env.observation_spec(),
                 tf_env.action_spec(),
                 preprocessing_layers=preprocessing_layers,
                 conv_layer_params=conv_layer_params,
                 fc_layer_params=fc_layer_params)

train_step = tf.Variable(0)
update_period = 4
optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                     rho=0.95,
                                     momentum=0.0,
                                     epsilon=0.00001,
                                     centered=True)

print("Before Epsilon function")
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1.0, decay_steps=250000, end_learning_rate=0.01)
print("Before Agent")
예제 #10
0
print('Observation Spec:')
print(train_env.time_step_spec().observation)

print('Reward Spec:')
print(train_env.time_step_spec().reward)

print('Action Spec:')
print(train_env.action_spec())

train_env = tf_py_environment.TFPyEnvironment(train_env)
evaluation_env = tf_py_environment.TFPyEnvironment(evaluation_env)

hidden_layers = (100, )

dqn_network = QNetwork(train_env.observation_spec(),
                       train_env.action_spec(),
                       fc_layer_params=hidden_layers)

ddqn_network = QNetwork(train_env.observation_spec(),
                        train_env.action_spec(),
                        fc_layer_params=hidden_layers)

counter = tf.Variable(0)

dqn_agent = DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=dqn_network,
    optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3),
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=counter)
예제 #11
0
def main(_):
    # Environment
    env_name = "Breakout-v4"
    train_num_parallel_environments = 5
    max_steps_per_episode = 1000
    # Replay buffer
    replay_buffer_capacity = 50000
    init_replay_buffer = 500
    # Driver
    collect_steps_per_iteration = 1 * train_num_parallel_environments
    # Training
    train_batch_size = 32
    train_iterations = 100000
    train_summary_interval = 200
    train_checkpoint_interval = 200
    # Evaluation
    eval_num_parallel_environments = 5
    eval_summary_interval = 500
    eval_num_episodes = 20
    # File paths
    path = pathlib.Path(__file__)
    parent_dir = path.parent.resolve()
    folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S")
    train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint")
    train_summary_dir = str(parent_dir / folder_name / "train_summary")
    eval_summary_dir = str(parent_dir / folder_name / "eval_summary")

    # Parallel training environment
    tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * train_num_parallel_environments))
    tf_env.seed([42] * tf_env.batch_size)
    tf_env.reset()

    # Parallel evaluation environment
    eval_tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * eval_num_parallel_environments))
    eval_tf_env.seed([42] * eval_tf_env.batch_size)
    eval_tf_env.reset()

    # Creating the Deep Q-Network
    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.)

    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(tf_env.observation_spec(),
                     tf_env.action_spec(),
                     preprocessing_layers=preprocessing_layer,
                     conv_layer_params=conv_layer_params,
                     fc_layer_params=fc_layer_params)

    # Creating the DQN Agent
    optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                         rho=0.95,
                                         momentum=0.0,
                                         epsilon=0.00001,
                                         centered=True)

    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=2500000,
        end_learning_rate=0.01)  # final ε

    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=200,
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=global_step,
        epsilon_greedy=lambda: epsilon_fn(global_step))
    agent.initialize()

    # Creating the Replay Buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    # Observer: Replay Buffer Observer
    replay_buffer_observer = replay_buffer.add_batch

    # Observer: Training Metrics
    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size),
        tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size),
    ]

    # Creating the Collect Driver
    collect_driver = DynamicStepDriver(tf_env,
                                       agent.collect_policy,
                                       observers=[replay_buffer_observer] +
                                       train_metrics,
                                       num_steps=collect_steps_per_iteration)

    # Initialize replay buffer
    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer_observer,
                   ShowProgress()],
        num_steps=init_replay_buffer)
    final_time_step, final_policy_state = init_driver.run()

    # Creating the Dataset
    dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size,
                                       num_steps=2,
                                       num_parallel_calls=3).prefetch(3)

    # Optimize by wrapping some of the code in a graph using TF function.
    collect_driver.run = function(collect_driver.run)
    agent.train = function(agent.train)

    print("\n\n++++++++++++++++++++++++++++++++++\n")

    # Create checkpoint
    train_checkpointer = Checkpointer(
        ckpt_dir=train_checkpoint_dir,
        max_to_keep=1,
        agent=agent,
        # replay_buffer=replay_buffer,
        global_step=global_step,
        # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')
    )

    # Restore checkpoint
    # train_checkpointer.initialize_or_restore()

    # Summary writers and metrics
    train_summary_writer = tf.summary.create_file_writer(train_summary_dir)
    eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir)
    eval_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size,
                                       buffer_size=eval_num_episodes),
        tf_metrics.AverageEpisodeLengthMetric(
            batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes)
    ]

    # Create evaluate callback function
    eval_callback = evaluate(eval_metrics=eval_metrics,
                             eval_tf_env=eval_tf_env,
                             eval_policy=agent.policy,
                             eval_num_episodes=eval_num_episodes,
                             train_step=global_step,
                             eval_summary_writer=eval_summary_writer)

    # Train agent
    train_agent(tf_env=tf_env,
                train_iterations=train_iterations,
                global_step=global_step,
                agent=agent,
                dataset=dataset,
                collect_driver=collect_driver,
                train_metrics=train_metrics,
                train_checkpointer=train_checkpointer,
                train_checkpoint_interval=train_checkpoint_interval,
                train_summary_writer=train_summary_writer,
                train_summary_interval=train_summary_interval,
                eval_summary_interval=eval_summary_interval,
                eval_callback=eval_callback)

    print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")
예제 #12
0
env.seed(42)
env.reset()

time_step = env.step(np.array(1))  # FIRE
for _ in range(4):
    time_step = env.step(np.array(3))  # LEFT

# Create Deep Q-Network with TF-Agents
preprocessing_layer = keras.layers.Lambda(
    lambda obs: tf.cast(obs, np.float32) / 255.)
conv_layer_params = [(64, (8, 8), 4), (64, (3, 3), 2), (64, (3, 3), 1)]
fc_layer_params = [1024]

q_net = QNetwork(tf_env.observation_spec(),
                 tf_env.action_spec(),
                 preprocessing_layers=preprocessing_layer,
                 conv_layer_params=conv_layer_params,
                 fc_layer_params=fc_layer_params,
                 activation_fn=tf.keras.activations.relu)

# See TF-agents issue #113
#optimizer = keras.optimizers.RMSprop(lr = 2.5e-4, rho = 0.95, momentum = 0.0, epsilon = 0.00001, centered = True)
train_step = tf.Variable(0)
update_period = 4  # run a training step every 4 collect steps

optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=1e-3,
                                                decay=0.95,
                                                momentum=0.0,
                                                epsilon=0.0001,
                                                centered=True)

epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
예제 #13
0
def breakout_v4(seed=42):
    env = suite_gym.load("Breakout-v4")
    env.seed(seed)
    env.reset()

    repeating_env = ActionRepeat(env, times=4)
    for name in dir(tf_agents.environments.wrappers):
        obj = getattr(tf_agents.environments.wrappers, name)
        if hasattr(obj, "__base__") and issubclass(
                obj, tf_agents.environments.wrappers.PyEnvironmentBaseWrapper):
            print("{:27s} {}".format(name, obj.__doc__.split("\n")[0]))

    limited_repeating_env = suite_gym.load(
        "Breakout-v4",
        gym_env_wrappers=[partial(TimeLimit, max_episode_steps=10000)],
        env_wrappers=[partial(ActionRepeat, times=4)],
    )

    max_episode_steps = 27000  # <=> 108k ALE frames since 1 step = 4 frames
    environment_name = "BreakoutNoFrameskip-v4"

    env = suite_atari.load(
        environment_name,
        max_episode_steps=max_episode_steps,
        gym_env_wrappers=[AtariPreprocessing, FrameStack4],
    )

    env.seed(42)
    env.reset()
    time_step = env.step(np.array(1))  # FIRE
    for _ in range(4):
        time_step = env.step(np.array(3))  # LEFT

    def plot_observation(obs):
        # Since there are only 3 color channels, you cannot display 4 frames
        # with one primary color per frame. So this code computes the delta between
        # the current frame and the mean of the other frames, and it adds this delta
        # to the red and blue channels to get a pink color for the current frame.
        obs = obs.astype(np.float32)
        img_ = obs[..., :3]
        current_frame_delta = np.maximum(
            obs[..., 3] - obs[..., :3].mean(axis=-1), 0.0)
        img_[..., 0] += current_frame_delta
        img_[..., 2] += current_frame_delta
        img_ = np.clip(img_ / 150, 0, 1)
        plt.imshow(img_)
        plt.axis("off")

    plt.figure(figsize=(6, 6))
    plot_observation(time_step.observation)
    plt.tight_layout()
    plt.savefig("./images/preprocessed_breakout_plot.png",
                format="png",
                dpi=300)
    plt.show()

    tf_env = TFPyEnvironment(env)

    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.0)
    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(
        tf_env.observation_spec(),
        tf_env.action_spec(),
        preprocessing_layers=preprocessing_layer,
        conv_layer_params=conv_layer_params,
        fc_layer_params=fc_layer_params,
    )

    # see TF-agents issue #113
    # optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0,
    #                                     epsilon=0.00001, centered=True)

    train_step = tf.Variable(0)
    update_period = 4  # run a training step every 4 collect steps
    optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4,
                                                    decay=0.95,
                                                    momentum=0.0,
                                                    epsilon=0.00001,
                                                    centered=True)
    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=250000 // update_period,  # <=> 1,000,000 ALE frames
        end_learning_rate=0.01,
    )  # final ε
    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=2000,  # <=> 32,000 ALE frames
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=train_step,
        epsilon_greedy=lambda: epsilon_fn(train_step),
    )
    agent.initialize()

    from tf_agents.replay_buffers import tf_uniform_replay_buffer

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=1000000)

    replay_buffer_observer = replay_buffer.add_batch

    class ShowProgress:
        def __init__(self, total):
            self.counter = 0
            self.total = total

        def __call__(self, trajectory):
            if not trajectory.is_boundary():
                self.counter += 1
            if self.counter % 100 == 0:
                print("\r{}/{}".format(self.counter, self.total), end="")

    from tf_agents.metrics import tf_metrics

    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    from tf_agents.eval.metric_utils import log_metrics
    import logging

    logging.getLogger().setLevel(logging.INFO)
    log_metrics(train_metrics)

    from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

    collect_driver = DynamicStepDriver(
        tf_env,
        agent.collect_policy,
        observers=[replay_buffer_observer] + train_metrics,
        num_steps=update_period,
    )  # collect 4 steps for each training iteration

    from tf_agents.policies.random_tf_policy import RandomTFPolicy

    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer.add_batch,
                   ShowProgress(20000)],
        num_steps=20000,
    )  # <=> 80,000 ALE frames
    final_time_step, final_policy_state = init_driver.run()
                pickle.dump(all_metrics, f)
            with open("checkpoint/returns.pickle", "wb") as f:
                pickle.dump(returns, f)


if __name__ == '__main__':
    # tf_env = tf_py_environment.TFPyEnvironment(
    #   parallel_py_environment.ParallelPyEnvironment(
    #       [BombermanEnvironment] * N_PARALLEL_ENVIRONMENTS
    #   ))

    tf_env = tf_py_environment.TFPyEnvironment(BombermanEnvironment())
    eval_tf_env = tf_py_environment.TFPyEnvironment(BombermanEnvironment())

    q_net = QNetwork(tf_env.observation_spec(),
                     tf_env.action_spec(),
                     conv_layer_params=[(32, 3, 1), (32, 3, 1)],
                     fc_layer_params=[128, 64, 32])

    train_step = tf.Variable(0)
    update_period = 4
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)  # todo fine tune

    epsilon_fn = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,
        decay_steps=250000 // update_period,
        end_learning_rate=0.01)

    agent = dqn_agent.DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
예제 #15
0
               MAX_COUNTER=int(max_counter))
tf_env = tf_py_environment.TFPyEnvironment(env)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

#preprocessing_layer = keras.layers.Lambda(
#                         lambda obs: tf.cast(obs, np.float32) / 100)

# Layers params are specified by local variables ovtained from DataFrame
q_net = QNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=[keras.layers.Flatten(),
                          keras.layers.Flatten()],
    preprocessing_combiner=tf.keras.layers.Concatenate(axis=-1),
    conv_layer_params=conv_layer_params,
    fc_layer_params=fc_layer_params,
    batch_squash=False)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

# Create variable that counts the number of training steps
train_step = tf.Variable(0)
# Create optimizer
optimizer = tf.compat.v1.train.RMSPropOptimizer(
    learning_rate=optimizer_learning_rate,
    decay=optimizer_decay,
예제 #16
0
log_interval = 1000
eval_interval = 5000

# create training and evaluation environments
train_env = TFPyEnvironment(
    suite_gym.load(env_name,
                   max_episode_steps=max_episode_steps_train,
                   gym_env_wrappers=[ShrinkWrapper, DiscreteActionWrapper]))
eval_env = TFPyEnvironment(
    suite_gym.load(env_name,
                   max_episode_steps=max_episode_steps_eval,
                   gym_env_wrappers=[ShrinkWrapper, DiscreteActionWrapper]))

# create DQN (deep Q-Learning network)
q_net = QNetwork(train_env.observation_spec(),
                 train_env.action_spec(),
                 conv_layer_params=conv_layer_params,
                 fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

# create deep reinforcement learning agent
tf_agent = DqnAgent(train_env.time_step_spec(),
                    train_env.action_spec(),
                    q_network=q_net,
                    optimizer=optimizer,
                    td_errors_loss_fn=element_wise_squared_loss,
                    train_step_counter=train_step_counter)
tf_agent.initialize()
예제 #17
0
BUFFER_LENGTH = 131072  # Maximum number of steps in the buffer
STEPS_PER_ITER = 4096  # Steps collected per iteration (driver)
N_ITERATIONS = 1000  # Number of training iterations per session
EVAL_MAX_STEPS = 1000  # Maximum number of env steps during evaluation
COLLECT_RANDOM = True  # Use random policy to collect data

if __name__ == '__main__':
    # Create global step counter
    global_step = tf.compat.v1.train.get_or_create_global_step()

    # Create a dummy environment with no policy, just to extract the specs
    dummy_env = TFPyEnvironment(NineMensMorris(None, discount=DISCOUNT))

    # Create Q Network
    q_net = QNetwork(input_tensor_spec=dummy_env.observation_spec(),
                     action_spec=dummy_env.action_spec(),
                     fc_layer_params=(100, 600, 600, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 600, 600),
                     dropout_layer_params=(None, 0.1, 0.1, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.1, None))

    # Create agent
    agent = DdqnAgent(time_step_spec=dummy_env.time_step_spec(),
                      action_spec=dummy_env.action_spec(),
                      q_network=q_net,
                      optimizer=Adam(learning_rate=1e-4),
                      td_errors_loss_fn=common.element_wise_squared_loss,
                      epsilon_greedy=0.1,
                      train_step_counter=global_step)
    # Initialize agent
    agent.initialize()
    # Wrap the training function in a TF graph
    agent.train = common.function(agent.train)