Пример #1
0
  def testMasking(self):
    batch_size = 1000
    num_state_dims = 5
    num_actions = 8
    observations = tf.random.uniform([batch_size, num_state_dims])
    time_step = ts.restart(observations, batch_size=batch_size)
    input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32)
    action_spec = tensor_spec.BoundedTensorSpec(
        [1], tf.int32, 0, num_actions - 1)

    mask = [0, 1, 0, 1, 0, 0, 1, 0]
    np_mask = np.array(mask)
    tf_mask = tf.constant([mask for _ in range(batch_size)])
    q_network = categorical_q_network.CategoricalQNetwork(
        input_tensor_spec=input_tensor_spec,
        action_spec=action_spec,
        num_atoms=3,
        mask_split_fn=lambda observation: (observation, tf_mask),
        fc_layer_params=[4])
    policy = categorical_q_policy.CategoricalQPolicy(self._min_q_value,
                                                     self._max_q_value,
                                                     q_network,
                                                     action_spec)

    # Force creation of variables before global_variables_initializer.
    policy.variables()
    self.evaluate(tf.compat.v1.global_variables_initializer())

    # Sample from the policy 1000 times and ensure that invalid actions are
    # never chosen.
    action_step = policy.action(time_step)
    action = self.evaluate(action_step.action)
    self.assertEqual(action.shape, (batch_size,))
    self.assertAllEqual(np_mask[action], np.ones([batch_size]))
Пример #2
0
  def build_categorical_dqn_agent(self):
    """Build categorical DQN agent with CategoricalQNetwork."""
    temp_env = self.build_temp_env()

    if self.dropout_layer_params is not None:
      raise AttributeError('CategoricalQNetwork does accept dropout layers.')

    q_net = categorical_q_network.CategoricalQNetwork(
        temp_env.observation_spec(),
        temp_env.action_spec(),
        fc_layer_params=self.fc_layer_params)

    optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
    agent = CategoricalDqnAgent(
        temp_env.time_step_spec(),
        temp_env.action_spec(),
        n_step_update=self.n_step_update,
        categorical_q_network=q_net,
        optimizer=optimizer,
        min_q_value=0.0,
        max_q_value=3.0,
        epsilon_greedy=self.epsilon_greedy,
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=tf.Variable(0, dtype=tf.int64))

    return q_net, agent
    def testCorrectOutputShape(self):
        batch_size = 3
        num_state_dims = 5
        action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)
        num_actions = action_spec.maximum - action_spec.minimum + 1
        self.assertEqual(num_actions, 2)

        observations_spec = tensor_spec.TensorSpec([num_state_dims],
                                                   tf.float32)
        observations = tf.random.uniform([batch_size, num_state_dims])
        time_steps = ts.restart(observations, batch_size)

        q_network = categorical_q_network.CategoricalQNetwork(
            input_tensor_spec=observations_spec,
            action_spec=action_spec,
            fc_layer_params=[3])

        logits, _ = q_network(time_steps.observation)
        self.assertAllEqual(logits.shape.as_list(),
                            [batch_size, num_actions, q_network._num_atoms])

        self.evaluate(tf.compat.v1.global_variables_initializer())
        eval_logits = self.evaluate(logits)
        self.assertAllEqual(eval_logits.shape,
                            [batch_size, num_actions, q_network._num_atoms])
 def __init__(self, input_tensor_spec, action_spec, **kwargs):
     super(AtariCategoricalQNetwork, self).__init__(input_tensor_spec,
                                                    state_spec=())
     input_tensor_spec = tf.TensorSpec(dtype=tf.float32,
                                       shape=input_tensor_spec.shape)
     self._categorical_q_network = categorical_q_network.CategoricalQNetwork(
         input_tensor_spec, action_spec, **kwargs)
    def testGinConfig(self):
        batch_size = 3
        num_state_dims = 5
        action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)
        num_actions = action_spec.maximum - action_spec.minimum + 1
        self.assertEqual(num_actions, 2)

        observations_spec = tensor_spec.TensorSpec([3, 3, num_state_dims],
                                                   tf.float32)
        observations = tf.random.uniform([batch_size, 3, 3, num_state_dims])
        next_observations = tf.random.uniform(
            [batch_size, 3, 3, num_state_dims])
        time_steps = ts.restart(observations, batch_size)
        next_time_steps = ts.restart(next_observations, batch_size)

        gin.parse_config("""
        CategoricalQNetwork.conv_layer_params = [(16, 2, 1), (15, 2, 1)]
        CategoricalQNetwork.fc_layer_params = [4, 3, 5]
    """)

        q_network = categorical_q_network.CategoricalQNetwork(
            input_tensor_spec=observations_spec, action_spec=action_spec)

        logits, _ = q_network(time_steps.observation)
        next_logits, _ = q_network(next_time_steps.observation)
        self.assertAllEqual(logits.shape.as_list(),
                            [batch_size, num_actions, q_network._num_atoms])
        self.assertAllEqual(next_logits.shape.as_list(),
                            [batch_size, num_actions, q_network._num_atoms])

        # This time there are six layers: two conv layers, three fc layers, and one
        # final logits layer, for 12 trainable_variables in total.
        self.assertLen(q_network.trainable_variables, 12)
    def testBuild(self):
        batch_size = 3
        num_state_dims = 5
        action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)
        num_actions = action_spec.maximum - action_spec.minimum + 1
        self.assertEqual(num_actions, 2)

        observations_spec = tensor_spec.TensorSpec([num_state_dims],
                                                   tf.float32)
        observations = tf.random.uniform([batch_size, num_state_dims])
        time_steps = ts.restart(observations, batch_size)

        q_network = categorical_q_network.CategoricalQNetwork(
            input_tensor_spec=observations_spec,
            action_spec=action_spec,
            fc_layer_params=[3])

        logits, _ = q_network(time_steps.observation)
        self.assertAllEqual(logits.shape.as_list(),
                            [batch_size, num_actions, q_network._num_atoms])

        # There are two trainable layers here: the specified fc_layer and the final
        # logits layer. Each layer has two trainable_variables (kernel and bias),
        # for a total of 4.
        self.assertLen(q_network.trainable_variables, 4)
Пример #7
0
  def testMasking(self):
    batch_size = 1000
    num_state_dims = 5
    num_actions = 8
    observations = tf.random.uniform([batch_size, num_state_dims])
    time_step = ts.restart(observations, batch_size=batch_size)
    input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32)
    action_spec = tensor_spec.BoundedTensorSpec(
        [1], tf.int32, 0, num_actions - 1)

    # We create a fixed mask here for testing purposes. Normally the mask would
    # be part of the observation.
    mask = [0, 1, 0, 1, 0, 0, 1, 0]
    np_mask = np.array(mask)
    tf_mask = tf.constant([mask for _ in range(batch_size)])
    q_network = categorical_q_network.CategoricalQNetwork(
        input_tensor_spec=input_tensor_spec,
        action_spec=action_spec,
        num_atoms=3,
        fc_layer_params=[4])
    policy = categorical_q_policy.CategoricalQPolicy(
        self._time_step_spec, action_spec, q_network,
        self._min_q_value, self._max_q_value,
        observation_and_action_constraint_splitter=(
            lambda observation: (observation, tf_mask)))

    self.evaluate(tf.compat.v1.global_variables_initializer())

    # Sample from the policy 1000 times, and ensure that actions considered
    # invalid according to the mask are never chosen.
    action_step = policy.action(time_step)
    action = self.evaluate(action_step.action)
    self.assertEqual(action.shape, (batch_size,))
    self.assertAllEqual(np_mask[action], np.ones([batch_size]))
Пример #8
0
 def setUp(self):
     super(CategoricalDqnAgentTest, self).setUp()
     tf.compat.v1.enable_resource_variables()
     self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
     self._action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 1)
     self._categorical_net = categorical_q_network.CategoricalQNetwork(
         self._obs_spec, self._action_spec, fc_layer_params=[4])
     self._dummy_categorical_net = DummyCategoricalNet(self._obs_spec)
     self._optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.01)
Пример #9
0
 def testMultipleActionsRaiseError(self):
   with self.assertRaisesRegexp(
       TypeError, '.*action_spec must be a BoundedTensorSpec.*'):
     # Replace the action_spec for this test.
     action_spec = [tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)] * 2
     q_network = categorical_q_network.CategoricalQNetwork(
         input_tensor_spec=self._obs_spec,
         action_spec=action_spec,
         num_atoms=3,
         fc_layer_params=[4])
     categorical_q_policy.CategoricalQPolicy(
         self._time_step_spec, action_spec, q_network,
         self._min_q_value, self._max_q_value)
    def testMasking(self):
        batch_size = 3
        num_state_dims = 5
        num_actions = 6
        states = tf.random.uniform([batch_size, num_state_dims])
        input_tensor_spec = tensor_spec.TensorSpec([num_state_dims],
                                                   tf.float32)
        action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0,
                                                    num_actions - 1)
        mask = tf.constant([[1, 0, 1, 0, 0, 1] for _ in range(batch_size)])
        network = categorical_q_network.CategoricalQNetwork(
            input_tensor_spec,
            action_spec,
            mask_split_fn=lambda observation: (observation, mask))
        self.assertIsNotNone(network.mask_split_fn)

        # Run a pass through the network to catch any shape errors.
        network(states)
Пример #11
0
  def testChangeHiddenLayers(self):
    batch_size = 3
    num_state_dims = 5
    action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)
    num_actions = action_spec.maximum - action_spec.minimum + 1
    self.assertEqual(num_actions, 2)

    observations_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32)
    observations = tf.random.uniform([batch_size, num_state_dims])
    time_steps = ts.restart(observations, batch_size)

    q_network = categorical_q_network.CategoricalQNetwork(
        input_tensor_spec=observations_spec,
        action_spec=action_spec,
        fc_layer_params=[3, 3])

    logits, _ = q_network(time_steps.observation)
    self.assertAllEqual(logits.shape.as_list(),
                        [batch_size, num_actions, q_network._num_atoms])

    # This time there is an extra fc layer, for a total of 6
    # trainable_variables.
    self.assertLen(q_network.trainable_variables, 6)
Пример #12
0
    def testAddConvLayers(self):
        batch_size = 3
        num_state_dims = 5
        action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)
        num_actions = action_spec.maximum - action_spec.minimum + 1
        self.assertEqual(num_actions, 2)

        observations_spec = tensor_spec.TensorSpec([3, 3, num_state_dims],
                                                   tf.float32)
        observations = tf.random.uniform([batch_size, 3, 3, num_state_dims])
        time_steps = ts.restart(observations, batch_size)

        q_network = categorical_q_network.CategoricalQNetwork(
            input_tensor_spec=observations_spec,
            action_spec=action_spec,
            conv_layer_params=[(16, 2, 1), (15, 2, 1)])

        logits, _ = q_network(time_steps.observation)
        self.assertAllEqual(logits.shape.as_list(),
                            [batch_size, num_actions, q_network._num_atoms])

        # This time there are two conv layers and one final logits layer, for a
        # total of 6 trainable_variables.
        self.assertLen(q_network.trainable_variables, 6)
Пример #13
0
if frame_stack is not None:
    board_preprocessing = Sequential([
        keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32) / 2.),
        tf.keras.layers.Permute((4, 2, 3, 1)),
        tf.keras.layers.Lambda(lambda x: x[:, 0, :, :, :])
    ])
else:
    board_preprocessing = Sequential([
        keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32) / 2.),
    ])

# Layers params are specified by local variables ovtained from DataFrame
categorical_q_net = categorical_q_network.CategoricalQNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=board_preprocessing,
    fc_layer_params=fc_layer_params,
    conv_layer_params=conv_layer_params,
    num_atoms=int(num_atoms))
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

# Create variable that counts the number of training steps
train_step = tf.Variable(0)
# Create optimizer
optimizer = tf.compat.v1.train.RMSPropOptimizer(
    learning_rate=optimizer_learning_rate,
    decay=optimizer_decay,
    momentum=optimizer_momentum,
    epsilon=optimizer_epsilon,
Пример #14
0
def create_agent(
        agent_class,
        environment,
        fc_layer_params,
        learning_rate,
        decaying_epsilon,
        n_step_update,
        target_update_tau,
        target_update_period,
        gamma,
        reward_scale_factor,
        gradient_clipping,
        debug_summaries,
        summarize_grads_and_vars,
        train_step_counter,
        num_atoms=None,  # Only for categorical_dqn
        min_q_value=None,  # Only for categorical_dqn
        max_q_value=None,  # Only for categorical_dqn
):
    """Creates the Hanabi agent.

	Args:
	  agent_class: str, type of agent to construct.
	  environment: The environment.
	  learning_rate: The Learning Rate
	  decaying_epsilon: Epsilon for Epsilon Greedy Policy
	  target_update_tau: Agent parameter
	  target_update_period: Agent parameter
	  gamma: Agent parameter
	  reward_scale_factor: Agent parameter
	  gradient_clipping: Agent parameter
	  debug_summaries: Agent parameter
	  summarize_grads_and_vars: Agent parameter
	  train_step_counter: The train step tf.Variable to be passed to agent


	Returns:
	  An agent for playing Hanabi.

	Raises:
	  ValueError: if an unknown agent type is requested.
	"""
    if agent_class == 'DQN':
        return dqn_agent.DqnAgent(
            environment.time_step_spec(),
            environment.action_spec(),
            q_network=q_network.QNetwork(
                environment.time_step_spec().observation['observations'],
                environment.action_spec(),
                fc_layer_params=fc_layer_params),
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            observation_and_action_constraint_splitter=
            observation_and_action_constraint_splitter,
            epsilon_greedy=decaying_epsilon,
            n_step_update=n_step_update,
            target_update_tau=target_update_tau,
            target_update_period=target_update_period,
            td_errors_loss_fn=common.element_wise_squared_loss,
            gamma=gamma,
            reward_scale_factor=reward_scale_factor,
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step_counter)
    elif agent_class == 'DDQN':
        return dqn_agent.DdqnAgent(
            environment.time_step_spec(),
            environment.action_spec(),
            q_network=q_network.QNetwork(
                environment.time_step_spec().observation['observations'],
                environment.action_spec(),
                fc_layer_params=fc_layer_params),
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            observation_and_action_constraint_splitter=
            observation_and_action_constraint_splitter,
            epsilon_greedy=decaying_epsilon,
            n_step_update=n_step_update,
            target_update_tau=target_update_tau,
            target_update_period=target_update_period,
            td_errors_loss_fn=common.element_wise_squared_loss,
            gamma=gamma,
            reward_scale_factor=reward_scale_factor,
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step_counter)
    elif agent_class == 'categorical_dqn':
        return categorical_dqn_agent.CategoricalDqnAgent(
            environment.time_step_spec(),
            environment.action_spec(),
            categorical_q_network=categorical_q_network.CategoricalQNetwork(
                environment.time_step_spec().observation['observations'],
                environment.action_spec(),
                num_atoms=num_atoms,
                fc_layer_params=fc_layer_params),
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            observation_and_action_constraint_splitter=
            observation_and_action_constraint_splitter,
            epsilon_greedy=decaying_epsilon,
            n_step_update=n_step_update,
            target_update_tau=target_update_tau,
            target_update_period=target_update_period,
            min_q_value=min_q_value,
            max_q_value=max_q_value,
            td_errors_loss_fn=common.element_wise_squared_loss,
            gamma=gamma,
            reward_scale_factor=reward_scale_factor,
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step_counter)
    else:
        raise ValueError(
            'Expected valid agent_type, got {}'.format(agent_class))
Пример #15
0
                            activation='relu'),
        keras.layers.Flatten()
    ])
else:
    board_preprocessing = Sequential([
        keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32)),
        keras.layers.Flatten()
    ])

health_preprocessing = keras.layers.Flatten()

# Layers params are specified by local variables ovtained from DataFrame
categorical_q_net = categorical_q_network.CategoricalQNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=(board_preprocessing, health_preprocessing),
    preprocessing_combiner=tf.keras.layers.Concatenate(axis=-1),
    fc_layer_params=fc_layer_params,
    num_atoms=int(num_atoms))
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

# Create variable that counts the number of training steps
train_step = tf.Variable(0)
# Create optimizer
optimizer = tf.compat.v1.train.RMSPropOptimizer(
    learning_rate=optimizer_learning_rate,
    decay=optimizer_decay,
    momentum=optimizer_momentum,
    epsilon=optimizer_epsilon,
Пример #16
0
def main(argv):
    tf.compat.v1.enable_v2_behavior()
    logging.config.dictConfig({
        'version': 1,
        # Other configs ...
        'disable_existing_loggers': True
    })
    argv = argv[0]

    evaluate = argv.eval

    # Mostly copied from https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial
    # Hyperparameters
    num_iterations = argv.num_iterations

    collect_steps_per_iteration = argv.collect_steps_per_iteration
    replay_buffer_max_length = 100000

    batch_size = argv.batch_size
    learning_rate = 2.5e-5
    log_interval = argv.log_interval

    num_atoms = argv.num_atoms
    min_q_value = argv.min_q_value
    max_q_value = argv.max_q_value
    n_step_update = argv.n_step_update
    gamma = 0.99

    num_eval_episodes = 10
    eval_interval = argv.eval_interval

    save_interval = argv.save_interval
    n_parallels = argv.n_parallels
    train_in_browser = argv.train_in_browser
    # Environment
    train_py_env = Env2048(evaluate) if evaluate else ParallelPyEnvironment(
        [lambda: Env2048(train_in_browser)] * n_parallels,
        start_serially=False)
    eval_py_env = Env2048(evaluate)
    train_env = tf_py_environment.TFPyEnvironment(train_py_env)
    eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    # Agent
    fc_layer_params = (64, 64, 32)
    conv_layer_params = ((512, (2, 1), (1, 1)), (512, (1, 2), (1, 1)))
    preprocessing_layers = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(512, (1, 1), (1, 1), padding='same'),
        tf.keras.layers.Conv2D(512, (2, 1), (1, 1), padding='same'),
        tf.keras.layers.Conv2D(512, (1, 2), (1, 1), padding='same'),
        tf.keras.layers.Flatten()
    ])
    preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    global_step = tf.compat.v1.train.get_or_create_global_step()

    # q_net = q_network.QNetwork(
    #     train_env.observation_spec(),
    #     train_env.action_spec(),
    #     fc_layer_params=fc_layer_params)
    # agent = dqn_agent.DqnAgent(
    #     train_env.time_step_spec(),
    #     train_env.action_spec(),
    #     q_network=q_net,
    #     optimizer=optimizer,
    #     td_errors_loss_fn=common.element_wise_squared_loss,
    #     train_step_counter=global_step)

    categorical_q_net = categorical_q_network.CategoricalQNetwork(
        train_env.observation_spec(),
        train_env.action_spec(),
        num_atoms=num_atoms,
        fc_layer_params=fc_layer_params,
        # conv_layer_params=conv_layer_params
        preprocessing_layers=preprocessing_layers,
        preprocessing_combiner=preprocessing_combiner)
    agent = categorical_dqn_agent.CategoricalDqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        categorical_q_network=categorical_q_net,
        optimizer=optimizer,
        min_q_value=min_q_value,
        max_q_value=max_q_value,
        n_step_update=n_step_update,
        td_errors_loss_fn=common.element_wise_squared_loss,
        gamma=gamma,
        train_step_counter=global_step)
    agent.initialize()

    # Replay buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=replay_buffer_max_length)

    # Data Collection
    collect_driver = dynamic_step_driver.DynamicStepDriver(
        train_env,
        agent.collect_policy,
        observers=[replay_buffer.add_batch],
        num_steps=collect_steps_per_iteration)

    collect_driver.run()

    dataset = replay_buffer.as_dataset(num_parallel_calls=3,
                                       sample_batch_size=batch_size,
                                       num_steps=2).prefetch(3)
    iterator = iter(dataset)

    # Checkpointer
    checkpoint_dir = os.path.join(os.getcwd(), 'checkpoint')
    train_checkpointer = common.Checkpointer(ckpt_dir=checkpoint_dir,
                                             max_to_keep=1,
                                             agent=agent,
                                             policy=agent.policy,
                                             replay_buffer=replay_buffer,
                                             global_step=global_step)

    train_checkpointer.initialize_or_restore()
    global_step = tf.compat.v1.train.get_global_step()

    # Training
    if evaluate:
        avg_return, best_eval_score = compute_avg_return(
            eval_env, agent.policy, num_eval_episodes)
        print(f"Average return: {avg_return}, best score = {best_eval_score}")
        train_env.station.shutdown()
        eval_env.station.shutdown()
    else:
        agent.train = common.function(agent.train)
        # agent.train_step_counter.assign(0)
        avg_return = compute_avg_return(eval_env, agent.policy,
                                        num_eval_episodes)
        returns = [avg_return]
        t = trange(global_step.numpy(), num_iterations, leave=True)
        best_scores = np.array(
            list(map(lambda env: env.best_score, train_env.envs)))
        for _ in t:
            # Collect a few steps using collect_policy and save to the replay buffer.
            collect_driver.run()

            # Sample a batch of data from the buffer and update the agent's network.
            experience, unused_info = next(iterator)
            train_loss = agent.train(experience).loss

            scores = list(map(lambda env: env.score, train_env.envs))
            t.set_description(desc=f"Scores = {scores}")

            step = tf.compat.v1.train.get_global_step().numpy()

            if step % log_interval == 0:
                t.write(f"step = {step}: loss = {train_loss}")

            if step % save_interval == 0:
                train_checkpointer.save(step)

            if step % eval_interval == 0:
                avg_return, best_eval_score = compute_avg_return(
                    eval_env, agent.policy, num_eval_episodes)
                new_best_scores = np.array(
                    list(map(lambda env: env.best_score, train_env.envs)))
                diff = np.subtract(new_best_scores, best_scores)
                best_scores = new_best_scores
                if np.count_nonzero(diff) > 0:
                    t.write(f"step = {step}: Best scores = {best_scores}")
                t.write(
                    f'step = {step}: Average Return = {avg_return}, best score reached in training = '
                    f'{max(list(map(lambda env: env.best_score, train_env.envs)))}'
                    f', best score in eval = {best_eval_score}')
                returns.append(avg_return)
        steps = range(0, num_iterations + 1, eval_interval)
        plt.plot(steps, returns)
        plt.ylabel('Average Return')
        plt.xlabel('Step')

    train_env.close()
    eval_env.close()
    train_py_env.close()
Пример #17
0
# setup the env

train_py_env = FourInARow()
eval_py_env = FourInARow()

# convert the env to tf_env
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

# Agent

# setup the categorical network
categorical_q_net = categorical_q_network.CategoricalQNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    num_atoms=num_atoms,
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

agent = categorical_dqn_agent.CategoricalDqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    categorical_q_network=categorical_q_net,
    optimizer=optimizer,
    min_q_value=min_q_value,
    max_q_value=max_q_value,
    n_step_update=n_step_update,
Пример #18
0
def load_agents_and_create_videos(
        root_dir,
        env_name='CartPole-v0',
        num_iterations=NUM_ITERATIONS,
        max_ep_steps=1000,
        train_sequence_length=1,
        # Params for QNetwork
        fc_layer_params=((100, )),
        # Params for QRnnNetwork
        input_fc_layer_params=(50, ),
        lstm_size=(20, ),
        output_fc_layer_params=(20, ),
        # Params for collect
        initial_collect_steps=10000,
        collect_steps_per_iteration=1,
        epsilon_greedy=0.1,
        replay_buffer_capacity=100000,
        # Params for target update
        target_update_tau=0.05,
        target_update_period=5,
        # Params for train
        train_steps_per_iteration=1,
        batch_size=64,
        learning_rate=1e-3,
        num_atoms=51,
        min_q_value=-20,
        max_q_value=20,
        n_step_update=1,
        gamma=0.99,
        reward_scale_factor=1.0,
        gradient_clipping=None,
        use_tf_functions=True,
        # Params for eval
        num_eval_episodes=10,
        num_random_episodes=1,
        eval_interval=1000,
        # Params for checkpoints
        train_checkpoint_interval=10000,
        policy_checkpoint_interval=5000,
        rb_checkpoint_interval=20000,
        # Params for summaries and logging
        log_interval=1000,
        summary_interval=1000,
        summaries_flush_secs=10,
        debug_summaries=False,
        summarize_grads_and_vars=False,
        eval_metrics_callback=None,
        random_metrics_callback=None):

    # Define the directories to read from
    train_dir = os.path.join(root_dir, 'train')
    eval_dir = os.path.join(root_dir, 'eval')
    random_dir = os.path.join(root_dir, 'random')

    # Match the writers and metrics used in training
    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis=summaries_flush_secs * 1000)

    train_summary_writer.set_as_default()

    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis=summaries_flush_secs * 1000)

    eval_metrics = [
        tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
    ]

    random_summary_writer = tf.compat.v2.summary.create_file_writer(
        random_dir, flush_millis=summaries_flush_secs * 1000)

    random_metrics = [
        tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
    ]

    global_step = tf.compat.v1.train.get_or_create_global_step()

    # Match the environments used in training
    tf_env = tf_py_environment.TFPyEnvironment(
        suite_gym.load(env_name, max_episode_steps=max_ep_steps))
    eval_py_env = suite_gym.load(env_name, max_episode_steps=max_ep_steps)
    eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    # Match the agents used in training
    categorical_q_net = categorical_q_network.CategoricalQNetwork(
        tf_env.observation_spec(),
        tf_env.action_spec(),
        num_atoms=num_atoms,
        fc_layer_params=fc_layer_params)

    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

    tf_agent = categorical_dqn_agent.CategoricalDqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        categorical_q_network=categorical_q_net,
        optimizer=optimizer,
        min_q_value=min_q_value,
        max_q_value=max_q_value,
        n_step_update=n_step_update,
        td_errors_loss_fn=common.element_wise_squared_loss,
        gamma=gamma,
        train_step_counter=global_step)

    tf_agent.initialize()

    train_metrics = [
        # tf_metrics.NumberOfEpisodes(),
        # tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=tf_agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    collect_driver = dynamic_step_driver.DynamicStepDriver(
        tf_env,
        collect_policy,
        observers=[replay_buffer.add_batch] + train_metrics,
        num_steps=collect_steps_per_iteration)

    train_checkpointer = common.Checkpointer(ckpt_dir=train_dir,
                                             agent=tf_agent,
                                             global_step=global_step,
                                             metrics=metric_utils.MetricsGroup(
                                                 train_metrics,
                                                 'train_metrics'))

    policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
        train_dir, 'policy'),
                                              policy=eval_policy,
                                              global_step=global_step)

    rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
        train_dir, 'replay_buffer'),
                                          max_to_keep=1,
                                          replay_buffer=replay_buffer)

    train_checkpointer.initialize_or_restore()
    rb_checkpointer.initialize_or_restore()

    if use_tf_functions:
        # To speed up collect use common.function.
        collect_driver.run = common.function(collect_driver.run)
        tf_agent.train = common.function(tf_agent.train)

    random_policy = random_tf_policy.RandomTFPolicy(
        eval_tf_env.time_step_spec(), eval_tf_env.action_spec())

    # Make movies of the trained agent and a random agent
    date_string = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S')

    # Finally, used the saved policy to generate the video
    trained_filename = "trainedC51_" + date_string
    create_policy_eval_video(eval_tf_env, eval_py_env, tf_agent.policy,
                             trained_filename)

    # And, create one with a random agent for comparison
    random_filename = 'random_' + date_string
    create_policy_eval_video(eval_tf_env, eval_py_env, random_policy,
                             random_filename)