Пример #1
0
  def learner(self, queue: reverb.Client, counter: counting.Counter):
    """The Learning part of the agent."""
    # Use architect and create the environment.
    # Create the networks.
    network = self._network_factory(self._environment_spec.actions)
    tf2_utils.create_variables(network, [self._environment_spec.observations])

    # The dataset object to learn from.
    dataset = datasets.make_reverb_dataset(
        server_address=queue.server_address,
        batch_size=self._batch_size,
        prefetch_size=self._prefetch_size)

    logger = loggers.make_default_logger('learner', steps_key='learner_steps')
    counter = counting.Counter(counter, 'learner')

    # Return the learning agent.
    learner = learning.IMPALALearner(
        environment_spec=self._environment_spec,
        network=network,
        dataset=dataset,
        discount=self._discount,
        learning_rate=self._learning_rate,
        entropy_cost=self._entropy_cost,
        baseline_cost=self._baseline_cost,
        max_abs_reward=self._max_abs_reward,
        max_gradient_norm=self._max_gradient_norm,
        counter=counter,
        logger=logger,
    )

    return tf2_savers.CheckpointingRunner(learner,
                                          time_delta_minutes=5,
                                          subdirectory='impala_learner')
Пример #2
0
    def __init__(
        self,
        environment_spec: specs.EnvironmentSpec,
        network: snt.RNNCore,
        sequence_length: int,
        sequence_period: int,
        counter: counting.Counter = None,
        logger: loggers.Logger = None,
        discount: float = 0.99,
        max_queue_size: int = 100000,
        batch_size: int = 16,
        learning_rate: float = 1e-3,
        entropy_cost: float = 0.01,
        baseline_cost: float = 0.5,
        max_abs_reward: Optional[float] = None,
        max_gradient_norm: Optional[float] = None,
    ):

        num_actions = environment_spec.actions.num_values
        self._logger = logger or loggers.TerminalLogger('agent')

        extra_spec = {
            'core_state': network.initial_state(1),
            'logits': tf.ones(shape=(1, num_actions), dtype=tf.float32)
        }
        # Remove batch dimensions.
        extra_spec = tf2_utils.squeeze_batch_dim(extra_spec)

        queue = reverb.Table.queue(name=adders.DEFAULT_PRIORITY_TABLE,
                                   max_size=max_queue_size,
                                   signature=adders.SequenceAdder.signature(
                                       environment_spec,
                                       extras_spec=extra_spec,
                                       sequence_length=sequence_length))
        self._server = reverb.Server([queue], port=None)
        self._can_sample = lambda: queue.can_sample(batch_size)
        address = f'localhost:{self._server.port}'

        # Component to add things into replay.
        adder = adders.SequenceAdder(
            client=reverb.Client(address),
            period=sequence_period,
            sequence_length=sequence_length,
        )

        # The dataset object to learn from.
        dataset = datasets.make_reverb_dataset(server_address=address,
                                               batch_size=batch_size)

        tf2_utils.create_variables(network, [environment_spec.observations])

        self._actor = acting.IMPALAActor(network, adder)
        self._learner = learning.IMPALALearner(
            environment_spec=environment_spec,
            network=network,
            dataset=dataset,
            counter=counter,
            logger=logger,
            discount=discount,
            learning_rate=learning_rate,
            entropy_cost=entropy_cost,
            baseline_cost=baseline_cost,
            max_gradient_norm=max_gradient_norm,
            max_abs_reward=max_abs_reward,
        )