예제 #1
0
 def _validate_config(config: PartialTrainerConfigDict,
                      trainer_obj_or_none: Optional["Trainer"] = None):
     # Call super (Trainer) validation method first.
     Trainer._validate_config(config, trainer_obj_or_none)
     # Then call user defined one, if any.
     if validate_config is not None:
         validate_config(config)
예제 #2
0
 def __init__(self,
              config: TrainerConfigDict = None,
              env: Union[str, EnvType, None] = None,
              logger_creator: Callable[[], Logger] = None,
              remote_checkpoint_dir: Optional[str] = None,
              sync_function_tpl: Optional[str] = None):
     Trainer.__init__(self, config, env, logger_creator,
                      remote_checkpoint_dir, sync_function_tpl)
예제 #3
0
 def setup(self, config: PartialTrainerConfigDict):
     if allow_unknown_subkeys is not None:
         self._allow_unknown_subkeys += allow_unknown_subkeys
     self._allow_unknown_configs = allow_unknown_configs
     if override_all_subkeys_if_type_changes is not None:
         self._override_all_subkeys_if_type_changes += \
             override_all_subkeys_if_type_changes
     Trainer.setup(self, config)
예제 #4
0
파일: impala.py 프로젝트: zhouhh2017/ray
 def default_resource_request(cls, config):
     cf = dict(cls._default_config, **config)
     Trainer._validate_config(cf)
     return Resources(
         cpu=cf["num_cpus_for_driver"],
         gpu=cf["num_gpus"],
         extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] +
         cf["num_aggregation_workers"],
         extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
예제 #5
0
    def test_validate_config_idempotent(self):
        """
        Asserts that validate_config run multiple
        times on COMMON_CONFIG will be idempotent
        """
        # Given:
        standard_config = copy.deepcopy(COMMON_CONFIG)

        # When (we validate config 2 times), ...
        Trainer._validate_config(standard_config)
        config_v1 = copy.deepcopy(standard_config)
        Trainer._validate_config(standard_config)
        config_v2 = copy.deepcopy(standard_config)

        # ... then ...
        self.assertEqual(config_v1, config_v2)
예제 #6
0
 def __getstate__(self):
     state = Trainer.__getstate__(self)
     state["trainer_state"] = self.state.copy()
     if self.train_exec_impl:
         state["train_exec_impl"] = (
             self.train_exec_impl.shared_metrics.get().save())
     return state
예제 #7
0
파일: dqn.py 프로젝트: songhappy/ray
 def __getstate__(self):
     state = Trainer.__getstate__(self)
     state.update({
         "num_target_updates": self.num_target_updates,
         "last_target_update_ts": self.last_target_update_ts,
     })
     return state
예제 #8
0
    def __init__(self, obs_space, action_space, config):
        model = ModelCatalog.get_model_v2(obs_space, action_space,
                                          action_space.n, config["model"],
                                          "torch")
        _, env_creator = Trainer._get_env_id_and_creator(config["env"], config)
        if config["ranked_rewards"]["enable"]:
            # if r2 is enabled, tne env is wrapped to include a rewards buffer
            # used to normalize rewards
            env_cls = get_r2_env_wrapper(env_creator, config["ranked_rewards"])

            # the wrapped env is used only in the mcts, not in the
            # rollout workers
            def _env_creator():
                return env_cls(config["env_config"])

        else:

            def _env_creator():
                return env_creator(config["env_config"])

        def mcts_creator():
            return MCTS(model, config["mcts_config"])

        super().__init__(
            obs_space,
            action_space,
            config,
            model,
            alpha_zero_loss,
            TorchCategorical,
            mcts_creator,
            _env_creator,
        )
 def on_train_result(self, trainer: Trainer, result: dict, **kwargs):
     iteration = result["training_iteration"]
     logger.info(f"Iteration {iteration}")
     if iteration % 10 == 0:
         logger.info(f"Model checkpoint at iteration {iteration}")
         torch.save(trainer.get_weights()["default_policy"],
                    self.model_path)
예제 #10
0
파일: bandit.py 프로젝트: wuisawesome/ray
 def get_default_config(cls) -> TrainerConfigDict:
     return Trainer.merge_trainer_configs(
         DEFAULT_CONFIG,
         {
             # Use UpperConfidenceBound exploration.
             "exploration_config": {"type": "UpperConfidenceBound"}
         },
     )
예제 #11
0
    def test_validate_config_idempotent(self):
        """
        Asserts that validate_config run multiple
        times on COMMON_CONFIG will be idempotent
        """
        # Given
        standard_config = copy.deepcopy(COMMON_CONFIG)
        standard_config["_use_trajectory_view_api"] = False

        # When (we validate config 2 times)
        Trainer._validate_config(standard_config)
        config_v1 = copy.deepcopy(standard_config)
        Trainer._validate_config(standard_config)
        config_v2 = copy.deepcopy(standard_config)

        # Then
        self.assertEqual(config_v1, config_v2)
예제 #12
0
파일: inference.py 프로젝트: ulrikah/rave
def run_offline_inference(agent: Trainer, env: CrossAdaptiveEnv):
    # NOTE: something is wrong here. For some reason, all the action values are too close to the bound
    done = False
    obs = env.reset()
    while not done:
        action = agent.compute_action(obs)
        # TODO: standardize action
        # it might be difficult to standardize the action in live mode, but offline inference essentially work
        obs, _, done, _ = env.step(action)
예제 #13
0
파일: bandit.py 프로젝트: wuisawesome/ray
 def get_default_config(cls) -> TrainerConfigDict:
     config = Trainer.merge_trainer_configs(
         DEFAULT_CONFIG,
         {
             # Use ThompsonSampling exploration.
             "exploration_config": {"type": "ThompsonSampling"}
         },
     )
     return config
예제 #14
0
 def execution_plan(workers, config, **kwargs):
     # `execution_plan` is provided, use it inside
     # `self.execution_plan()`.
     if execution_plan is not None:
         return execution_plan(workers, config, **kwargs)
     # If `execution_plan` is not provided (None), the Trainer will use
     # it's already existing default `execution_plan()` static method
     # instead.
     else:
         return Trainer.execution_plan(workers, config, **kwargs)
예제 #15
0
def execution_plan(trainer: Trainer, workers: WorkerSet,
                   config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]:
    """Execution plan of the Simple Q algorithm. Defines the distributed dataflow.

    Args:
        trainer (Trainer): The Trainer object creating the execution plan.
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: A local iterator over training metrics.
    """
    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config["replay_sequence_length"])
    # Assign to Trainer, so we can store the LocalReplayBuffer's
    # data when we save checkpoints.
    trainer.local_replay_buffer = local_replay_buffer

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # (1) Generate rollouts and store them in our local replay buffer.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    if config["simple_optimizer"]:
        train_step_op = TrainOneStep(workers)
    else:
        train_step_op = MultiGPUTrainOneStep(
            workers=workers,
            sgd_minibatch_size=config["train_batch_size"],
            num_sgd_iter=1,
            num_gpus=config["num_gpus"],
            shuffle_sequences=True,
            _fake_gpus=config["_fake_gpus"],
            framework=config.get("framework"))

    # (2) Read and train on experiences from the replay buffer.
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(train_step_op) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2).
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
예제 #16
0
    def default_resource_request(cls, config):
        cf = dict(cls._default_config, **config)
        Trainer._validate_config(cf)

        eval_config = cf["evaluation_config"]

        # Return PlacementGroupFactory containing all needed resources
        # (already properly defined as device bundles).
        return PlacementGroupFactory(
            bundles=[{
                # Driver + Aggregation Workers:
                # Force to be on same node to maximize data bandwidth
                # between aggregation workers and the learner (driver).
                # Aggregation workers tree-aggregate experiences collected
                # from RolloutWorkers (n rollout workers map to m
                # aggregation workers, where m < n) and always use 1 CPU
                # each.
                "CPU":
                cf["num_cpus_for_driver"] + cf["num_aggregation_workers"],
                "GPU":
                cf["num_gpus"]
            }] + [
                {
                    # RolloutWorkers.
                    "CPU": cf["num_cpus_per_worker"],
                    "GPU": cf["num_gpus_per_worker"],
                } for _ in range(cf["num_workers"])
            ] + ([
                {
                    # Evaluation (remote) workers.
                    # Note: The local eval worker is located on the driver CPU.
                    "CPU":
                    eval_config.get("num_cpus_per_worker",
                                    cf["num_cpus_per_worker"]),
                    "GPU":
                    eval_config.get("num_gpus_per_worker",
                                    cf["num_gpus_per_worker"]),
                } for _ in range(cf["evaluation_num_workers"])
            ] if cf["evaluation_interval"] else []),
            strategy=config.get("placement_strategy", "PACK"))
예제 #17
0
    def default_resource_request(cls, config):
        cf = dict(cls._default_config, **config)
        Trainer._validate_config(cf)

        eval_config = cf["evaluation_config"]

        # Return PlacementGroupFactory containing all needed resources
        # (already properly defined as device bundles).
        return PlacementGroupFactory(
            bundles=[{
                # Local worker + replay buffer actors.
                # Force replay buffers to be on same node to maximize
                # data bandwidth between buffers and the learner (driver).
                # Replay buffer actors each contain one shard of the total
                # replay buffer and use 1 CPU each.
                "CPU":
                cf["num_cpus_for_driver"] +
                cf["optimizer"]["num_replay_buffer_shards"],
                "GPU":
                cf["num_gpus"]
            }] + [
                {
                    # RolloutWorkers.
                    "CPU": cf["num_cpus_per_worker"],
                    "GPU": cf["num_gpus_per_worker"],
                } for _ in range(cf["num_workers"])
            ] + ([
                {
                    # Evaluation workers.
                    # Note: The local eval worker is located on the driver CPU.
                    "CPU":
                    eval_config.get("num_cpus_per_worker",
                                    cf["num_cpus_per_worker"]),
                    "GPU":
                    eval_config.get("num_gpus_per_worker",
                                    cf["num_gpus_per_worker"]),
                } for _ in range(cf["evaluation_num_workers"])
            ] if cf["evaluation_interval"] else []),
            strategy=config.get("placement_strategy", "PACK"))
예제 #18
0
    def add_policy(self, policy_id: PolicyID, policy_spec: PolicySpec):
        # Merge the policies config overrides with the main config.
        # Also, adjust `num_gpus` (to indicate an individual policy's
        # num_gpus, not the total number of GPUs).
        cfg = Trainer.merge_trainer_configs(
            self.config,
            dict(policy_spec.config, **{"num_gpus": self.num_gpus_per_policy}),
        )

        # Need to create the replay actor first. Then add the first policy.
        if self.replay_actor is None:
            return self._add_replay_buffer_and_policy(policy_id, policy_spec,
                                                      cfg)

        # Replay actor already exists -> Just add a new policy here.

        assert len(self.policy_actors) < self.max_num_policies

        actual_policy_class = get_tf_eager_cls_if_necessary(
            policy_spec.policy_class, cfg)

        colocated = create_colocated_actors(
            actor_specs=[(
                ray.remote(
                    num_cpus=1,
                    num_gpus=self.num_gpus_per_policy
                    if not cfg["_fake_gpus"] else 0,
                )(actual_policy_class),
                # Policy c'tor args.
                (policy_spec.observation_space, policy_spec.action_space, cfg),
                # Policy c'tor kwargs={}.
                {},
                # Count=1,
                1,
            )],
            # Force co-locate on the already existing replay actor's node.
            node=ray.get(self.replay_actor.get_host.remote()),
        )

        self.policy_actors[policy_id] = colocated[0][0]

        return self.policy_actors[policy_id]
예제 #19
0
파일: inference.py 프로젝트: ulrikah/rave
def run_live_inference(
    agent: Trainer,
    env: CrossAdaptiveEnv,
):
    mediator = Mediator()

    episode_index = 0
    while episode_index < 1500:
        source_features, target_features = mediator.get_features()
        if source_features is None or target_features is None:
            continue
        else:
            # trim off timestamp
            source_features = source_features[1:]
            target_features = target_features[1:]

        standardized_source = np.array([
            env.standardizer.get_standardized_value(
                env.analyser.analysis_features[i], feature_value)
            for i, feature_value in enumerate(source_features)
        ])
        standardized_target = np.array([
            env.standardizer.get_standardized_value(
                env.analyser.analysis_features[i], feature_value)
            for i, feature_value in enumerate(target_features)
        ])
        obs = np.concatenate((standardized_source, standardized_target))
        print(np.round(obs, decimals=2))
        action = agent.compute_action(obs)
        # action = env.action_space.sample()
        mapping = env.action_to_mapping(action)
        # print(mapping)
        mediator.send_effect_mapping(mapping)
        episode_index += 1
    mediator.terminate()
    print("\n\n\tDONE\n\n")
예제 #20
0
파일: dqn.py 프로젝트: wuisawesome/ray
DEFAULT_CONFIG = Trainer.merge_trainer_configs(
    SIMPLEQ_DEFAULT_CONFIG,
    {
        # === Model ===
        # Number of atoms for representing the distribution of return. When
        # this is greater than 1, distributional Q-learning is used.
        # the discrete supports are bounded by v_min and v_max
        "num_atoms": 1,
        "v_min": -10.0,
        "v_max": 10.0,
        # Whether to use noisy network
        "noisy": False,
        # control the initial value of noisy nets
        "sigma0": 0.5,
        # Whether to use dueling dqn
        "dueling": True,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens": [256],
        # Whether to use double dqn
        "double_q": True,
        # N-step Q learning
        "n_step": 1,

        # === Prioritized replay buffer ===
        # If True prioritized replay buffer will be used.
        "prioritized_replay": True,
        # Alpha parameter for prioritized replay buffer.
        "prioritized_replay_alpha": 0.6,
        # Beta parameter for sampling from prioritized replay buffer.
        "prioritized_replay_beta": 0.4,
        # Final value of beta (by default, we use constant beta=0.4).
        "final_prioritized_replay_beta": 0.4,
        # Time steps over which the beta parameter is annealed.
        "prioritized_replay_beta_annealing_timesteps": 20000,
        # Epsilon to add to the TD errors when updating priorities.
        "prioritized_replay_eps": 1e-6,

        # Callback to run before learning on a multi-agent batch of
        # experiences.
        "before_learn_on_batch": None,

        # The intensity with which to update the model (vs collecting samples
        # from the env). If None, uses the "natural" value of:
        # `train_batch_size` / (`rollout_fragment_length` x `num_workers` x
        # `num_envs_per_worker`).
        # If provided, will make sure that the ratio between ts inserted into
        # and sampled from the buffer matches the given value.
        # Example:
        #   training_intensity=1000.0
        #   train_batch_size=250 rollout_fragment_length=1
        #   num_workers=1 (or 0) num_envs_per_worker=1
        #   -> natural value = 250 / 1 = 250.0
        #   -> will make sure that replay+train op will be executed 4x as
        #      often as rollout+insert op (4 * 250 = 1000).
        # See: rllib/agents/dqn/dqn.py::calculate_rr_weights for further
        # details.
        "training_intensity": None,

        # === Parallelism ===
        # Whether to compute priorities on workers.
        "worker_side_prioritization": False,
    },
    _allow_unknown_configs=True,
)
예제 #21
0
파일: r2d2.py 프로젝트: krfricke/ray
R2D2_DEFAULT_CONFIG = Trainer.merge_trainer_configs(
    DQN_DEFAULT_CONFIG,  # See keys in impala.py, which are also supported.
    {
        # Learning rate for adam optimizer.
        "lr": 1e-4,
        # Discount factor.
        "gamma": 0.997,
        # Train batch size (in number of single timesteps).
        "train_batch_size": 64 * 20,
        # Adam epsilon hyper parameter
        "adam_epsilon": 1e-3,
        # Run in parallel by default.
        "num_workers": 2,
        # Batch mode must be complete_episodes.
        "batch_mode": "complete_episodes",

        # === Replay buffer ===
        "replay_buffer_config": {
            # For now we don't use the new ReplayBuffer API here
            "_enable_replay_buffer_api": False,
            "type": "MultiAgentReplayBuffer",
            "prioritized_replay": False,
            "prioritized_replay_alpha": 0.6,
            # Beta parameter for sampling from prioritized replay buffer.
            "prioritized_replay_beta": 0.4,
            # Epsilon to add to the TD errors when updating priorities.
            "prioritized_replay_eps": 1e-6,
            # Size of the replay buffer (in sequences, not timesteps).
            "capacity": 100000,
            # Set automatically: The number
            # of contiguous environment steps to
            # replay at once. Will be calculated via
            # model->max_seq_len + burn_in.
            # Do not set this to any valid value!
            "replay_sequence_length": -1,
        },
        # If True, assume a zero-initialized state input (no matter where in
        # the episode the sequence is located).
        # If False, store the initial states along with each SampleBatch, use
        # it (as initial state when running through the network for training),
        # and update that initial state during training (from the internal
        # state outputs of the immediately preceding sequence).
        "zero_init_states": True,
        # If > 0, use the `burn_in` first steps of each replay-sampled sequence
        # (starting either from all 0.0-values if `zero_init_state=True` or
        # from the already stored values) to calculate an even more accurate
        # initial states for the actual sequence (starting after this burn-in
        # window). In the burn-in case, the actual length of the sequence
        # used for loss calculation is `n - burn_in` time steps
        # (n=LSTM’s/attention net’s max_seq_len).
        "burn_in": 0,

        # Whether to use the h-function from the paper [1] to scale target
        # values in the R2D2-loss function:
        # h(x) = sign(x)(􏰅|x| + 1 − 1) + εx
        "use_h_function": True,
        # The epsilon parameter from the R2D2 loss function (only used
        # if `use_h_function`=True.
        "h_function_epsilon": 1e-3,

        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq": 2500,

        # Experimental flag.
        # If True, the execution plan API will not be used. Instead,
        # a Trainer's `training_iteration` method will be called as-is each
        # training iteration.
        "_disable_execution_plan_api": False,
    },
    _allow_unknown_configs=True,
)
예제 #22
0
DEFAULT_CONFIG = Trainer.merge_trainer_configs(
    PPO_DEFAULT_CONFIG,
    {
        # During the sampling phase, each rollout worker will collect a batch
        # `rollout_fragment_length * num_envs_per_worker` steps in size.
        "rollout_fragment_length": 100,
        # Vectorize the env (should enable by default since each worker has
        # a GPU).
        "num_envs_per_worker": 5,
        # During the SGD phase, workers iterate over minibatches of this size.
        # The effective minibatch size will be:
        # `sgd_minibatch_size * num_workers`.
        "sgd_minibatch_size": 50,
        # Number of SGD epochs per optimization round.
        "num_sgd_iter": 10,
        # Download weights between each training step. This adds a bit of
        # overhead but allows the user to access the weights from the trainer.
        "keep_local_weights_in_sync": True,

        # *** WARNING: configs below are DDPPO overrides over PPO; you
        #     shouldn't need to adjust them. ***
        # DDPPO requires PyTorch distributed.
        "framework": "torch",
        # The communication backend for PyTorch distributed.
        "torch_distributed_backend": "gloo",
        # Learning is no longer done on the driver process, so
        # giving GPUs to the driver does not make sense!
        "num_gpus": 0,
        # Each rollout worker gets a GPU.
        "num_gpus_per_worker": 1,
        # Require evenly sized batches. Otherwise,
        # collective allreduce could fail.
        "truncate_episodes": True,
        # This is auto set based on sample batch size.
        "train_batch_size": -1,
        # Kl divergence penalty should be fixed to 0 in DDPPO because in order
        # for it to be used as a penalty, we would have to un-decentralize
        # DDPPO
        "kl_coeff": 0.0,
        "kl_target": 0.0
    },
    _allow_unknown_configs=True,
)
예제 #23
0
 def __setstate__(self, state):
     Trainer.__setstate__(self, state)
     self.train_exec_impl.shared_metrics.get().restore(
         state["train_exec_impl"])
예제 #24
0
 def __init__(self, config=None, env=None, logger_creator=None):
     Trainer.__init__(self, config, env, logger_creator)
예제 #25
0
DEFAULT_CONFIG = Trainer.merge_trainer_configs(
    appo.DEFAULT_CONFIG,  # See keys in appo.py, which are also supported.
    {
        # TODO: Unify the buffer API, then clean up our existing
        #  implementations of different buffers.
        # This is num batches held at any time for each policy.
        "replay_buffer_capacity": 20,
        # e.g. ratio=0.2 -> 20% of samples in each train batch are
        # old (replayed) ones.
        "replay_buffer_replay_ratio": 0.5,

        # Timeout to use for `ray.wait()` when waiting for samplers to have placed
        # new data into the buffers. If no samples are ready within the timeout,
        # the buffers used for mixin-sampling will return only older samples.
        "sample_wait_timeout": 0.0,
        # Timeout to use for `ray.wait()` when waiting for the policy learner actors
        # to have performed an update and returned learning stats. If no learner
        # actors have produced any learning results in the meantime, their
        # learner-stats in the results will be empty for that iteration.
        "learn_wait_timeout": 0.0,

        # League-building parameters.
        # The LeagueBuilder class to be used for league building logic.
        "league_builder_config": {
            "type": AlphaStarLeagueBuilder,
            # The number of random policies to add to the league. This must be an
            # even number (including 0) as these will be evenly distributed
            # amongst league- and main- exploiters.
            "num_random_policies": 2,
            # The number of initially learning league-exploiters to create.
            "num_learning_league_exploiters": 4,
            # The number of initially learning main-exploiters to create.
            "num_learning_main_exploiters": 4,
            # Minimum win-rate (between 0.0 = 0% and 1.0 = 100%) of any policy to
            # be considered for snapshotting (cloning). The cloned copy may then
            # be frozen (no further learning) or keep learning (independent of
            # its ancestor policy).
            # Set this to lower values to speed up league growth.
            "win_rate_threshold_for_new_snapshot": 0.9,
            # If we took a new snapshot of any given policy, what's the probability
            # that this snapshot will continue to be trainable (rather than become
            # frozen/non-trainable)? By default, only keep those policies trainable
            # that have been trainable from the very beginning.
            "keep_new_snapshot_training_prob": 0.0,
            # Probabilities of different match-types:
            # LE: Learning league_exploiter vs any.
            # ME: Learning main exploiter vs any main.
            # M: Main self-play (p=1.0 - LE - ME).
            "prob_league_exploiter_match": 0.33,
            "prob_main_exploiter_match": 0.33,
            # Only for ME matches: Prob to play against learning
            # main (vs a snapshot main).
            "prob_main_exploiter_playing_against_learning_main": 0.5,
        },

        # The maximum number of trainable policies for this Trainer.
        # Each trainable policy will exist as a independent remote actor, co-located
        # with a replay buffer. This is besides its existence inside
        # the RolloutWorkers for training and evaluation.
        # Set to None for automatically inferring this value from the number of
        # trainable policies found in the `multiagent` config.
        "max_num_policies_to_train": None,

        # By default, don't drop last timestep.
        # TODO: We should do the same for IMPALA and APPO at some point.
        "vtrace_drop_last_ts": False,

        # Reporting interval.
        "min_time_s_per_reporting": 2,

        # Use the `training_iteration` method instead of an execution plan.
        "_disable_execution_plan_api": True,
    },
    _allow_unknown_configs=True,
)
예제 #26
0
    def training_step(self) -> ResultDict:
        # W/o microbatching: Identical to Trainer's default implementation.
        # Only difference to a default Trainer being the value function loss term
        # and its value computations alongside each action.
        if self.config["microbatch_size"] is None:
            return Trainer.training_step(self)

        # In microbatch mode, we want to compute gradients on experience
        # microbatches, average a number of these microbatches, and then
        # apply the averaged gradient in one SGD step. This conserves GPU
        # memory, allowing for extremely large experience batches to be
        # used.
        if self._by_agent_steps:
            train_batch = synchronous_parallel_sample(
                worker_set=self.workers,
                max_agent_steps=self.config["microbatch_size"])
        else:
            train_batch = synchronous_parallel_sample(
                worker_set=self.workers,
                max_env_steps=self.config["microbatch_size"])
        self._counters[NUM_ENV_STEPS_SAMPLED] += train_batch.env_steps()
        self._counters[NUM_AGENT_STEPS_SAMPLED] += train_batch.agent_steps()

        with self._timers[COMPUTE_GRADS_TIMER]:
            grad, info = self.workers.local_worker().compute_gradients(
                train_batch, single_agent=True)
            # New microbatch accumulation phase.
            if self._microbatches_grads is None:
                self._microbatches_grads = grad
            # Existing gradients: Accumulate new gradients on top of existing ones.
            else:
                for i, g in enumerate(grad):
                    self._microbatches_grads[i] += g
            self._microbatches_counts += train_batch.count
            self._num_microbatches += 1

        # If `train_batch_size` reached: Accumulate gradients and apply.
        num_microbatches = math.ceil(self.config["train_batch_size"] /
                                     self.config["microbatch_size"])
        if self._num_microbatches >= num_microbatches:
            # Update counters.
            self._counters[STEPS_TRAINED_COUNTER] += self._microbatches_counts
            self._counters[
                STEPS_TRAINED_THIS_ITER_COUNTER] = self._microbatches_counts

            # Apply gradients.
            apply_timer = self._timers[APPLY_GRADS_TIMER]
            with apply_timer:
                self.workers.local_worker().apply_gradients(
                    self._microbatches_grads)
                apply_timer.push_units_processed(self._microbatches_counts)

            # Reset microbatch information.
            self._microbatches_grads = None
            self._microbatches_counts = self._num_microbatches = 0

            # Also update global vars of the local worker.
            # Create current global vars.
            global_vars = {
                "timestep": self._counters[NUM_AGENT_STEPS_SAMPLED],
            }
            with self._timers[WORKER_UPDATE_TIMER]:
                self.workers.sync_weights(
                    policies=self.workers.local_worker().get_policies_to_train(
                    ),
                    global_vars=global_vars,
                )

        train_results = {DEFAULT_POLICY_ID: info}

        return train_results
예제 #27
0
 def __setstate__(self, state):
     Trainer.__setstate__(self, state)
     self.state = state["trainer_state"].copy()
     if self.train_pipeline:
         self.train_pipeline.metrics.restore(state["train_pipeline"])
예제 #28
0
 def __getstate__(self):
     state = Trainer.__getstate__(self)
     state["trainer_state"] = self.state.copy()
     if self.train_pipeline:
         state["train_pipeline"] = self.train_pipeline.metrics.save()
     return state
예제 #29
0
 def __setstate__(self, state):
     Trainer.__setstate__(self, state)
     self.state = state["trainer_state"].copy()
예제 #30
0
 def validate_config(self, config: PartialTrainerConfigDict):
     # Call super (Trainer) validation method first.
     Trainer.validate_config(self, config)
     # Then call user defined one, if any.
     if validate_config is not None:
         validate_config(config)