Пример #1
0
 def test_train_external_multi_agent_cartpole_many_policies(self):
     n = 20
     single_env = gym.make("CartPole-v0")
     act_space = single_env.action_space
     obs_space = single_env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = RolloutWorker(
         env_creator=lambda _: MultiAgentCartPole({"num_agents": n}),
         policy=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         rollout_fragment_length=100)
     optimizer = SyncSamplesOptimizer(WorkerSet._from_existing(ev))
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Пример #2
0
 def testTrainMultiCartpoleManyPolicies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = PolicyEvaluator(
         env_creator=lambda _: MultiCartpole(n),
         policy_graph=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     optimizer = SyncSamplesOptimizer(ev, [], {})
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Пример #3
0
Файл: pg.py Проект: zdpau/ray-1
class PGAgent(Agent):
    """Simple policy gradient agent.

    This is an example agent to show how to implement algorithms in RLlib.
    In most cases, you will probably want to use the PPO agent instead.
    """

    _agent_name = "PG"
    _default_config = DEFAULT_CONFIG
    _policy_graph = PGPolicyGraph

    @override(Agent)
    def _init(self):
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, self._policy_graph)
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, self._policy_graph, self.config["num_workers"])
        optimizer_config = dict(
            self.config["optimizer"],
            **{"train_batch_size": self.config["train_batch_size"]})
        self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                              self.remote_evaluators,
                                              optimizer_config)

    @override(Agent)
    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        self.optimizer.step()
        result = self.optimizer.collect_metrics(
            self.config["collect_metrics_timeout"])
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result
Пример #4
0
class PGAgent(Agent):
    """Simple policy gradient agent.

    This is an example agent to show how to implement algorithms in RLlib.
    In most cases, you will probably want to use the PPO agent instead.
    """

    _agent_name = "PG"
    _default_config = DEFAULT_CONFIG
    _policy_graph = PGPolicyGraph

    @classmethod
    def default_resource_request(cls, config):
        cf = merge_dicts(cls._default_config, config)
        return Resources(cpu=1, gpu=0, extra_cpu=cf["num_workers"])

    def _init(self):
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, self._policy_graph)
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, self._policy_graph, self.config["num_workers"],
            {})
        self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                              self.remote_evaluators,
                                              self.config["optimizer"])

    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        self.optimizer.step()
        result = self.optimizer.collect_metrics()
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result
Пример #5
0
        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)
            if get_initial_state:
                self.state = get_initial_state(self)
            else:
                self.state = {}
            if get_policy_class is None:
                policy = default_policy
            else:
                policy = get_policy_class(config)
            if before_init:
                before_init(self)
            if make_workers:
                self.workers = make_workers(self, env_creator, policy, config)
            else:
                self.workers = self._make_workers(env_creator, policy, config,
                                                  self.config["num_workers"])
            self.train_pipeline = None
            self.optimizer = None

            if training_pipeline:
                self.train_pipeline = training_pipeline(self.workers, config)
            elif make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.workers, config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.workers,
                                                      **optimizer_config)
            if after_init:
                after_init(self)
Пример #6
0
class PGAgent(Agent):
    """Simple policy gradient agent.

    This is an example agent to show how to implement algorithms in RLlib.
    In most cases, you will probably want to use the PPO agent instead.
    """

    _agent_name = "PG"
    _default_config = DEFAULT_CONFIG
    _policy_graph = PGPolicyGraph

    @override(Agent)
    def _init(self):
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, self._policy_graph)
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, self._policy_graph, self.config["num_workers"])
        optimizer_config = dict(
            self.config["optimizer"],
            **{"train_batch_size": self.config["train_batch_size"]})
        self.optimizer = SyncSamplesOptimizer(
            self.local_evaluator, self.remote_evaluators, optimizer_config)

    @override(Agent)
    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        self.optimizer.step()
        result = self.optimizer.collect_metrics(
            self.config["collect_metrics_timeout"])
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result
Пример #7
0
 def testTrainMultiCartpoleManyPolicies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = PolicyEvaluator(
         env_creator=lambda _: MultiCartpole(n),
         policy_graph=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     optimizer = SyncSamplesOptimizer(ev, [], {})
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Пример #8
0
        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)
            if get_initial_state:
                self.state = get_initial_state(self)
            else:
                self.state = {}
            if get_policy_class is None:
                policy = default_policy
            else:
                policy = get_policy_class(config)
            if before_init:
                before_init(self)
            if make_workers:
                self.workers = make_workers(self, env_creator, policy, config)
            else:
                self.workers = self._make_workers(env_creator, policy, config,
                                                  self.config["num_workers"])
            self.train_pipeline = None
            self.optimizer = None

            if training_pipeline and (self.config["use_pipeline_impl"] or
                                      "RLLIB_USE_PIPELINE_IMPL" in os.environ):
                logger.warning("Using experimental pipeline based impl.")
                self.train_pipeline = training_pipeline(self.workers, config)
            elif make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.workers, config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.workers,
                                                      **optimizer_config)
            if after_init:
                after_init(self)
Пример #9
0
        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)
            if get_initial_state:
                self.state = get_initial_state(self)
            else:
                self.state = {}
            if get_policy_class is None:
                policy = default_policy
            else:
                policy = get_policy_class(config)
            if before_init:
                before_init(self)
            if make_workers:
                self.workers = make_workers(self, env_creator, policy, config)
            else:
                self.workers = self._make_workers(env_creator, policy, config,
                                                  self.config["num_workers"])
            if make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.workers, config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.workers,
                                                      **optimizer_config)
            # self.optimizer: <Override_ray.sync_replay_optimizer.SyncReplayOptimizer object at 0x7f7424799d90>

            if after_init:
                after_init(self)
Пример #10
0
 def test_train_multi_cartpole_many_policies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     worker = RolloutWorker(
         env_creator=lambda _: MultiCartpole(n),
         policy=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     workers = WorkerSet._from_existing(worker, [])
     optimizer = SyncSamplesOptimizer(workers)
     for i in range(100):
         optimizer.step()
         result = collect_metrics(worker)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Пример #11
0
 def _init(self):
     self.local_evaluator = self.make_local_evaluator(
         self.env_creator, self._policy_graph)
     self.remote_evaluators = self.make_remote_evaluators(
         self.env_creator, self._policy_graph, self.config["num_workers"])
     self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                           self.remote_evaluators,
                                           self.config["optimizer"])
Пример #12
0
 def _init(self, config, env_creator):
     self._validate_config(config)
     self.workers = self._make_workers(env_creator,
                                       self._policy,
                                       config,
                                       num_workers=config["num_workers"])
     self.optimizer = SyncSamplesOptimizer(
         self.workers, train_batch_size=config["train_batch_size"])
Пример #13
0
Файл: pg.py Проект: zdpau/ray-1
 def _init(self):
     self.local_evaluator = self.make_local_evaluator(
         self.env_creator, self._policy_graph)
     self.remote_evaluators = self.make_remote_evaluators(
         self.env_creator, self._policy_graph, self.config["num_workers"])
     optimizer_config = dict(
         self.config["optimizer"],
         **{"train_batch_size": self.config["train_batch_size"]})
     self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                           self.remote_evaluators,
                                           optimizer_config)
Пример #14
0
 def _init(self, config, env_creator):
     if config["use_pytorch"]:
         from ray.rllib.agents.pg.torch_pg_policy_graph import \
             PGTorchPolicyGraph
         policy_cls = PGTorchPolicyGraph
     else:
         policy_cls = self._policy_graph
     self.local_evaluator = self.make_local_evaluator(
         env_creator, policy_cls)
     self.remote_evaluators = self.make_remote_evaluators(
         env_creator, policy_cls, config["num_workers"])
     optimizer_config = dict(
         config["optimizer"],
         **{"train_batch_size": config["train_batch_size"]})
     self.optimizer = SyncSamplesOptimizer(
         self.local_evaluator, self.remote_evaluators, **optimizer_config)
Пример #15
0
class TRPOTrainer(Trainer):
    """Single agent trainer for TRPO."""

    _name = "TRPO"
    _default_config = DEFAULT_CONFIG
    _policy = TRPOTorchPolicy

    # pylint:disable=attribute-defined-outside-init

    @override(Trainer)
    def _init(self, config, env_creator):
        self._validate_config(config)
        self.workers = self._make_workers(env_creator,
                                          self._policy,
                                          config,
                                          num_workers=config["num_workers"])
        self.optimizer = SyncSamplesOptimizer(
            self.workers, train_batch_size=config["train_batch_size"])

    @override(Trainer)
    def _train(self):
        while not self._iteration_done():
            _ = self.optimizer.step()

        res = self.collect_metrics()
        timesteps = self.optimizer.num_steps_sampled - self.global_vars[
            "timestep"]
        res.update(timesteps_this_iter=timesteps, info=res.get("info", {}))
        return res
Пример #16
0
 def _init(self):
     self.optimizer = SyncSamplesOptimizer.make(
         evaluator_cls=CommonPolicyEvaluator,
         evaluator_args={
             "env_creator":
             self.env_creator,
             "policy_graph": (self.config["multiagent"]["policy_graphs"]
                              or PGPolicyGraph),
             "policy_mapping_fn":
             self.config["multiagent"]["policy_mapping_fn"],
             "batch_steps":
             self.config["batch_size"],
             "batch_mode":
             "truncate_episodes",
             "model_config":
             self.config["model"],
             "env_config":
             self.config["env_config"],
             "policy_config":
             self.config,
             "num_envs":
             self.config["num_envs"],
         },
         num_workers=self.config["num_workers"],
         optimizer_config=self.config["optimizer"])
Пример #17
0
Файл: ppo.py Проект: xlnwel/ray
 def _init(self):
     waste_ratio = (self.config["sample_batch_size"] *
                    self.config["num_workers"] /
                    self.config["train_batch_size"])
     if waste_ratio > 1:
         msg = ("sample_batch_size * num_workers >> train_batch_size. "
                "This means that many steps will be discarded. Consider "
                "reducing sample_batch_size, or increase train_batch_size.")
         if waste_ratio > 1.5:
             raise ValueError(msg)
         else:
             print("Warning: " + msg)
     self.local_evaluator = self.make_local_evaluator(
         self.env_creator, self._policy_graph)
     self.remote_evaluators = self.make_remote_evaluators(
         self.env_creator, self._policy_graph, self.config["num_workers"], {
             "num_cpus": self.config["num_cpus_per_worker"],
             "num_gpus": self.config["num_gpus_per_worker"]
         })
     if self.config["simple_optimizer"]:
         self.optimizer = SyncSamplesOptimizer(
             self.local_evaluator, self.remote_evaluators, {
                 "num_sgd_iter": self.config["num_sgd_iter"],
                 "train_batch_size": self.config["train_batch_size"]
             })
     else:
         self.optimizer = LocalMultiGPUOptimizer(
             self.local_evaluator, self.remote_evaluators, {
                 "sgd_batch_size": self.config["sgd_minibatch_size"],
                 "num_sgd_iter": self.config["num_sgd_iter"],
                 "num_gpus": self.config["num_gpus"],
                 "train_batch_size": self.config["train_batch_size"],
                 "standardize_fields": ["advantages"],
             })
Пример #18
0
 def _init(self):
     self._validate_config()
     self.local_evaluator = self.make_local_evaluator(
         self.env_creator, self._policy_graph)
     self.remote_evaluators = self.make_remote_evaluators(
         self.env_creator, self._policy_graph, self.config["num_workers"])
     if self.config["simple_optimizer"]:
         self.optimizer = SyncSamplesOptimizer(
             self.local_evaluator, self.remote_evaluators, {
                 "num_sgd_iter": self.config["num_sgd_iter"],
                 "train_batch_size": self.config["train_batch_size"],
             })
     else:
         self.optimizer = LocalMultiGPUOptimizer(
             self.local_evaluator, self.remote_evaluators, {
                 "sgd_batch_size": self.config["sgd_minibatch_size"],
                 "num_sgd_iter": self.config["num_sgd_iter"],
                 "num_gpus": self.config["num_gpus"],
                 "sample_batch_size": self.config["sample_batch_size"],
                 "num_envs_per_worker": self.config["num_envs_per_worker"],
                 "train_batch_size": self.config["train_batch_size"],
                 "standardize_fields": ["advantages"],
                 "straggler_mitigation":
                 (self.config["straggler_mitigation"]),
             })
Пример #19
0
        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)

            if get_initial_state:
                self.state = get_initial_state(self)
            else:
                self.state = {}

            # Override default policy if `get_policy_class` is provided.
            if get_policy_class is not None:
                self._policy = get_policy_class(config)

            if before_init:
                before_init(self)
            use_exec_api = (execution_plan
                            and (self.config["use_exec_api"]
                                 or "RLLIB_EXEC_API" in os.environ))

            # Creating all workers (excluding evaluation workers).
            if make_workers and not use_exec_api:
                self.workers = make_workers(self, env_creator, self._policy,
                                            config)
            else:
                self.workers = self._make_workers(env_creator, self._policy,
                                                  config,
                                                  self.config["num_workers"])
            self.train_exec_impl = None
            self.optimizer = None
            self.execution_plan = execution_plan

            if use_exec_api:
                logger.warning(
                    "The experimental distributed execution API is enabled "
                    "for this algorithm. Disable this by setting "
                    "'use_exec_api': False.")
                self.train_exec_impl = execution_plan(self.workers, config)
            elif make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.workers, config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.workers,
                                                      **optimizer_config)
            if after_init:
                after_init(self)
Пример #20
0
Файл: a2c.py Проект: zqxyz73/ray
def choose_policy_optimizer(workers, config):
    if config["microbatch_size"]:
        return MicrobatchOptimizer(workers,
                                   train_batch_size=config["train_batch_size"],
                                   microbatch_size=config["microbatch_size"])
    else:
        return SyncSamplesOptimizer(
            workers, train_batch_size=config["train_batch_size"])
Пример #21
0
 def _init(self, config, env_creator):
     if validate_config:
         validate_config(config)
     if get_policy_class is None:
         policy = default_policy
     else:
         policy = get_policy_class(config)
     self.workers = self._make_workers(env_creator, policy, config,
                                       self.config["num_workers"])
     if make_policy_optimizer:
         self.optimizer = make_policy_optimizer(self.workers, config)
     else:
         optimizer_config = dict(
             config["optimizer"],
             **{"train_batch_size": config["train_batch_size"]})
         self.optimizer = SyncSamplesOptimizer(self.workers,
                                               **optimizer_config)
Пример #22
0
 def _init(self):
     self.local_evaluator = self.make_local_evaluator(
         self.env_creator, self._policy_graph)
     self.remote_evaluators = self.make_remote_evaluators(
         self.env_creator, self._policy_graph, self.config["num_workers"])
     optimizer_config = dict(
         self.config["optimizer"],
         **{"train_batch_size": self.config["train_batch_size"]})
     self.optimizer = SyncSamplesOptimizer(
         self.local_evaluator, self.remote_evaluators, optimizer_config)
Пример #23
0
        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)

            if get_initial_state:
                self.state = get_initial_state(self)
            else:
                self.state = {}
            if get_policy_class is None:
                self._policy = default_policy
            else:
                self._policy = get_policy_class(config)
            if before_init:
                before_init(self)
            use_exec_api = (execution_plan
                            and (self.config["use_exec_api"]
                                 or "RLLIB_EXEC_API" in os.environ))

            # Creating all workers (excluding evaluation workers).
            if make_workers and not use_exec_api:
                self.workers = make_workers(self, env_creator, self._policy,
                                            config)
            else:
                self.workers = self._make_workers(env_creator, self._policy,
                                                  config,
                                                  self.config["num_workers"])
            self.train_exec_impl = None
            self.optimizer = None
            self.execution_plan = execution_plan

            if use_exec_api:
                self.train_exec_impl = execution_plan(self.workers, config)
            elif make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.workers, config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.workers,
                                                      **optimizer_config)
            if after_init:
                after_init(self)
Пример #24
0
    def _init(self, config, env_creator):
        # Random seed
        seed = config['seed']
        torch.manual_seed(seed)
        np.random.seed(seed)

        self.env_config = config['env_config']
        self.num_sgd_iter = config['num_sgd_iter']
        self.num_workers = config['num_workers']
        self.sgd_minibatch_size = config['sgd_minibatch_size']
        self.train_batch_size = config['train_batch_size']

        # Set up workers
        policy_cls = policy_options[config['policy']]
        self.workers = self._make_workers(env_creator, policy_cls, config,
                                          self.num_workers)
        self.optimizer = SyncSamplesOptimizer(
            self.workers,
            num_sgd_iter=self.num_sgd_iter,
            train_batch_size=self.train_batch_size,
            sgd_minibatch_size=self.sgd_minibatch_size)
Пример #25
0
def choose_policy_optimizer(workers, config):
    if config["simple_optimizer"]:
        return SyncSamplesOptimizer(
            workers,
            num_sgd_iter=config["num_sgd_iter"],
            train_batch_size=config["train_batch_size"])
    else:
        return SyncBatchesReplayOptimizer(
            workers,
            num_gradient_descents=config["num_sgd_iter"],
            learning_starts=config["learning_starts"],
            train_batch_size=config["train_batch_size"],
            buffer_size=config["buffer_size"])
Пример #26
0
 def _init(self, config, env_creator):
     if validate_config:
         validate_config(config)
     if get_policy_class is None:
         policy_graph = default_policy
     else:
         policy_graph = get_policy_class(config)
     self.local_evaluator = self.make_local_evaluator(
         env_creator, policy_graph)
     self.remote_evaluators = self.make_remote_evaluators(
         env_creator, policy_graph, config["num_workers"])
     if make_policy_optimizer:
         self.optimizer = make_policy_optimizer(self.local_evaluator,
                                                self.remote_evaluators,
                                                config)
     else:
         optimizer_config = dict(
             config["optimizer"],
             **{"train_batch_size": config["train_batch_size"]})
         self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                               self.remote_evaluators,
                                               **optimizer_config)
Пример #27
0
class PGTrainer(Trainer):
    """Simple policy gradient agent.

    This is an example agent to show how to implement algorithms in RLlib.
    In most cases, you will probably want to use the PPO agent instead.
    """

    _name = "PG"
    _default_config = DEFAULT_CONFIG
    _policy_graph = PGPolicyGraph

    @override(Trainer)
    def _init(self, config, env_creator):
        if config["use_pytorch"]:
            from ray.rllib.agents.pg.torch_pg_policy_graph import \
                PGTorchPolicyGraph
            policy_cls = PGTorchPolicyGraph
        else:
            policy_cls = self._policy_graph
        self.local_evaluator = self.make_local_evaluator(
            env_creator, policy_cls)
        self.remote_evaluators = self.make_remote_evaluators(
            env_creator, policy_cls, config["num_workers"])
        optimizer_config = dict(
            config["optimizer"],
            **{"train_batch_size": config["train_batch_size"]})
        self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                              self.remote_evaluators,
                                              optimizer_config)

    @override(Trainer)
    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        self.optimizer.step()
        result = self.collect_metrics()
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result
Пример #28
0
class PPOTrainer(Trainer):

    _name = "PPO"
    _default_config = DEFAULT_CONFIG

    def _init(self, config, env_creator):
        # Random seed
        seed = config['seed']
        torch.manual_seed(seed)
        np.random.seed(seed)

        self.env_config = config['env_config']
        self.num_sgd_iter = config['num_sgd_iter']
        self.num_workers = config['num_workers']
        self.sgd_minibatch_size = config['sgd_minibatch_size']
        self.train_batch_size = config['train_batch_size']

        # Set up workers
        policy_cls = policy_options[config['policy']]
        self.workers = self._make_workers(env_creator, policy_cls, config,
                                          self.num_workers)
        self.optimizer = SyncSamplesOptimizer(
            self.workers,
            num_sgd_iter=self.num_sgd_iter,
            train_batch_size=self.train_batch_size,
            sgd_minibatch_size=self.sgd_minibatch_size)

    def _train(self):
        self.optimizer.step()

        res = dict(timesteps_this_iter=self.optimizer.num_steps_sampled,
                   info=self.optimizer.stats())
        return res

    def evaluate(self):
        return self.workers.local_worker().sample()
Пример #29
0
def choose_policy_optimizer(workers, config):
    if config["simple_optimizer"]:
        return SyncSamplesOptimizer(
            workers,
            num_sgd_iter=config["num_sgd_iter"],
            train_batch_size=config["train_batch_size"])

    return LocalMultiGPUOptimizer(
        workers,
        sgd_batch_size=config["sgd_minibatch_size"],
        num_sgd_iter=config["num_sgd_iter"],
        num_gpus=config["num_gpus"],
        sample_batch_size=config["sample_batch_size"],
        num_envs_per_worker=config["num_envs_per_worker"],
        train_batch_size=config["train_batch_size"],
        standardize_fields=["advantages"],
        shuffle_sequences=config["shuffle_sequences"])
Пример #30
0
def choose_policy_optimizer(workers, config):
    if config["distributed_data_parallel_optimizer"]:
        if not config["use_pytorch"]:
            raise ValueError(
                "Distributed data parallel is only supported for PyTorch")
        if config["num_gpus"]:
            raise ValueError(
                "When using distributed data parallel, you should set "
                "num_gpus=0 since all optimization "
                "is happening on workers. Enable GPUs for workers by setting "
                "num_gpus_per_worker=1.")
        if config["batch_mode"] != "truncate_episodes":
            raise ValueError(
                "Distributed data parallel requires truncate_episodes "
                "batch mode.")
        if config["sample_batch_size"] != config["train_batch_size"]:
            raise ValueError(
                "Distributed data parallel requires sample_batch_size to be "
                "equal to train_batch_size. Each worker will sample and learn "
                "on train_batch_size samples per iteration.")

        return TorchDistributedDataParallelOptimizer(
            workers,
            num_sgd_iter=config["num_sgd_iter"],
            train_batch_size=config["train_batch_size"],
            sgd_minibatch_size=config["sgd_minibatch_size"],
            standardize_fields=["advantages"])

    if config["simple_optimizer"]:
        return SyncSamplesOptimizer(
            workers,
            num_sgd_iter=config["num_sgd_iter"],
            train_batch_size=config["train_batch_size"],
            sgd_minibatch_size=config["sgd_minibatch_size"],
            standardize_fields=["advantages"])

    return LocalMultiGPUOptimizer(
        workers,
        sgd_batch_size=config["sgd_minibatch_size"],
        num_sgd_iter=config["num_sgd_iter"],
        num_gpus=config["num_gpus"],
        sample_batch_size=config["sample_batch_size"],
        num_envs_per_worker=config["num_envs_per_worker"],
        train_batch_size=config["train_batch_size"],
        standardize_fields=["advantages"],
        shuffle_sequences=config["shuffle_sequences"])
Пример #31
0
    class trainer_cls(Trainer):
        _name = name
        _default_config = default_config or COMMON_CONFIG
        _policy = default_policy

        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)
            if get_policy_class is None:
                policy = default_policy
            else:
                policy = get_policy_class(config)
            self.workers = self._make_workers(env_creator, policy, config,
                                              self.config["num_workers"])
            if make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.workers, config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.workers,
                                                      **optimizer_config)

        @override(Trainer)
        def _train(self):
            if before_train_step:
                before_train_step(self)
            prev_steps = self.optimizer.num_steps_sampled

            start = time.time()
            while True:
                fetches = self.optimizer.step()
                if after_optimizer_step:
                    after_optimizer_step(self, fetches)
                if time.time() - start > self.config["min_iter_time_s"]:
                    break

            res = self.collect_metrics()
            res.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                       prev_steps,
                       info=res.get("info", {}))
            if after_train_result:
                after_train_result(self, res)
            return res
Пример #32
0
def make_optimizer(local_evaluator, remote_evaluators, config):
    if config["simple_optimizer"]:
        return SyncSamplesOptimizer(
            local_evaluator,
            remote_evaluators,
            num_sgd_iter=config["num_sgd_iter"],
            train_batch_size=config["train_batch_size"])

    return LocalMultiGPUOptimizer(
        local_evaluator,
        remote_evaluators,
        sgd_batch_size=config["sgd_minibatch_size"],
        num_sgd_iter=config["num_sgd_iter"],
        num_gpus=config["num_gpus"],
        sample_batch_size=config["sample_batch_size"],
        num_envs_per_worker=config["num_envs_per_worker"],
        train_batch_size=config["train_batch_size"],
        standardize_fields=["advantages"],
        straggler_mitigation=config["straggler_mitigation"])
Пример #33
0
    class trainer_cls(Trainer):
        _name = name
        _default_config = default_config or Trainer.COMMON_CONFIG
        _policy_graph = default_policy

        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)
            if get_policy_class is None:
                policy_graph = default_policy
            else:
                policy_graph = get_policy_class(config)
            self.local_evaluator = self.make_local_evaluator(
                env_creator, policy_graph)
            self.remote_evaluators = self.make_remote_evaluators(
                env_creator, policy_graph, config["num_workers"])
            if make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.local_evaluator,
                                                       self.remote_evaluators,
                                                       config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                                      self.remote_evaluators,
                                                      **optimizer_config)

        @override(Trainer)
        def _train(self):
            if before_train_step:
                before_train_step(self)
            prev_steps = self.optimizer.num_steps_sampled
            fetches = self.optimizer.step()
            if after_optimizer_step:
                after_optimizer_step(self, fetches)
            res = self.collect_metrics()
            res.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                       prev_steps,
                       info=res.get("info", {}))
            if after_train_result:
                after_train_result(self, res)
            return res