class PPOAgent(Agent): """Multi-GPU optimized implementation of PPO in TensorFlow.""" _agent_name = "PPO" _default_config = DEFAULT_CONFIG _policy_graph = PPOPolicyGraph @override(Agent) def _init(self): self._validate_config() self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"]) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"], }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], }) @override(Agent) def _train(self): prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: # multi-agent self.local_evaluator.foreach_trainable_policy( lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"])) res = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) res.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.get("info", {}))) return res def _validate_config(self): waste_ratio = (self.config["sample_batch_size"] * self.config["num_workers"] / self.config["train_batch_size"]) if waste_ratio > 1: msg = ("sample_batch_size * num_workers >> train_batch_size. " "This means that many steps will be discarded. Consider " "reducing sample_batch_size, or increase train_batch_size.") if waste_ratio > 1.5: raise ValueError(msg) else: logger.warn(msg) if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]: raise ValueError( "Minibatch size {} must be <= train batch size {}.".format( self.config["sgd_minibatch_size"], self.config["train_batch_size"])) if (self.config["batch_mode"] == "truncate_episodes" and not self.config["use_gae"]): raise ValueError( "Episode truncation is not supported without a value function") if (self.config["multiagent"]["policy_graphs"] and not self.config["simple_optimizer"]): logger.info( "In multi-agent mode, policies will be optimized sequentially " "by the multi-GPU optimizer. Consider setting " "simple_optimizer=True if this doesn't work for you.") if self.config["observation_filter"] != "NoFilter": # TODO(ekl): consider setting the default to be NoFilter logger.warn( "By default, observations will be normalized with {}".format( self.config["observation_filter"]))
class PPOAgent(Agent): """Multi-GPU optimized implementation of PPO in TensorFlow.""" _agent_name = "PPO" _default_config = DEFAULT_CONFIG _policy_graph = PPOPolicyGraph @classmethod def default_resource_request(cls, config): cf = merge_dicts(cls._default_config, config) return Resources( cpu=1, gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) def _init(self): self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"], { "num_cpus": self.config["num_cpus_per_worker"], "num_gpus": self.config["num_gpus_per_worker"] }) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "timesteps_per_batch": self.config["timesteps_per_batch"] }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_batchsize"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "timesteps_per_batch": self.config["timesteps_per_batch"], "standardize_fields": ["advantages"], }) def _train(self): prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: # multi-agent self.local_evaluator.foreach_trainable_policy( lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"])) res = self.optimizer.collect_metrics() res.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.get("info", {}))) return res def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 for ev in self.remote_evaluators: ev.__ray_terminate__.remote() def _save(self, checkpoint_dir): checkpoint_path = os.path.join(checkpoint_dir, "checkpoint-{}".format(self.iteration)) agent_state = ray.get( [a.save.remote() for a in self.remote_evaluators]) extra_data = [self.local_evaluator.save(), agent_state] pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb")) return checkpoint_path def _restore(self, checkpoint_path): extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb")) self.local_evaluator.restore(extra_data[0]) ray.get([ a.restore.remote(o) for (a, o) in zip(self.remote_evaluators, extra_data[1]) ])
class PPOAgent(Agent): """Multi-GPU optimized implementation of PPO in TensorFlow.""" _agent_name = "PPO" _default_config = DEFAULT_CONFIG _policy_graph = PPOPolicyGraph @override(Agent) def _init(self): self._validate_config() self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"]) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"], }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "sample_batch_size": self.config["sample_batch_size"], "num_envs_per_worker": self.config["num_envs_per_worker"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], "straggler_mitigation": (self.config["straggler_mitigation"]), }) @override(Agent) def _train(self): if "observation_filter" not in self.raw_user_config: # TODO(ekl) remove this message after a few releases logger.info( "Important! Since 0.7.0, observation normalization is no " "longer enabled by default. To enable running-mean " "normalization, set 'observation_filter': 'MeanStdFilter'. " "You can ignore this message if your environment doesn't " "require observation normalization.") prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: def update(pi, pi_id): if pi_id in fetches: pi.update_kl(fetches[pi_id]["kl"]) else: logger.debug( "No data for {}, not updating kl".format(pi_id)) # multi-agent self.local_evaluator.foreach_trainable_policy(update) res = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) res.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.get("info", {}))) # Warn about bad clipping configs if self.config["vf_clip_param"] <= 0: rew_scale = float("inf") elif res["policy_reward_mean"]: rew_scale = 0 # punt on handling multiagent case else: rew_scale = round( abs(res["episode_reward_mean"]) / self.config["vf_clip_param"], 0) if rew_scale > 100: logger.warning( "The magnitude of your environment rewards are more than " "{}x the scale of `vf_clip_param`. ".format(rew_scale) + "This means that it will take more than " "{} iterations for your value ".format(rew_scale) + "function to converge. If this is not intended, consider " "increasing `vf_clip_param`.") return res def _validate_config(self): if self.config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0") if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]: raise ValueError( "Minibatch size {} must be <= train batch size {}.".format( self.config["sgd_minibatch_size"], self.config["train_batch_size"])) if (self.config["batch_mode"] == "truncate_episodes" and not self.config["use_gae"]): raise ValueError( "Episode truncation is not supported without a value " "function. Consider setting batch_mode=complete_episodes.") if (self.config["multiagent"]["policy_graphs"] and not self.config["simple_optimizer"]): logger.info( "In multi-agent mode, policies will be optimized sequentially " "by the multi-GPU optimizer. Consider setting " "simple_optimizer=True if this doesn't work for you.") if not self.config["vf_share_layers"]: logger.warning( "FYI: By default, the value function will not share layers " "with the policy model ('vf_share_layers': False).")
class PPOAgent(Agent): """Multi-GPU optimized implementation of PPO in TensorFlow.""" _agent_name = "PPO" _default_config = DEFAULT_CONFIG _policy_graph = PPOPolicyGraph @classmethod def default_resource_request(cls, config): cf = merge_dicts(cls._default_config, config) return Resources( cpu=1, gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) def _init(self): self._validate_config() self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"], { "num_cpus": self.config["num_cpus_per_worker"], "num_gpus": self.config["num_gpus_per_worker"] }) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"], }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], }) def _validate_config(self): waste_ratio = ( self.config["sample_batch_size"] * self.config["num_workers"] / self.config["train_batch_size"]) if waste_ratio > 1: msg = ("sample_batch_size * num_workers >> train_batch_size. " "This means that many steps will be discarded. Consider " "reducing sample_batch_size, or increase train_batch_size.") if waste_ratio > 1.5: raise ValueError(msg) else: print("Warning: " + msg) if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]: raise ValueError( "Minibatch size {} must be <= train batch size {}.".format( self.config["sgd_minibatch_size"], self.config["train_batch_size"])) if (self.config["batch_mode"] == "truncate_episodes" and not self.config["use_gae"]): raise ValueError( "Episode truncation is not supported without a value function") def _train(self): prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: # multi-agent self.local_evaluator.foreach_trainable_policy( lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"])) res = self.optimizer.collect_metrics() res.update( timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.get("info", {}))) return res
class PPOAgentICM(Agent): """Multi-GPU optimized implementation of PPO in TensorFlow.""" # _agent_name = "PPO" # _default_config = DEFAULT_CONFIG # _policy_graph = PPOPolicyGraph _agent_name = "PPO_ICM" _default_config = DEFAULT_CONFIG _policy_graph = PPOPolicyGraphICM @classmethod def default_resource_request(cls, config): cf = merge_dicts(cls._default_config, config) return Resources( cpu=1, gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) def _init(self): self._validate_config() self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"], { "num_cpus": self.config["num_cpus_per_worker"], "num_gpus": self.config["num_gpus_per_worker"] }) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"] }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], }) def _validate_config(self): waste_ratio = (self.config["sample_batch_size"] * self.config["num_workers"] / self.config["train_batch_size"]) if waste_ratio > 1: msg = ("sample_batch_size * num_workers >> train_batch_size. " "This means that many steps will be discarded. Consider " "reducing sample_batch_size, or increase train_batch_size.") if waste_ratio > 1.5: raise ValueError(msg) else: print("Warning: " + msg) if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]: raise ValueError( "Minibatch size {} must be <= train batch size {}.".format( self.config["sgd_minibatch_size"], self.config["train_batch_size"])) if (self.config["batch_mode"] == "truncate_episodes" and not self.config["use_gae"]): raise ValueError( "Episode truncation is not supported without a value function") def _train(self): prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: # multi-agent self.local_evaluator.foreach_trainable_policy( lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"])) # samples = self.local_evaluator.sample() res = self.optimizer.collect_metrics() res.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.get("info", {}))) return res def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 for ev in self.remote_evaluators: ev.__ray_terminate__.remote() def _save(self, checkpoint_dir): checkpoint_path = os.path.join(checkpoint_dir, "checkpoint-{}".format(self.iteration)) agent_state = ray.get( [a.save.remote() for a in self.remote_evaluators]) extra_data = [self.local_evaluator.save(), agent_state] pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb")) return checkpoint_path def _restore(self, checkpoint_path): extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb")) self.local_evaluator.restore(extra_data[0]) ray.get([ a.restore.remote(o) for (a, o) in zip(self.remote_evaluators, extra_data[1]) ])
class PPOAgent(Agent): """Multi-GPU optimized implementation of PPO in TensorFlow.""" _agent_name = "PPO" _default_config = DEFAULT_CONFIG _policy_graph = PPOPolicyGraph @override(Agent) def _init(self): self._validate_config() self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"]) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"], }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "sample_batch_size": self.config["sample_batch_size"], "num_envs_per_worker": self.config["num_envs_per_worker"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], "straggler_mitigation": ( self.config["straggler_mitigation"]), }) @override(Agent) def _train(self): if "observation_filter" not in self.raw_user_config: # TODO(ekl) remove this message after a few releases logger.info( "Important! Since 0.7.0, observation normalization is no " "longer enabled by default. To enable running-mean " "normalization, set 'observation_filter': 'MeanStdFilter'. " "You can ignore this message if your environment doesn't " "require observation normalization.") prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: def update(pi, pi_id): if pi_id in fetches: pi.update_kl(fetches[pi_id]["kl"]) else: logger.debug( "No data for {}, not updating kl".format(pi_id)) # multi-agent self.local_evaluator.foreach_trainable_policy(update) res = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) res.update( timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.get("info", {}))) # Warn about bad clipping configs if self.config["vf_clip_param"] <= 0: rew_scale = float("inf") elif res["policy_reward_mean"]: rew_scale = 0 # punt on handling multiagent case else: rew_scale = round( abs(res["episode_reward_mean"]) / self.config["vf_clip_param"], 0) if rew_scale > 100: logger.warning( "The magnitude of your environment rewards are more than " "{}x the scale of `vf_clip_param`. ".format(rew_scale) + "This means that it will take more than " "{} iterations for your value ".format(rew_scale) + "function to converge. If this is not intended, consider " "increasing `vf_clip_param`.") return res def _validate_config(self): if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]: raise ValueError( "Minibatch size {} must be <= train batch size {}.".format( self.config["sgd_minibatch_size"], self.config["train_batch_size"])) if (self.config["batch_mode"] == "truncate_episodes" and not self.config["use_gae"]): raise ValueError( "Episode truncation is not supported without a value " "function. Consider setting batch_mode=complete_episodes.") if (self.config["multiagent"]["policy_graphs"] and not self.config["simple_optimizer"]): logger.info( "In multi-agent mode, policies will be optimized sequentially " "by the multi-GPU optimizer. Consider setting " "simple_optimizer=True if this doesn't work for you.") if not self.config["vf_share_layers"]: logger.warning( "FYI: By default, the value function will not share layers " "with the policy model ('vf_share_layers': False).")