def _init(self): assert isinstance(self.local_evaluator, TFMultiGPUSupport) self.batch_size = self.config.get("sgd_batch_size", 128) gpu_ids = ray.get_gpu_ids() if not gpu_ids: self.devices = ["/cpu:0"] else: self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))] assert self.batch_size > len(self.devices), "batch size too small" self.per_device_batch_size = self.batch_size // len(self.devices) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() print("LocalMultiGPUOptimizer devices", self.devices) print("LocalMultiGPUOptimizer batch size", self.batch_size) # List of (feature name, feature placeholder) tuples self.loss_inputs = self.local_evaluator.tf_loss_inputs() # per-GPU graph copies created below must share vars with the policy tf.get_variable_scope().reuse_variables() self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config.get("sgd_stepsize", 5e-5)), self.devices, [ph for _, ph in self.loss_inputs], self.per_device_batch_size, lambda *ph: self.local_evaluator.build_tf_loss(ph), self.config.get("logdir", os.getcwd())) self.sess = self.local_evaluator.sess self.sess.run(tf.global_variables_initializer())
def __init__(self, env, config, logdir): self.env = env num_actions = env.action_space.n optimizer = tf.train.AdamOptimizer(learning_rate=config["lr"]) # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder( tf.float32, shape=(None,) + env.observation_space.shape) # Action Q network if config["multi_gpu_optimize"]: q_scope_name = TOWER_SCOPE_NAME + "/q_func" else: q_scope_name = "q_func" with tf.variable_scope(q_scope_name) as scope: q_values = _build_q_network( self.cur_observations, num_actions, config) q_func_vars = _scope_vars(scope.name) # Action outputs self.output_actions = _build_action_network( q_values, self.cur_observations, num_actions, self.stochastic, self.eps) # Replay inputs self.obs_t = tf.placeholder( tf.float32, shape=(None,) + env.observation_space.shape) self.act_t = tf.placeholder(tf.int32, [None], name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder( tf.float32, shape=(None,) + env.observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder( tf.float32, [None], name="weight") def build_loss( obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights): return ModelAndLoss( num_actions, config, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights) if config["multi_gpu_optimize"]: self.multi_gpu_optimizer = LocalSyncParallelOptimizer( optimizer, config["devices"], [self.obs_t, self.act_t, self.rew_t, self.obs_tp1, self.done_mask, self.importance_weights], int(config["sgd_batch_size"] / len(config["devices"])), build_loss, logdir, grad_norm_clipping=config["grad_norm_clipping"]) loss_obj = self.multi_gpu_optimizer.get_common_loss() else: loss_obj = build_loss( self.obs_t, self.act_t, self.rew_t, self.obs_tp1, self.done_mask, self.importance_weights) weighted_error = loss_obj.loss target_q_func_vars = loss_obj.target_q_func_vars self.q_t = loss_obj.q_t self.q_tp1 = loss_obj.q_tp1 self.td_error = loss_obj.td_error # compute optimization op (potentially with gradient clipping) if config["grad_norm_clipping"] is not None: self.grads_and_vars = _minimize_and_clip( optimizer, weighted_error, var_list=q_func_vars, clip_val=config["grad_norm_clipping"]) else: self.grads_and_vars = optimizer.compute_gradients( weighted_error, var_list=q_func_vars) self.grads_and_vars = [ (g, v) for (g, v) in self.grads_and_vars if g is not None] self.grads = [g for (g, v) in self.grads_and_vars] self.train_expr = optimizer.apply_gradients(self.grads_and_vars) # update_target_fn will be called periodically to copy Q network to # target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) self.update_target_expr = tf.group(*update_target_expr)
def __init__(self, env_creator, config, logdir, is_remote): self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = create_and_wrap(env_creator, config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder( tf.float32, shape=(None,) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None,)) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space # TODO(rliaw): pull this into model_catalog if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder( tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None,)) else: raise NotImplemented( "action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[ policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[ policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[ policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) obs_filter = get_filter( config["observation_filter"], self.env.observation_space.shape) self.sampler = SyncSampler( self.env, self.common_policy, obs_filter, self.config["horizon"], self.config["horizon"]) self.reward_filter = MeanStdFilter((), clip=5.0) self.sess.run(tf.global_variables_initializer())
class Runner(object): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, env_creator, config, logdir, is_remote): self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = create_and_wrap(env_creator, config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder( tf.float32, shape=(None,) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None,)) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space # TODO(rliaw): pull this into model_catalog if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder( tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None,)) else: raise NotImplemented( "action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[ policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[ policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[ policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) obs_filter = get_filter( config["observation_filter"], self.env.observation_space.shape) self.sampler = SyncSampler( self.env, self.common_policy, obs_filter, self.config["horizon"], self.config["horizon"]) self.reward_filter = MeanStdFilter((), clip=5.0) self.sess.run(tf.global_variables_initializer()) def load_data(self, trajectories, full_trace): use_gae = self.config["use_gae"] dummy = np.zeros_like(trajectories["advantages"]) return self.par_opt.load_data( self.sess, [trajectories["observations"], trajectories["value_targets"] if use_gae else dummy, trajectories["advantages"], trajectories["actions"].squeeze(), trajectories["logprobs"], trajectories["vf_preds"] if use_gae else dummy], full_trace=full_trace) def run_sgd_minibatch( self, batch_index, kl_coeff, full_trace, file_writer): return self.par_opt.optimize( self.sess, batch_index, extra_ops=[ self.mean_loss, self.mean_policy_loss, self.mean_vf_loss, self.mean_kl, self.mean_entropy], extra_feed_dict={self.kl_coeff: kl_coeff}, file_writer=file_writer if full_trace else None) def save(self): obs_filter = self.sampler.get_obs_filter() return pickle.dumps([obs_filter, self.reward_filter]) def restore(self, objs): objs = pickle.loads(objs) obs_filter = objs[0] rew_filter = objs[1] self.update_filters(obs_filter, rew_filter) def get_weights(self): return self.variables.get_weights() def load_weights(self, weights): self.variables.set_weights(weights) def update_filters(self, obs_filter=None, rew_filter=None): if rew_filter: # No special handling required since outside of threaded code self.reward_filter = rew_filter.copy() if obs_filter: self.sampler.update_obs_filter(obs_filter) def get_obs_filter(self): return self.sampler.get_obs_filter() def compute_steps(self, config, obs_filter, rew_filter): """Compute multiple rollouts and concatenate the results. Args: config: Configuration parameters obs_filter: Function that is applied to each of the observations. reward_filter: Function that is applied to each of the rewards. Returns: states: List of states. total_rewards: Total rewards of the trajectories. trajectory_lengths: Lengths of the trajectories. """ num_steps_so_far = 0 trajectories = [] self.update_filters(obs_filter, rew_filter) while num_steps_so_far < config["min_steps_per_task"]: rollout = self.sampler.get_data() trajectory = process_rollout( rollout, self.reward_filter, config["gamma"], config["lambda"], use_gae=config["use_gae"]) num_steps_so_far += trajectory["rewards"].shape[0] trajectories.append(trajectory) metrics = self.sampler.get_metrics() total_rewards, trajectory_lengths = zip(*[ (c.episode_reward, c.episode_length) for c in metrics]) updated_obs_filter = self.sampler.get_obs_filter(flush=True) return ( concatenate(trajectories), total_rewards, trajectory_lengths, updated_obs_filter, self.reward_filter)
class Runner(object): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, name, batchsize, config, logdir, is_remote): if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = BatchedEnv(name, batchsize, config) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.preprocessor = self.env.preprocessor self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) # The shape of the preprocessed observations. self.preprocessor_shape = self.preprocessor.transform_shape( self.env.observation_space.shape) # The input observations. self.observations = tf.placeholder( tf.float32, shape=(None,) + self.preprocessor_shape) # Targets of the value function. self.returns = tf.placeholder(tf.float32, shape=(None,)) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder( tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None,)) else: raise NotImplemented( "action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, rets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, rets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.returns, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[ policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[ policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[ policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.observation_filter = MeanStdFilter( self.preprocessor_shape, clip=None) self.reward_filter = MeanStdFilter((), clip=5.0) self.sess.run(tf.global_variables_initializer()) def load_data(self, trajectories, full_trace): if self.config["use_gae"]: return self.par_opt.load_data( self.sess, [trajectories["observations"], trajectories["td_lambda_returns"], trajectories["advantages"], trajectories["actions"].squeeze(), trajectories["logprobs"], trajectories["vf_preds"]], full_trace=full_trace) else: dummy = np.zeros((trajectories["observations"].shape[0],)) return self.par_opt.load_data( self.sess, [trajectories["observations"], dummy, trajectories["returns"], trajectories["actions"].squeeze(), trajectories["logprobs"], dummy], full_trace=full_trace) def run_sgd_minibatch( self, batch_index, kl_coeff, full_trace, file_writer): return self.par_opt.optimize( self.sess, batch_index, extra_ops=[ self.mean_loss, self.mean_policy_loss, self.mean_vf_loss, self.mean_kl, self.mean_entropy], extra_feed_dict={self.kl_coeff: kl_coeff}, file_writer=file_writer if full_trace else None) def save(self): return pickle.dumps([self.observation_filter, self.reward_filter]) def restore(self, objs): objs = pickle.loads(objs) self.observation_filter = objs[0] self.reward_filter = objs[1] def get_weights(self): return self.variables.get_weights() def load_weights(self, weights): self.variables.set_weights(weights) def compute_trajectory(self, gamma, lam, horizon): """Compute a single rollout on the agent and return.""" trajectory = rollouts( self.common_policy, self.env, horizon, self.observation_filter, self.reward_filter) if self.config["use_gae"]: add_advantage_values(trajectory, gamma, lam, self.reward_filter) else: add_return_values(trajectory, gamma, self.reward_filter) return trajectory def compute_steps(self, gamma, lam, horizon, min_steps_per_task=-1): """Compute multiple rollouts and concatenate the results. Args: gamma: MDP discount factor lam: GAE(lambda) parameter horizon: Number of steps after which a rollout gets cut min_steps_per_task: Lower bound on the number of states to be collected. Returns: states: List of states. total_rewards: Total rewards of the trajectories. trajectory_lengths: Lengths of the trajectories. """ num_steps_so_far = 0 trajectories = [] total_rewards = [] trajectory_lengths = [] while True: trajectory = self.compute_trajectory(gamma, lam, horizon) total_rewards.append( trajectory["raw_rewards"].sum(axis=0).mean()) trajectory_lengths.append( np.logical_not(trajectory["dones"]).sum(axis=0).mean()) trajectory = flatten(trajectory) not_done = np.logical_not(trajectory["dones"]) # Filtering out states that are done. We do this because # trajectories are batched and cut only if all the trajectories # in the batch terminated, so we can potentially get rid of # some of the states here. trajectory = {key: val[not_done] for key, val in trajectory.items()} num_steps_so_far += trajectory["raw_rewards"].shape[0] trajectories.append(trajectory) if num_steps_so_far >= min_steps_per_task: break return concatenate(trajectories), total_rewards, trajectory_lengths
def __init__( self, name, batchsize, preprocessor, config, logdir, is_remote): if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = BatchedEnv(name, batchsize, preprocessor=preprocessor) if preprocessor.shape is None: preprocessor.shape = self.env.observation_space.shape if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.preprocessor = preprocessor self.sess = tf.Session(config=config_proto) if config["use_tf_debugger"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) self.observations = tf.placeholder( tf.float32, shape=(None,) + preprocessor.shape) self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder( tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None,)) else: raise NotImplemented( "action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = 1 self.per_device_batch_size = 1 else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, advs, acts, plog): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, advs, acts, plog, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.advantages, self.actions, self.prev_logits], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[policy.loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack( values=[policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None) self.reward_filter = MeanStdFilter((), clip=5.0) self.sess.run(tf.global_variables_initializer())
class Agent(object): """ Agent class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__( self, name, batchsize, preprocessor, config, logdir, is_remote): if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = BatchedEnv(name, batchsize, preprocessor=preprocessor) if preprocessor.shape is None: preprocessor.shape = self.env.observation_space.shape if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.preprocessor = preprocessor self.sess = tf.Session(config=config_proto) if config["use_tf_debugger"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) self.observations = tf.placeholder( tf.float32, shape=(None,) + preprocessor.shape) self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder( tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None,)) else: raise NotImplemented( "action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = 1 self.per_device_batch_size = 1 else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, advs, acts, plog): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, advs, acts, plog, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.advantages, self.actions, self.prev_logits], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[policy.loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack( values=[policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None) self.reward_filter = MeanStdFilter((), clip=5.0) self.sess.run(tf.global_variables_initializer()) def load_data(self, trajectories, full_trace): return self.par_opt.load_data( self.sess, [trajectories["observations"], trajectories["advantages"], trajectories["actions"].squeeze(), trajectories["logprobs"]], full_trace=full_trace) def run_sgd_minibatch( self, batch_index, kl_coeff, full_trace, file_writer): return self.par_opt.optimize( self.sess, batch_index, extra_ops=[self.mean_loss, self.mean_kl, self.mean_entropy], extra_feed_dict={self.kl_coeff: kl_coeff}, file_writer=file_writer if full_trace else None) def get_weights(self): return self.variables.get_weights() def load_weights(self, weights): self.variables.set_weights(weights) def compute_trajectory(self, gamma, lam, horizon): trajectory = rollouts( self.common_policy, self.env, horizon, self.observation_filter, self.reward_filter) add_advantage_values(trajectory, gamma, lam, self.reward_filter) return trajectory
class LocalMultiGPUOptimizer(Optimizer): """A synchronous optimizer that uses multiple local GPUs. Samples are pulled synchronously from multiple remote evaluators, concatenated, and then split across the memory of multiple local GPUs. A number of SGD passes are then taken over the in-memory data. For more details, see `ray.rllib.parallel.LocalSyncParallelOptimizer`. This optimizer is Tensorflow-specific and require evaluators to implement the TFMultiGPUSupport API. """ def _init(self): assert isinstance(self.local_evaluator, TFMultiGPUSupport) self.batch_size = self.config.get("sgd_batch_size", 128) gpu_ids = ray.get_gpu_ids() if not gpu_ids: self.devices = ["/cpu:0"] else: self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))] assert self.batch_size > len(self.devices), "batch size too small" self.per_device_batch_size = self.batch_size // len(self.devices) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() print("LocalMultiGPUOptimizer devices", self.devices) print("LocalMultiGPUOptimizer batch size", self.batch_size) # List of (feature name, feature placeholder) tuples self.loss_inputs = self.local_evaluator.tf_loss_inputs() # per-GPU graph copies created below must share vars with the policy tf.get_variable_scope().reuse_variables() self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config.get("sgd_stepsize", 5e-5)), self.devices, [ph for _, ph in self.loss_inputs], self.per_device_batch_size, lambda *ph: self.local_evaluator.build_tf_loss(ph), self.config.get("logdir", os.getcwd())) self.sess = self.local_evaluator.sess self.sess.run(tf.global_variables_initializer()) def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() assert isinstance(samples, SampleBatch) with self.load_timer: tuples_per_device = self.par_opt.load_data( self.local_evaluator.sess, samples.columns([key for key, _ in self.loss_inputs])) with self.grad_timer: for i in range(self.config.get("num_sgd_iter", 10)): batch_index = 0 num_batches = (int(tuples_per_device) // int(self.per_device_batch_size)) permutation = np.random.permutation(num_batches) while batch_index < num_batches: # TODO(ekl) support ppo's debugging features, e.g. # printing the current loss and tracing self.par_opt.optimize( self.sess, permutation[batch_index] * self.per_device_batch_size) batch_index += 1 def stats(self): return { "sample_time_ms": round(1000 * self.sample_timer.mean, 3), "load_time_ms": round(1000 * self.load_timer.mean, 3), "grad_time_ms": round(1000 * self.grad_timer.mean, 3), "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), }