def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("advantages", advantages), ("value_targets", v_target), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer())
def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup placeholders obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") # Create the model network and action outputs self.model = ModelCatalog.get_model({ "obs": obs, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, obs_space, self.logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Mapping from sample batch keys to placeholders. These keys will be # read from postprocessed sample batches and fed into the specified # placeholders during loss computation. loss_in = [ ("obs", obs), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), # added during postprocessing ] # Initialize TFPolicyGraph sess = tf.get_default_session() TFPolicyGraph.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=loss, loss_inputs=loss_in, model=self.model, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup placeholders obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") # Create the model network and action outputs self.model = ModelCatalog.get_model({ "obs": obs, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, obs_space, action_space, self.logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Mapping from sample batch keys to placeholders. These keys will be # read from postprocessed sample batches and fed into the specified # placeholders during loss computation. loss_in = [ ("obs", obs), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), # added during postprocessing ] # Initialize TFPolicyGraph sess = tf.get_default_session() TFPolicyGraph.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=loss, loss_inputs=loss_in, model=self.model, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup policy obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": obs, "prev_actions": prev_actions, "prev_rewards": prev_rewards }, obs_space, self.logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Initialize TFPolicyGraph sess = tf.get_default_session() # Mapping from sample batch keys to placeholders loss_in = [ ("obs", obs), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), ] TFPolicyGraph.__init__(self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), loss=loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup policy obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(obs, self.logit_dim, options=self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Initialize TFPolicyGraph sess = tf.get_default_session() loss_in = [ ("obs", obs), ("actions", actions), ("advantages", advantages), ] # LSTM support for i, ph in enumerate(self.model.state_in): loss_in.append(("state_in_{}".format(i), ph)) is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), loss=loss, loss_inputs=loss_in, is_training=is_training, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def _init_helper(self, observation_space, action_space, config, existing_inputs=None): config = dict(DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None imitation = config["imitation"] assert not imitation if imitation: T = config["sample_batch_size"] B = config["train_batch_size"] // T batch_shape = (T, B) else: batch_shape = (None, ) if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False actions_shape = batch_shape output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True actions_shape = batch_shape + (len(action_space.nvec), ) output_hidden_shape = action_space.nvec.astype(np.int32) else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) assert is_multidiscrete if imitation: make_action_ph = lambda: ssbm_actions.make_ph( ssbm_actions.flat_repeated_config, batch_shape) actions = make_action_ph() prev_actions = make_action_ph() else: # actions are stacked "multidiscrete" actions = tf.placeholder(tf.int64, actions_shape, name="actions") prev_actions = tf.placeholder(tf.int64, actions_shape, name="prev_actions") # Create input placeholders dones = tf.placeholder(tf.bool, batch_shape, name="dones") rewards = tf.placeholder(tf.float32, batch_shape, name="rewards") if imitation: observations = ssbm_spaces.slippi_conv_list[0].make_ph(batch_shape) else: observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) behavior_logp = tf.placeholder(tf.float32, batch_shape) existing_state_in = None existing_seq_lens = None # Setup the policy autoregressive = config.get("autoregressive") if autoregressive: logit_dim = 128 # not really logits else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_rewards = tf.placeholder(tf.float32, batch_shape, name="prev_reward") self.model = HumanActionModel( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], imitation=imitation, state_in=existing_state_in, seq_lens=existing_seq_lens) # HumanActionModel doesn't flatten outputs flat_outputs = snt.MergeDims(0, 2)(self.model.outputs) if autoregressive: action_dist = ssbm_actions.AutoRegressive( nest.map_structure(lambda conv: conv.build_dist(), ssbm_actions.flat_repeated_config), residual=config.get("residual")) actions_logp, actions_entropy = action_dist.logp( flat_outputs, tf.unstack(actions, axis=-1)) action_sampler, self.sampled_logp = action_dist.sample( flat_outputs) action_sampler = tf.stack( [tf.cast(t, tf.int64) for t in nest.flatten(action_sampler)], axis=-1) sampled_prob = tf.exp(self.sampled_logp) else: dist_inputs = tf.split(flat_outputs, output_hidden_shape, axis=-1) action_dist = dist_class(dist_inputs) int64_actions = [tf.cast(x, tf.int64) for x in actions] actions_logp = action_dist.logp(int64_actions) actions_entropy = action_dist.entropy() action_sampler = action_dist.sample() sampled_prob = action_dist.sampled_action_prob() self.sampled_logp = tf.log(sampled_prob) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res # actual loss computation values_tm = make_time_major(self.model.value_function()) baseline_values = values_tm[:-1] actions_logp_tm = make_time_major(actions_logp, True) behavior_logp_tm = make_time_major(behavior_logp, True) log_rhos_tm = actions_logp_tm - behavior_logp_tm discounts = tf.fill(tf.shape(baseline_values), config["gamma"]) if not config.get("soft_horizon"): discounts *= tf.to_float(~make_time_major(dones, True)) vtrace_returns = vtrace.from_importance_weights( log_rhos=log_rhos_tm, discounts=discounts, rewards=make_time_major(rewards, True), values=baseline_values, bootstrap_value=values_tm[-1]) vf_loss = tf.reduce_mean( tf.squared_difference(vtrace_returns.vs, baseline_values)) pi_loss = -tf.reduce_mean( actions_logp_tm * vtrace_returns.pg_advantages) entropy_mean = tf.reduce_mean(actions_entropy) total_loss = pi_loss total_loss += self.config["vf_loss_coeff"] * vf_loss total_loss -= self.config["entropy_coeff"] * entropy_mean self.total_loss = total_loss kl_mean = -tf.reduce_mean(log_rhos_tm) # Initialize TFPolicyGraph loss_in = [ (SampleBatch.ACTIONS, actions), (SampleBatch.DONES, dones), ("behavior_logp", behavior_logp), (SampleBatch.REWARDS, rewards), (SampleBatch.CUR_OBS, observations), (SampleBatch.PREV_ACTIONS, prev_actions), (SampleBatch.PREV_REWARDS, prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_sampler, action_prob=sampled_prob, loss=self.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { LEARNER_STATS_KEY: { "cur_lr": tf.cast(self.cur_lr, tf.float64), "pi_loss": pi_loss, "entropy": entropy_mean, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": vf_loss, "vf_explained_var": explained_variance(tf.reshape(vtrace_returns.vs, [-1]), tf.reshape(baseline_values, [-1])), "kl_mean": kl_mean, }, }
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph = existing_inputs[:6] existing_state_in = existing_inputs[6:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) existing_state_in = None existing_seq_lens = None self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ] self.model = ModelCatalog.get_model( obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() self.value_function = self.model.value_function self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \ existing_inputs[:8] existing_state_in = existing_inputs[8:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.prev_actions = prev_actions_ph self.prev_rewards = prev_rewards_ph self.loss_in = [ (SampleBatch.CUR_OBS, obs_ph), (Postprocessing.VALUE_TARGETS, value_targets_ph), (Postprocessing.ADVANTAGES, adv_ph), (SampleBatch.ACTIONS, act_ph), (BEHAVIOUR_LOGITS, logits_ph), (SampleBatch.VF_PREDS, vf_preds_ph), (SampleBatch.PREV_ACTIONS, prev_actions_ph), (SampleBatch.PREV_REWARDS, prev_rewards_ph), ] self.model = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = self.model.value_function() else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False if vf_config["use_lstm"]: vf_config["use_lstm"] = False logger.warning( "It is not recommended to use a LSTM model with " "vf_share_layers=False (consider setting it to True). " "If you want to not share layers, you can implement " "a custom LSTM model that overrides the " "value_function() method.") with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model({ "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph, dtype=tf.bool) self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, action_prob=curr_action_dist.sampled_action_prob(), loss=self.loss_obj.loss, model=self.model, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_kl_coeff": self.kl_coeff, "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Action inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) with tf.variable_scope(P_SCOPE) as scope: self.model = self._build_policy_network(self.obs_t, observation_space, logit_dim) logits = self.model.outputs self.p_func_vars = _scope_vars(scope.name) # Action outputs action_dist = dist_cls(logits) self.output_actions = action_dist.sample() # Training inputs self.act_t = tf.placeholder(tf.int32, [None], name="action") self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward") # v network evaluation with tf.variable_scope(V_SCOPE) as scope: state_values = self._build_value_network(self.obs_t, observation_space) self.v_func_vars = _scope_vars(scope.name) self.v_loss = self._build_value_loss(state_values, self.cum_rew_t) self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t, logits, self.act_t, action_space) # which kind of objective to optimize objective = (self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss) self.explained_variance = tf.reduce_mean( explained_variance(self.cum_rew_t, state_values)) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("advantages", self.cum_rew_t), ] TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.obs_t, action_sampler=self.output_actions, action_prob=action_dist.sampled_action_prob(), loss=self.model.loss() + objective, loss_inputs=self.loss_inputs, state_inputs=self.model.state_in, state_outputs=self.model.state_out) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "total_loss": objective, "vf_explained_var": self.explained_variance, "policy_loss": self.p_loss.loss, "vf_loss": self.v_loss.loss }
def _init_helper(self, observation_space, action_space, config, existing_inputs=None): print(get_available_gpus()) config = dict(impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None imitation = config["imitation"] if imitation: T = config["sample_batch_size"] B = config["train_batch_size"] // T batch_shape = (T, B) else: batch_shape = (None, ) if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False actions_shape = batch_shape output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True actions_shape = batch_shape + (len(action_space.nvec), ) output_hidden_shape = action_space.nvec.astype(np.int32) else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) if imitation: make_action_ph = lambda: ssbm_actions.make_ph( ssbm_actions.flat_repeated_config, batch_shape) actions = make_action_ph() prev_actions = make_action_ph() else: actions = tf.placeholder(tf.int64, actions_shape, name="actions") prev_actions = tf.placeholder(tf.int64, actions_shape, name="prev_actions") # Create input placeholders dones = tf.placeholder(tf.bool, batch_shape, name="dones") rewards = tf.placeholder(tf.float32, batch_shape, name="rewards") if imitation: observations = ssbm_spaces.slippi_conv_list[0].make_ph(batch_shape) else: observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Setup the policy autoregressive = config.get("autoregressive") if autoregressive: logit_dim = 128 # not really logits else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_rewards = tf.placeholder(tf.float32, batch_shape, name="prev_reward") self.model = HumanActionModel( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], imitation=imitation, state_in=existing_state_in, seq_lens=existing_seq_lens) if autoregressive: action_dist = ssbm_actions.AutoRegressive( nest.map_structure(lambda conv: conv.build_dist(), ssbm_actions.flat_repeated_config), residual=config.get("residual")) actions_logp = snt.BatchApply(action_dist.logp)(self.model.outputs, actions) action_sampler, sampled_logp = snt.BatchApply(action_dist.sample)( self.model.outputs) sampled_prob = tf.exp(sampled_logp) else: dist_inputs = tf.split(self.model.outputs, output_hidden_shape, axis=-1) action_dist = dist_class(snt.MergeDims(0, 2)(dist_inputs)) int64_actions = [tf.cast(x, tf.int64) for x in actions] actions_logp = action_dist.logp(snt.MergeDims(0, 2)(int64_actions)) action_sampler = action_dist.sample() sampled_prob = action_dist.sampled_action_prob(), self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # actual loss computation imitation_loss = -tf.reduce_mean(actions_logp) tm_values = self.model.values baseline_values = tm_values[:-1] if config.get("soft_horizon"): discounts = config["gamma"] else: discounts = tf.to_float(~dones[:-1]) * config["gamma"] td_lambda = trfl.td_lambda(state_values=baseline_values, rewards=rewards[:-1], pcontinues=discounts, bootstrap_value=tm_values[-1], lambda_=config.get("lambda", 1.)) # td_lambda.loss has shape [B] after a reduce_sum vf_loss = tf.reduce_mean(td_lambda.loss) / T self.total_loss = imitation_loss + self.config[ "vf_loss_coeff"] * vf_loss # Initialize TFPolicyGraph loss_in = [ (SampleBatch.ACTIONS, actions), (SampleBatch.DONES, dones), # (BEHAVIOUR_LOGITS, behaviour_logits), (SampleBatch.REWARDS, rewards), (SampleBatch.CUR_OBS, observations), (SampleBatch.PREV_ACTIONS, prev_actions), (SampleBatch.PREV_REWARDS, prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_sampler, action_prob=sampled_prob, loss=self.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self._loss_input_dict = dict(self._loss_inputs, state_in=self._state_inputs) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { LEARNER_STATS_KEY: { "cur_lr": tf.cast(self.cur_lr, tf.float64), "imitation_loss": imitation_loss, #"entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": vf_loss, "vf_explained_var": explained_variance( tf.reshape(td_lambda.extra.discounted_returns, [-1]), tf.reshape(baseline_values, [-1])), }, "state_out": self.model.state_out, }
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config) if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( action_space)) self.config = config self.cur_epsilon = 1.0 self.dim_actions = action_space.shape[0] self.low_action = action_space.low self.high_action = action_space.high # create global step for counting the number of update operations self.global_step = tf.train.get_or_create_global_step() # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape, name="cur_obs") # Actor: P (policy) network with tf.variable_scope(P_SCOPE) as scope: p_values, self.p_model = self._build_p_network( self.cur_observations, observation_space, action_space) self.p_func_vars = _scope_vars(scope.name) # Noise vars for P network except for layer normalization vars if self.config["parameter_noise"]: self._build_parameter_noise([ var for var in self.p_func_vars if "LayerNorm" not in var.name ]) # Action outputs with tf.variable_scope(A_SCOPE): self.output_actions = self._build_action_network( p_values, self.stochastic, self.eps) if self.config["smooth_target_policy"]: self.reset_noise_op = tf.no_op() else: with tf.variable_scope(A_SCOPE, reuse=True): exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") self.reset_noise_op = tf.assign(exploration_sample, self.dim_actions * [.0]) # Replay inputs self.obs_t = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape, name="observation") self.act_t = tf.placeholder( tf.float32, shape=(None, ) + action_space.shape, name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder( tf.float32, [None], name="weight") # p network evaluation with tf.variable_scope(P_SCOPE, reuse=True) as scope: prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) self.p_t, _ = self._build_p_network(self.obs_t, observation_space, action_space) p_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target p network evaluation with tf.variable_scope(P_TARGET_SCOPE) as scope: p_tp1, _ = self._build_p_network(self.obs_tp1, observation_space, action_space) target_p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE, reuse=True): output_actions = self._build_action_network( self.p_t, stochastic=tf.constant(value=False, dtype=tf.bool), eps=.0) output_actions_estimated = self._build_action_network( p_tp1, stochastic=tf.constant( value=self.config["smooth_target_policy"], dtype=tf.bool), eps=.0, is_target=True) # q network evaluation prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) with tf.variable_scope(Q_SCOPE) as scope: q_t, self.q_model = self._build_q_network( self.obs_t, observation_space, action_space, self.act_t) self.q_func_vars = _scope_vars(scope.name) self.stats = { "mean_q": tf.reduce_mean(q_t), "max_q": tf.reduce_max(q_t), "min_q": tf.reduce_min(q_t), } with tf.variable_scope(Q_SCOPE, reuse=True): q_tp0, _ = self._build_q_network(self.obs_t, observation_space, action_space, output_actions) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_SCOPE) as scope: twin_q_t, self.twin_q_model = self._build_q_network( self.obs_t, observation_space, action_space, self.act_t) self.twin_q_func_vars = _scope_vars(scope.name) q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space, action_space, output_actions_estimated) target_q_func_vars = _scope_vars(scope.name) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope: twin_q_tp1, _ = self._build_q_network( self.obs_tp1, observation_space, action_space, output_actions_estimated) twin_target_q_func_vars = _scope_vars(scope.name) if self.config["twin_q"]: self.loss = self._build_actor_critic_loss( q_t, q_tp1, q_tp0, twin_q_t=twin_q_t, twin_q_tp1=twin_q_tp1) else: self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0) if config["l2_reg"] is not None: for var in self.p_func_vars: if "bias" not in var.name: self.loss.actor_loss += ( config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) for var in self.q_func_vars: if "bias" not in var.name: self.loss.critic_loss += ( config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) if self.config["twin_q"]: for var in self.twin_q_func_vars: if "bias" not in var.name: self.loss.critic_loss += ( config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) # update_target_fn will be called periodically to copy Q network to # target Q network self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) if self.config["twin_q"]: for var, var_target in zip( sorted(self.twin_q_func_vars, key=lambda v: v.name), sorted(twin_target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) for var, var_target in zip( sorted(self.p_func_vars, key=lambda v: v.name), sorted(target_p_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*update_target_expr) self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] input_dict = dict(self.loss_inputs) # Model self-supervised losses self.loss.actor_loss = self.p_model.custom_loss( self.loss.actor_loss, input_dict) self.loss.critic_loss = self.q_model.custom_loss( self.loss.critic_loss, input_dict) if self.config["twin_q"]: self.loss.critic_loss = self.twin_q_model.custom_loss( self.loss.critic_loss, input_dict) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.loss.actor_loss + self.loss.critic_loss, loss_inputs=self.loss_inputs, update_ops=q_batchnorm_update_ops + p_batchnorm_update_ops) self.sess.run(tf.global_variables_initializer()) # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.tf_utils.TensorFlowVariables( tf.group(q_tp0, q_tp1), self.sess) # Hard initial update self.update_target(tau=1.0)
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph = existing_inputs[:6] existing_state_in = existing_inputs[6:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None,)+observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None,)) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None,)) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None,)) existing_state_in = None existing_seq_lens = None self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ] self.model = ModelCatalog.get_model( obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space) if existing_inputs: self.loss_in = existing_inputs obs_ph, value_targets_ph, adv_ph, act_ph, \ logprobs_ph, vf_preds_ph = [ph for _, ph in existing_inputs] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) # Targets of the value function. value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) # Advantage values in the policy gradient estimator. adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) # Log probabilities from the policy before the policy update. logprobs_ph = tf.placeholder(tf.float32, name="logprobs", shape=(None, logit_dim)) # Value function predictions before the policy update. vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) self.loss_in = [("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logprobs", logprobs_ph), ("vf_preds", vf_preds_ph)] # TODO(ekl) feed RNN states in here # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"]).outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.constant("NA") self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logprobs_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, is_training=self.is_training)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( action_space)) self.config = config self.cur_epsilon = 1.0 self.num_actions = action_space.n # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) # Action Q network with tf.variable_scope(Q_SCOPE) as scope: q_values, q_logits, q_dist, _ = self._build_q_network( self.cur_observations, observation_space, action_space) self.q_values = q_values self.q_func_vars = _scope_vars(scope.name) # Noise vars for Q network except for layer normalization vars if self.config["parameter_noise"]: self._build_parameter_noise([ var for var in self.q_func_vars if "LayerNorm" not in var.name ]) self.action_probs = tf.nn.softmax(self.q_values) # Action outputs self.output_actions, self.action_prob = self._build_q_value_policy( q_values) # Replay inputs self.obs_t = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) self.act_t = tf.placeholder(tf.int32, [None], name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder( tf.float32, [None], name="weight") # q network evaluation with tf.variable_scope(Q_SCOPE, reuse=True): prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) q_t, q_logits_t, q_dist_t, model = self._build_q_network( self.obs_t, observation_space, action_space) q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, q_logits_tp1, q_dist_tp1, _ = self._build_q_network( self.obs_tp1, observation_space, action_space) self.target_q_func_vars = _scope_vars(scope.name) # q scores for actions which we know were selected in the given state. one_hot_selection = tf.one_hot(self.act_t, self.num_actions) q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1) q_logits_t_selected = tf.reduce_sum( q_logits_t * tf.expand_dims(one_hot_selection, -1), 1) # compute estimate of best possible value starting from state at t + 1 if config["double_q"]: with tf.variable_scope(Q_SCOPE, reuse=True): q_tp1_using_online_net, q_logits_tp1_using_online_net, \ q_dist_tp1_using_online_net, _ = self._build_q_network( self.obs_tp1, observation_space, action_space) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best_one_hot_selection = tf.one_hot( q_tp1_best_using_online_net, self.num_actions) q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_dist_tp1_best = tf.reduce_sum( q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1) else: q_tp1_best_one_hot_selection = tf.one_hot( tf.argmax(q_tp1, 1), self.num_actions) q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_dist_tp1_best = tf.reduce_sum( q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1) self.loss = self._build_q_loss(q_t_selected, q_logits_t_selected, q_tp1_best, q_dist_tp1_best) # update_target_fn will be called periodically to copy Q network to # target Q network update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(self.target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) self.update_target_expr = tf.group(*update_target_expr) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, action_prob=self.action_prob, loss=self.loss.loss, model=model, loss_inputs=self.loss_inputs, update_ops=q_batchnorm_update_ops) self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Action inputs self.obs_t = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") with tf.variable_scope(P_SCOPE) as scope: self.model = ModelCatalog.get_model({ "obs": self.obs_t, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) logits = self.model.outputs self.p_func_vars = _scope_vars(scope.name) # Action outputs action_dist = dist_cls(logits) self.output_actions = action_dist.sample() # Training inputs self.act_t = tf.placeholder(tf.int32, [None], name="action") self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward") # v network evaluation with tf.variable_scope(V_SCOPE) as scope: state_values = self.model.value_function() self.v_func_vars = _scope_vars(scope.name) self.v_loss = self._build_value_loss(state_values, self.cum_rew_t) self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t, logits, self.act_t, action_space) # which kind of objective to optimize objective = ( self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss) self.explained_variance = tf.reduce_mean( explained_variance(self.cum_rew_t, state_values)) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("advantages", self.cum_rew_t), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.obs_t, action_sampler=self.output_actions, action_prob=action_dist.sampled_action_prob(), loss=objective, model=self.model, loss_inputs=self.loss_inputs, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "total_loss": objective, "vf_explained_var": self.explained_variance, "policy_loss": self.p_loss.loss, "vf_loss": self.v_loss.loss }
def __init__(self, obs_space, action_space, config, loss_fn, stats_fn=None, grad_stats_fn=None, before_loss_init=None, make_action_sampler=None, existing_inputs=None, get_batch_divisibility_req=None): """Initialize a dynamic TF policy graph. Arguments: observation_space (gym.Space): Observation space of the policy. action_space (gym.Space): Action space of the policy. config (dict): Policy-specific configuration data. loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy graph and batch input tensors grad_stats_fn (func): optional function that returns a dict of TF fetches given the policy graph and loss gradient tensors before_loss_init (func): optional function to run prior to loss init that takes the same arguments as __init__ make_action_sampler (func): optional function that returns a tuple of action and action prob tensors. The function takes (policy, input_dict, obs_space, action_space, config) as its arguments existing_inputs (OrderedDict): when copying a policy graph, this specifies an existing dict of placeholders to use instead of defining new ones get_batch_divisibility_req (func): optional function that returns the divisibility requirement for sample batches """ self.config = config self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn # Setup standard placeholders if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] else: obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") input_dict = { "obs": obs, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), } # Create the model network and action outputs if make_action_sampler: assert not existing_inputs, \ "Cloning not supported with custom action sampler" self.model = None self.dist_class = None self.action_dist = None action_sampler, action_prob = make_action_sampler( self, input_dict, obs_space, action_space, config) else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: existing_state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if existing_state_in: existing_seq_lens = existing_inputs["seq_lens"] else: existing_seq_lens = None else: existing_state_in = [] existing_seq_lens = None self.model = ModelCatalog.get_model(input_dict, obs_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) self.action_dist = self.dist_class(self.model.outputs) action_sampler = self.action_dist.sample() action_prob = self.action_dist.sampled_action_prob() # Phase 1 init sess = tf.get_default_session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 TFPolicyGraph.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_sampler, action_prob=action_prob, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self.model and self.model.state_in, state_outputs=self.model and self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model and self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req) # Phase 2 init before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss()
def __init__(self, observation_space, action_space, config, existing_inputs=None, unsupType='action', designHead='universe'): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.unsup = unsupType is not None predictor = None numaction = action_space.n config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) act_ph = ModelCatalog.get_action_placeholder(action_space) self.observations = obs_ph self.actions = act_ph # print("Observations space:", observation_space) print("Observations shape:", tf.shape(self.observations)[0]) # tf.shape(x)[0] print("Observations shape:", self.observations.shape[1:]) print("obs_ph shape:", tf.shape(obs_ph)[:1]) with tf.variable_scope("predictor"): if 'state' in unsupType: self.local_ap_network = predictor = StatePredictor(tf.shape(self.observations)[0], numaction, designHead, unsupType) else: self.local_ap_network = predictor = StateActionPredictor(tf.shape(self.observations)[0], numaction, designHead) self.logits = self.local_ap_network.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() self.loss_obj = ICMLoss(action_space, unsupType, predictor) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) self.loss_in = [ ("obs", obs_ph), ("actions", act_ph), ("phi1", self.local_ap_network.s1), ("phi2", self.local_ap_network.s2), ("asample", self.local_ap_network.asample), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.predloss, loss_inputs=self.loss_in, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.predloss # "policy_loss": self.loss_obj.mean_policy_loss, # "vf_loss": self.loss_obj.mean_vf_loss, # "vf_explained_var": self.explained_variance, # "kl": self.loss_obj.mean_kl, # "entropy": self.loss_obj.mean_entropy }
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( action_space)) self.config = config self.cur_epsilon = 1.0 num_actions = action_space.n def _build_q_network(obs): return QNetwork(ModelCatalog.get_model(obs, 1, config["model"]), num_actions, config["dueling"], config["hiddens"]).value # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) # Action Q network with tf.variable_scope(Q_SCOPE) as scope: q_values = _build_q_network(self.cur_observations) self.q_func_vars = _scope_vars(scope.name) # Action outputs self.output_actions = QValuePolicy(q_values, self.cur_observations, num_actions, self.stochastic, self.eps).action # Replay inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.act_t = tf.placeholder(tf.int32, [None], name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation with tf.variable_scope(Q_SCOPE, reuse=True): q_t = _build_q_network(self.obs_t) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1 = _build_q_network(self.obs_tp1) self.target_q_func_vars = _scope_vars(scope.name) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if config["double_q"]: with tf.variable_scope(Q_SCOPE, reuse=True): q_tp1_using_online_net = _build_q_network(self.obs_tp1) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights, self.rew_t, self.done_mask, config["gamma"], config["n_step"]) # update_target_fn will be called periodically to copy Q network to # target Q network update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(self.target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) self.update_target_expr = tf.group(*update_target_expr) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.loss.loss, loss_inputs=self.loss_inputs, is_training=self.is_training) self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model({ "obs": self.observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.model.loss() + self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.vf), }, } self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config) if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( action_space)) self.config = config self.cur_noise_scale = 1.0 self.cur_pure_exploration_phase = False self.dim_actions = action_space.shape[0] self.low_action = action_space.low self.high_action = action_space.high # create global step for counting the number of update operations self.global_step = tf.train.get_or_create_global_step() # use separate optimizers for actor & critic self._actor_optimizer = tf.train.AdamOptimizer( learning_rate=self.config["actor_lr"]) self._critic_optimizer = tf.train.AdamOptimizer( learning_rate=self.config["critic_lr"]) # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale") self.pure_exploration_phase = tf.placeholder( tf.bool, (), name="pure_exploration_phase") self.cur_observations = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="cur_obs") with tf.variable_scope(POLICY_SCOPE) as scope: policy_out, self.policy_model = self._build_policy_network( self.cur_observations, observation_space, action_space) self.policy_vars = _scope_vars(scope.name) # Noise vars for P network except for layer normalization vars if self.config["parameter_noise"]: self._build_parameter_noise([ var for var in self.policy_vars if "LayerNorm" not in var.name ]) # Action outputs with tf.variable_scope(ACTION_SCOPE): self.output_actions = self._add_exploration_noise( policy_out, self.stochastic, self.noise_scale, self.pure_exploration_phase, action_space) if self.config["smooth_target_policy"]: self.reset_noise_op = tf.no_op() else: with tf.variable_scope(ACTION_SCOPE, reuse=True): exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") self.reset_noise_op = tf.assign(exploration_sample, self.dim_actions * [.0]) # Replay inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="observation") self.act_t = tf.placeholder(tf.float32, shape=(None, ) + action_space.shape, name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder(tf.float32, [None], name="weight") # policy network evaluation with tf.variable_scope(POLICY_SCOPE, reuse=True) as scope: prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) self.policy_t, _ = self._build_policy_network( self.obs_t, observation_space, action_space) policy_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target policy network evaluation with tf.variable_scope(POLICY_TARGET_SCOPE) as scope: policy_tp1, _ = self._build_policy_network(self.obs_tp1, observation_space, action_space) target_policy_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(ACTION_SCOPE, reuse=True): if config["smooth_target_policy"]: target_noise_clip = self.config["target_noise_clip"] clipped_normal_sample = tf.clip_by_value( tf.random_normal(tf.shape(policy_tp1), stddev=self.config["target_noise"]), -target_noise_clip, target_noise_clip) policy_tp1_smoothed = tf.clip_by_value( policy_tp1 + clipped_normal_sample, action_space.low * tf.ones_like(policy_tp1), action_space.high * tf.ones_like(policy_tp1)) else: # no smoothing, just use deterministic actions policy_tp1_smoothed = policy_tp1 # q network evaluation prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) with tf.variable_scope(Q_SCOPE) as scope: # Q-values for given actions & observations in given current q_t, self.q_model = self._build_q_network(self.obs_t, observation_space, action_space, self.act_t) self.q_func_vars = _scope_vars(scope.name) self.stats = { "mean_q": tf.reduce_mean(q_t), "max_q": tf.reduce_max(q_t), "min_q": tf.reduce_min(q_t), } with tf.variable_scope(Q_SCOPE, reuse=True): # Q-values for current policy (no noise) in given current state q_t_det_policy, _ = self._build_q_network(self.obs_t, observation_space, action_space, self.policy_t) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_SCOPE) as scope: twin_q_t, self.twin_q_model = self._build_q_network( self.obs_t, observation_space, action_space, self.act_t) self.twin_q_func_vars = _scope_vars(scope.name) q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target q network evaluation with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space, action_space, policy_tp1_smoothed) target_q_func_vars = _scope_vars(scope.name) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope: twin_q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space, action_space, policy_tp1_smoothed) twin_target_q_func_vars = _scope_vars(scope.name) if self.config["twin_q"]: self.critic_loss, self.actor_loss, self.td_error \ = self._build_actor_critic_loss( q_t, q_tp1, q_t_det_policy, twin_q_t=twin_q_t, twin_q_tp1=twin_q_tp1) else: self.critic_loss, self.actor_loss, self.td_error \ = self._build_actor_critic_loss( q_t, q_tp1, q_t_det_policy) if config["l2_reg"] is not None: for var in self.policy_vars: if "bias" not in var.name: self.actor_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) for var in self.q_func_vars: if "bias" not in var.name: self.critic_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) if self.config["twin_q"]: for var in self.twin_q_func_vars: if "bias" not in var.name: self.critic_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) # update_target_fn will be called periodically to copy Q network to # target Q network self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) if self.config["twin_q"]: for var, var_target in zip( sorted(self.twin_q_func_vars, key=lambda v: v.name), sorted(twin_target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) for var, var_target in zip( sorted(self.policy_vars, key=lambda v: v.name), sorted(target_policy_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*update_target_expr) self.sess = tf.get_default_session() self.loss_inputs = [ (SampleBatch.CUR_OBS, self.obs_t), (SampleBatch.ACTIONS, self.act_t), (SampleBatch.REWARDS, self.rew_t), (SampleBatch.NEXT_OBS, self.obs_tp1), (SampleBatch.DONES, self.done_mask), (PRIO_WEIGHTS, self.importance_weights), ] input_dict = dict(self.loss_inputs) if self.config["use_state_preprocessor"]: # Model self-supervised losses self.actor_loss = self.policy_model.custom_loss( self.actor_loss, input_dict) self.critic_loss = self.q_model.custom_loss( self.critic_loss, input_dict) if self.config["twin_q"]: self.critic_loss = self.twin_q_model.custom_loss( self.critic_loss, input_dict) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.actor_loss + self.critic_loss, loss_inputs=self.loss_inputs, update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops) self.sess.run(tf.global_variables_initializer()) # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.tf_utils.TensorFlowVariables( tf.group(q_t_det_policy, q_tp1), self.sess) # Hard initial update self.update_target(tau=1.0)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config) if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( action_space)) self.config = config self.cur_epsilon = 1.0 self.dim_actions = action_space.shape[0] self.low_action = action_space.low self.high_action = action_space.high # create global step for counting the number of update operations self.global_step = tf.train.get_or_create_global_step() # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="cur_obs") # Actor: P (policy) network with tf.variable_scope(P_SCOPE) as scope: p_values, self.p_model = self._build_p_network( self.cur_observations, observation_space) self.p_func_vars = _scope_vars(scope.name) # Noise vars for P network except for layer normalization vars if self.config["parameter_noise"]: self._build_parameter_noise([ var for var in self.p_func_vars if "LayerNorm" not in var.name ]) # Action outputs with tf.variable_scope(A_SCOPE): self.output_actions = self._build_action_network( p_values, self.stochastic, self.eps) if self.config["smooth_target_policy"]: self.reset_noise_op = tf.no_op() else: with tf.variable_scope(A_SCOPE, reuse=True): exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") self.reset_noise_op = tf.assign(exploration_sample, self.dim_actions * [.0]) # Replay inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="observation") self.act_t = tf.placeholder(tf.float32, shape=(None, ) + action_space.shape, name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder(tf.float32, [None], name="weight") # p network evaluation with tf.variable_scope(P_SCOPE, reuse=True) as scope: prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) self.p_t, _ = self._build_p_network(self.obs_t, observation_space) p_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target p network evaluation with tf.variable_scope(P_TARGET_SCOPE) as scope: p_tp1, _ = self._build_p_network(self.obs_tp1, observation_space) target_p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE, reuse=True): output_actions = self._build_action_network(self.p_t, stochastic=tf.constant( value=False, dtype=tf.bool), eps=.0) output_actions_estimated = self._build_action_network( p_tp1, stochastic=tf.constant( value=self.config["smooth_target_policy"], dtype=tf.bool), eps=.0, is_target=True) # q network evaluation prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) with tf.variable_scope(Q_SCOPE) as scope: q_t, self.q_model = self._build_q_network(self.obs_t, observation_space, self.act_t) self.q_func_vars = _scope_vars(scope.name) self.stats = { "mean_q": tf.reduce_mean(q_t), "max_q": tf.reduce_max(q_t), "min_q": tf.reduce_min(q_t), } with tf.variable_scope(Q_SCOPE, reuse=True): q_tp0, _ = self._build_q_network(self.obs_t, observation_space, output_actions) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_SCOPE) as scope: twin_q_t, self.twin_q_model = self._build_q_network( self.obs_t, observation_space, self.act_t) self.twin_q_func_vars = _scope_vars(scope.name) q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space, output_actions_estimated) target_q_func_vars = _scope_vars(scope.name) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope: twin_q_tp1, _ = self._build_q_network( self.obs_tp1, observation_space, output_actions_estimated) twin_target_q_func_vars = _scope_vars(scope.name) if self.config["twin_q"]: self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0, twin_q_t=twin_q_t, twin_q_tp1=twin_q_tp1) else: self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0) if config["l2_reg"] is not None: for var in self.p_func_vars: if "bias" not in var.name: self.loss.actor_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) for var in self.q_func_vars: if "bias" not in var.name: self.loss.critic_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) if self.config["twin_q"]: for var in self.twin_q_func_vars: if "bias" not in var.name: self.loss.critic_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) # Model self-supervised losses self.loss.actor_loss = self.p_model.custom_loss(self.loss.actor_loss) self.loss.critic_loss = self.q_model.custom_loss(self.loss.critic_loss) if self.config["twin_q"]: self.loss.critic_loss = self.twin_q_model.custom_loss( self.loss.critic_loss) # update_target_fn will be called periodically to copy Q network to # target Q network self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) if self.config["twin_q"]: for var, var_target in zip( sorted(self.twin_q_func_vars, key=lambda v: v.name), sorted(twin_target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) for var, var_target in zip( sorted(self.p_func_vars, key=lambda v: v.name), sorted(target_p_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*update_target_expr) self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.loss.actor_loss + self.loss.critic_loss, loss_inputs=self.loss_inputs, update_ops=q_batchnorm_update_ops + p_batchnorm_update_ops) self.sess.run(tf.global_variables_initializer()) # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.tf_utils.TensorFlowVariables( tf.group(q_tp0, q_tp1), self.sess) # Hard initial update self.update_target(tau=1.0)
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True output_hidden_shape = action_space.nvec.astype(np.int32) elif self.config["vtrace"]: raise UnsupportedSpaceException( "Action space {} is not supported for APPO + VTrace.", format(action_space)) else: is_multidiscrete = False output_hidden_shape = 1 # Policy network model dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Create input placeholders if existing_inputs: if self.config["vtrace"]: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards, adv_ph, value_targets = \ existing_inputs[:9] existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: actions = ModelCatalog.get_action_placeholder(action_space) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, logit_dim], name="behaviour_logits") observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None if not self.config["vtrace"]: adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) value_targets = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) self.observations = observations # Unpack behaviour logits unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1) # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split( self.model.outputs, output_hidden_shape, axis=1) dist_inputs = unpacked_outputs if is_multidiscrete else \ self.model.outputs prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \ behaviour_logits action_dist = dist_class(dist_inputs) prev_action_dist = dist_class(prev_dist_inputs) values = self.model.value_function() self.value_function = values self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. if self.config["vtrace"]: logger.info("Using V-Trace surrogate loss (vtrace=True)") # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims( actions, axis=1) self.loss = VTraceSurrogateLoss( actions=make_time_major(loss_actions, drop_last=True), prev_actions_logp=make_time_major( prev_action_dist.logp(actions), drop_last=True), actions_logp=make_time_major( action_dist.logp(actions), drop_last=True), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major( action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major( unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major( unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config[ "vtrace_clip_pg_rho_threshold"], clip_param=self.config["clip_param"]) else: logger.info("Using PPO surrogate loss (vtrace=False)") self.loss = PPOSurrogateLoss( prev_actions_logp=make_time_major( prev_action_dist.logp(actions)), actions_logp=make_time_major(action_dist.logp(actions)), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major(action_dist.entropy()), values=make_time_major(values), valid_mask=make_time_major(mask), advantages=make_time_major(adv_ph), value_targets=make_time_major(value_targets), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"]) # KL divergence between worker and learner logits for debugging model_dist = MultiCategorical(unpacked_outputs) behaviour_dist = MultiCategorical(unpacked_behaviour_logits) kls = model_dist.kl(behaviour_dist) if len(kls) > 1: self.KL_stats = {} for i, kl in enumerate(kls): self.KL_stats.update({ "mean_KL_{}".format(i): tf.reduce_mean(kl), "max_KL_{}".format(i): tf.reduce_max(kl), "median_KL_{}".format(i): tf.contrib.distributions. percentile(kl, 50.0), }) else: self.KL_stats = { "mean_KL": tf.reduce_mean(kls[0]), "max_KL": tf.reduce_max(kls[0]), "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0), } # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] if not self.config["vtrace"]: loss_in.append(("advantages", adv_ph)) loss_in.append(("value_targets", value_targets)) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) values_batched = make_time_major( values, drop_last=self.config["vtrace"]) self.stats_fetches = { "stats": dict({ "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.value_targets, [-1]), tf.reshape(values_batched, [-1])), }, **self.KL_stats), }
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config) if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( action_space)) self.config = config self.cur_epsilon = 1.0 self.dim_actions = action_space.shape[0] self.low_action = action_space.low self.high_action = action_space.high self.actor_optimizer = tf.train.AdamOptimizer( learning_rate=config["actor_lr"]) self.critic_optimizer = tf.train.AdamOptimizer( learning_rate=config["critic_lr"]) # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) # Actor: P (policy) network with tf.variable_scope(P_SCOPE) as scope: p_values = self._build_p_network(self.cur_observations, observation_space) self.p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE): self.output_actions = self._build_action_network( p_values, self.stochastic, self.eps) with tf.variable_scope(A_SCOPE, reuse=True): exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") self.reset_noise_op = tf.assign(exploration_sample, self.dim_actions * [.0]) # Replay inputs self.obs_t = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape, name="observation") self.act_t = tf.placeholder( tf.float32, shape=(None, ) + action_space.shape, name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder( tf.float32, [None], name="weight") # p network evaluation with tf.variable_scope(P_SCOPE, reuse=True) as scope: self.p_t = self._build_p_network(self.obs_t, observation_space) # target p network evaluation with tf.variable_scope(P_TARGET_SCOPE) as scope: p_tp1 = self._build_p_network(self.obs_tp1, observation_space) target_p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE, reuse=True): deterministic_flag = tf.constant(value=False, dtype=tf.bool) zero_eps = tf.constant(value=.0, dtype=tf.float32) output_actions = self._build_action_network( self.p_t, deterministic_flag, zero_eps) output_actions_estimated = self._build_action_network( p_tp1, deterministic_flag, zero_eps) # q network evaluation with tf.variable_scope(Q_SCOPE) as scope: q_t = self._build_q_network(self.obs_t, observation_space, self.act_t) self.q_func_vars = _scope_vars(scope.name) with tf.variable_scope(Q_SCOPE, reuse=True): q_tp0 = self._build_q_network(self.obs_t, observation_space, output_actions) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1 = self._build_q_network(self.obs_tp1, observation_space, output_actions_estimated) target_q_func_vars = _scope_vars(scope.name) self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0) if config["l2_reg"] is not None: for var in self.p_func_vars: if "bias" not in var.name: self.loss.actor_loss += ( config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) for var in self.q_func_vars: if "bias" not in var.name: self.loss.critic_loss += ( config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) # update_target_fn will be called periodically to copy Q network to # target Q network self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) for var, var_target in zip( sorted(self.p_func_vars, key=lambda v: v.name), sorted(target_p_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*update_target_expr) self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.loss.total_loss, loss_inputs=self.loss_inputs) self.sess.run(tf.global_variables_initializer()) # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.TensorFlowVariables( tf.group(q_tp0, q_tp1), self.sess) # Hard initial update self.update_target(tau=1.0)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.prev_actions = ModelCatalog.get_action_placeholder(action_space) self.prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": self.observations, "prev_actions": self.prev_actions, "prev_rewards": self.prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("prev_actions", self.prev_actions), ("prev_rewards", self.prev_rewards), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=self.prev_actions, prev_reward_input=self.prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { LEARNER_STATS_KEY: { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.vf), }, } self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \ existing_inputs[:8] existing_state_in = existing_inputs[8:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.prev_actions = prev_actions_ph self.prev_rewards = prev_rewards_ph self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ("prev_actions", prev_actions_ph), ("prev_rewards", prev_rewards_ph), ] self.model = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = self.model.value_function() else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False if vf_config["use_lstm"]: vf_config["use_lstm"] = False logger.warning( "It is not recommended to use a LSTM model with " "vf_share_layers=False (consider setting it to True). " "If you want to not share layers, you can implement " "a custom LSTM model that overrides the " "value_function() method.") with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model({ "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph, dtype=tf.bool) self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, action_prob=curr_action_dist.sampled_action_prob(), loss=self.loss_obj.loss, model=self.model, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_kl_coeff": self.kl_coeff, "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() # Create input placeholders if existing_inputs: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: if isinstance(action_space, gym.spaces.Discrete): ac_size = action_space.n actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder(tf.float32, [None, ac_size], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) action_dist = dist_class(self.model.outputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def to_batches(tensor): if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards, dtype=tf.bool) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=to_batches(actions)[:-1], actions_logp=to_batches(action_dist.logp(actions))[:-1], actions_entropy=to_batches(action_dist.entropy())[:-1], dones=to_batches(dones)[:-1], behaviour_logits=to_batches(behaviour_logits)[:-1], target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], rewards=to_batches(rewards)[:-1], values=to_batches(values)[:-1], bootstrap_value=to_batches(values)[-1], valid_mask=to_batches(mask)[:-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # KL divergence between worker and learner logits for debugging model_dist = Categorical(self.model.outputs) behaviour_dist = Categorical(behaviour_logits) self.KLs = model_dist.kl(behaviour_dist) self.mean_KL = tf.reduce_mean(self.KLs) self.max_KL = tf.reduce_max(self.KLs) self.median_KL = tf.contrib.distributions.percentile(self.KLs, 50.0) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.model.loss() + self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(to_batches(values)[:-1], [-1])), "mean_KL": self.mean_KL, "max_KL": self.max_KL, "median_KL": self.median_KL, }, }
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True output_hidden_shape = action_space.nvec.astype(np.int32) elif self.config["vtrace"]: raise UnsupportedSpaceException( "Action space {} is not supported for APPO + VTrace.", format(action_space)) else: is_multidiscrete = False output_hidden_shape = 1 # Policy network model dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Create input placeholders if existing_inputs: if self.config["vtrace"]: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards, adv_ph, value_targets = \ existing_inputs[:9] existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: actions = ModelCatalog.get_action_placeholder(action_space) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder(tf.float32, [None, logit_dim], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None if not self.config["vtrace"]: adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) value_targets = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) self.observations = observations # Unpack behaviour logits unpacked_behaviour_logits = tf.split(behaviour_logits, output_hidden_shape, axis=1) # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split(self.model.outputs, output_hidden_shape, axis=1) dist_inputs = unpacked_outputs if is_multidiscrete else \ self.model.outputs prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \ behaviour_logits action_dist = dist_class(dist_inputs) prev_action_dist = dist_class(prev_dist_inputs) values = self.model.value_function() self.value_function = values self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. if self.config["vtrace"]: logger.info("Using V-Trace surrogate loss (vtrace=True)") # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims( actions, axis=1) self.loss = VTraceSurrogateLoss( actions=make_time_major(loss_actions, drop_last=True), prev_actions_logp=make_time_major( prev_action_dist.logp(actions), drop_last=True), actions_logp=make_time_major(action_dist.logp(actions), drop_last=True), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major(action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major(unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major(unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self. config["vtrace_clip_pg_rho_threshold"], clip_param=self.config["clip_param"]) else: logger.info("Using PPO surrogate loss (vtrace=False)") self.loss = PPOSurrogateLoss( prev_actions_logp=make_time_major( prev_action_dist.logp(actions)), actions_logp=make_time_major(action_dist.logp(actions)), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major(action_dist.entropy()), values=make_time_major(values), valid_mask=make_time_major(mask), advantages=make_time_major(adv_ph), value_targets=make_time_major(value_targets), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"]) # KL divergence between worker and learner logits for debugging model_dist = MultiCategorical(unpacked_outputs) behaviour_dist = MultiCategorical(unpacked_behaviour_logits) kls = model_dist.kl(behaviour_dist) if len(kls) > 1: self.KL_stats = {} for i, kl in enumerate(kls): self.KL_stats.update({ "mean_KL_{}".format(i): tf.reduce_mean(kl), "max_KL_{}".format(i): tf.reduce_max(kl), "median_KL_{}".format(i): tf.contrib.distributions.percentile(kl, 50.0), }) else: self.KL_stats = { "mean_KL": tf.reduce_mean(kls[0]), "max_KL": tf.reduce_max(kls[0]), "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0), } # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] if not self.config["vtrace"]: loss_in.append(("advantages", adv_ph)) loss_in.append(("value_targets", value_targets)) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) values_batched = make_time_major(values, drop_last=self.config["vtrace"]) self.stats_fetches = { "stats": dict( { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.value_targets, [-1]), tf.reshape(values_batched, [-1])), }, **self.KL_stats), }
def __init__(self, observation_space, action_space, config, unsupType='action', envWrap=False, designHead='universe', noReward=False): """ An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments. Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism. But overall, we'll define the model, specify its inputs, and describe how the policy gradients step should be computed. """ self.unsup = unsupType is not None self.cur_batch = None predictor = None numaction = action_space.n config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy # ===================================================================== self.observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # NOTE: value function and trainable variables are defined in self.model # Define the policy network self.model = pi = ModelCatalog.get_model(self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) # Define S/S+A predictor network if self.unsup: with tf.variable_scope("predictor"): if 'state' in unsupType: self.local_ap_network = predictor = StatePredictor( observation_space.shape, numaction, designHead, unsupType) else: self.local_ap_network = predictor = StateActionPredictor( observation_space.shape, numaction, designHead) # Setup the policy loss # ===================================================================== if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") # compute policy loss and predictor loss self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.model.vf, unsupType, predictor, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.model.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.model.vf), }, } self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Action inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder(tf.float32, [None], name="prev_reward") with tf.variable_scope(POLICY_SCOPE) as scope: self.model = ModelCatalog.get_model( { "obs": self.obs_t, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) logits = self.model.outputs self.p_func_vars = _scope_vars(scope.name) # Action outputs action_dist = dist_cls(logits) self.output_actions = action_dist.sample() # Training inputs self.act_t = tf.placeholder(tf.int32, [None], name="action") self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward") # v network evaluation with tf.variable_scope(VALUE_SCOPE) as scope: state_values = self.model.value_function() self.v_func_vars = _scope_vars(scope.name) self.v_loss = self._build_value_loss(state_values, self.cum_rew_t) self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t, logits, self.act_t, action_space) # which kind of objective to optimize objective = (self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss) self.explained_variance = tf.reduce_mean( explained_variance(self.cum_rew_t, state_values)) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ (SampleBatch.CUR_OBS, self.obs_t), (SampleBatch.ACTIONS, self.act_t), (Postprocessing.ADVANTAGES, self.cum_rew_t), ] TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.obs_t, action_sampler=self.output_actions, action_prob=action_dist.sampled_action_prob(), loss=objective, model=self.model, loss_inputs=self.loss_inputs, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "total_loss": objective, "vf_explained_var": self.explained_variance, "policy_loss": self.p_loss.loss, "vf_loss": self.v_loss.loss }
def __init__(self, observation_space, action_space, config, existing_inputs=None, unsupType='action', designHead='universe'): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ self.unsup = unsupType is not None # self.cur_batch = None # self.cur_sample_batch = {} predictor = None numaction = action_space.n config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, phi1, phi2, asample = existing_inputs[:9] # TODO: updates to account for s1, s2 and asample existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder(tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) phi1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="phi1") phi2 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="phi2") asample = tf.placeholder(tf.float32, shape=(None, numaction), name="asample") existing_state_in = None existing_seq_lens = None self.observations = obs_ph if self.unsup: with tf.variable_scope("predictor"): if 'state' in unsupType: self.local_ap_network = predictor = StatePredictor( phi1, phi2, asample, observation_space, numaction, designHead, unsupType) else: self.local_ap_network = predictor = StateActionPredictor( phi1, phi2, asample, observation_space, numaction, designHead) self.model = pi = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, unsupType, predictor, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ("s1", phi1), ("s2", phi2), ("asample", asample), ] self.extra_inputs = ["s1", "s2", "asample"] # TODO: testing to see if this lets me pass inputs to ICM # self.variables = ray.experimental.TensorFlowVariables(self.loss_in, self.sess) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph = existing_inputs[:6] existing_state_in = existing_inputs[6:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder(tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ] self.model = ModelCatalog.get_model(obs_ph, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( obs_ph, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph) self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) values = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): ac_size = action_space.n actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, ac_size], name="behaviour_logits") def to_batches(tensor): if self.config["model"]["use_lstm"]: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = (self.config["sample_batch_size"] // self.config["num_envs_per_worker"]) B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=to_batches(actions)[:-1], actions_logp=to_batches(action_dist.logp(actions))[:-1], actions_entropy=to_batches(action_dist.entropy())[:-1], dones=to_batches(dones)[:-1], behaviour_logits=to_batches(behaviour_logits)[:-1], target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], rewards=to_batches(rewards)[:-1], values=to_batches(values)[:-1], bootstrap_value=to_batches(values)[-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", self.observations), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(to_batches(values)[:-1], [-1])), }, }
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( action_space)) self.config = config self.cur_epsilon = 1.0 self.num_actions = action_space.n # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) # Action Q network with tf.variable_scope(Q_SCOPE) as scope: q_values, q_logits, q_dist, _ = self._build_q_network( self.cur_observations, observation_space, action_space) self.q_values = q_values self.q_func_vars = _scope_vars(scope.name) # Noise vars for Q network except for layer normalization vars if self.config["parameter_noise"]: self._build_parameter_noise([ var for var in self.q_func_vars if "LayerNorm" not in var.name ]) self.action_probs = tf.nn.softmax(self.q_values) # Action outputs self.output_actions, self.action_prob = self._build_q_value_policy( q_values) # Replay inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.act_t = tf.placeholder(tf.int32, [None], name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation with tf.variable_scope(Q_SCOPE, reuse=True): prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) q_t, q_logits_t, q_dist_t, model = self._build_q_network( self.obs_t, observation_space, action_space) q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, q_logits_tp1, q_dist_tp1, _ = self._build_q_network( self.obs_tp1, observation_space, action_space) self.target_q_func_vars = _scope_vars(scope.name) # q scores for actions which we know were selected in the given state. one_hot_selection = tf.one_hot(self.act_t, self.num_actions) q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1) q_logits_t_selected = tf.reduce_sum( q_logits_t * tf.expand_dims(one_hot_selection, -1), 1) # compute estimate of best possible value starting from state at t + 1 if config["double_q"]: with tf.variable_scope(Q_SCOPE, reuse=True): q_tp1_using_online_net, q_logits_tp1_using_online_net, \ q_dist_tp1_using_online_net, _ = self._build_q_network( self.obs_tp1, observation_space, action_space) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best_one_hot_selection = tf.one_hot( q_tp1_best_using_online_net, self.num_actions) q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_dist_tp1_best = tf.reduce_sum( q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1) else: q_tp1_best_one_hot_selection = tf.one_hot(tf.argmax(q_tp1, 1), self.num_actions) q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_dist_tp1_best = tf.reduce_sum( q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1) self.loss = self._build_q_loss(q_t_selected, q_logits_t_selected, q_tp1_best, q_dist_tp1_best) # update_target_fn will be called periodically to copy Q network to # target Q network update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(self.target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) self.update_target_expr = tf.group(*update_target_expr) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, action_prob=self.action_prob, loss=self.loss.loss, model=model, loss_inputs=self.loss_inputs, update_ops=q_batchnorm_update_ops) self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model( self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) is_training = tf.placeholder_with_default(True, ()) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss( action_dist, actions, advantages, v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("advantages", advantages), ("value_targets", v_target), ] for i, ph in enumerate(self.model.state_in): loss_in.append(("state_in_{}".format(i), ph)) self.state_in = self.model.state_in self.state_out = self.model.state_out TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, is_training=is_training, state_inputs=self.state_in, state_outputs=self.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) if self.config.get("summarize"): bs = tf.to_float(tf.shape(self.observations)[0]) tf.summary.scalar("model/policy_graph", self.loss.pi_loss / bs) tf.summary.scalar("model/value_loss", self.loss.vf_loss / bs) tf.summary.scalar("model/entropy", self.loss.entropy / bs) tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads)) tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list)) self.summary_op = tf.summary.merge_all() self.sess.run(tf.global_variables_initializer())