def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Action inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder(tf.float32, [None], name="prev_reward") with tf.variable_scope(POLICY_SCOPE) as scope: self.model = ModelCatalog.get_model( { "obs": self.obs_t, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) logits = self.model.outputs self.p_func_vars = scope_vars(scope.name) # Action outputs action_dist = dist_cls(logits) self.output_actions = action_dist.sample() # Training inputs self.act_t = tf.placeholder(tf.int32, [None], name="action") self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward") # v network evaluation with tf.variable_scope(VALUE_SCOPE) as scope: state_values = self.model.value_function() self.v_func_vars = scope_vars(scope.name) self.v_loss = self._build_value_loss(state_values, self.cum_rew_t) self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t, logits, self.act_t, action_space) # which kind of objective to optimize objective = (self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss) self.explained_variance = tf.reduce_mean( explained_variance(self.cum_rew_t, state_values)) # initialize TFPolicy self.sess = tf.get_default_session() self.loss_inputs = [ (SampleBatch.CUR_OBS, self.obs_t), (SampleBatch.ACTIONS, self.act_t), (Postprocessing.ADVANTAGES, self.cum_rew_t), ] TFPolicy.__init__(self, observation_space, action_space, self.sess, obs_input=self.obs_t, action_sampler=self.output_actions, action_prob=action_dist.sampled_action_prob(), loss=objective, model=self.model, loss_inputs=self.loss_inputs, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "total_loss": objective, "vf_explained_var": self.explained_variance, "policy_loss": self.p_loss.loss, "vf_loss": self.v_loss.loss }
def __init__(self, obs_space, act_space, config): # _____ Initial Configuration config = dict(ray.rllib.agents.maddpg.DEFAULT_CONFIG, **config) self.config = config self.global_step = tf1.train.get_or_create_global_step() # FIXME: Get done from info is required since agentwise done is not # supported now. self.get_done_from_info = np.vectorize( lambda info: info.get("done", False)) agent_id = config["agent_id"] if agent_id is None: raise ValueError("Must set `agent_id` in the policy config.") if type(agent_id) is not int: raise ValueError("Agent ids must be integers for MADDPG.") # _____ Environment Setting def _make_continuous_space(space): if isinstance(space, Box): return space elif isinstance(space, Discrete): return Box(low=np.zeros((space.n, )), high=np.ones((space.n, ))) else: raise UnsupportedSpaceException( "Space {} is not supported.".format(space)) obs_space_n = [ _make_continuous_space(space) for _, (_, space, _, _) in config["multiagent"]["policies"].items() ] act_space_n = [ _make_continuous_space(space) for _, (_, _, space, _) in config["multiagent"]["policies"].items() ] # _____ Placeholders # Placeholders for policy evaluation and updates def _make_ph_n(space_n, name=""): return [ tf1.placeholder(tf.float32, shape=(None, ) + space.shape, name=name + "_%d" % i) for i, space in enumerate(space_n) ] obs_ph_n = _make_ph_n(obs_space_n, SampleBatch.OBS) act_ph_n = _make_ph_n(act_space_n, SampleBatch.ACTIONS) new_obs_ph_n = _make_ph_n(obs_space_n, SampleBatch.NEXT_OBS) new_act_ph_n = _make_ph_n(act_space_n, "new_actions") rew_ph = tf1.placeholder(tf.float32, shape=None, name="rewards_{}".format(agent_id)) done_ph = tf1.placeholder(tf.float32, shape=None, name="dones_{}".format(agent_id)) if config["use_local_critic"]: obs_space_n, act_space_n = [obs_space_n[agent_id] ], [act_space_n[agent_id]] obs_ph_n, act_ph_n = [obs_ph_n[agent_id]], [act_ph_n[agent_id]] new_obs_ph_n, new_act_ph_n = [new_obs_ph_n[agent_id] ], [new_act_ph_n[agent_id]] agent_id = 0 # _____ Value Network # Build critic network for t. critic, _, critic_model_n, critic_vars = self._build_critic_network( obs_ph_n, act_ph_n, obs_space_n, act_space_n, config["use_state_preprocessor"], config["critic_hiddens"], getattr(tf.nn, config["critic_hidden_activation"]), scope="critic", ) # Build critic network for t + 1. target_critic, _, _, target_critic_vars = self._build_critic_network( new_obs_ph_n, new_act_ph_n, obs_space_n, act_space_n, config["use_state_preprocessor"], config["critic_hiddens"], getattr(tf.nn, config["critic_hidden_activation"]), scope="target_critic", ) # Build critic loss. td_error = tf.subtract( tf.stop_gradient(rew_ph + (1.0 - done_ph) * (config["gamma"]**config["n_step"]) * target_critic[:, 0]), critic[:, 0], ) critic_loss = tf.reduce_mean(td_error**2) # _____ Policy Network # Build actor network for t. act_sampler, actor_feature, actor_model, actor_vars = self._build_actor_network( obs_ph_n[agent_id], obs_space_n[agent_id], act_space_n[agent_id], config["use_state_preprocessor"], config["actor_hiddens"], getattr(tf.nn, config["actor_hidden_activation"]), scope="actor", ) # Build actor network for t + 1. self.new_obs_ph = new_obs_ph_n[agent_id] self.target_act_sampler, _, _, target_actor_vars = self._build_actor_network( self.new_obs_ph, obs_space_n[agent_id], act_space_n[agent_id], config["use_state_preprocessor"], config["actor_hiddens"], getattr(tf.nn, config["actor_hidden_activation"]), scope="target_actor", ) # Build actor loss. act_n = act_ph_n.copy() act_n[agent_id] = act_sampler critic, _, _, _ = self._build_critic_network( obs_ph_n, act_n, obs_space_n, act_space_n, config["use_state_preprocessor"], config["critic_hiddens"], getattr(tf.nn, config["critic_hidden_activation"]), scope="critic", ) actor_loss = -tf.reduce_mean(critic) if config["actor_feature_reg"] is not None: actor_loss += config["actor_feature_reg"] * tf.reduce_mean( actor_feature**2) # _____ Losses self.losses = {"critic": critic_loss, "actor": actor_loss} # _____ Optimizers self.optimizers = { "critic": tf1.train.AdamOptimizer(config["critic_lr"]), "actor": tf1.train.AdamOptimizer(config["actor_lr"]), } # _____ Build variable update ops. self.tau = tf1.placeholder_with_default(config["tau"], shape=(), name="tau") def _make_target_update_op(vs, target_vs, tau): return [ target_v.assign(tau * v + (1.0 - tau) * target_v) for v, target_v in zip(vs, target_vs) ] self.update_target_vars = _make_target_update_op( critic_vars + actor_vars, target_critic_vars + target_actor_vars, self.tau) def _make_set_weight_op(variables): vs = list() for v in variables.values(): vs += v phs = [ tf1.placeholder(tf.float32, shape=v.get_shape(), name=v.name.split(":")[0] + "_ph") for v in vs ] return tf.group(*[v.assign(ph) for v, ph in zip(vs, phs)]), phs self.vars = { "critic": critic_vars, "actor": actor_vars, "target_critic": target_critic_vars, "target_actor": target_actor_vars, } self.update_vars, self.vars_ph = _make_set_weight_op(self.vars) # _____ TensorFlow Initialization sess = tf1.get_default_session() assert sess def _make_loss_inputs(placeholders): return [(ph.name.split("/")[-1].split(":")[0], ph) for ph in placeholders] loss_inputs = _make_loss_inputs(obs_ph_n + act_ph_n + new_obs_ph_n + new_act_ph_n + [rew_ph, done_ph]) TFPolicy.__init__( self, obs_space, act_space, config=config, sess=sess, obs_input=obs_ph_n[agent_id], sampled_action=act_sampler, loss=actor_loss + critic_loss, loss_inputs=loss_inputs, dist_inputs=actor_feature, ) del self.view_requirements["prev_actions"] del self.view_requirements["prev_rewards"] self.get_session().run(tf1.global_variables_initializer()) # Hard initial update self.update_target(1.0)
def __init__(self, obs_space, action_space, config, loss_fn, stats_fn=None, grad_stats_fn=None, before_loss_init=None, make_model=None, action_sampler_fn=None, existing_inputs=None, existing_model=None, get_batch_divisibility_req=None, obs_include_prev_action_reward=True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.Space): Observation space of the policy. action_space (gym.Space): Action space of the policy. config (dict): Policy-specific configuration data. loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy and batch input tensors grad_stats_fn (func): optional function that returns a dict of TF fetches given the policy and loss gradient tensors before_loss_init (func): optional function to run prior to loss init that takes the same arguments as __init__ make_model (func): optional function that returns a ModelV2 object given (policy, obs_space, action_space, config). All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (func): optional function that returns a tuple of action and action prob tensors given (policy, model, input_dict, obs_space, action_space, config). If not specified, a default action distribution will be used. existing_inputs (OrderedDict): when copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones existing_model (ModelV2): when copying a policy, this specifies an existing model to clone and share weights with get_batch_divisibility_req (func): optional function that returns the divisibility requirement for sample batches obs_include_prev_action_reward (bool): whether to include the previous action and reward in the model input Attributes: config: config of the policy model: model instance, if any model_out: output tensors of the model action_dist: action distribution of the model, if any state_in: state input tensors, if any state_out: state output tensors, if any seq_lens: tensor of sequence lengths """ self.config = config self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] else: obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } self.seq_lens = tf.placeholder(dtype=tf.int32, shape=[None], name="seq_lens") # Setup model if action_sampler_fn: if not make_model: raise ValueError( "make_model is required if action_sampler_fn is given") self.dist_class = None else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_model: self.model = existing_model elif make_model: self.model = make_model(self, obs_space, action_space, config) else: self.model = ModelCatalog.get_model_v2(obs_space, action_space, logit_dim, self.config["model"], framework="tf") if existing_inputs: self.state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if self.state_in: self.seq_lens = existing_inputs["seq_lens"] else: self.state_in = [ tf.placeholder(shape=(None, ) + s.shape, dtype=s.dtype) for s in self.model.get_initial_state() ] self.model_out, self.state_out = self.model(self.input_dict, self.state_in, self.seq_lens) # Setup action sampler if action_sampler_fn: self.action_dist = None action_sampler, action_prob = action_sampler_fn( self, self.model, self.input_dict, obs_space, action_space, config) else: self.action_dist = self.dist_class(self.model_out) action_sampler = self.action_dist.sample() action_prob = self.action_dist.sampled_action_prob() # Phase 1 init sess = tf.get_default_session() or tf.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 TFPolicy.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_sampler, action_prob=action_prob, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self.state_in, state_outputs=self.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req) # Phase 2 init before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss()
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config) if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( action_space)) if len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.") self.config = config self.cur_noise_scale = 1.0 self.cur_pure_exploration_phase = False self.dim_actions = action_space.shape[0] self.low_action = action_space.low self.high_action = action_space.high # create global step for counting the number of update operations self.global_step = tf.train.get_or_create_global_step() # use separate optimizers for actor & critic self._actor_optimizer = tf.train.AdamOptimizer( learning_rate=self.config["actor_lr"]) self._critic_optimizer = tf.train.AdamOptimizer( learning_rate=self.config["critic_lr"]) # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale") self.pure_exploration_phase = tf.placeholder( tf.bool, (), name="pure_exploration_phase") self.cur_observations = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="cur_obs") with tf.variable_scope(POLICY_SCOPE) as scope: policy_out, self.policy_model = self._build_policy_network( self.cur_observations, observation_space, action_space) self.policy_vars = scope_vars(scope.name) # Noise vars for P network except for layer normalization vars if self.config["parameter_noise"]: self._build_parameter_noise([ var for var in self.policy_vars if "LayerNorm" not in var.name ]) # Action outputs with tf.variable_scope(ACTION_SCOPE): self.output_actions = self._add_exploration_noise( policy_out, self.stochastic, self.noise_scale, self.pure_exploration_phase, action_space) if self.config["smooth_target_policy"]: self.reset_noise_op = tf.no_op() else: with tf.variable_scope(ACTION_SCOPE, reuse=True): exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") self.reset_noise_op = tf.assign(exploration_sample, self.dim_actions * [.0]) # Replay inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="observation") self.act_t = tf.placeholder(tf.float32, shape=(None, ) + action_space.shape, name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder(tf.float32, [None], name="weight") # policy network evaluation with tf.variable_scope(POLICY_SCOPE, reuse=True) as scope: prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) self.policy_t, _ = self._build_policy_network( self.obs_t, observation_space, action_space) policy_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target policy network evaluation with tf.variable_scope(POLICY_TARGET_SCOPE) as scope: policy_tp1, _ = self._build_policy_network(self.obs_tp1, observation_space, action_space) target_policy_vars = scope_vars(scope.name) # Action outputs with tf.variable_scope(ACTION_SCOPE, reuse=True): if config["smooth_target_policy"]: target_noise_clip = self.config["target_noise_clip"] clipped_normal_sample = tf.clip_by_value( tf.random_normal(tf.shape(policy_tp1), stddev=self.config["target_noise"]), -target_noise_clip, target_noise_clip) policy_tp1_smoothed = tf.clip_by_value( policy_tp1 + clipped_normal_sample, action_space.low * tf.ones_like(policy_tp1), action_space.high * tf.ones_like(policy_tp1)) else: # no smoothing, just use deterministic actions policy_tp1_smoothed = policy_tp1 # q network evaluation prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) with tf.variable_scope(Q_SCOPE) as scope: # Q-values for given actions & observations in given current q_t, self.q_model = self._build_q_network(self.obs_t, observation_space, action_space, self.act_t) self.q_func_vars = scope_vars(scope.name) self.stats = { "mean_q": tf.reduce_mean(q_t), "max_q": tf.reduce_max(q_t), "min_q": tf.reduce_min(q_t), } with tf.variable_scope(Q_SCOPE, reuse=True): # Q-values for current policy (no noise) in given current state q_t_det_policy, _ = self._build_q_network(self.obs_t, observation_space, action_space, self.policy_t) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_SCOPE) as scope: twin_q_t, self.twin_q_model = self._build_q_network( self.obs_t, observation_space, action_space, self.act_t) self.twin_q_func_vars = scope_vars(scope.name) q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target q network evaluation with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space, action_space, policy_tp1_smoothed) target_q_func_vars = scope_vars(scope.name) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope: twin_q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space, action_space, policy_tp1_smoothed) twin_target_q_func_vars = scope_vars(scope.name) if self.config["twin_q"]: self.critic_loss, self.actor_loss, self.td_error \ = self._build_actor_critic_loss( q_t, q_tp1, q_t_det_policy, twin_q_t=twin_q_t, twin_q_tp1=twin_q_tp1) else: self.critic_loss, self.actor_loss, self.td_error \ = self._build_actor_critic_loss( q_t, q_tp1, q_t_det_policy) if config["l2_reg"] is not None: for var in self.policy_vars: if "bias" not in var.name: self.actor_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) for var in self.q_func_vars: if "bias" not in var.name: self.critic_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) if self.config["twin_q"]: for var in self.twin_q_func_vars: if "bias" not in var.name: self.critic_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) # update_target_fn will be called periodically to copy Q network to # target Q network self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) if self.config["twin_q"]: for var, var_target in zip( sorted(self.twin_q_func_vars, key=lambda v: v.name), sorted(twin_target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) for var, var_target in zip( sorted(self.policy_vars, key=lambda v: v.name), sorted(target_policy_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*update_target_expr) self.sess = tf.get_default_session() self.loss_inputs = [ (SampleBatch.CUR_OBS, self.obs_t), (SampleBatch.ACTIONS, self.act_t), (SampleBatch.REWARDS, self.rew_t), (SampleBatch.NEXT_OBS, self.obs_tp1), (SampleBatch.DONES, self.done_mask), (PRIO_WEIGHTS, self.importance_weights), ] input_dict = dict(self.loss_inputs) if self.config["use_state_preprocessor"]: # Model self-supervised losses self.actor_loss = self.policy_model.custom_loss( self.actor_loss, input_dict) self.critic_loss = self.q_model.custom_loss( self.critic_loss, input_dict) if self.config["twin_q"]: self.critic_loss = self.twin_q_model.custom_loss( self.critic_loss, input_dict) TFPolicy.__init__(self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.actor_loss + self.critic_loss, loss_inputs=self.loss_inputs, update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops) self.sess.run(tf.global_variables_initializer()) # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.tf_utils.TensorFlowVariables( tf.group(q_t_det_policy, q_tp1), self.sess) # Hard initial update self.update_target(tau=1.0)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( action_space)) self.config = config self.cur_epsilon = 1.0 self.num_actions = action_space.n # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) # Action Q network with tf.variable_scope(Q_SCOPE) as scope: q_values, q_logits, q_dist, _ = self._build_q_network( self.cur_observations, observation_space, action_space) self.q_values = q_values self.q_func_vars = _scope_vars(scope.name) # Noise vars for Q network except for layer normalization vars if self.config["parameter_noise"]: self._build_parameter_noise([ var for var in self.q_func_vars if "LayerNorm" not in var.name ]) self.action_probs = tf.nn.softmax(self.q_values) # Action outputs self.output_actions, self.action_prob = self._build_q_value_policy( q_values) # Replay inputs self.obs_t = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) self.act_t = tf.placeholder(tf.int32, [None], name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder( tf.float32, [None], name="weight") # q network evaluation with tf.variable_scope(Q_SCOPE, reuse=True): prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) q_t, q_logits_t, q_dist_t, model = self._build_q_network( self.obs_t, observation_space, action_space) q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, q_logits_tp1, q_dist_tp1, _ = self._build_q_network( self.obs_tp1, observation_space, action_space) self.target_q_func_vars = _scope_vars(scope.name) # q scores for actions which we know were selected in the given state. one_hot_selection = tf.one_hot(self.act_t, self.num_actions) q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1) q_logits_t_selected = tf.reduce_sum( q_logits_t * tf.expand_dims(one_hot_selection, -1), 1) # compute estimate of best possible value starting from state at t + 1 if config["double_q"]: with tf.variable_scope(Q_SCOPE, reuse=True): q_tp1_using_online_net, q_logits_tp1_using_online_net, \ q_dist_tp1_using_online_net, _ = self._build_q_network( self.obs_tp1, observation_space, action_space) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best_one_hot_selection = tf.one_hot( q_tp1_best_using_online_net, self.num_actions) q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_dist_tp1_best = tf.reduce_sum( q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1) else: q_tp1_best_one_hot_selection = tf.one_hot( tf.argmax(q_tp1, 1), self.num_actions) q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_dist_tp1_best = tf.reduce_sum( q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1) self.loss = self._build_q_loss(q_t_selected, q_logits_t_selected, q_tp1_best, q_dist_tp1_best) # update_target_fn will be called periodically to copy Q network to # target Q network update_target_expr = [] assert len(self.q_func_vars) == len(self.target_q_func_vars), \ (self.q_func_vars, self.target_q_func_vars) for var, var_target in zip(self.q_func_vars, self.target_q_func_vars): update_target_expr.append(var_target.assign(var)) self.update_target_expr = tf.group(*update_target_expr) # initialize TFPolicy self.sess = tf.get_default_session() self.loss_inputs = [ (SampleBatch.CUR_OBS, self.obs_t), (SampleBatch.ACTIONS, self.act_t), (SampleBatch.REWARDS, self.rew_t), (SampleBatch.NEXT_OBS, self.obs_tp1), (SampleBatch.DONES, self.done_mask), (PRIO_WEIGHTS, self.importance_weights), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicy.__init__( self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, action_prob=self.action_prob, loss=self.loss.loss, model=model, loss_inputs=self.loss_inputs, update_ops=q_batchnorm_update_ops) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = dict({ "cur_lr": tf.cast(self.cur_lr, tf.float64), }, **self.loss.stats)
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True output_hidden_shape = action_space.nvec.astype(np.int32) else: is_multidiscrete = False output_hidden_shape = 1 # Create input placeholders dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions = ModelCatalog.get_action_placeholder(action_space) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder(tf.float32, [None, logit_dim], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Unpack behaviour logits unpacked_behaviour_logits = tf.split(behaviour_logits, output_hidden_shape, axis=1) # Setup the policy prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split(self.model.outputs, output_hidden_shape, axis=1) dist_inputs = unpacked_outputs if is_multidiscrete else \ self.model.outputs action_dist = dist_class(dist_inputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards, dtype=tf.bool) # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims(actions, axis=1) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=make_time_major(loss_actions, drop_last=True), actions_logp=make_time_major(action_dist.logp(actions), drop_last=True), actions_entropy=make_time_major(action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major(unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major(unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], dist_class=dist_class, valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # KL divergence between worker and learner logits for debugging model_dist = MultiCategorical(unpacked_outputs) behaviour_dist = MultiCategorical(unpacked_behaviour_logits) kls = model_dist.kl(behaviour_dist) if len(kls) > 1: self.KL_stats = {} for i, kl in enumerate(kls): self.KL_stats.update({ "mean_KL_{}".format(i): tf.reduce_mean(kl), "max_KL_{}".format(i): tf.reduce_max(kl), }) else: self.KL_stats = { "mean_KL": tf.reduce_mean(kls[0]), "max_KL": tf.reduce_max(kls[0]), } # Initialize TFPolicy loss_in = [ (SampleBatch.ACTIONS, actions), (SampleBatch.DONES, dones), (BEHAVIOUR_LOGITS, behaviour_logits), (SampleBatch.REWARDS, rewards), (SampleBatch.CUR_OBS, observations), (SampleBatch.PREV_ACTIONS, prev_actions), (SampleBatch.PREV_REWARDS, prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicy.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { LEARNER_STATS_KEY: dict( { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(make_time_major(values, drop_last=True), [-1])), }, **self.KL_stats), }
def __init__(self, obs_space, action_space, config, loss_fn, stats_fn=None, update_ops_fn=None, grad_stats_fn=None, before_loss_init=None, make_action_sampler=None, existing_inputs=None, get_batch_divisibility_req=None, obs_include_prev_action_reward=True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.Space): Observation space of the policy. action_space (gym.Space): Action space of the policy. config (dict): Policy-specific configuration data. loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy and batch input tensors grad_stats_fn (func): optional function that returns a dict of TF fetches given the policy and loss gradient tensors update_ops_fn (func): optional function that returns a list overriding the update ops to run when applying gradients before_loss_init (func): optional function to run prior to loss init that takes the same arguments as __init__ make_action_sampler (func): optional function that returns a tuple of action and action prob tensors. The function takes (policy, input_dict, obs_space, action_space, config) as its arguments existing_inputs (OrderedDict): when copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones get_batch_divisibility_req (func): optional function that returns the divisibility requirement for sample batches obs_include_prev_action_reward (bool): whether to include the previous action and reward in the model input """ self.config = config self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._update_ops_fn = update_ops_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] else: obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } # Create the model network and action outputs if make_action_sampler: assert not existing_inputs, \ "Cloning not supported with custom action sampler" self.model = None self.dist_class = None self.action_dist = None action_sampler, action_prob = make_action_sampler( self, self.input_dict, obs_space, action_space, config) else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: existing_state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if existing_state_in: existing_seq_lens = existing_inputs["seq_lens"] else: existing_seq_lens = None else: existing_state_in = [] existing_seq_lens = None self.model = ModelCatalog.get_model(self.input_dict, obs_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) self.action_dist = self.dist_class(self.model.outputs) action_sampler = self.action_dist.sample() action_prob = self.action_dist.sampled_action_prob() # Phase 1 init sess = tf.get_default_session() or tf.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 TFPolicy.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_sampler, action_prob=action_prob, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self.model and self.model.state_in, state_outputs=self.model and self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model and self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req) # Phase 2 init self._needs_eager_conversion = set() self._eager_tensors = {} before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss()
def __init__(self, obs_space, act_space, config): # _____ Initial Configuration config = dict(DEFAULT_CONFIG, **config) self.config = config self.global_step = tf1.train.get_or_create_global_step() # FIXME: Get done from info is required since agentwise done is not # supported now. # self.get_done_from_info = np.vectorize(lambda info: info.get("done", False)) agent_id = config["agent_id"] if agent_id is None: raise ValueError("Must set `agent_id` in the policy config.") obs_space_n, act_space_n = [], [] self.agent_ids = [] for pid, (_, obs_space, act_space, _) in config["multiagent"][ "policies" ].items(): # assert isinstance(obs_space, gym.spaces.Box), obs_space obs_space_n.append(_make_continuous_space(obs_space)) act_space_n.append(_make_continuous_space(act_space)) self.agent_ids.append(pid) self.agent_idx = self.agent_ids.index(agent_id) # _____ Placeholders # Placeholders for policy evaluation and updates obs_ph_n = _make_ph_n(obs_space_n, "obs") act_ph_n = _make_ph_n(act_space_n, "actions") new_obs_ph_n = _make_ph_n(obs_space_n, "new_obs") new_act_ph_n = _make_ph_n(act_space_n, "new_actions") rew_ph = tf1.placeholder( tf.float32, shape=None, name="rewards_{}".format(self.agent_idx) ) done_ph = tf1.placeholder( tf.float32, shape=None, name="dones_{}".format(self.agent_idx) ) if config["use_local_critic"]: # no global ... obs_space_n, act_space_n = ( [obs_space_n[self.agent_idx]], [act_space_n[self.agent_idx]], ) obs_ph_n, act_ph_n = [obs_ph_n[self.agent_idx]], [act_ph_n[self.agent_idx]] new_obs_ph_n, new_act_ph_n = ( [new_obs_ph_n[self.agent_idx]], [new_act_ph_n[self.agent_idx]], ) self.agent_idx = 0 # _____ Value Network # Build critic network for t. critic, _, critic_model_n, critic_vars = self._build_critic_network( obs_ph_n, act_ph_n, obs_space_n, act_space_n, config["use_state_preprocessor"], config["critic_hiddens"], getattr(tf.nn, config["critic_hidden_activation"]), scope="critic", ) # Build critic network for t + 1. target_critic, _, _, target_critic_vars = self._build_critic_network( new_obs_ph_n, new_act_ph_n, obs_space_n, act_space_n, config["use_state_preprocessor"], config["critic_hiddens"], getattr(tf.nn, config["critic_hidden_activation"]), scope="target_critic", ) # Build critic loss. td_error = tf.subtract( tf.stop_gradient( rew_ph + (1.0 - done_ph) * (config["gamma"] ** config["n_step"]) * target_critic[:, 0] ), critic[:, 0], ) critic_loss = tf.reduce_mean(td_error ** 2) # _____ Policy Network # Build actor network for t. act_sampler, actor_feature, actor_model, actor_vars = self._build_actor_network( obs_ph_n[self.agent_idx], obs_space_n[self.agent_idx], act_space_n[self.agent_idx], config["use_state_preprocessor"], config["actor_hiddens"], getattr(tf.nn, config["actor_hidden_activation"]), scope="actor", ) # Build actor network for t + 1. self.new_obs_ph = new_obs_ph_n[self.agent_idx] self.target_act_sampler, _, _, target_actor_vars = self._build_actor_network( self.new_obs_ph, obs_space_n[self.agent_idx], act_space_n[self.agent_idx], config["use_state_preprocessor"], config["actor_hiddens"], getattr(tf.nn, config["actor_hidden_activation"]), scope="target_actor", ) # Build actor loss. act_n = act_ph_n.copy() act_n[self.agent_idx] = act_sampler critic, _, _, _ = self._build_critic_network( obs_ph_n, act_n, obs_space_n, act_space_n, config["use_state_preprocessor"], config["critic_hiddens"], getattr(tf.nn, config["critic_hidden_activation"]), scope="critic", ) actor_loss = -tf.reduce_mean(critic) if config["actor_feature_reg"] is not None: actor_loss += config["actor_feature_reg"] * tf.reduce_mean( actor_feature ** 2 ) # _____ Losses self.losses = {"critic": critic_loss, "actor": actor_loss} # _____ Optimizers self.optimizers = { "critic": tf1.train.AdamOptimizer(config["critic_lr"]), "actor": tf1.train.AdamOptimizer(config["actor_lr"]), } # _____ Build variable update ops. self.tau = tf1.placeholder_with_default(config["tau"], shape=(), name="tau") self.update_target_vars = _make_target_update_op( critic_vars + actor_vars, target_critic_vars + target_actor_vars, self.tau ) self.vars = { "critic": critic_vars, "actor": actor_vars, "target_critic": target_critic_vars, "target_actor": target_actor_vars, } self.update_vars, self.vars_ph = _make_set_weight_op(self.vars) # _____ TensorFlow Initialization self.sess = tf1.get_default_session() loss_inputs = _make_loss_inputs( obs_ph_n + act_ph_n + new_obs_ph_n + new_act_ph_n + [rew_ph, done_ph] ) TFPolicy.__init__( self, obs_space, act_space, config=config, sess=self.sess, obs_input=obs_ph_n[self.agent_idx], sampled_action=act_sampler, loss=actor_loss + critic_loss, loss_inputs=loss_inputs, dist_inputs=actor_feature, ) self.sess.run(tf1.global_variables_initializer()) # Hard initial update self.update_target(1.0)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.prev_actions = ModelCatalog.get_action_placeholder(action_space) self.prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": self.observations, "prev_actions": self.prev_actions, "prev_rewards": self.prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicy loss_in = [ (SampleBatch.CUR_OBS, self.observations), (SampleBatch.ACTIONS, actions), (SampleBatch.PREV_ACTIONS, self.prev_actions), (SampleBatch.PREV_REWARDS, self.prev_rewards), (Postprocessing.ADVANTAGES, advantages), (Postprocessing.VALUE_TARGETS, self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicy.__init__(self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=self.prev_actions, prev_reward_input=self.prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { LEARNER_STATS_KEY: { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.vf), }, } self.sess.run(tf.global_variables_initializer())
def __init__(self, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, loss_fn: Callable[ [Policy, ModelV2, type, SampleBatch], TensorType], *, stats_fn: Optional[Callable[[Policy, SampleBatch], Dict[str, TensorType]]] = None, grad_stats_fn: Optional[Callable[ [Policy, SampleBatch, ModelGradients], Dict[str, TensorType]]] = None, before_loss_init: Optional[Callable[ [Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict], None]] = None, make_model: Optional[Callable[ [Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict], ModelV2]] = None, action_sampler_fn: Optional[Callable[ [TensorType, List[TensorType]], Tuple[ TensorType, TensorType]]] = None, action_distribution_fn: Optional[Callable[ [Policy, ModelV2, TensorType, TensorType, TensorType], Tuple[TensorType, type, List[TensorType]]]] = None, existing_inputs: Optional[Dict[ str, "tf1.placeholder"]] = None, existing_model: Optional[ModelV2] = None, get_batch_divisibility_req: Optional[int] = None, obs_include_prev_action_reward: bool = True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.spaces.Space): Observation space of the policy. action_space (gym.spaces.Space): Action space of the policy. config (TrainerConfigDict): Policy-specific configuration data. loss_fn (Callable[[Policy, ModelV2, type, SampleBatch], TensorType]): Function that returns a loss tensor for the policy graph. stats_fn (Optional[Callable[[Policy, SampleBatch], Dict[str, TensorType]]]): Optional function that returns a dict of TF fetches given the policy and batch input tensors. grad_stats_fn (Optional[Callable[[Policy, SampleBatch, ModelGradients], Dict[str, TensorType]]]): Optional function that returns a dict of TF fetches given the policy, sample batch, and loss gradient tensors. before_loss_init (Optional[Callable[ [Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict], None]]): Optional function to run prior to loss init that takes the same arguments as __init__. make_model (Optional[Callable[[Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict], ModelV2]]): Optional function that returns a ModelV2 object given policy, obs_space, action_space, and policy config. All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (Optional[Callable[[Policy, ModelV2, Dict[ str, TensorType], TensorType, TensorType], Tuple[TensorType, TensorType]]]): A callable returning a sampled action and its log-likelihood given Policy, ModelV2, input_dict, explore, timestep, and is_training. action_distribution_fn (Optional[Callable[[Policy, ModelV2, Dict[str, TensorType], TensorType, TensorType], Tuple[TensorType, type, List[TensorType]]]]): A callable returning distribution inputs (parameters), a dist-class to generate an action distribution object from, and internal-state outputs (or an empty list if not applicable). Note: No Exploration hooks have to be called from within `action_distribution_fn`. It's should only perform a simple forward pass through some model. If None, pass inputs through `self.model()` to get distribution inputs. The callable takes as inputs: Policy, ModelV2, input_dict, explore, timestep, is_training. existing_inputs (Optional[Dict[str, tf1.placeholder]]): When copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones. existing_model (Optional[ModelV2]): When copying a policy, this specifies an existing model to clone and share weights with. get_batch_divisibility_req (Optional[Callable[[Policy], int]]]): Optional callable that returns the divisibility requirement for sample batches given the Policy. obs_include_prev_action_reward (bool): Whether to include the previous action and reward in the model input (default: True). """ self.observation_space = obs_space self.action_space = action_space self.config = config self.framework = "tf" self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] action_input = existing_inputs[SampleBatch.ACTIONS] explore = existing_inputs["is_exploring"] timestep = existing_inputs["timestep"] else: obs = tf1.placeholder( tf.float32, shape=[None] + list(obs_space.shape), name="observation") action_input = ModelCatalog.get_action_placeholder(action_space) if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space, "prev_action") prev_rewards = tf1.placeholder( tf.float32, [None], name="prev_reward") explore = tf1.placeholder_with_default( True, (), name="is_exploring") timestep = tf1.placeholder(tf.int32, (), name="timestep") self._input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } # Placeholder for RNN time-chunk valid lengths. self._seq_lens = tf1.placeholder( dtype=tf.int32, shape=[None], name="seq_lens") dist_class = dist_inputs = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Setup self.model. if existing_model: self.model = existing_model elif make_model: self.model = make_model(self, obs_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework="tf", **self.config["model"].get("custom_model_config", {})) # Create the Exploration object to use for this Policy. self.exploration = self._create_exploration() if existing_inputs: self._state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if self._state_in: self._seq_lens = existing_inputs["seq_lens"] else: self._state_in = [ tf1.placeholder(shape=(None, ) + s.shape, dtype=s.dtype) for s in self.model.get_initial_state() ] # Fully customized action generation (e.g., custom policy). if action_sampler_fn: sampled_action, sampled_action_logp = action_sampler_fn( self, self.model, obs_batch=self._input_dict[SampleBatch.CUR_OBS], state_batches=self._state_in, seq_lens=self._seq_lens, prev_action_batch=self._input_dict[SampleBatch.PREV_ACTIONS], prev_reward_batch=self._input_dict[SampleBatch.PREV_REWARDS], explore=explore, is_training=self._input_dict["is_training"]) else: # Distribution generation is customized, e.g., DQN, DDPG. if action_distribution_fn: dist_inputs, dist_class, self._state_out = \ action_distribution_fn( self, self.model, obs_batch=self._input_dict[SampleBatch.CUR_OBS], state_batches=self._state_in, seq_lens=self._seq_lens, prev_action_batch=self._input_dict[ SampleBatch.PREV_ACTIONS], prev_reward_batch=self._input_dict[ SampleBatch.PREV_REWARDS], explore=explore, is_training=self._input_dict["is_training"]) # Default distribution generation behavior: # Pass through model. E.g., PG, PPO. else: dist_inputs, self._state_out = self.model( self._input_dict, self._state_in, self._seq_lens) action_dist = dist_class(dist_inputs, self.model) # Using exploration to get final action (e.g. via sampling). sampled_action, sampled_action_logp = \ self.exploration.get_exploration_action( action_distribution=action_dist, timestep=timestep, explore=explore) # Phase 1 init. sess = tf1.get_default_session() or tf1.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 # super().__init__( TFPolicy.__init__(self, observation_space=obs_space, action_space=action_space, config=config, sess=sess, obs_input=obs, action_input=action_input, # for logp calculations sampled_action=sampled_action, sampled_action_logp=sampled_action_logp, dist_inputs=dist_inputs, dist_class=dist_class, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self._state_in, state_outputs=self._state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self._seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req, explore=explore, timestep=timestep) # Phase 2 init. if before_loss_init is not None: before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss_dynamically()