예제 #1
0
    def __init__(self, obs_space, action_space, config):
        config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config)
        self.config = config

        # Setup placeholders
        obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape))
        dist_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
        prev_actions = ModelCatalog.get_action_placeholder(action_space)
        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")

        # Create the model network and action outputs
        self.model = ModelCatalog.get_model({
            "obs": obs,
            "prev_actions": prev_actions,
            "prev_rewards": prev_rewards,
            "is_training": self._get_is_training_placeholder(),
        }, obs_space, action_space, self.logit_dim, self.config["model"])
        action_dist = dist_class(self.model.outputs)  # logit for each action

        # Setup policy loss
        actions = ModelCatalog.get_action_placeholder(action_space)
        advantages = tf.placeholder(tf.float32, [None], name="adv")
        loss = PGLoss(action_dist, actions, advantages).loss

        # Mapping from sample batch keys to placeholders. These keys will be
        # read from postprocessed sample batches and fed into the specified
        # placeholders during loss computation.
        loss_in = [
            ("obs", obs),
            ("actions", actions),
            ("prev_actions", prev_actions),
            ("prev_rewards", prev_rewards),
            ("advantages", advantages),  # added during postprocessing
        ]

        # Initialize TFPolicyGraph
        sess = tf.get_default_session()
        TFPolicyGraph.__init__(
            self,
            obs_space,
            action_space,
            sess,
            obs_input=obs,
            action_sampler=action_dist.sample(),
            action_prob=action_dist.sampled_action_prob(),
            loss=loss,
            loss_inputs=loss_in,
            model=self.model,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            prev_action_input=prev_actions,
            prev_reward_input=prev_rewards,
            seq_lens=self.model.seq_lens,
            max_seq_len=config["model"]["max_seq_len"])
        sess.run(tf.global_variables_initializer())
예제 #2
0
파일: policy.py 프로젝트: adgirish/ray
 def _setup_graph(self, ob_space, ac_space):
     self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
     dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
     self._model = ModelCatalog.get_model(
         self.registry, self.x, self.logit_dim, self.config["model"])
     self.logits = self._model.outputs
     self.curr_dist = dist_class(self.logits)
     self.sample = self.curr_dist.sample()
     self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       tf.get_variable_scope().name)
예제 #3
0
 def __init__(self, obs_space, action_space, config):
     config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config)
     self.config = config
     _, self.logit_dim = ModelCatalog.get_action_dist(
         action_space, self.config["model"])
     self.model = ModelCatalog.get_torch_model(obs_space, self.logit_dim,
                                               self.config["model"])
     loss = A3CLoss(self.model, self.config["vf_loss_coeff"],
                    self.config["entropy_coeff"])
     TorchPolicyGraph.__init__(
         self,
         obs_space,
         action_space,
         self.model,
         loss,
         loss_inputs=["obs", "actions", "advantages", "value_targets"])
예제 #4
0
    def _setup_graph(self, ob_space, ac_space):
        self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
        dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
        self._model = ModelCatalog.get_model(
            self.registry, self.x, self.logit_dim, self.config["model"])
        self.logits = self._model.outputs
        self.curr_dist = dist_class(self.logits)
        self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
                                    normc_initializer(1.0)), [-1])

        self.sample = self.curr_dist.sample()
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)
        self.global_step = tf.get_variable(
            "global_step", [], tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)
예제 #5
0
    def __init__(self,
                 observation_space,
                 action_space,
                 config,
                 existing_inputs=None):
        """
        Arguments:
            observation_space: Environment observation space specification.
            action_space: Environment action space specification.
            config (dict): Configuration values for PPO graph.
            existing_inputs (list): Optional list of tuples that specify the
                placeholders upon which the graph should be built upon.
        """
        config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config)
        self.sess = tf.get_default_session()
        self.action_space = action_space
        self.config = config
        self.kl_coeff_val = self.config["kl_coeff"]
        self.kl_target = self.config["kl_target"]
        dist_cls, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])

        if existing_inputs:
            obs_ph, value_targets_ph, adv_ph, act_ph, \
                logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \
                existing_inputs[:8]
            existing_state_in = existing_inputs[8:-1]
            existing_seq_lens = existing_inputs[-1]
        else:
            obs_ph = tf.placeholder(
                tf.float32,
                name="obs",
                shape=(None, ) + observation_space.shape)
            adv_ph = tf.placeholder(
                tf.float32, name="advantages", shape=(None, ))
            act_ph = ModelCatalog.get_action_placeholder(action_space)
            logits_ph = tf.placeholder(
                tf.float32, name="logits", shape=(None, logit_dim))
            vf_preds_ph = tf.placeholder(
                tf.float32, name="vf_preds", shape=(None, ))
            value_targets_ph = tf.placeholder(
                tf.float32, name="value_targets", shape=(None, ))
            prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
            prev_rewards_ph = tf.placeholder(
                tf.float32, [None], name="prev_reward")
            existing_state_in = None
            existing_seq_lens = None
        self.observations = obs_ph
        self.prev_actions = prev_actions_ph
        self.prev_rewards = prev_rewards_ph

        self.loss_in = [
            ("obs", obs_ph),
            ("value_targets", value_targets_ph),
            ("advantages", adv_ph),
            ("actions", act_ph),
            ("logits", logits_ph),
            ("vf_preds", vf_preds_ph),
            ("prev_actions", prev_actions_ph),
            ("prev_rewards", prev_rewards_ph),
        ]
        self.model = ModelCatalog.get_model(
            {
                "obs": obs_ph,
                "prev_actions": prev_actions_ph,
                "prev_rewards": prev_rewards_ph,
                "is_training": self._get_is_training_placeholder(),
            },
            observation_space,
            action_space,
            logit_dim,
            self.config["model"],
            state_in=existing_state_in,
            seq_lens=existing_seq_lens)

        # KL Coefficient
        self.kl_coeff = tf.get_variable(
            initializer=tf.constant_initializer(self.kl_coeff_val),
            name="kl_coeff",
            shape=(),
            trainable=False,
            dtype=tf.float32)

        self.logits = self.model.outputs
        curr_action_dist = dist_cls(self.logits)
        self.sampler = curr_action_dist.sample()
        if self.config["use_gae"]:
            if self.config["vf_share_layers"]:
                self.value_function = self.model.value_function()
            else:
                vf_config = self.config["model"].copy()
                # Do not split the last layer of the value function into
                # mean parameters and standard deviation parameters and
                # do not make the standard deviations free variables.
                vf_config["free_log_std"] = False
                if vf_config["use_lstm"]:
                    vf_config["use_lstm"] = False
                    logger.warning(
                        "It is not recommended to use a LSTM model with "
                        "vf_share_layers=False (consider setting it to True). "
                        "If you want to not share layers, you can implement "
                        "a custom LSTM model that overrides the "
                        "value_function() method.")
                with tf.variable_scope("value_function"):
                    self.value_function = ModelCatalog.get_model({
                        "obs": obs_ph,
                        "prev_actions": prev_actions_ph,
                        "prev_rewards": prev_rewards_ph,
                        "is_training": self._get_is_training_placeholder(),
                    }, observation_space, action_space, 1, vf_config).outputs
                    self.value_function = tf.reshape(self.value_function, [-1])
        else:
            self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1])

        if self.model.state_in:
            max_seq_len = tf.reduce_max(self.model.seq_lens)
            mask = tf.sequence_mask(self.model.seq_lens, max_seq_len)
            mask = tf.reshape(mask, [-1])
        else:
            mask = tf.ones_like(adv_ph, dtype=tf.bool)

        self.loss_obj = PPOLoss(
            action_space,
            value_targets_ph,
            adv_ph,
            act_ph,
            logits_ph,
            vf_preds_ph,
            curr_action_dist,
            self.value_function,
            self.kl_coeff,
            mask,
            entropy_coeff=self.config["entropy_coeff"],
            clip_param=self.config["clip_param"],
            vf_clip_param=self.config["vf_clip_param"],
            vf_loss_coeff=self.config["vf_loss_coeff"],
            use_gae=self.config["use_gae"])

        LearningRateSchedule.__init__(self, self.config["lr"],
                                      self.config["lr_schedule"])
        TFPolicyGraph.__init__(
            self,
            observation_space,
            action_space,
            self.sess,
            obs_input=obs_ph,
            action_sampler=self.sampler,
            action_prob=curr_action_dist.sampled_action_prob(),
            loss=self.loss_obj.loss,
            model=self.model,
            loss_inputs=self.loss_in,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            prev_action_input=prev_actions_ph,
            prev_reward_input=prev_rewards_ph,
            seq_lens=self.model.seq_lens,
            max_seq_len=config["model"]["max_seq_len"])

        self.sess.run(tf.global_variables_initializer())
        self.explained_variance = explained_variance(value_targets_ph,
                                                     self.value_function)
        self.stats_fetches = {
            "cur_kl_coeff": self.kl_coeff,
            "cur_lr": tf.cast(self.cur_lr, tf.float64),
            "total_loss": self.loss_obj.loss,
            "policy_loss": self.loss_obj.mean_policy_loss,
            "vf_loss": self.loss_obj.mean_vf_loss,
            "vf_explained_var": self.explained_variance,
            "kl": self.loss_obj.mean_kl,
            "entropy": self.loss_obj.mean_entropy
        }
예제 #6
0
        def __init__(self, obs_space, action_space, config):
            # Set up the config from possible default-config fn and given
            # config arg.
            if get_default_config:
                config = dict(get_default_config(), **config)
            self.config = config

            # Set the DL framework for this Policy.
            self.framework = self.config["framework"] = framework

            # Validate observation- and action-spaces.
            if validate_spaces:
                validate_spaces(self, obs_space, action_space, self.config)

            # Do some pre-initialization steps.
            if before_init:
                before_init(self, obs_space, action_space, self.config)

            # Model is customized (use default action dist class).
            if make_model:
                assert make_model_and_action_dist is None, \
                    "Either `make_model` or `make_model_and_action_dist`" \
                    " must be None!"
                self.model = make_model(self, obs_space, action_space, config)
                dist_class, _ = ModelCatalog.get_action_dist(
                    action_space, self.config["model"], framework=framework)
            # Model and action dist class are customized.
            elif make_model_and_action_dist:
                self.model, dist_class = make_model_and_action_dist(
                    self, obs_space, action_space, config)
            # Use default model and default action dist.
            else:
                dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"], framework=framework)
                self.model = ModelCatalog.get_model_v2(
                    obs_space=obs_space,
                    action_space=action_space,
                    num_outputs=logit_dim,
                    model_config=self.config["model"],
                    framework=framework)

            # Make sure, we passed in a correct Model factory.
            model_cls = TorchModelV2 if framework == "torch" else JAXModelV2
            assert isinstance(self.model, model_cls), \
                "ERROR: Generated Model must be a TorchModelV2 object!"

            # Call the framework-specific Policy constructor.
            self.parent_cls = parent_cls
            self.parent_cls.__init__(
                self,
                observation_space=obs_space,
                action_space=action_space,
                config=config,
                model=self.model,
                loss=None if self.config["in_evaluation"] else loss_fn,
                action_distribution_class=dist_class,
                action_sampler_fn=action_sampler_fn,
                action_distribution_fn=action_distribution_fn,
                max_seq_len=config["model"]["max_seq_len"],
                get_batch_divisibility_req=get_batch_divisibility_req,
            )

            # Merge Model's view requirements into Policy's.
            self.view_requirements.update(self.model.view_requirements)

            _before_loss_init = before_loss_init or after_init
            if _before_loss_init:
                _before_loss_init(self, self.observation_space,
                                  self.action_space, config)

            # Perform test runs through postprocessing- and loss functions.
            self._initialize_loss_from_dummy_batch(
                auto_remove_unneeded_view_reqs=True,
                stats_fn=None if self.config["in_evaluation"] else stats_fn,
            )

            if _after_loss_init:
                _after_loss_init(self, obs_space, action_space, config)

            # Got to reset global_timestep again after this fake run-through.
            self.global_timestep = 0
예제 #7
0
 def _setup_graph(self, ob_space, ac_space):
     _, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
     self._model = ModelCatalog.get_torch_model(
         self.registry, ob_space, self.logit_dim, self.config["model"])
     self.optimizer = torch.optim.Adam(
         self._model.parameters(), lr=self.config["lr"])
예제 #8
0
    def __init__(self,
                 observation_space,
                 action_space,
                 config,
                 existing_inputs=None):
        config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config)
        assert config["batch_mode"] == "truncate_episodes", \
            "Must use `truncate_episodes` batch mode with V-trace."
        self.config = config
        self.sess = tf.get_default_session()

        # Create input placeholders
        if existing_inputs:
            actions, dones, behaviour_logits, rewards, observations, \
                prev_actions, prev_rewards = existing_inputs[:7]
            existing_state_in = existing_inputs[7:-1]
            existing_seq_lens = existing_inputs[-1]
        else:
            if isinstance(action_space, gym.spaces.Discrete):
                ac_size = action_space.n
                actions = tf.placeholder(tf.int64, [None], name="ac")
            else:
                raise UnsupportedSpaceException(
                    "Action space {} is not supported for IMPALA.".format(
                        action_space))
            dones = tf.placeholder(tf.bool, [None], name="dones")
            rewards = tf.placeholder(tf.float32, [None], name="rewards")
            behaviour_logits = tf.placeholder(tf.float32, [None, ac_size],
                                              name="behaviour_logits")
            observations = tf.placeholder(tf.float32, [None] +
                                          list(observation_space.shape))
            existing_state_in = None
            existing_seq_lens = None

        # Setup the policy
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
        prev_actions = ModelCatalog.get_action_placeholder(action_space)
        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
        self.model = ModelCatalog.get_model(
            {
                "obs": observations,
                "prev_actions": prev_actions,
                "prev_rewards": prev_rewards,
                "is_training": self._get_is_training_placeholder(),
            },
            observation_space,
            logit_dim,
            self.config["model"],
            state_in=existing_state_in,
            seq_lens=existing_seq_lens)
        action_dist = dist_class(self.model.outputs)
        values = self.model.value_function()
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)

        def to_batches(tensor):
            if self.config["model"]["use_lstm"]:
                B = tf.shape(self.model.seq_lens)[0]
                T = tf.shape(tensor)[0] // B
            else:
                # Important: chop the tensor into batches at known episode cut
                # boundaries. TODO(ekl) this is kind of a hack
                T = self.config["sample_batch_size"]
                B = tf.shape(tensor)[0] // T
            rs = tf.reshape(tensor,
                            tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0))
            # swap B and T axes
            return tf.transpose(
                rs,
                [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))))

        if self.model.state_in:
            max_seq_len = tf.reduce_max(self.model.seq_lens) - 1
            mask = tf.sequence_mask(self.model.seq_lens, max_seq_len)
            mask = tf.reshape(mask, [-1])
        else:
            mask = tf.ones_like(rewards)

        # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc.
        self.loss = VTraceLoss(
            actions=to_batches(actions)[:-1],
            actions_logp=to_batches(action_dist.logp(actions))[:-1],
            actions_entropy=to_batches(action_dist.entropy())[:-1],
            dones=to_batches(dones)[:-1],
            behaviour_logits=to_batches(behaviour_logits)[:-1],
            target_logits=to_batches(self.model.outputs)[:-1],
            discount=config["gamma"],
            rewards=to_batches(rewards)[:-1],
            values=to_batches(values)[:-1],
            bootstrap_value=to_batches(values)[-1],
            valid_mask=to_batches(mask)[:-1],
            vf_loss_coeff=self.config["vf_loss_coeff"],
            entropy_coeff=self.config["entropy_coeff"],
            clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
            clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"])

        # KL divergence between worker and learner logits for debugging
        model_dist = Categorical(self.model.outputs)
        behaviour_dist = Categorical(behaviour_logits)
        self.KLs = model_dist.kl(behaviour_dist)
        self.mean_KL = tf.reduce_mean(self.KLs)
        self.max_KL = tf.reduce_max(self.KLs)
        self.median_KL = tf.contrib.distributions.percentile(self.KLs, 50.0)

        # Initialize TFPolicyGraph
        loss_in = [
            ("actions", actions),
            ("dones", dones),
            ("behaviour_logits", behaviour_logits),
            ("rewards", rewards),
            ("obs", observations),
            ("prev_actions", prev_actions),
            ("prev_rewards", prev_rewards),
        ]
        LearningRateSchedule.__init__(self, self.config["lr"],
                                      self.config["lr_schedule"])
        TFPolicyGraph.__init__(
            self,
            observation_space,
            action_space,
            self.sess,
            obs_input=observations,
            action_sampler=action_dist.sample(),
            loss=self.model.loss() + self.loss.total_loss,
            loss_inputs=loss_in,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            prev_action_input=prev_actions,
            prev_reward_input=prev_rewards,
            seq_lens=self.model.seq_lens,
            max_seq_len=self.config["model"]["max_seq_len"],
            batch_divisibility_req=self.config["sample_batch_size"])

        self.sess.run(tf.global_variables_initializer())

        self.stats_fetches = {
            "stats": {
                "cur_lr":
                tf.cast(self.cur_lr, tf.float64),
                "policy_loss":
                self.loss.pi_loss,
                "entropy":
                self.loss.entropy,
                "grad_gnorm":
                tf.global_norm(self._grads),
                "var_gnorm":
                tf.global_norm(self.var_list),
                "vf_loss":
                self.loss.vf_loss,
                "vf_explained_var":
                explained_variance(
                    tf.reshape(self.loss.vtrace_returns.vs, [-1]),
                    tf.reshape(to_batches(values)[:-1], [-1])),
                "mean_KL":
                self.mean_KL,
                "max_KL":
                self.max_KL,
                "median_KL":
                self.median_KL,
            },
        }
예제 #9
0
    def __init__(self,
                 obs_space,
                 action_space,
                 config,
                 loss_fn,
                 stats_fn=None,
                 grad_stats_fn=None,
                 before_loss_init=None,
                 make_model=None,
                 action_sampler_fn=None,
                 existing_inputs=None,
                 existing_model=None,
                 get_batch_divisibility_req=None,
                 obs_include_prev_action_reward=True):
        """Initialize a dynamic TF policy.

        Arguments:
            observation_space (gym.Space): Observation space of the policy.
            action_space (gym.Space): Action space of the policy.
            config (dict): Policy-specific configuration data.
            loss_fn (func): function that returns a loss tensor the policy
                graph, and dict of experience tensor placeholders
            stats_fn (func): optional function that returns a dict of
                TF fetches given the policy and batch input tensors
            grad_stats_fn (func): optional function that returns a dict of
                TF fetches given the policy and loss gradient tensors
            before_loss_init (func): optional function to run prior to loss
                init that takes the same arguments as __init__
            make_model (func): optional function that returns a ModelV2 object
                given (policy, obs_space, action_space, config).
                All policy variables should be created in this function. If not
                specified, a default model will be created.
            action_sampler_fn (func): optional function that returns a
                tuple of action and action logp tensors given
                (policy, model, input_dict, obs_space, action_space, config).
                If not specified, a default action distribution will be used.
            existing_inputs (OrderedDict): when copying a policy, this
                specifies an existing dict of placeholders to use instead of
                defining new ones
            existing_model (ModelV2): when copying a policy, this specifies
                an existing model to clone and share weights with
            get_batch_divisibility_req (func): optional function that returns
                the divisibility requirement for sample batches
            obs_include_prev_action_reward (bool): whether to include the
                previous action and reward in the model input
            extra_input_dict (dict): map from input name to placeholder.

        Attributes:
            config: config of the policy
            model: model instance, if any
        """
        self.config = config
        self._loss_fn = loss_fn
        self._stats_fn = stats_fn
        self._grad_stats_fn = grad_stats_fn
        self._obs_include_prev_action_reward = obs_include_prev_action_reward

        # Setup standard placeholders
        prev_actions = None
        prev_rewards = None
        if existing_inputs is not None:
            obs = existing_inputs[SampleBatch.CUR_OBS]
            if self._obs_include_prev_action_reward:
                prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS]
                prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS]
        else:
            obs = tf.placeholder(tf.float32,
                                 shape=[None] + list(obs_space.shape),
                                 name="observation")
            if self._obs_include_prev_action_reward:
                prev_actions = ModelCatalog.get_action_placeholder(
                    action_space)
                prev_rewards = tf.placeholder(tf.float32, [None],
                                              name="prev_reward")

        self._input_dict = {
            SampleBatch.CUR_OBS: obs,
            SampleBatch.PREV_ACTIONS: prev_actions,
            SampleBatch.PREV_REWARDS: prev_rewards,
            "is_training": self._get_is_training_placeholder(),
        }

        self._seq_lens = tf.placeholder(dtype=tf.int32,
                                        shape=[None],
                                        name="seq_lens")

        # Setup model
        if action_sampler_fn:
            if not make_model:
                raise ValueError(
                    "make_model is required if action_sampler_fn is given")
            self._dist_class = None
        else:
            self._dist_class, logit_dim = ModelCatalog.get_action_dist(
                action_space, self.config["model"])

        if existing_model:
            self.model = existing_model
        elif make_model:
            self.model = make_model(self, obs_space, action_space, config)
        else:
            self.model = ModelCatalog.get_model_v2(obs_space,
                                                   action_space,
                                                   logit_dim,
                                                   self.config["model"],
                                                   framework="tf")

        # if existing_inputs:
        #     for name, mask_input in existing_inputs.items():
        #         if not name.endswith("mask"):
        #             continue
        #         else:
        #             self._input_dict[name] = mask_input
        # else:
        # PENGZHENGHAO
        # for name, ph in self.model.mask_placeholder_dict.items():
        #     self._input_dict[name] = ph
        # print("Current key names of input dict: ", self._input_dict.keys())

        if existing_inputs:
            self._state_in = [
                v for k, v in existing_inputs.items()
                if k.startswith("state_in_")
            ]
            if self._state_in:
                self._seq_lens = existing_inputs["seq_lens"]
        else:
            self._state_in = [
                tf.placeholder(shape=(None, ) + s.shape, dtype=s.dtype)
                for s in self.model.get_initial_state()
            ]

        model_out, self._state_out = self.model(self._input_dict,
                                                self._state_in, self._seq_lens)

        # Setup action sampler
        if action_sampler_fn:
            action_sampler, action_logp = action_sampler_fn(
                self, self.model, self._input_dict, obs_space, action_space,
                config)
        else:
            action_dist = self._dist_class(model_out, self.model)
            action_sampler = action_dist.sample()
            action_logp = action_dist.sampled_action_logp()

        # Phase 1 init
        sess = tf.get_default_session() or tf.Session()
        if get_batch_divisibility_req:
            batch_divisibility_req = get_batch_divisibility_req(self)
        else:
            batch_divisibility_req = 1

        TFPolicy.__init__(
            self,
            obs_space,
            action_space,
            sess,
            obs_input=obs,
            action_sampler=action_sampler,
            action_logp=action_logp,
            loss=None,  # dynamically initialized on run
            loss_inputs=[],
            model=self.model,
            state_inputs=self._state_in,
            state_outputs=self._state_out,
            prev_action_input=prev_actions,
            prev_reward_input=prev_rewards,
            seq_lens=self._seq_lens,
            max_seq_len=config["model"]["max_seq_len"],
            batch_divisibility_req=batch_divisibility_req)

        # Phase 2 init
        before_loss_init(self, obs_space, action_space, config)
        # print("Before dynamic_tf_policy's initialize_loss")
        if not existing_inputs:
            self._initialize_loss()
예제 #10
0
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config)
        assert config["batch_mode"] == "truncate_episodes", \
            "Must use `truncate_episodes` batch mode with V-trace."
        self.config = config
        self.sess = tf.get_default_session()

        # Setup the policy
        self.observations = tf.placeholder(
            tf.float32, [None] + list(observation_space.shape))
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
        self.model = ModelCatalog.get_model(self.observations, logit_dim,
                                            self.config["model"])
        action_dist = dist_class(self.model.outputs)
        values = tf.reshape(
            linear(self.model.last_layer, 1, "value", normc_initializer(1.0)),
            [-1])
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)

        # Setup the policy loss
        if isinstance(action_space, gym.spaces.Discrete):
            ac_size = action_space.n
            actions = tf.placeholder(tf.int64, [None], name="ac")
        else:
            raise UnsupportedSpaceException(
                "Action space {} is not supported for IMPALA.".format(
                    action_space))
        dones = tf.placeholder(tf.bool, [None], name="dones")
        rewards = tf.placeholder(tf.float32, [None], name="rewards")
        behaviour_logits = tf.placeholder(
            tf.float32, [None, ac_size], name="behaviour_logits")

        def to_batches(tensor):
            if self.config["model"]["use_lstm"]:
                B = tf.shape(self.model.seq_lens)[0]
                T = tf.shape(tensor)[0] // B
            else:
                # Important: chop the tensor into batches at known episode cut
                # boundaries. TODO(ekl) this is kind of a hack
                T = self.config["sample_batch_size"]
                B = tf.shape(tensor)[0] // T
            rs = tf.reshape(tensor,
                            tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0))
            # swap B and T axes
            return tf.transpose(
                rs,
                [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))))

        # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc.
        self.loss = VTraceLoss(
            actions=to_batches(actions)[:-1],
            actions_logp=to_batches(action_dist.logp(actions))[:-1],
            actions_entropy=to_batches(action_dist.entropy())[:-1],
            dones=to_batches(dones)[:-1],
            behaviour_logits=to_batches(behaviour_logits)[:-1],
            target_logits=to_batches(self.model.outputs)[:-1],
            discount=config["gamma"],
            rewards=to_batches(rewards)[:-1],
            values=to_batches(values)[:-1],
            bootstrap_value=to_batches(values)[-1],
            vf_loss_coeff=self.config["vf_loss_coeff"],
            entropy_coeff=self.config["entropy_coeff"],
            clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
            clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"])

        # Initialize TFPolicyGraph
        loss_in = [
            ("actions", actions),
            ("dones", dones),
            ("behaviour_logits", behaviour_logits),
            ("rewards", rewards),
            ("obs", self.observations),
        ]
        LearningRateSchedule.__init__(self, self.config["lr"],
                                      self.config["lr_schedule"])
        TFPolicyGraph.__init__(
            self,
            observation_space,
            action_space,
            self.sess,
            obs_input=self.observations,
            action_sampler=action_dist.sample(),
            loss=self.loss.total_loss,
            loss_inputs=loss_in,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            seq_lens=self.model.seq_lens,
            max_seq_len=self.config["model"]["max_seq_len"])

        self.sess.run(tf.global_variables_initializer())

        self.stats_fetches = {
            "stats": {
                "cur_lr": tf.cast(self.cur_lr, tf.float64),
                "policy_loss": self.loss.pi_loss,
                "entropy": self.loss.entropy,
                "grad_gnorm": tf.global_norm(self._grads),
                "var_gnorm": tf.global_norm(self.var_list),
                "vf_loss": self.loss.vf_loss,
                "vf_explained_var": explained_variance(
                    tf.reshape(self.loss.vtrace_returns.vs, [-1]),
                    tf.reshape(to_batches(values)[:-1], [-1])),
            },
        }
예제 #11
0
    def __init__(self,
                 action_space,
                 value_targets,
                 advantages,
                 actions,
                 logprobs,
                 vf_preds,
                 curr_action_dist,
                 value_fn,
                 cur_kl_coeff,
                 entropy_coeff=0,
                 clip_param=0.1,
                 vf_loss_coeff=1.0,
                 use_gae=True):
        """Constructs the loss for Proximal Policy Objective.

        Arguments:
            action_space: Environment observation space specification.
            value_targets (Placeholder): Placeholder for target values; used
                for GAE.
            actions (Placeholder): Placeholder for actions taken
                from previous model evaluation.
            advantages (Placeholder): Placeholder for calculated advantages
                from previous model evaluation.
            logprobs (Placeholder): Placeholder for logits output from
                previous model evaluation.
            vf_preds (Placeholder): Placeholder for value function output
                from previous model evaluation.
            curr_action_dist (ActionDistribution): ActionDistribution
                of the current model.
            value_fn (Tensor): Current value function output Tensor.
            cur_kl_coeff (Variable): Variable holding the current PPO KL
                coefficient.
            entropy_coeff (float): Coefficient of the entropy regularizer.
            clip_param (float): Clip parameter
            vf_loss_coeff (float): Coefficient of the value function loss
            use_gae (bool): If true, use the Generalized Advantage Estimator.
        """
        dist_cls, _ = ModelCatalog.get_action_dist(action_space)
        prev_dist = dist_cls(logprobs)
        # Make loss functions.
        logp_ratio = tf.exp(
            curr_action_dist.logp(actions) - prev_dist.logp(actions))
        action_kl = prev_dist.kl(curr_action_dist)
        self.mean_kl = tf.reduce_mean(action_kl)

        curr_entropy = curr_action_dist.entropy()
        self.mean_entropy = tf.reduce_mean(curr_entropy)

        surrogate_loss = tf.minimum(
            advantages * logp_ratio,
            advantages *
            tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param))
        self.mean_policy_loss = tf.reduce_mean(-surrogate_loss)

        if use_gae:
            vf_loss1 = tf.square(value_fn - value_targets)
            vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds,
                                                     -clip_param, clip_param)
            vf_loss2 = tf.square(vf_clipped - value_targets)
            vf_loss = tf.minimum(vf_loss1, vf_loss2)
            self.mean_vf_loss = tf.reduce_mean(vf_loss)
            loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl +
                                  vf_loss_coeff * vf_loss -
                                  entropy_coeff * curr_entropy)
        else:
            self.mean_vf_loss = tf.constant(0.0)
            loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl -
                                  entropy_coeff * curr_entropy)
        self.loss = loss
예제 #12
0
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
            self.framework = config.get("framework", "tfe")
            Policy.__init__(self, observation_space, action_space, config)
            self._is_training = False
            self._loss_initialized = False
            self._sess = None

            self._loss = loss_fn
            self.batch_divisibility_req = get_batch_divisibility_req(self) if \
                callable(get_batch_divisibility_req) else \
                (get_batch_divisibility_req or 1)
            self._max_seq_len = config["model"]["max_seq_len"]

            if get_default_config:
                config = dict(get_default_config(), **config)

            if validate_spaces:
                validate_spaces(self, observation_space, action_space, config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config
            self.dist_class = None
            if action_sampler_fn or action_distribution_fn:
                if not make_model:
                    raise ValueError(
                        "`make_model` is required if `action_sampler_fn` OR "
                        "`action_distribution_fn` is given")
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework=self.framework,
                )
            self.exploration = self._create_exploration()
            self._state_in = [
                tf.convert_to_tensor([s])
                for s in self.model.get_initial_state()
            ]
            input_dict = {
                SampleBatch.CUR_OBS:
                tf.convert_to_tensor(np.array([observation_space.sample()])),
                SampleBatch.PREV_ACTIONS:
                tf.convert_to_tensor(
                    [flatten_to_single_ndarray(action_space.sample())]),
                SampleBatch.PREV_REWARDS:
                tf.convert_to_tensor([0.]),
            }

            if action_distribution_fn:
                dist_inputs, self.dist_class, _ = action_distribution_fn(
                    self, self.model, input_dict[SampleBatch.CUR_OBS])
            else:
                self.model(input_dict, self._state_in,
                           tf.convert_to_tensor([1]))

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            self._initialize_loss_with_dummy_batch()
            self._loss_initialized = True

            if optimizer_fn:
                self._optimizer = optimizer_fn(self, config)
            else:
                self._optimizer = tf.keras.optimizers.Adam(config["lr"])

            if after_init:
                after_init(self, observation_space, action_space, config)
예제 #13
0
    def __init__(self,
                 action_space,
                 value_targets,
                 advantages_ext,
                 advantages_int,
                 actions,
                 logits,
                 vf_preds,
                 curr_action_dist,
                 value_fn,
                 cur_kl_coeff,
                 rnd_target,
                 rnd_predictor,
                 entropy_coeff=0,
                 clip_param=0.1,
                 vf_clip_param=0.1,
                 vf_loss_coeff=1.0,
                 use_gae=True,
                 rnd_pred_update_prop=0.25):
        """Constructs the loss for Proximal Policy Objective with Random Networks Distillation
        Arguments:
            action_space: Environment observation space specification.
            value_targets (Placeholder): Placeholder for target values; used
                for GAE.
            actions (Placeholder): Placeholder for actions taken
                from previous model evaluation.
            advantages_ext (Placeholder): Placeholder for calculated extrinsic advantages
                from previous model evaluation.
            advantages_int (Placeholder): Placeholder for calculated intrinsic advantages
                from previous model evaluation.
            logits (Placeholder): Placeholder for logits output from
                previous model evaluation.
            vf_preds (Placeholder): Placeholder for value function output
                from previous model evaluation.
            curr_action_dist (ActionDistribution): ActionDistribution
                of the current model.
            value_fn (Tensor): Current value function output Tensor.
            cur_kl_coeff (Variable): Variable holding the current PPO KL
                coefficient.
            rnd_target (Tensor): Current RND target network output Tensor
            rnd_predictor (Tensor): Current RND predictor network output Tensor
            entropy_coeff (float): Coefficient of the entropy regularizer.
            clip_param (float): Clip parameter
            vf_clip_param (float): Clip parameter for the value function
            vf_loss_coeff (float): Coefficient of the value function loss
            use_gae (bool): If true, use the Generalized Advantage Estimator.
            rnd_pred_update_prop (float): Proportion of experience used for RND predictor update.
        """
        dist_cls, _ = ModelCatalog.get_action_dist(action_space)
        prev_dist = dist_cls(logits)
        # Make loss functions.
        logp_ratio = tf.exp(
            curr_action_dist.logp(actions) - prev_dist.logp(actions))
        action_kl = prev_dist.kl(curr_action_dist)
        self.mean_kl = tf.reduce_mean(action_kl)

        curr_entropy = curr_action_dist.entropy()
        self.mean_entropy = tf.reduce_mean(curr_entropy)

        surrogate_loss = tf.minimum(
            advantages_ext * logp_ratio,
            advantages_ext *
            tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param))
        self.mean_policy_loss = tf.reduce_mean(-surrogate_loss)

        if use_gae:
            vf_loss1 = tf.square(value_fn - value_targets)
            vf_clipped = vf_preds + tf.clip_by_value(
                value_fn - vf_preds, -vf_clip_param, vf_clip_param)
            vf_loss2 = tf.square(vf_clipped - value_targets)
            vf_loss = tf.maximum(vf_loss1, vf_loss2)
            self.mean_vf_loss = tf.reduce_mean(vf_loss)
            loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl +
                                  vf_loss_coeff * vf_loss -
                                  entropy_coeff * curr_entropy)
        else:
            self.mean_vf_loss = tf.constant(0.0)
            loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl -
                                  entropy_coeff * curr_entropy)
        # TODO: add value loss for intrinsic rewards

        # Add RND loss terms to vf_loss
        # feat_var = tf.reduce_mean(tf.nn.moments(rnd_target, axes=[0])[1])  # TODO: use where?
        # max_feat = tf.reduce_max(tf.abs(rnd_target))  # TODO: use where?
        targets = tf.stop_gradient(rnd_target)
        self.int_rew = tf.reduce_mean(tf.square(targets - rnd_predictor),
                                      axis=-1,
                                      keep_dims=True)
        self.aux_loss = tf.reduce_mean(tf.square(targets - rnd_predictor), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < rnd_pred_update_prop, tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)
        loss = loss + self.aux_loss

        self.loss = loss
예제 #14
0
        def __init__(self, observation_space, action_space, config):
            # If this class runs as a @ray.remote actor, eager mode may not
            # have been activated yet.
            if not tf1.executing_eagerly():
                tf1.enable_eager_execution()
            self.framework = config.get("framework", "tfe")
            EagerTFPolicy.__init__(self, observation_space, action_space,
                                   config)

            # Global timestep should be a tensor.
            self.global_timestep = tf.Variable(0,
                                               trainable=False,
                                               dtype=tf.int64)
            self.explore = tf.Variable(self.config["explore"],
                                       trainable=False,
                                       dtype=tf.bool)

            # Log device and worker index.
            from ray.rllib.evaluation.rollout_worker import get_global_worker

            worker = get_global_worker()
            worker_idx = worker.worker_index if worker else 0
            if get_gpu_devices():
                logger.info(
                    "TF-eager Policy (worker={}) running on GPU.".format(
                        worker_idx if worker_idx > 0 else "local"))
            else:
                logger.info(
                    "TF-eager Policy (worker={}) running on CPU.".format(
                        worker_idx if worker_idx > 0 else "local"))

            self._is_training = False

            # Only for `config.eager_tracing=True`: A counter to keep track of
            # how many times an eager-traced method (e.g.
            # `self._compute_actions_helper`) has been re-traced by tensorflow.
            # We will raise an error if more than n re-tracings have been
            # detected, since this would considerably slow down execution.
            # The variable below should only get incremented during the
            # tf.function trace operations, never when calling the already
            # traced function after that.
            self._re_trace_counter = 0

            self._loss_initialized = False
            # To ensure backward compatibility:
            # Old way: If `loss` provided here, use as-is (as a function).
            if loss_fn is not None:
                self._loss = loss_fn
            # New way: Convert the overridden `self.loss` into a plain
            # function, so it can be called the same way as `loss` would
            # be, ensuring backward compatibility.
            elif self.loss.__func__.__qualname__ != "Policy.loss":
                self._loss = self.loss.__func__
            # `loss` not provided nor overridden from Policy -> Set to None.
            else:
                self._loss = None

            self.batch_divisibility_req = (get_batch_divisibility_req(self) if
                                           callable(get_batch_divisibility_req)
                                           else
                                           (get_batch_divisibility_req or 1))
            self._max_seq_len = config["model"]["max_seq_len"]

            if get_default_config:
                config = dict(get_default_config(), **config)

            if validate_spaces:
                validate_spaces(self, observation_space, action_space, config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config
            self.dist_class = None
            if action_sampler_fn or action_distribution_fn:
                if not make_model:
                    raise ValueError(
                        "`make_model` is required if `action_sampler_fn` OR "
                        "`action_distribution_fn` is given")
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework=self.framework,
                )
            # Lock used for locking some methods on the object-level.
            # This prevents possible race conditions when calling the model
            # first, then its value function (e.g. in a loss function), in
            # between of which another model call is made (e.g. to compute an
            # action).
            self._lock = threading.RLock()

            # Auto-update model's inference view requirements, if recurrent.
            self._update_model_view_requirements_from_init_state()

            self.exploration = self._create_exploration()
            self._state_inputs = self.model.get_initial_state()
            self._is_recurrent = len(self._state_inputs) > 0

            # Combine view_requirements for Model and Policy.
            self.view_requirements.update(self.model.view_requirements)

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            if optimizer_fn:
                optimizers = optimizer_fn(self, config)
            else:
                optimizers = tf.keras.optimizers.Adam(config["lr"])
            optimizers = force_list(optimizers)
            if getattr(self, "exploration", None):
                optimizers = self.exploration.get_exploration_optimizer(
                    optimizers)

            # The list of local (tf) optimizers (one per loss term).
            self._optimizers: List[LocalOptimizer] = optimizers
            # Backward compatibility: A user's policy may only support a single
            # loss term and optimizer (no lists).
            self._optimizer: LocalOptimizer = optimizers[
                0] if optimizers else None

            self._initialize_loss_from_dummy_batch(
                auto_remove_unneeded_view_reqs=True,
                stats_fn=stats_fn,
            )
            self._loss_initialized = True

            if after_init:
                after_init(self, observation_space, action_space, config)

            # Got to reset global_timestep again after fake run-throughs.
            self.global_timestep.assign(0)
예제 #15
0
 def _setup_graph(self, ob_space, ac_space):
     _, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
     self._model = ModelCatalog.get_torch_model(ob_space, self.logit_dim)
     self.optimizer = torch.optim.Adam(self._model.parameters(), lr=0.0001)
예제 #16
0
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
            self.framework = config.get("framework", "tfe")
            Policy.__init__(self, observation_space, action_space, config)
            self._is_training = False
            self._loss_initialized = False
            self._sess = None

            self._loss = loss_fn
            self.batch_divisibility_req = get_batch_divisibility_req(self) if \
                callable(get_batch_divisibility_req) else \
                (get_batch_divisibility_req or 1)
            self._max_seq_len = config["model"]["max_seq_len"]

            if get_default_config:
                config = dict(get_default_config(), **config)

            if validate_spaces:
                validate_spaces(self, observation_space, action_space, config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config
            self.dist_class = None
            if action_sampler_fn or action_distribution_fn:
                if not make_model:
                    raise ValueError(
                        "`make_model` is required if `action_sampler_fn` OR "
                        "`action_distribution_fn` is given")
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework=self.framework,
                )
            # Auto-update model's inference view requirements, if recurrent.
            self._update_model_inference_view_requirements_from_init_state()

            self.exploration = self._create_exploration()
            self._state_in = [
                tf.convert_to_tensor([s])
                for s in self.model.get_initial_state()
            ]

            # Combine view_requirements for Model and Policy.
            self.view_requirements.update(
                self.model.inference_view_requirements)

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            self._initialize_loss_from_dummy_batch(
                auto_remove_unneeded_view_reqs=True,
                stats_fn=stats_fn,
            )
            self._loss_initialized = True

            if optimizer_fn:
                optimizers = optimizer_fn(self, config)
            else:
                optimizers = tf.keras.optimizers.Adam(config["lr"])
            optimizers = force_list(optimizers)
            if getattr(self, "exploration", None):
                optimizers = self.exploration.get_exploration_optimizer(
                    optimizers)
            # TODO: (sven) Allow tf policy to have more than 1 optimizer.
            #  Just like torch Policy does.
            self._optimizer = optimizers[0] if optimizers else None

            if after_init:
                after_init(self, observation_space, action_space, config)

            # Got to reset global_timestep again after this fake run-through.
            self.global_timestep = 0
예제 #17
0
    def __init__(self,
                 observation_space,
                 action_space,
                 config,
                 existing_inputs=None):
        """
        Arguments:
            observation_space: Environment observation space specification.
            action_space: Environment action space specification.
            config (dict): Configuration values for PPO graph.
            existing_inputs (list): Optional list of tuples that specify the
                placeholders upon which the graph should be built upon.
        """
        config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config)
        self.sess = tf.get_default_session()
        self.action_space = action_space
        self.config = config
        self.kl_coeff_val = self.config["kl_coeff"]
        self.kl_target = self.config["kl_target"]
        dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space)

        if existing_inputs:
            obs_ph, value_targets_ph, adv_ph, act_ph, \
                logits_ph, vf_preds_ph = existing_inputs[:6]
            existing_state_in = existing_inputs[6:-1]
            existing_seq_lens = existing_inputs[-1]
        else:
            obs_ph = tf.placeholder(tf.float32,
                                    name="obs",
                                    shape=(None, ) + observation_space.shape)
            adv_ph = tf.placeholder(tf.float32,
                                    name="advantages",
                                    shape=(None, ))
            act_ph = ModelCatalog.get_action_placeholder(action_space)
            logits_ph = tf.placeholder(tf.float32,
                                       name="logits",
                                       shape=(None, logit_dim))
            vf_preds_ph = tf.placeholder(tf.float32,
                                         name="vf_preds",
                                         shape=(None, ))
            value_targets_ph = tf.placeholder(tf.float32,
                                              name="value_targets",
                                              shape=(None, ))
            existing_state_in = None
            existing_seq_lens = None

        self.loss_in = [
            ("obs", obs_ph),
            ("value_targets", value_targets_ph),
            ("advantages", adv_ph),
            ("actions", act_ph),
            ("logits", logits_ph),
            ("vf_preds", vf_preds_ph),
        ]
        self.model = ModelCatalog.get_model(obs_ph,
                                            logit_dim,
                                            self.config["model"],
                                            state_in=existing_state_in,
                                            seq_lens=existing_seq_lens)

        # KL Coefficient
        self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer(
            self.kl_coeff_val),
                                        name="kl_coeff",
                                        shape=(),
                                        trainable=False,
                                        dtype=tf.float32)

        self.logits = self.model.outputs
        curr_action_dist = dist_cls(self.logits)
        self.sampler = curr_action_dist.sample()
        if self.config["use_gae"]:
            vf_config = self.config["model"].copy()
            # Do not split the last layer of the value function into
            # mean parameters and standard deviation parameters and
            # do not make the standard deviations free variables.
            vf_config["free_log_std"] = False
            vf_config["use_lstm"] = False
            with tf.variable_scope("value_function"):
                self.value_function = ModelCatalog.get_model(
                    obs_ph, 1, vf_config).outputs
            self.value_function = tf.reshape(self.value_function, [-1])
        else:
            self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1])

        self.loss_obj = PPOLoss(action_space,
                                value_targets_ph,
                                adv_ph,
                                act_ph,
                                logits_ph,
                                vf_preds_ph,
                                curr_action_dist,
                                self.value_function,
                                self.kl_coeff,
                                entropy_coeff=self.config["entropy_coeff"],
                                clip_param=self.config["clip_param"],
                                vf_loss_coeff=self.config["kl_target"],
                                use_gae=self.config["use_gae"])

        TFPolicyGraph.__init__(self,
                               observation_space,
                               action_space,
                               self.sess,
                               obs_input=obs_ph,
                               action_sampler=self.sampler,
                               loss=self.loss_obj.loss,
                               loss_inputs=self.loss_in,
                               state_inputs=self.model.state_in,
                               state_outputs=self.model.state_out,
                               seq_lens=self.model.seq_lens,
                               max_seq_len=config["model"]["max_seq_len"])

        self.sess.run(tf.global_variables_initializer())
예제 #18
0
    def __init__(self,
                 observation_space,
                 action_space,
                 config,
                 existing_inputs=None):
        """
        Arguments:
            observation_space: Environment observation space specification.
            action_space: Environment action space specification.
            config (dict): Configuration values for PPO graph.
            existing_inputs (list): Optional list of tuples that specify the
                placeholders upon which the graph should be built upon.
        """
        config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config)
        self.sess = tf.get_default_session()
        self.action_space = action_space
        self.config = config
        self.kl_coeff_val = self.config["kl_coeff"]
        self.kl_target = self.config["kl_target"]
        dist_cls, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])

        if existing_inputs:
            obs_ph, value_targets_ph, adv_ph, act_ph, \
                logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \
                existing_inputs[:8]
            existing_state_in = existing_inputs[8:-1]
            existing_seq_lens = existing_inputs[-1]
        else:
            obs_ph = tf.placeholder(tf.float32,
                                    name="obs",
                                    shape=(None, ) + observation_space.shape)
            adv_ph = tf.placeholder(tf.float32,
                                    name="advantages",
                                    shape=(None, ))
            act_ph = ModelCatalog.get_action_placeholder(action_space)
            logits_ph = tf.placeholder(tf.float32,
                                       name="logits",
                                       shape=(None, logit_dim))
            vf_preds_ph = tf.placeholder(tf.float32,
                                         name="vf_preds",
                                         shape=(None, ))
            value_targets_ph = tf.placeholder(tf.float32,
                                              name="value_targets",
                                              shape=(None, ))
            prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
            prev_rewards_ph = tf.placeholder(tf.float32, [None],
                                             name="prev_reward")
            existing_state_in = None
            existing_seq_lens = None
        self.observations = obs_ph

        self.loss_in = [
            ("obs", obs_ph),
            ("value_targets", value_targets_ph),
            ("advantages", adv_ph),
            ("actions", act_ph),
            ("logits", logits_ph),
            ("vf_preds", vf_preds_ph),
            ("prev_actions", prev_actions_ph),
            ("prev_rewards", prev_rewards_ph),
        ]
        self.model = ModelCatalog.get_model(
            {
                "obs": obs_ph,
                "prev_actions": prev_actions_ph,
                "prev_rewards": prev_rewards_ph,
                "is_training": self._get_is_training_placeholder(),
            },
            observation_space,
            logit_dim,
            self.config["model"],
            state_in=existing_state_in,
            seq_lens=existing_seq_lens)

        # KL Coefficient
        self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer(
            self.kl_coeff_val),
                                        name="kl_coeff",
                                        shape=(),
                                        trainable=False,
                                        dtype=tf.float32)

        self.logits = self.model.outputs
        curr_action_dist = dist_cls(self.logits)
        self.sampler = curr_action_dist.sample()
        if self.config["use_gae"]:
            if self.config["vf_share_layers"]:
                self.value_function = self.model.value_function()
            else:
                vf_config = self.config["model"].copy()
                # Do not split the last layer of the value function into
                # mean parameters and standard deviation parameters and
                # do not make the standard deviations free variables.
                vf_config["free_log_std"] = False
                vf_config["use_lstm"] = False
                with tf.variable_scope("value_function"):
                    self.value_function = ModelCatalog.get_model(
                        {
                            "obs": obs_ph,
                            "prev_actions": prev_actions_ph,
                            "prev_rewards": prev_rewards_ph,
                            "is_training": self._get_is_training_placeholder(),
                        }, observation_space, 1, vf_config).outputs
                    self.value_function = tf.reshape(self.value_function, [-1])
        else:
            self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1])

        if self.model.state_in:
            max_seq_len = tf.reduce_max(self.model.seq_lens)
            mask = tf.sequence_mask(self.model.seq_lens, max_seq_len)
            mask = tf.reshape(mask, [-1])
        else:
            mask = tf.ones_like(adv_ph)

        self.loss_obj = PPOLoss(action_space,
                                value_targets_ph,
                                adv_ph,
                                act_ph,
                                logits_ph,
                                vf_preds_ph,
                                curr_action_dist,
                                self.value_function,
                                self.kl_coeff,
                                mask,
                                entropy_coeff=self.config["entropy_coeff"],
                                clip_param=self.config["clip_param"],
                                vf_clip_param=self.config["vf_clip_param"],
                                vf_loss_coeff=self.config["vf_loss_coeff"],
                                use_gae=self.config["use_gae"])

        LearningRateSchedule.__init__(self, self.config["lr"],
                                      self.config["lr_schedule"])
        TFPolicyGraph.__init__(self,
                               observation_space,
                               action_space,
                               self.sess,
                               obs_input=obs_ph,
                               action_sampler=self.sampler,
                               loss=self.model.loss() + self.loss_obj.loss,
                               loss_inputs=self.loss_in,
                               state_inputs=self.model.state_in,
                               state_outputs=self.model.state_out,
                               prev_action_input=prev_actions_ph,
                               prev_reward_input=prev_rewards_ph,
                               seq_lens=self.model.seq_lens,
                               max_seq_len=config["model"]["max_seq_len"])

        self.sess.run(tf.global_variables_initializer())
        self.explained_variance = explained_variance(value_targets_ph,
                                                     self.value_function)
        self.stats_fetches = {
            "cur_lr": tf.cast(self.cur_lr, tf.float64),
            "total_loss": self.loss_obj.loss,
            "policy_loss": self.loss_obj.mean_policy_loss,
            "vf_loss": self.loss_obj.mean_vf_loss,
            "vf_explained_var": self.explained_variance,
            "kl": self.loss_obj.mean_kl,
            "entropy": self.loss_obj.mean_entropy
        }
예제 #19
0
    def __init__(self,
                 obs_space,
                 action_space,
                 config,
                 loss_fn,
                 stats_fn=None,
                 update_ops_fn=None,
                 grad_stats_fn=None,
                 before_loss_init=None,
                 make_model=None,
                 action_sampler_fn=None,
                 existing_inputs=None,
                 existing_model=None,
                 get_batch_divisibility_req=None,
                 obs_include_prev_action_reward=True):
        """Initialize a dynamic TF policy.

        Arguments:
            observation_space (gym.Space): Observation space of the policy.
            action_space (gym.Space): Action space of the policy.
            config (dict): Policy-specific configuration data.
            loss_fn (func): function that returns a loss tensor the policy
                graph, and dict of experience tensor placeholders
            stats_fn (func): optional function that returns a dict of
                TF fetches given the policy and batch input tensors
            grad_stats_fn (func): optional function that returns a dict of
                TF fetches given the policy and loss gradient tensors
            update_ops_fn (func): optional function that returns a list
                overriding the update ops to run when applying gradients
            before_loss_init (func): optional function to run prior to loss
                init that takes the same arguments as __init__
            make_model (func): optional function that returns a ModelV2 object
                given (policy, obs_space, action_space, config).
                All policy variables should be created in this function. If not
                specified, a default model will be created.
            action_sampler_fn (func): optional function that returns a
                tuple of action and action prob tensors given
                (policy, model, input_dict, obs_space, action_space, config).
                If not specified, a default action distribution will be used.
            existing_inputs (OrderedDict): when copying a policy, this
                specifies an existing dict of placeholders to use instead of
                defining new ones
            existing_model (ModelV2): when copying a policy, this specifies
                an existing model to clone and share weights with
            get_batch_divisibility_req (func): optional function that returns
                the divisibility requirement for sample batches
            obs_include_prev_action_reward (bool): whether to include the
                previous action and reward in the model input

        Attributes:
            config: config of the policy
            model: model instance, if any
            model_out: output tensors of the model
            action_dist: action distribution of the model, if any
            state_in: state input tensors, if any
            state_out: state output tensors, if any
            seq_lens: tensor of sequence lengths
        """
        self.config = config
        self._loss_fn = loss_fn
        self._stats_fn = stats_fn
        self._grad_stats_fn = grad_stats_fn
        self._update_ops_fn = update_ops_fn
        self._obs_include_prev_action_reward = obs_include_prev_action_reward

        # Setup standard placeholders
        prev_actions = None
        prev_rewards = None
        if existing_inputs is not None:
            obs = existing_inputs[SampleBatch.CUR_OBS]
            if self._obs_include_prev_action_reward:
                prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS]
                prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS]
        else:
            obs = tf.placeholder(
                tf.float32,
                shape=[None] + list(obs_space.shape),
                name="observation")
            if self._obs_include_prev_action_reward:
                prev_actions = ModelCatalog.get_action_placeholder(
                    action_space)
                prev_rewards = tf.placeholder(
                    tf.float32, [None], name="prev_reward")

        self.input_dict = {
            SampleBatch.CUR_OBS: obs,
            SampleBatch.PREV_ACTIONS: prev_actions,
            SampleBatch.PREV_REWARDS: prev_rewards,
            "is_training": self._get_is_training_placeholder(),
        }
        self.seq_lens = tf.placeholder(
            dtype=tf.int32, shape=[None], name="seq_lens")

        # Setup model
        self.dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
        if existing_model:
            self.model = existing_model
        elif make_model:
            self.model = make_model(self, obs_space, action_space, config)
        else:
            self.model = ModelCatalog.get_model_v2(
                obs_space,
                action_space,
                logit_dim,
                self.config["model"],
                framework="tf")
        if existing_inputs:
            self.state_in = [
                v for k, v in existing_inputs.items()
                if k.startswith("state_in_")
            ]
            if self.state_in:
                self.seq_lens = existing_inputs["seq_lens"]
        else:
            self.state_in = [
                tf.placeholder(shape=(None, ) + s.shape, dtype=s.dtype)
                for s in self.model.get_initial_state()
            ]
        self.model_out, self.state_out = self.model(
            self.input_dict, self.state_in, self.seq_lens)

        # Setup action sampler
        if action_sampler_fn:
            self.action_dist = None
            self.dist_class = None
            action_sampler, action_prob = action_sampler_fn(
                self, self.model, self.input_dict, obs_space, action_space,
                config)
        else:
            self.action_dist = self.dist_class(self.model_out)
            action_sampler = self.action_dist.sample()
            action_prob = self.action_dist.sampled_action_prob()

        # Phase 1 init
        sess = tf.get_default_session() or tf.Session()
        if get_batch_divisibility_req:
            batch_divisibility_req = get_batch_divisibility_req(self)
        else:
            batch_divisibility_req = 1
        TFPolicy.__init__(
            self,
            obs_space,
            action_space,
            sess,
            obs_input=obs,
            action_sampler=action_sampler,
            action_prob=action_prob,
            loss=None,  # dynamically initialized on run
            loss_inputs=[],
            model=self.model,
            state_inputs=self.state_in,
            state_outputs=self.state_out,
            prev_action_input=prev_actions,
            prev_reward_input=prev_rewards,
            seq_lens=self.seq_lens,
            max_seq_len=config["model"]["max_seq_len"],
            batch_divisibility_req=batch_divisibility_req)

        # Phase 2 init
        self._needs_eager_conversion = set()
        self._eager_tensors = {}
        before_loss_init(self, obs_space, action_space, config)
        if not existing_inputs:
            self._initialize_loss()
예제 #20
0
    def __init__(self,
                 observation_space,
                 action_space,
                 config,
                 existing_inputs=None):
        """
        Arguments:
            observation_space: Environment observation space specification.
            action_space: Environment action space specification.
            config (dict): Configuration values for PPO graph.
            existing_inputs (list): Optional list of tuples that specify the
                placeholders upon which the graph should be built upon.
        """
        self.sess = tf.get_default_session()
        self.action_space = action_space
        self.config = config
        self.kl_coeff_val = self.config["kl_coeff"]
        self.kl_target = self.config["kl_target"]
        dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space)

        if existing_inputs:
            self.loss_in = existing_inputs
            obs_ph, value_targets_ph, adv_ph, act_ph, \
                logprobs_ph, vf_preds_ph = [ph for _, ph in existing_inputs]
        else:
            obs_ph = tf.placeholder(tf.float32,
                                    name="obs",
                                    shape=(None, ) + observation_space.shape)
            # Targets of the value function.
            value_targets_ph = tf.placeholder(tf.float32,
                                              name="value_targets",
                                              shape=(None, ))
            # Advantage values in the policy gradient estimator.
            adv_ph = tf.placeholder(tf.float32,
                                    name="advantages",
                                    shape=(None, ))
            act_ph = ModelCatalog.get_action_placeholder(action_space)
            # Log probabilities from the policy before the policy update.
            logprobs_ph = tf.placeholder(tf.float32,
                                         name="logprobs",
                                         shape=(None, logit_dim))
            # Value function predictions before the policy update.
            vf_preds_ph = tf.placeholder(tf.float32,
                                         name="vf_preds",
                                         shape=(None, ))
            self.loss_in = [("obs", obs_ph),
                            ("value_targets", value_targets_ph),
                            ("advantages", adv_ph), ("actions", act_ph),
                            ("logprobs", logprobs_ph),
                            ("vf_preds", vf_preds_ph)]
        # TODO(ekl) feed RNN states in here

        # KL Coefficient
        self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer(
            self.kl_coeff_val),
                                        name="kl_coeff",
                                        shape=(),
                                        trainable=False,
                                        dtype=tf.float32)

        self.logits = ModelCatalog.get_model(obs_ph, logit_dim,
                                             self.config["model"]).outputs
        curr_action_dist = dist_cls(self.logits)
        self.sampler = curr_action_dist.sample()
        if self.config["use_gae"]:
            vf_config = self.config["model"].copy()
            # Do not split the last layer of the value function into
            # mean parameters and standard deviation parameters and
            # do not make the standard deviations free variables.
            vf_config["free_log_std"] = False
            with tf.variable_scope("value_function"):
                self.value_function = ModelCatalog.get_model(
                    obs_ph, 1, vf_config).outputs
            self.value_function = tf.reshape(self.value_function, [-1])
        else:
            self.value_function = tf.constant("NA")

        self.loss_obj = PPOLoss(action_space,
                                value_targets_ph,
                                adv_ph,
                                act_ph,
                                logprobs_ph,
                                vf_preds_ph,
                                curr_action_dist,
                                self.value_function,
                                self.kl_coeff,
                                entropy_coeff=self.config["entropy_coeff"],
                                clip_param=self.config["clip_param"],
                                vf_loss_coeff=self.config["kl_target"],
                                use_gae=self.config["use_gae"])
        self.is_training = tf.placeholder_with_default(True, ())

        TFPolicyGraph.__init__(self,
                               observation_space,
                               action_space,
                               self.sess,
                               obs_input=obs_ph,
                               action_sampler=self.sampler,
                               loss=self.loss_obj.loss,
                               loss_inputs=self.loss_in,
                               is_training=self.is_training)
예제 #21
0
    def __init__(
        self,
        observation_space: gym.spaces.Space,
        action_space: gym.spaces.Space,
        config: TrainerConfigDict,
        *,
        model: Optional[TorchModelV2] = None,
        loss: Optional[Callable[
            [Policy, ModelV2, Type[TorchDistributionWrapper], SampleBatch],
            Union[TensorType, List[TensorType]]]] = None,
        action_distribution_class: Optional[
            Type[TorchDistributionWrapper]] = None,
        action_sampler_fn: Optional[Callable[[TensorType, List[TensorType]],
                                             Tuple[TensorType,
                                                   TensorType]]] = None,
        action_distribution_fn: Optional[
            Callable[[Policy, ModelV2, TensorType, TensorType, TensorType],
                     Tuple[TensorType, Type[TorchDistributionWrapper],
                           List[TensorType]]]] = None,
        max_seq_len: int = 20,
        get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None,
    ):
        """Initializes a TorchPolicy instance.

        Args:
            observation_space: Observation space of the policy.
            action_space: Action space of the policy.
            config: The Policy's config dict.
            model: PyTorch policy module. Given observations as
                input, this module must return a list of outputs where the
                first item is action logits, and the rest can be any value.
            loss: Callable that returns one or more (a list of) scalar loss
                terms.
            action_distribution_class: Class for a torch action distribution.
            action_sampler_fn: A callable returning a sampled action and its
                log-likelihood given Policy, ModelV2, input_dict, state batches
                (optional), explore, and timestep.
                Provide `action_sampler_fn` if you would like to have full
                control over the action computation step, including the
                model forward pass, possible sampling from a distribution,
                and exploration logic.
                Note: If `action_sampler_fn` is given, `action_distribution_fn`
                must be None. If both `action_sampler_fn` and
                `action_distribution_fn` are None, RLlib will simply pass
                inputs through `self.model` to get distribution inputs, create
                the distribution object, sample from it, and apply some
                exploration logic to the results.
                The callable takes as inputs: Policy, ModelV2, input_dict
                (SampleBatch), state_batches (optional), explore, and timestep.
            action_distribution_fn: A callable returning distribution inputs
                (parameters), a dist-class to generate an action distribution
                object from, and internal-state outputs (or an empty list if
                not applicable).
                Provide `action_distribution_fn` if you would like to only
                customize the model forward pass call. The resulting
                distribution parameters are then used by RLlib to create a
                distribution object, sample from it, and execute any
                exploration logic.
                Note: If `action_distribution_fn` is given, `action_sampler_fn`
                must be None. If both `action_sampler_fn` and
                `action_distribution_fn` are None, RLlib will simply pass
                inputs through `self.model` to get distribution inputs, create
                the distribution object, sample from it, and apply some
                exploration logic to the results.
                The callable takes as inputs: Policy, ModelV2, ModelInputDict,
                explore, timestep, is_training.
            max_seq_len: Max sequence length for LSTM training.
            get_batch_divisibility_req: Optional callable that returns the
                divisibility requirement for sample batches given the Policy.
        """
        self.framework = config["framework"] = "torch"
        super().__init__(observation_space, action_space, config)

        # Create multi-GPU model towers, if necessary.
        # - The central main model will be stored under self.model, residing
        #   on self.device (normally, a CPU).
        # - Each GPU will have a copy of that model under
        #   self.model_gpu_towers, matching the devices in self.devices.
        # - Parallelization is done by splitting the train batch and passing
        #   it through the model copies in parallel, then averaging over the
        #   resulting gradients, applying these averages on the main model and
        #   updating all towers' weights from the main model.
        # - In case of just one device (1 (fake or real) GPU or 1 CPU), no
        #   parallelization will be done.

        # If no Model is provided, build a default one here.
        if model is None:
            dist_class, logit_dim = ModelCatalog.get_action_dist(
                action_space, self.config["model"], framework=self.framework)
            model = ModelCatalog.get_model_v2(
                obs_space=self.observation_space,
                action_space=self.action_space,
                num_outputs=logit_dim,
                model_config=self.config["model"],
                framework=self.framework)
            if action_distribution_class is None:
                action_distribution_class = dist_class

        # Get devices to build the graph on.
        worker_idx = self.config.get("worker_index", 0)
        if not config["_fake_gpus"] and \
                ray.worker._mode() == ray.worker.LOCAL_MODE:
            num_gpus = 0
        elif worker_idx == 0:
            num_gpus = config["num_gpus"]
        else:
            num_gpus = config["num_gpus_per_worker"]
        gpu_ids = list(range(torch.cuda.device_count()))

        # Place on one or more CPU(s) when either:
        # - Fake GPU mode.
        # - num_gpus=0 (either set by user or we are in local_mode=True).
        # - No GPUs available.
        if config["_fake_gpus"] or num_gpus == 0 or not gpu_ids:
            logger.info("TorchPolicy (worker={}) running on {}.".format(
                worker_idx if worker_idx > 0 else "local",
                "{} fake-GPUs".format(num_gpus)
                if config["_fake_gpus"] else "CPU"))
            self.device = torch.device("cpu")
            self.devices = [
                self.device for _ in range(int(math.ceil(num_gpus)) or 1)
            ]
            self.model_gpu_towers = [
                model if i == 0 else copy.deepcopy(model)
                for i in range(int(math.ceil(num_gpus)) or 1)
            ]
            if hasattr(self, "target_model"):
                self.target_models = {
                    m: self.target_model
                    for m in self.model_gpu_towers
                }
            self.model = model
        # Place on one or more actual GPU(s), when:
        # - num_gpus > 0 (set by user) AND
        # - local_mode=False AND
        # - actual GPUs available AND
        # - non-fake GPU mode.
        else:
            logger.info("TorchPolicy (worker={}) running on {} GPU(s).".format(
                worker_idx if worker_idx > 0 else "local", num_gpus))
            # We are a remote worker (WORKER_MODE=1):
            # GPUs should be assigned to us by ray.
            if ray.worker._mode() == ray.worker.WORKER_MODE:
                gpu_ids = ray.get_gpu_ids()

            if len(gpu_ids) < num_gpus:
                raise ValueError(
                    "TorchPolicy was not able to find enough GPU IDs! Found "
                    f"{gpu_ids}, but num_gpus={num_gpus}.")

            self.devices = [
                torch.device("cuda:{}".format(i))
                for i, id_ in enumerate(gpu_ids) if i < num_gpus
            ]
            self.device = self.devices[0]
            ids = [id_ for i, id_ in enumerate(gpu_ids) if i < num_gpus]
            self.model_gpu_towers = []
            for i, _ in enumerate(ids):
                model_copy = copy.deepcopy(model)
                self.model_gpu_towers.append(model_copy.to(self.devices[i]))
            if hasattr(self, "target_model"):
                self.target_models = {
                    m: copy.deepcopy(self.target_model).to(self.devices[i])
                    for i, m in enumerate(self.model_gpu_towers)
                }
            self.model = self.model_gpu_towers[0]

        # Lock used for locking some methods on the object-level.
        # This prevents possible race conditions when calling the model
        # first, then its value function (e.g. in a loss function), in
        # between of which another model call is made (e.g. to compute an
        # action).
        self._lock = threading.RLock()

        self._state_inputs = self.model.get_initial_state()
        self._is_recurrent = len(self._state_inputs) > 0
        # Auto-update model's inference view requirements, if recurrent.
        self._update_model_view_requirements_from_init_state()
        # Combine view_requirements for Model and Policy.
        self.view_requirements.update(self.model.view_requirements)

        self.exploration = self._create_exploration()
        self.unwrapped_model = model  # used to support DistributedDataParallel
        # To ensure backward compatibility:
        # Old way: If `loss` provided here, use as-is (as a function).
        if loss is not None:
            self._loss = loss
        # New way: Convert the overridden `self.loss` into a plain function,
        # so it can be called the same way as `loss` would be, ensuring
        # backward compatibility.
        elif self.loss.__func__.__qualname__ != "Policy.loss":
            self._loss = self.loss.__func__
        # `loss` not provided nor overridden from Policy -> Set to None.
        else:
            self._loss = None
        self._optimizers = force_list(self.optimizer())
        # Store, which params (by index within the model's list of
        # parameters) should be updated per optimizer.
        # Maps optimizer idx to set or param indices.
        self.multi_gpu_param_groups: List[Set[int]] = []
        main_params = {p: i for i, p in enumerate(self.model.parameters())}
        for o in self._optimizers:
            param_indices = []
            for pg_idx, pg in enumerate(o.param_groups):
                for p in pg["params"]:
                    param_indices.append(main_params[p])
            self.multi_gpu_param_groups.append(set(param_indices))

        # Create n sample-batch buffers (num_multi_gpu_tower_stacks), each
        # one with m towers (num_gpus).
        num_buffers = self.config.get("num_multi_gpu_tower_stacks", 1)
        self._loaded_batches = [[] for _ in range(num_buffers)]

        self.dist_class = action_distribution_class
        self.action_sampler_fn = action_sampler_fn
        self.action_distribution_fn = action_distribution_fn

        # If set, means we are using distributed allreduce during learning.
        self.distributed_world_size = None

        self.max_seq_len = max_seq_len
        self.batch_divisibility_req = get_batch_divisibility_req(self) if \
            callable(get_batch_divisibility_req) else \
            (get_batch_divisibility_req or 1)
예제 #22
0
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config)
        self.config = config
        self.sess = tf.get_default_session()

        # Setup the policy
        self.observations = tf.placeholder(tf.float32, [None] +
                                           list(observation_space.shape))
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
        prev_actions = ModelCatalog.get_action_placeholder(action_space)
        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
        self.model = ModelCatalog.get_model(
            {
                "obs": self.observations,
                "prev_actions": prev_actions,
                "prev_rewards": prev_rewards
            }, observation_space, logit_dim, self.config["model"])
        action_dist = dist_class(self.model.outputs)
        self.vf = self.model.value_function()
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)

        # Setup the policy loss
        if isinstance(action_space, gym.spaces.Box):
            ac_size = action_space.shape[0]
            actions = tf.placeholder(tf.float32, [None, ac_size], name="ac")
        elif isinstance(action_space, gym.spaces.Discrete):
            actions = tf.placeholder(tf.int64, [None], name="ac")
        else:
            raise UnsupportedSpaceException(
                "Action space {} is not supported for A3C.".format(
                    action_space))
        advantages = tf.placeholder(tf.float32, [None], name="advantages")
        self.v_target = tf.placeholder(tf.float32, [None], name="v_target")
        self.loss = A3CLoss(action_dist, actions, advantages, self.v_target,
                            self.vf, self.config["vf_loss_coeff"],
                            self.config["entropy_coeff"])

        # Initialize TFPolicyGraph
        loss_in = [
            ("obs", self.observations),
            ("actions", actions),
            ("prev_actions", prev_actions),
            ("prev_rewards", prev_rewards),
            ("advantages", advantages),
            ("value_targets", self.v_target),
        ]
        LearningRateSchedule.__init__(self, self.config["lr"],
                                      self.config["lr_schedule"])
        TFPolicyGraph.__init__(self,
                               observation_space,
                               action_space,
                               self.sess,
                               obs_input=self.observations,
                               action_sampler=action_dist.sample(),
                               loss=self.model.loss() + self.loss.total_loss,
                               loss_inputs=loss_in,
                               state_inputs=self.model.state_in,
                               state_outputs=self.model.state_out,
                               prev_action_input=prev_actions,
                               prev_reward_input=prev_rewards,
                               seq_lens=self.model.seq_lens,
                               max_seq_len=self.config["model"]["max_seq_len"])

        self.stats_fetches = {
            "stats": {
                "cur_lr": tf.cast(self.cur_lr, tf.float64),
                "policy_loss": self.loss.pi_loss,
                "policy_entropy": self.loss.entropy,
                "grad_gnorm": tf.global_norm(self._grads),
                "var_gnorm": tf.global_norm(self.var_list),
                "vf_loss": self.loss.vf_loss,
                "vf_explained_var": explained_variance(self.v_target, self.vf),
            },
        }

        self.sess.run(tf.global_variables_initializer())
예제 #23
0
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
            self.framework = "tf"
            Policy.__init__(self, observation_space, action_space, config)
            self._is_training = False
            self._loss_initialized = False
            self._sess = None

            if get_default_config:
                config = dict(get_default_config(), **config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config
            self.dist_class = None

            if action_sampler_fn:
                if not make_model:
                    raise ValueError("`make_model` is required if "
                                     "`action_sampler_fn` is given")
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework="tf",
                )

            self._state_in = [
                tf.convert_to_tensor(np.array([s]))
                for s in self.model.get_initial_state()
            ]

            input_dict = {
                SampleBatch.CUR_OBS:
                tf.convert_to_tensor(np.array([observation_space.sample()])),
                SampleBatch.PREV_ACTIONS:
                tf.convert_to_tensor([_flatten_action(action_space.sample())]),
                SampleBatch.PREV_REWARDS:
                tf.convert_to_tensor([0.]),
            }
            self.model(input_dict, self._state_in, tf.convert_to_tensor([1]))

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            self._initialize_loss_with_dummy_batch()
            self._loss_initialized = True

            if optimizer_fn:
                self._optimizer = optimizer_fn(self, config)
            else:
                self._optimizer = tf.train.AdamOptimizer(config["lr"])

            if after_init:
                after_init(self, observation_space, action_space, config)
예제 #24
0
    def __init__(self,
                 observation_space,
                 action_space,
                 config,
                 existing_inputs=None):
        config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config)
        assert config["batch_mode"] == "truncate_episodes", \
            "Must use `truncate_episodes` batch mode with V-trace."
        self.config = config
        self.sess = tf.get_default_session()
        self.grads = None

        if isinstance(action_space, gym.spaces.Discrete):
            is_multidiscrete = False
            actions_shape = [None]
            output_hidden_shape = [action_space.n]
        elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete):
            is_multidiscrete = True
            actions_shape = [None, len(action_space.nvec)]
            output_hidden_shape = action_space.nvec.astype(np.int32)
        else:
            raise UnsupportedSpaceException(
                "Action space {} is not supported for IMPALA.".format(
                    action_space))

        # Create input placeholders
        if existing_inputs:
            actions, dones, behaviour_logits, rewards, observations, \
                prev_actions, prev_rewards = existing_inputs[:7]
            existing_state_in = existing_inputs[7:-1]
            existing_seq_lens = existing_inputs[-1]
        else:
            actions = tf.placeholder(tf.int64, actions_shape, name="ac")
            dones = tf.placeholder(tf.bool, [None], name="dones")
            rewards = tf.placeholder(tf.float32, [None], name="rewards")
            behaviour_logits = tf.placeholder(
                tf.float32, [None, sum(output_hidden_shape)],
                name="behaviour_logits")
            observations = tf.placeholder(tf.float32, [None] +
                                          list(observation_space.shape))
            existing_state_in = None
            existing_seq_lens = None

        # Unpack behaviour logits
        unpacked_behaviour_logits = tf.split(behaviour_logits,
                                             output_hidden_shape,
                                             axis=1)

        # Setup the policy
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
        prev_actions = ModelCatalog.get_action_placeholder(action_space)
        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
        self.model = ModelCatalog.get_model(
            {
                "obs": observations,
                "prev_actions": prev_actions,
                "prev_rewards": prev_rewards,
                "is_training": self._get_is_training_placeholder(),
            },
            observation_space,
            action_space,
            logit_dim,
            self.config["model"],
            state_in=existing_state_in,
            seq_lens=existing_seq_lens)
        unpacked_outputs = tf.split(self.model.outputs,
                                    output_hidden_shape,
                                    axis=1)

        dist_inputs = unpacked_outputs if is_multidiscrete else \
            self.model.outputs
        action_dist = dist_class(dist_inputs)

        values = self.model.value_function()
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)

        def make_time_major(tensor, drop_last=False):
            """Swaps batch and trajectory axis.
            Args:
                tensor: A tensor or list of tensors to reshape.
                drop_last: A bool indicating whether to drop the last
                trajectory item.
            Returns:
                res: A tensor with swapped axes or a list of tensors with
                swapped axes.
            """
            if isinstance(tensor, list):
                return [make_time_major(t, drop_last) for t in tensor]

            if self.model.state_init:
                B = tf.shape(self.model.seq_lens)[0]
                T = tf.shape(tensor)[0] // B
            else:
                # Important: chop the tensor into batches at known episode cut
                # boundaries. TODO(ekl) this is kind of a hack
                T = self.config["sample_batch_size"]
                B = tf.shape(tensor)[0] // T
            rs = tf.reshape(tensor,
                            tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0))

            # swap B and T axes
            res = tf.transpose(
                rs,
                [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))))

            if drop_last:
                return res[:-1]
            return res

        if self.model.state_in:
            max_seq_len = tf.reduce_max(self.model.seq_lens) - 1
            mask = tf.sequence_mask(self.model.seq_lens, max_seq_len)
            mask = tf.reshape(mask, [-1])
        else:
            mask = tf.ones_like(rewards, dtype=tf.bool)

        # Prepare actions for loss
        loss_actions = actions if is_multidiscrete else tf.expand_dims(actions,
                                                                       axis=1)

        # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc.
        self.loss = VTraceLoss(
            actions=make_time_major(loss_actions, drop_last=True),
            actions_logp=make_time_major(action_dist.logp(actions),
                                         drop_last=True),
            actions_entropy=make_time_major(action_dist.entropy(),
                                            drop_last=True),
            dones=make_time_major(dones, drop_last=True),
            behaviour_logits=make_time_major(unpacked_behaviour_logits,
                                             drop_last=True),
            target_logits=make_time_major(unpacked_outputs, drop_last=True),
            discount=config["gamma"],
            rewards=make_time_major(rewards, drop_last=True),
            values=make_time_major(values, drop_last=True),
            bootstrap_value=make_time_major(values)[-1],
            valid_mask=make_time_major(mask, drop_last=True),
            vf_loss_coeff=self.config["vf_loss_coeff"],
            entropy_coeff=self.config["entropy_coeff"],
            clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
            clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"])

        # KL divergence between worker and learner logits for debugging
        model_dist = MultiCategorical(unpacked_outputs)
        behaviour_dist = MultiCategorical(unpacked_behaviour_logits)

        kls = model_dist.kl(behaviour_dist)
        if len(kls) > 1:
            self.KL_stats = {}

            for i, kl in enumerate(kls):
                self.KL_stats.update({
                    "mean_KL_{}".format(i):
                    tf.reduce_mean(kl),
                    "max_KL_{}".format(i):
                    tf.reduce_max(kl),
                    "median_KL_{}".format(i):
                    tf.contrib.distributions.percentile(kl, 50.0),
                })
        else:
            self.KL_stats = {
                "mean_KL": tf.reduce_mean(kls[0]),
                "max_KL": tf.reduce_max(kls[0]),
                "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0),
            }

        # Initialize TFPolicyGraph
        loss_in = [
            (SampleBatch.ACTIONS, actions),
            (SampleBatch.DONES, dones),
            (BEHAVIOUR_LOGITS, behaviour_logits),
            (SampleBatch.REWARDS, rewards),
            (SampleBatch.CUR_OBS, observations),
            (SampleBatch.PREV_ACTIONS, prev_actions),
            (SampleBatch.PREV_REWARDS, prev_rewards),
        ]
        LearningRateSchedule.__init__(self, self.config["lr"],
                                      self.config["lr_schedule"])
        TFPolicyGraph.__init__(
            self,
            observation_space,
            action_space,
            self.sess,
            obs_input=observations,
            action_sampler=action_dist.sample(),
            action_prob=action_dist.sampled_action_prob(),
            loss=self.loss.total_loss,
            model=self.model,
            loss_inputs=loss_in,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            prev_action_input=prev_actions,
            prev_reward_input=prev_rewards,
            seq_lens=self.model.seq_lens,
            max_seq_len=self.config["model"]["max_seq_len"],
            batch_divisibility_req=self.config["sample_batch_size"])

        self.sess.run(tf.global_variables_initializer())

        self.stats_fetches = {
            LEARNER_STATS_KEY:
            dict(
                {
                    "cur_lr":
                    tf.cast(self.cur_lr, tf.float64),
                    "policy_loss":
                    self.loss.pi_loss,
                    "entropy":
                    self.loss.entropy,
                    "grad_gnorm":
                    tf.global_norm(self._grads),
                    "var_gnorm":
                    tf.global_norm(self.var_list),
                    "vf_loss":
                    self.loss.vf_loss,
                    "vf_explained_var":
                    explained_variance(
                        tf.reshape(self.loss.vtrace_returns.vs, [-1]),
                        tf.reshape(make_time_major(values, drop_last=True),
                                   [-1])),
                }, **self.KL_stats),
        }
예제 #25
0
def make_model_and_action_dist(policy, obs_space, action_space, config):
    """create model neural network"""
    policy.device = (torch.device("cuda")
                       if torch.cuda.is_available() else torch.device("cpu"))
    policy.log_stats = config["log_stats"]  # flag to log statistics
    if policy.log_stats:
        policy.stats_dict = {}
        policy.stats_fn = config["stats_fn"]

    # Keys of the observation space that must be used at train and test time ('signal' and 'mask' will be excluded
    # from the actual obs space)
    policy.train_obs_keys = config["train_obs_keys"]
    policy.test_obs_keys = config["test_obs_keys"]

    # Check whether policy observation space is inside a Tuple space
    policy.requires_tupling = False
    if isinstance(action_space, Tuple) and len(action_space.spaces) == 1:
        policy.action_space = action_space.spaces[0]
        action_space = action_space.spaces[0]
        policy.requires_tupling = True
    if not isinstance(action_space, Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))

    # Get real observation space
    if isinstance(obs_space, Box):
        assert hasattr(obs_space, "original_space"), "Invalid observation space"
        obs_space = obs_space.original_space
        if isinstance(obs_space, Tuple):
            obs_space = obs_space.spaces[0]
    assert isinstance(obs_space, Dict), "Invalid observation space"
    policy.has_action_mask = "action_mask" in obs_space.spaces
    assert all([k in obs_space.spaces for k in policy.train_obs_keys]), "Invalid train keys specification"
    assert all([k in obs_space.spaces for k in policy.test_obs_keys]), "Invalid test keys specification"

    # Get observation space used for training
    if config["train_obs_space"] is None:
        train_obs_space = obs_space
    else:
        train_obs_space = config["train_obs_space"]
        if isinstance(train_obs_space, Box):
            assert hasattr(train_obs_space, "original_space"), "Invalid observation space"
            train_obs_space = train_obs_space.original_space
            if isinstance(train_obs_space, Tuple):
                train_obs_space = train_obs_space.spaces[0]

    # Obs spaces used for training and testing
    sp = Dict({
        k: obs_space.spaces[k]
        for k in policy.test_obs_keys
    })

    policy.real_test_obs_space = flatten_space(sp)
    policy.real_test_obs_space.original_space = sp
    model_space = Dict({
        k: obs_space.spaces[k]
        for k in policy.test_obs_keys if k != "signal" and k != "action_mask"
    })


    sp = Dict({
        k: train_obs_space.spaces[k]
        for k in policy.train_obs_keys
    })
    policy.real_train_obs_space = flatten_space(sp)
    policy.real_train_obs_space.original_space = sp
    policy.n_actions = action_space.n
    def update_target():
        pass

    policy.update_target = update_target
    model = FullyConnectedNetwork(flatten_space(model_space), action_space, action_space.n, name="FcNet",
                                 model_config=config['model']).to(policy.device)
    return model, ModelCatalog.get_action_dist(action_space, config, framework='torch')
    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):

        model_config = with_base_config(
            base_config=DEFAULT_STRATEGO_MODEL_CONFIG,
            extra_config=model_config)
        TFModelV2.__init__(self, obs_space, action_space, num_outputs,
                           model_config, name)

        print(model_config)

        observation_mode = model_config['custom_options']['observation_mode']
        if observation_mode == PARTIALLY_OBSERVABLE:
            self._obs_key = 'partial_observation'
        elif observation_mode == FULLY_OBSERVABLE:
            self._obs_key = 'full_observation'
        elif observation_mode == BOTH_OBSERVATIONS:
            raise NotImplementedError
        else:
            assert False, "policy observation_mode must be in [PARTIALLY_OBSERVABLE, FULLY_OBSERVABLE, BOTH_OBSERVATIONS]"

        self._action_dist_class, self._logit_dim = ModelCatalog.get_action_dist(
            self.action_space, model_config)

        self.use_lstm = model_config['use_lstm']
        self.fake_lstm = model_config['custom_options'].get('fake_lstm', False)

        self.mask_invalid_actions = model_config['custom_options'][
            'mask_invalid_actions']

        conv_activation = get_activation_fn(
            model_config.get("conv_activation"))
        base_lstm_filters = model_config["custom_options"]['base_lstm_filters']
        base_cnn_filters = model_config["custom_options"]['base_cnn_filters']
        pi_cnn_filters = model_config["custom_options"]['pi_cnn_filters']
        q_cnn_filters = model_config["custom_options"]['q_cnn_filters']

        rows = obs_space.original_space[self._obs_key].shape[0]
        colums = obs_space.original_space[self._obs_key].shape[1]

        if self.use_lstm:
            self._lstm_state_shape = (rows, colums, base_lstm_filters[0][0])

        if self.use_lstm and not self.fake_lstm:
            self._base_model_out_shape = (rows, colums,
                                          base_lstm_filters[0][0])
        else:
            self._base_model_out_shape = (rows, colums,
                                          base_cnn_filters[-1][0])

        if self.use_lstm:
            state_in = [
                tf.keras.layers.Input(shape=self._lstm_state_shape,
                                      name="base_lstm_h"),
                tf.keras.layers.Input(shape=self._lstm_state_shape,
                                      name="base_lstm_c")
            ]
            seq_lens_in = tf.keras.layers.Input(shape=(), name="lstm_seq_in")

            self._obs_inputs = tf.keras.layers.Input(
                shape=(None, *obs_space.original_space[self._obs_key].shape),
                name="observation")
            self._base_model_out = tf.keras.layers.Input(
                shape=self._base_model_out_shape, name="model_out")
        else:
            state_in, seq_lens_in = None, None
            self._obs_inputs = tf.keras.layers.Input(
                shape=obs_space.original_space[self._obs_key].shape,
                name="observation")
            self._base_model_out = tf.keras.layers.Input(
                shape=self._base_model_out_shape, name="model_out")

        def maybe_td(layer):
            if self.use_lstm:
                return tf.keras.layers.TimeDistributed(layer=layer,
                                                       name=f"td_{layer.name}")
            else:
                return layer

        def build_shared_base_layers(prefix: str, obs_in: tf.Tensor,
                                     state_in: tf.Tensor):
            # obs_in = tf.debugging.check_numerics(
            #     obs_in, f"nan found in obs_in", name=None)

            _last_layer = obs_in

            for i, (out_size, kernel, stride) in enumerate(base_cnn_filters):
                _last_layer = maybe_td(
                    tf.keras.layers.Conv2D(filters=out_size,
                                           kernel_size=kernel,
                                           strides=stride,
                                           activation=conv_activation,
                                           padding="same",
                                           name="{}_conv_{}".format(
                                               prefix, i)))(_last_layer)
                # _last_layer = tf.debugging.check_numerics(
                #     _last_layer, f"nan found in _last_layer {i}", name=None)

            base_state_out = state_in
            if self.use_lstm and not self.fake_lstm:
                for i, (out_size, kernel,
                        stride) in enumerate(base_lstm_filters):
                    if i > 0:
                        raise NotImplementedError(
                            "Only single lstm layers are implemented right now"
                        )

                    _last_layer, *base_state_out = tf.keras.layers.ConvLSTM2D(
                        filters=out_size,
                        kernel_size=kernel,
                        strides=stride,
                        activation=conv_activation,
                        padding="same",
                        data_format='channels_last',
                        return_sequences=True,
                        return_state=True,
                        name="{}_convlstm".format(prefix))(
                            inputs=_last_layer,
                            initial_state=state_in,
                            mask=tf.sequence_mask(seq_lens_in))

            return _last_layer, base_state_out

        def build_pi_layers(input_layer):
            _last_layer = input_layer
            for i, (out_size, kernel, stride) in enumerate(pi_cnn_filters):
                _last_layer = tf.keras.layers.Conv2D(
                    filters=out_size,
                    kernel_size=kernel,
                    strides=stride,
                    activation=conv_activation,
                    padding="same",
                    name="{}_conv_{}".format('pi', i))(_last_layer)

            print(
                f"action space n: {action_space.n}, rows: {rows}, columns: {colums}, filters: {int(action_space.n / (rows * colums))}"
            )

            unmasked_logits = tf.keras.layers.Conv2D(
                filters=int(action_space.n / (rows * colums)),
                kernel_size=[3, 3],
                strides=1,
                activation=None,
                padding="same",
                name="{}_conv_{}".format('pi', "unmasked_logits"))(_last_layer)
            return unmasked_logits

        def build_q_layers(input_layer, prefix):
            _last_layer = input_layer
            for i, (out_size, kernel, stride) in enumerate(q_cnn_filters):
                _last_layer = tf.keras.layers.Conv2D(
                    filters=out_size,
                    kernel_size=kernel,
                    strides=stride,
                    activation=conv_activation,
                    padding="same",
                    name="{}_conv_{}".format(prefix, i))(_last_layer)

            q_val = tf.keras.layers.Conv2D(
                filters=int(action_space.n / (rows * colums)),
                kernel_size=[3, 3],
                strides=1,
                activation=None,
                padding="same",
                name="{}_conv_{}".format(prefix, "q_out"))(_last_layer)
            return q_val

        base_model_out, state_out = build_shared_base_layers(
            prefix="shared_base", obs_in=self._obs_inputs, state_in=state_in)
        pi_unmasked_logits_out = build_pi_layers(
            input_layer=self._base_model_out)
        q1_out = build_q_layers(input_layer=self._base_model_out, prefix="q1")
        q2_out = build_q_layers(input_layer=self._base_model_out, prefix="q2")

        base_inputs = [self._obs_inputs]
        base_outputs = [base_model_out]
        if self.use_lstm:
            base_inputs += [seq_lens_in, *state_in]
            base_outputs += [*state_out]

        self._base_model = tf.keras.Model(name=f"{name}_base",
                                          inputs=base_inputs,
                                          outputs=base_outputs)

        self.pi_model = tf.keras.Model(name=f"{name}_pi_head",
                                       inputs=[self._base_model_out],
                                       outputs=[pi_unmasked_logits_out])
        self.q1_model = tf.keras.Model(name=f"{name}_q1_head",
                                       inputs=[self._base_model_out],
                                       outputs=[q1_out])
        self.q2_model = tf.keras.Model(name=f"{name}_q2_head",
                                       inputs=[self._base_model_out],
                                       outputs=[q2_out])

        print(self._base_model.summary())
        print(self.pi_model.summary())
        print(self.q1_model.summary())
        print(self.q2_model.summary())

        self.register_variables(self._base_model.variables)
        self.register_variables(self.pi_model.variables)
        self.register_variables(self.q1_model.variables)
        self.register_variables(self.q2_model.variables)

        self.log_alpha = tf.Variable(0.0, dtype=tf.float32, name="log_alpha")
        self.alpha = tf.exp(self.log_alpha)
        self.register_variables([self.log_alpha])
예제 #27
0
        def __init__(self, obs_space, action_space, config):
            if get_default_config:
                config = dict(get_default_config(), **config)
            self.config = config

            if validate_spaces:
                validate_spaces(self, obs_space, action_space, self.config)

            if before_init:
                before_init(self, obs_space, action_space, self.config)

            # Model is customized (use default action dist class).
            if make_model:
                assert make_model_and_action_dist is None, \
                    "Either `make_model` or `make_model_and_action_dist`" \
                    " must be None!"
                self.model = make_model(self, obs_space, action_space, config)
                dist_class, _ = ModelCatalog.get_action_dist(
                    action_space, self.config["model"], framework="torch")
            # Model and action dist class are customized.
            elif make_model_and_action_dist:
                self.model, dist_class = make_model_and_action_dist(
                    self, obs_space, action_space, config)
            # Use default model and default action dist.
            else:
                dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"], framework="torch")
                self.model = ModelCatalog.get_model_v2(
                    obs_space=obs_space,
                    action_space=action_space,
                    num_outputs=logit_dim,
                    model_config=self.config["model"],
                    framework="torch")

            # Make sure, we passed in a correct Model factory.
            assert isinstance(self.model, TorchModelV2), \
                "ERROR: Generated Model must be a TorchModelV2 object!"

            TorchPolicy.__init__(
                self,
                observation_space=obs_space,
                action_space=action_space,
                config=config,
                model=self.model,
                loss=loss_fn,
                action_distribution_class=dist_class,
                action_sampler_fn=action_sampler_fn,
                action_distribution_fn=action_distribution_fn,
                max_seq_len=config["model"]["max_seq_len"],
                get_batch_divisibility_req=get_batch_divisibility_req,
            )

            self.view_requirements.update(
                self.model.inference_view_requirements)

            _before_loss_init = before_loss_init or after_init
            if _before_loss_init:
                _before_loss_init(self, self.observation_space,
                                  self.action_space, config)

            self._initialize_loss_from_dummy_batch(
                auto_remove_unneeded_view_reqs=True,
                stats_fn=stats_fn,
            )

            if _after_loss_init:
                _after_loss_init(self, obs_space, action_space, config)

            # Got to reset global_timestep again after this fake run-through.
            self.global_timestep = 0
예제 #28
0
    def __init__(self,
                 observation_space,
                 action_space,
                 config,
                 existing_inputs=None):
        config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config)
        assert config["batch_mode"] == "truncate_episodes", \
            "Must use `truncate_episodes` batch mode with V-trace."
        self.config = config
        self.sess = tf.get_default_session()
        self.grads = None

        if isinstance(action_space, gym.spaces.Discrete):
            is_multidiscrete = False
            output_hidden_shape = [action_space.n]
        elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete):
            is_multidiscrete = True
            output_hidden_shape = action_space.nvec.astype(np.int32)
        elif self.config["vtrace"]:
            raise UnsupportedSpaceException(
                "Action space {} is not supported for APPO + VTrace.",
                format(action_space))
        else:
            is_multidiscrete = False
            output_hidden_shape = 1

        # Policy network model
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])

        # Create input placeholders
        if existing_inputs:
            if self.config["vtrace"]:
                actions, dones, behaviour_logits, rewards, observations, \
                    prev_actions, prev_rewards = existing_inputs[:7]
                existing_state_in = existing_inputs[7:-1]
                existing_seq_lens = existing_inputs[-1]
            else:
                actions, dones, behaviour_logits, rewards, observations, \
                    prev_actions, prev_rewards, adv_ph, value_targets = \
                    existing_inputs[:9]
                existing_state_in = existing_inputs[9:-1]
                existing_seq_lens = existing_inputs[-1]
        else:
            actions = ModelCatalog.get_action_placeholder(action_space)
            dones = tf.placeholder(tf.bool, [None], name="dones")
            rewards = tf.placeholder(tf.float32, [None], name="rewards")
            behaviour_logits = tf.placeholder(
                tf.float32, [None, logit_dim], name="behaviour_logits")
            observations = tf.placeholder(
                tf.float32, [None] + list(observation_space.shape))
            existing_state_in = None
            existing_seq_lens = None

            if not self.config["vtrace"]:
                adv_ph = tf.placeholder(
                    tf.float32, name="advantages", shape=(None, ))
                value_targets = tf.placeholder(
                    tf.float32, name="value_targets", shape=(None, ))
        self.observations = observations

        # Unpack behaviour logits
        unpacked_behaviour_logits = tf.split(
            behaviour_logits, output_hidden_shape, axis=1)

        # Setup the policy
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
        prev_actions = ModelCatalog.get_action_placeholder(action_space)
        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
        self.model = ModelCatalog.get_model(
            {
                "obs": observations,
                "prev_actions": prev_actions,
                "prev_rewards": prev_rewards,
                "is_training": self._get_is_training_placeholder(),
            },
            observation_space,
            action_space,
            logit_dim,
            self.config["model"],
            state_in=existing_state_in,
            seq_lens=existing_seq_lens)
        unpacked_outputs = tf.split(
            self.model.outputs, output_hidden_shape, axis=1)

        dist_inputs = unpacked_outputs if is_multidiscrete else \
            self.model.outputs
        prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \
            behaviour_logits

        action_dist = dist_class(dist_inputs)
        prev_action_dist = dist_class(prev_dist_inputs)

        values = self.model.value_function()
        self.value_function = values
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)

        def make_time_major(tensor, drop_last=False):
            """Swaps batch and trajectory axis.
            Args:
                tensor: A tensor or list of tensors to reshape.
                drop_last: A bool indicating whether to drop the last
                trajectory item.
            Returns:
                res: A tensor with swapped axes or a list of tensors with
                swapped axes.
            """
            if isinstance(tensor, list):
                return [make_time_major(t, drop_last) for t in tensor]

            if self.model.state_init:
                B = tf.shape(self.model.seq_lens)[0]
                T = tf.shape(tensor)[0] // B
            else:
                # Important: chop the tensor into batches at known episode cut
                # boundaries. TODO(ekl) this is kind of a hack
                T = self.config["sample_batch_size"]
                B = tf.shape(tensor)[0] // T
            rs = tf.reshape(tensor,
                            tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0))

            # swap B and T axes
            res = tf.transpose(
                rs,
                [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))))

            if drop_last:
                return res[:-1]
            return res

        if self.model.state_in:
            max_seq_len = tf.reduce_max(self.model.seq_lens) - 1
            mask = tf.sequence_mask(self.model.seq_lens, max_seq_len)
            mask = tf.reshape(mask, [-1])
        else:
            mask = tf.ones_like(rewards)

        # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc.
        if self.config["vtrace"]:
            logger.info("Using V-Trace surrogate loss (vtrace=True)")

            # Prepare actions for loss
            loss_actions = actions if is_multidiscrete else tf.expand_dims(
                actions, axis=1)

            self.loss = VTraceSurrogateLoss(
                actions=make_time_major(loss_actions, drop_last=True),
                prev_actions_logp=make_time_major(
                    prev_action_dist.logp(actions), drop_last=True),
                actions_logp=make_time_major(
                    action_dist.logp(actions), drop_last=True),
                action_kl=prev_action_dist.kl(action_dist),
                actions_entropy=make_time_major(
                    action_dist.entropy(), drop_last=True),
                dones=make_time_major(dones, drop_last=True),
                behaviour_logits=make_time_major(
                    unpacked_behaviour_logits, drop_last=True),
                target_logits=make_time_major(
                    unpacked_outputs, drop_last=True),
                discount=config["gamma"],
                rewards=make_time_major(rewards, drop_last=True),
                values=make_time_major(values, drop_last=True),
                bootstrap_value=make_time_major(values)[-1],
                valid_mask=make_time_major(mask, drop_last=True),
                vf_loss_coeff=self.config["vf_loss_coeff"],
                entropy_coeff=self.config["entropy_coeff"],
                clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
                clip_pg_rho_threshold=self.config[
                    "vtrace_clip_pg_rho_threshold"],
                clip_param=self.config["clip_param"])
        else:
            logger.info("Using PPO surrogate loss (vtrace=False)")
            self.loss = PPOSurrogateLoss(
                prev_actions_logp=make_time_major(
                    prev_action_dist.logp(actions)),
                actions_logp=make_time_major(action_dist.logp(actions)),
                action_kl=prev_action_dist.kl(action_dist),
                actions_entropy=make_time_major(action_dist.entropy()),
                values=make_time_major(values),
                valid_mask=make_time_major(mask),
                advantages=make_time_major(adv_ph),
                value_targets=make_time_major(value_targets),
                vf_loss_coeff=self.config["vf_loss_coeff"],
                entropy_coeff=self.config["entropy_coeff"],
                clip_param=self.config["clip_param"])

        # KL divergence between worker and learner logits for debugging
        model_dist = MultiCategorical(unpacked_outputs)
        behaviour_dist = MultiCategorical(unpacked_behaviour_logits)

        kls = model_dist.kl(behaviour_dist)
        if len(kls) > 1:
            self.KL_stats = {}

            for i, kl in enumerate(kls):
                self.KL_stats.update({
                    "mean_KL_{}".format(i): tf.reduce_mean(kl),
                    "max_KL_{}".format(i): tf.reduce_max(kl),
                    "median_KL_{}".format(i): tf.contrib.distributions.
                    percentile(kl, 50.0),
                })
        else:
            self.KL_stats = {
                "mean_KL": tf.reduce_mean(kls[0]),
                "max_KL": tf.reduce_max(kls[0]),
                "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0),
            }

        # Initialize TFPolicyGraph
        loss_in = [
            ("actions", actions),
            ("dones", dones),
            ("behaviour_logits", behaviour_logits),
            ("rewards", rewards),
            ("obs", observations),
            ("prev_actions", prev_actions),
            ("prev_rewards", prev_rewards),
        ]
        if not self.config["vtrace"]:
            loss_in.append(("advantages", adv_ph))
            loss_in.append(("value_targets", value_targets))
        LearningRateSchedule.__init__(self, self.config["lr"],
                                      self.config["lr_schedule"])
        TFPolicyGraph.__init__(
            self,
            observation_space,
            action_space,
            self.sess,
            obs_input=observations,
            action_sampler=action_dist.sample(),
            action_prob=action_dist.sampled_action_prob(),
            loss=self.loss.total_loss,
            model=self.model,
            loss_inputs=loss_in,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            prev_action_input=prev_actions,
            prev_reward_input=prev_rewards,
            seq_lens=self.model.seq_lens,
            max_seq_len=self.config["model"]["max_seq_len"],
            batch_divisibility_req=self.config["sample_batch_size"])

        self.sess.run(tf.global_variables_initializer())

        values_batched = make_time_major(
            values, drop_last=self.config["vtrace"])
        self.stats_fetches = {
            "stats": dict({
                "cur_lr": tf.cast(self.cur_lr, tf.float64),
                "policy_loss": self.loss.pi_loss,
                "entropy": self.loss.entropy,
                "grad_gnorm": tf.global_norm(self._grads),
                "var_gnorm": tf.global_norm(self.var_list),
                "vf_loss": self.loss.vf_loss,
                "vf_explained_var": explained_variance(
                    tf.reshape(self.loss.value_targets, [-1]),
                    tf.reshape(values_batched, [-1])),
            }, **self.KL_stats),
        }
예제 #29
0
    def __init__(
            self,
            obs_space: gym.spaces.Space,
            action_space: gym.spaces.Space,
            config: TrainerConfigDict,
            loss_fn: Callable[[Policy, ModelV2, type, SampleBatch],
                              TensorType],
            *,
            stats_fn: Optional[Callable[[Policy, SampleBatch],
                                        Dict[str, TensorType]]] = None,
            grad_stats_fn: Optional[
                Callable[[Policy, SampleBatch, ModelGradients],
                         Dict[str, TensorType]]] = None,
            before_loss_init: Optional[Callable[[
                Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
            ], None]] = None,
            make_model: Optional[Callable[[
                Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
            ], ModelV2]] = None,
            action_sampler_fn: Optional[Callable[
                [TensorType, List[TensorType]], Tuple[TensorType,
                                                      TensorType]]] = None,
            action_distribution_fn: Optional[
                Callable[[Policy, ModelV2, TensorType, TensorType, TensorType],
                         Tuple[TensorType, type, List[TensorType]]]] = None,
            existing_inputs: Optional[Dict[str, "tf1.placeholder"]] = None,
            existing_model: Optional[ModelV2] = None,
            get_batch_divisibility_req: Optional[Callable[[Policy],
                                                          int]] = None,
            obs_include_prev_action_reward: bool = True):
        """Initialize a dynamic TF policy.

        Arguments:
            observation_space (gym.spaces.Space): Observation space of the
                policy.
            action_space (gym.spaces.Space): Action space of the policy.
            config (TrainerConfigDict): Policy-specific configuration data.
            loss_fn (Callable[[Policy, ModelV2, type, SampleBatch],
                TensorType]): Function that returns a loss tensor for the
                policy graph.
            stats_fn (Optional[Callable[[Policy, SampleBatch],
                Dict[str, TensorType]]]): Optional function that returns a dict
                of TF fetches given the policy and batch input tensors.
            grad_stats_fn (Optional[Callable[[Policy, SampleBatch,
                ModelGradients], Dict[str, TensorType]]]):
                Optional function that returns a dict of TF fetches given the
                policy, sample batch, and loss gradient tensors.
            before_loss_init (Optional[Callable[
                [Policy, gym.spaces.Space, gym.spaces.Space,
                TrainerConfigDict], None]]): Optional function to run prior to
                loss init that takes the same arguments as __init__.
            make_model (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, TrainerConfigDict], ModelV2]]): Optional
                function that returns a ModelV2 object given
                policy, obs_space, action_space, and policy config.
                All policy variables should be created in this function. If not
                specified, a default model will be created.
            action_sampler_fn (Optional[Callable[[Policy, ModelV2, Dict[
                str, TensorType], TensorType, TensorType], Tuple[TensorType,
                TensorType]]]): A callable returning a sampled action and its
                log-likelihood given Policy, ModelV2, input_dict, explore,
                timestep, and is_training.
            action_distribution_fn (Optional[Callable[[Policy, ModelV2,
                Dict[str, TensorType], TensorType, TensorType],
                Tuple[TensorType, type, List[TensorType]]]]): A callable
                returning distribution inputs (parameters), a dist-class to
                generate an action distribution object from, and
                internal-state outputs (or an empty list if not applicable).
                Note: No Exploration hooks have to be called from within
                `action_distribution_fn`. It's should only perform a simple
                forward pass through some model.
                If None, pass inputs through `self.model()` to get distribution
                inputs.
                The callable takes as inputs: Policy, ModelV2, input_dict,
                explore, timestep, is_training.
            existing_inputs (Optional[Dict[str, tf1.placeholder]]): When
                copying a policy, this specifies an existing dict of
                placeholders to use instead of defining new ones.
            existing_model (Optional[ModelV2]): When copying a policy, this
                specifies an existing model to clone and share weights with.
            get_batch_divisibility_req (Optional[Callable[[Policy], int]]]):
                Optional callable that returns the divisibility requirement
                for sample batches given the Policy.
            obs_include_prev_action_reward (bool): Whether to include the
                previous action and reward in the model input (default: True).
        """
        self.observation_space = obs_space
        self.action_space = action_space
        self.config = config
        self.framework = "tf"
        self._loss_fn = loss_fn
        self._stats_fn = stats_fn
        self._grad_stats_fn = grad_stats_fn
        self._obs_include_prev_action_reward = obs_include_prev_action_reward

        # Setup standard placeholders
        prev_actions = None
        prev_rewards = None
        if existing_inputs is not None:
            obs = existing_inputs[SampleBatch.CUR_OBS]
            if self._obs_include_prev_action_reward:
                prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS]
                prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS]
            action_input = existing_inputs[SampleBatch.ACTIONS]
            explore = existing_inputs["is_exploring"]
            timestep = existing_inputs["timestep"]
        else:
            obs = tf1.placeholder(tf.float32,
                                  shape=[None] + list(obs_space.shape),
                                  name="observation")
            action_input = ModelCatalog.get_action_placeholder(action_space)
            if self._obs_include_prev_action_reward:
                prev_actions = ModelCatalog.get_action_placeholder(
                    action_space, "prev_action")
                prev_rewards = tf1.placeholder(tf.float32, [None],
                                               name="prev_reward")
            explore = tf1.placeholder_with_default(True, (),
                                                   name="is_exploring")
            timestep = tf1.placeholder(tf.int32, (), name="timestep")

        self._input_dict = {
            SampleBatch.CUR_OBS: obs,
            SampleBatch.PREV_ACTIONS: prev_actions,
            SampleBatch.PREV_REWARDS: prev_rewards,
            "is_training": self._get_is_training_placeholder(),
        }
        # Placeholder for RNN time-chunk valid lengths.
        self._seq_lens = tf1.placeholder(dtype=tf.int32,
                                         shape=[None],
                                         name="seq_lens")

        dist_class = dist_inputs = None
        if action_sampler_fn or action_distribution_fn:
            if not make_model:
                raise ValueError(
                    "`make_model` is required if `action_sampler_fn` OR "
                    "`action_distribution_fn` is given")
        else:
            dist_class, logit_dim = ModelCatalog.get_action_dist(
                action_space, self.config["model"])

        # Setup self.model.
        if existing_model:
            self.model = existing_model
        elif make_model:
            self.model = make_model(self, obs_space, action_space, config)
        else:
            self.model = ModelCatalog.get_model_v2(
                obs_space=obs_space,
                action_space=action_space,
                num_outputs=logit_dim,
                model_config=self.config["model"],
                framework="tf")

        # Create the Exploration object to use for this Policy.
        self.exploration = self._create_exploration()

        if existing_inputs:
            self._state_in = [
                v for k, v in existing_inputs.items()
                if k.startswith("state_in_")
            ]
            if self._state_in:
                self._seq_lens = existing_inputs["seq_lens"]
        else:
            self._state_in = [
                tf1.placeholder(shape=(None, ) + s.shape, dtype=s.dtype)
                for s in self.model.get_initial_state()
            ]

        # Fully customized action generation (e.g., custom policy).
        if action_sampler_fn:
            sampled_action, sampled_action_logp = action_sampler_fn(
                self,
                self.model,
                obs_batch=self._input_dict[SampleBatch.CUR_OBS],
                state_batches=self._state_in,
                seq_lens=self._seq_lens,
                prev_action_batch=self._input_dict[SampleBatch.PREV_ACTIONS],
                prev_reward_batch=self._input_dict[SampleBatch.PREV_REWARDS],
                explore=explore,
                is_training=self._input_dict["is_training"])
        else:
            # Distribution generation is customized, e.g., DQN, DDPG.
            if action_distribution_fn:
                dist_inputs, dist_class, self._state_out = \
                    action_distribution_fn(
                        self, self.model,
                        obs_batch=self._input_dict[SampleBatch.CUR_OBS],
                        state_batches=self._state_in,
                        seq_lens=self._seq_lens,
                        prev_action_batch=self._input_dict[
                            SampleBatch.PREV_ACTIONS],
                        prev_reward_batch=self._input_dict[
                            SampleBatch.PREV_REWARDS],
                        explore=explore,
                        is_training=self._input_dict["is_training"])
            # Default distribution generation behavior:
            # Pass through model. E.g., PG, PPO.
            else:
                dist_inputs, self._state_out = self.model(
                    self._input_dict, self._state_in, self._seq_lens)

            action_dist = dist_class(dist_inputs, self.model)

            # Using exploration to get final action (e.g. via sampling).
            sampled_action, sampled_action_logp = \
                self.exploration.get_exploration_action(
                    action_distribution=action_dist,
                    timestep=timestep,
                    explore=explore)

        # Phase 1 init.
        sess = tf1.get_default_session() or tf1.Session()
        if get_batch_divisibility_req:
            batch_divisibility_req = get_batch_divisibility_req(self)
        else:
            batch_divisibility_req = 1

        super().__init__(
            observation_space=obs_space,
            action_space=action_space,
            config=config,
            sess=sess,
            obs_input=obs,
            action_input=action_input,  # for logp calculations
            sampled_action=sampled_action,
            sampled_action_logp=sampled_action_logp,
            dist_inputs=dist_inputs,
            dist_class=dist_class,
            loss=None,  # dynamically initialized on run
            loss_inputs=[],
            model=self.model,
            state_inputs=self._state_in,
            state_outputs=self._state_out,
            prev_action_input=prev_actions,
            prev_reward_input=prev_rewards,
            seq_lens=self._seq_lens,
            max_seq_len=config["model"]["max_seq_len"],
            batch_divisibility_req=batch_divisibility_req,
            explore=explore,
            timestep=timestep)

        # Phase 2 init.
        if before_loss_init is not None:
            before_loss_init(self, obs_space, action_space, config)

        if not existing_inputs:
            self._initialize_loss_dynamically()
예제 #30
0
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config)
        self.config = config
        self.sess = tf.get_default_session()

        # Setup the policy
        self.observations = tf.placeholder(
            tf.float32, [None] + list(observation_space.shape))
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
        prev_actions = ModelCatalog.get_action_placeholder(action_space)
        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
        self.model = ModelCatalog.get_model({
            "obs": self.observations,
            "prev_actions": prev_actions,
            "prev_rewards": prev_rewards,
            "is_training": self._get_is_training_placeholder(),
        }, observation_space, logit_dim, self.config["model"])
        action_dist = dist_class(self.model.outputs)
        self.vf = self.model.value_function()
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)

        # Setup the policy loss
        if isinstance(action_space, gym.spaces.Box):
            ac_size = action_space.shape[0]
            actions = tf.placeholder(tf.float32, [None, ac_size], name="ac")
        elif isinstance(action_space, gym.spaces.Discrete):
            actions = tf.placeholder(tf.int64, [None], name="ac")
        else:
            raise UnsupportedSpaceException(
                "Action space {} is not supported for A3C.".format(
                    action_space))
        advantages = tf.placeholder(tf.float32, [None], name="advantages")
        self.v_target = tf.placeholder(tf.float32, [None], name="v_target")
        self.loss = A3CLoss(action_dist, actions, advantages, self.v_target,
                            self.vf, self.config["vf_loss_coeff"],
                            self.config["entropy_coeff"])

        # Initialize TFPolicyGraph
        loss_in = [
            ("obs", self.observations),
            ("actions", actions),
            ("prev_actions", prev_actions),
            ("prev_rewards", prev_rewards),
            ("advantages", advantages),
            ("value_targets", self.v_target),
        ]
        LearningRateSchedule.__init__(self, self.config["lr"],
                                      self.config["lr_schedule"])
        TFPolicyGraph.__init__(
            self,
            observation_space,
            action_space,
            self.sess,
            obs_input=self.observations,
            action_sampler=action_dist.sample(),
            loss=self.model.loss() + self.loss.total_loss,
            loss_inputs=loss_in,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            prev_action_input=prev_actions,
            prev_reward_input=prev_rewards,
            seq_lens=self.model.seq_lens,
            max_seq_len=self.config["model"]["max_seq_len"])

        self.stats_fetches = {
            "stats": {
                "cur_lr": tf.cast(self.cur_lr, tf.float64),
                "policy_loss": self.loss.pi_loss,
                "policy_entropy": self.loss.entropy,
                "grad_gnorm": tf.global_norm(self._grads),
                "var_gnorm": tf.global_norm(self.var_list),
                "vf_loss": self.loss.vf_loss,
                "vf_explained_var": explained_variance(self.v_target, self.vf),
            },
        }

        self.sess.run(tf.global_variables_initializer())
예제 #31
0
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
            self.framework = config.get("framework", "tfe")
            Policy.__init__(self, observation_space, action_space, config)

            # Log device and worker index.
            from ray.rllib.evaluation.rollout_worker import get_global_worker
            worker = get_global_worker()
            worker_idx = worker.worker_index if worker else 0
            if get_gpu_devices():
                logger.info(
                    "TF-eager Policy (worker={}) running on GPU.".format(
                        worker_idx if worker_idx > 0 else "local"))
            else:
                logger.info(
                    "TF-eager Policy (worker={}) running on CPU.".format(
                        worker_idx if worker_idx > 0 else "local"))

            self._is_training = False
            self._loss_initialized = False

            self._loss = loss_fn
            self.batch_divisibility_req = get_batch_divisibility_req(self) if \
                callable(get_batch_divisibility_req) else \
                (get_batch_divisibility_req or 1)
            self._max_seq_len = config["model"]["max_seq_len"]

            if get_default_config:
                config = dict(get_default_config(), **config)

            if validate_spaces:
                validate_spaces(self, observation_space, action_space, config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config
            self.dist_class = None
            if action_sampler_fn or action_distribution_fn:
                if not make_model:
                    raise ValueError(
                        "`make_model` is required if `action_sampler_fn` OR "
                        "`action_distribution_fn` is given")
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework=self.framework,
                )
            # Lock used for locking some methods on the object-level.
            # This prevents possible race conditions when calling the model
            # first, then its value function (e.g. in a loss function), in
            # between of which another model call is made (e.g. to compute an
            # action).
            self._lock = threading.RLock()

            # Auto-update model's inference view requirements, if recurrent.
            self._update_model_view_requirements_from_init_state()

            self.exploration = self._create_exploration()
            self._state_inputs = self.model.get_initial_state()
            self._is_recurrent = len(self._state_inputs) > 0

            # Combine view_requirements for Model and Policy.
            self.view_requirements.update(self.model.view_requirements)

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            if optimizer_fn:
                optimizers = optimizer_fn(self, config)
            else:
                optimizers = tf.keras.optimizers.Adam(config["lr"])
            optimizers = force_list(optimizers)
            if getattr(self, "exploration", None):
                optimizers = self.exploration.get_exploration_optimizer(
                    optimizers)

            # The list of local (tf) optimizers (one per loss term).
            self._optimizers: List[LocalOptimizer] = optimizers
            # Backward compatibility: A user's policy may only support a single
            # loss term and optimizer (no lists).
            self._optimizer: LocalOptimizer = \
                optimizers[0] if optimizers else None

            self._initialize_loss_from_dummy_batch(
                auto_remove_unneeded_view_reqs=True,
                stats_fn=stats_fn,
            )
            self._loss_initialized = True

            if after_init:
                after_init(self, observation_space, action_space, config)

            # Got to reset global_timestep again after fake run-throughs.
            self.global_timestep = 0
예제 #32
0
    def __init__(self,
                 action_space,
                 value_targets,
                 advantages,
                 actions,
                 logits,
                 vf_preds,
                 curr_action_dist,
                 value_fn,
                 cur_kl_coeff,
                 valid_mask,
                 entropy_coeff=0,
                 clip_param=0.1,
                 vf_clip_param=0.1,
                 vf_loss_coeff=1.0,
                 use_gae=True):
        """Constructs the loss for Proximal Policy Objective.

        Arguments:
            action_space: Environment observation space specification.
            value_targets (Placeholder): Placeholder for target values; used
                for GAE.
            actions (Placeholder): Placeholder for actions taken
                from previous model evaluation.
            advantages (Placeholder): Placeholder for calculated advantages
                from previous model evaluation.
            logits (Placeholder): Placeholder for logits output from
                previous model evaluation.
            vf_preds (Placeholder): Placeholder for value function output
                from previous model evaluation.
            curr_action_dist (ActionDistribution): ActionDistribution
                of the current model.
            value_fn (Tensor): Current value function output Tensor.
            cur_kl_coeff (Variable): Variable holding the current PPO KL
                coefficient.
            valid_mask (Tensor): A bool mask of valid input elements (#2992).
            entropy_coeff (float): Coefficient of the entropy regularizer.
            clip_param (float): Clip parameter
            vf_clip_param (float): Clip parameter for the value function
            vf_loss_coeff (float): Coefficient of the value function loss
            use_gae (bool): If true, use the Generalized Advantage Estimator.
        """

        def reduce_mean_valid(t):
            return tf.reduce_mean(tf.boolean_mask(t, valid_mask))

        dist_cls, _ = ModelCatalog.get_action_dist(action_space, {})
        prev_dist = dist_cls(logits)
        # Make loss functions.
        logp_ratio = tf.exp(
            curr_action_dist.logp(actions) - prev_dist.logp(actions))
        action_kl = prev_dist.kl(curr_action_dist)
        self.mean_kl = reduce_mean_valid(action_kl)

        curr_entropy = curr_action_dist.entropy()
        self.mean_entropy = reduce_mean_valid(curr_entropy)

        surrogate_loss = tf.minimum(
            advantages * logp_ratio,
            advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
                                          1 + clip_param))
        self.mean_policy_loss = reduce_mean_valid(-surrogate_loss)

        if use_gae:
            vf_loss1 = tf.square(value_fn - value_targets)
            vf_clipped = vf_preds + tf.clip_by_value(
                value_fn - vf_preds, -vf_clip_param, vf_clip_param)
            vf_loss2 = tf.square(vf_clipped - value_targets)
            vf_loss = tf.maximum(vf_loss1, vf_loss2)
            self.mean_vf_loss = reduce_mean_valid(vf_loss)
            loss = reduce_mean_valid(
                -surrogate_loss + cur_kl_coeff * action_kl +
                vf_loss_coeff * vf_loss - entropy_coeff * curr_entropy)
        else:
            self.mean_vf_loss = tf.constant(0.0)
            loss = reduce_mean_valid(-surrogate_loss +
                                     cur_kl_coeff * action_kl -
                                     entropy_coeff * curr_entropy)
        self.loss = loss
예제 #33
0
    def __init__(
            self,
            obs_space: gym.spaces.Space,
            action_space: gym.spaces.Space,
            config: TrainerConfigDict,
            loss_fn: Callable[
                [Policy, ModelV2, Type[TFActionDistribution], SampleBatch],
                TensorType],
            *,
            stats_fn: Optional[Callable[[Policy, SampleBatch],
                                        Dict[str, TensorType]]] = None,
            grad_stats_fn: Optional[
                Callable[[Policy, SampleBatch, ModelGradients],
                         Dict[str, TensorType]]] = None,
            before_loss_init: Optional[Callable[[
                Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
            ], None]] = None,
            make_model: Optional[Callable[[
                Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
            ], ModelV2]] = None,
            action_sampler_fn: Optional[Callable[
                [TensorType, List[TensorType]], Tuple[TensorType,
                                                      TensorType]]] = None,
            action_distribution_fn: Optional[
                Callable[[Policy, ModelV2, TensorType, TensorType, TensorType],
                         Tuple[TensorType, type, List[TensorType]]]] = None,
            existing_inputs: Optional[Dict[str, "tf1.placeholder"]] = None,
            existing_model: Optional[ModelV2] = None,
            get_batch_divisibility_req: Optional[Callable[[Policy],
                                                          int]] = None,
            obs_include_prev_action_reward: bool = True):
        """Initialize a dynamic TF policy.

        Args:
            observation_space (gym.spaces.Space): Observation space of the
                policy.
            action_space (gym.spaces.Space): Action space of the policy.
            config (TrainerConfigDict): Policy-specific configuration data.
            loss_fn (Callable[[Policy, ModelV2, Type[TFActionDistribution],
                SampleBatch], TensorType]): Function that returns a loss tensor
                for the policy graph.
            stats_fn (Optional[Callable[[Policy, SampleBatch],
                Dict[str, TensorType]]]): Optional function that returns a dict
                of TF fetches given the policy and batch input tensors.
            grad_stats_fn (Optional[Callable[[Policy, SampleBatch,
                ModelGradients], Dict[str, TensorType]]]):
                Optional function that returns a dict of TF fetches given the
                policy, sample batch, and loss gradient tensors.
            before_loss_init (Optional[Callable[
                [Policy, gym.spaces.Space, gym.spaces.Space,
                TrainerConfigDict], None]]): Optional function to run prior to
                loss init that takes the same arguments as __init__.
            make_model (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, TrainerConfigDict], ModelV2]]): Optional
                function that returns a ModelV2 object given
                policy, obs_space, action_space, and policy config.
                All policy variables should be created in this function. If not
                specified, a default model will be created.
            action_sampler_fn (Optional[Callable[[Policy, ModelV2, Dict[
                str, TensorType], TensorType, TensorType], Tuple[TensorType,
                TensorType]]]): A callable returning a sampled action and its
                log-likelihood given Policy, ModelV2, input_dict, explore,
                timestep, and is_training.
            action_distribution_fn (Optional[Callable[[Policy, ModelV2,
                Dict[str, TensorType], TensorType, TensorType],
                Tuple[TensorType, type, List[TensorType]]]]): A callable
                returning distribution inputs (parameters), a dist-class to
                generate an action distribution object from, and
                internal-state outputs (or an empty list if not applicable).
                Note: No Exploration hooks have to be called from within
                `action_distribution_fn`. It's should only perform a simple
                forward pass through some model.
                If None, pass inputs through `self.model()` to get distribution
                inputs.
                The callable takes as inputs: Policy, ModelV2, input_dict,
                explore, timestep, is_training.
            existing_inputs (Optional[Dict[str, tf1.placeholder]]): When
                copying a policy, this specifies an existing dict of
                placeholders to use instead of defining new ones.
            existing_model (Optional[ModelV2]): When copying a policy, this
                specifies an existing model to clone and share weights with.
            get_batch_divisibility_req (Optional[Callable[[Policy], int]]):
                Optional callable that returns the divisibility requirement for
                sample batches. If None, will assume a value of 1.
            obs_include_prev_action_reward (bool): Whether to include the
                previous action and reward in the model input (default: True).
        """
        self.observation_space = obs_space
        self.action_space = action_space
        self.config = config
        self.framework = "tf"
        self._loss_fn = loss_fn
        self._stats_fn = stats_fn
        self._grad_stats_fn = grad_stats_fn
        self._obs_include_prev_action_reward = obs_include_prev_action_reward

        dist_class = dist_inputs = None
        if action_sampler_fn or action_distribution_fn:
            if not make_model:
                raise ValueError(
                    "`make_model` is required if `action_sampler_fn` OR "
                    "`action_distribution_fn` is given")
        else:
            dist_class, logit_dim = ModelCatalog.get_action_dist(
                action_space, self.config["model"])

        # Setup self.model.
        if existing_model:
            self.model = existing_model
        elif make_model:
            self.model = make_model(self, obs_space, action_space, config)
        else:
            self.model = ModelCatalog.get_model_v2(
                obs_space=obs_space,
                action_space=action_space,
                num_outputs=logit_dim,
                model_config=self.config["model"],
                framework="tf")
        # Auto-update model's inference view requirements, if recurrent.
        self._update_model_inference_view_requirements_from_init_state()

        if existing_inputs:
            self._state_inputs = [
                v for k, v in existing_inputs.items()
                if k.startswith("state_in_")
            ]
            if self._state_inputs:
                self._seq_lens = existing_inputs["seq_lens"]
        else:
            if self.config["_use_trajectory_view_api"]:
                self._state_inputs = [
                    get_placeholder(
                        space=vr.space,
                        time_axis=not isinstance(vr.shift, int),
                    ) for k, vr in
                    self.model.inference_view_requirements.items()
                    if k.startswith("state_in_")
                ]
            else:
                self._state_inputs = [
                    tf1.placeholder(shape=(None, ) + s.shape, dtype=s.dtype)
                    for s in self.model.get_initial_state()
                ]

        # Use default settings.
        # Add NEXT_OBS, STATE_IN_0.., and others.
        self.view_requirements = self._get_default_view_requirements()
        # Combine view_requirements for Model and Policy.
        self.view_requirements.update(self.model.inference_view_requirements)

        # Setup standard placeholders.
        if existing_inputs is not None:
            timestep = existing_inputs["timestep"]
            explore = existing_inputs["is_exploring"]
            self._input_dict, self._dummy_batch = \
                self._get_input_dict_and_dummy_batch(
                    self.view_requirements, existing_inputs)
        else:
            action_ph = ModelCatalog.get_action_placeholder(action_space)
            prev_action_ph = ModelCatalog.get_action_placeholder(
                action_space, "prev_action")
            if self.config["_use_trajectory_view_api"]:
                self._input_dict, self._dummy_batch = \
                    self._get_input_dict_and_dummy_batch(
                        self.view_requirements,
                        {SampleBatch.ACTIONS: action_ph,
                         SampleBatch.PREV_ACTIONS: prev_action_ph})
            else:
                self._input_dict = {
                    SampleBatch.CUR_OBS:
                    tf1.placeholder(tf.float32,
                                    shape=[None] + list(obs_space.shape),
                                    name="observation")
                }
                self._input_dict[SampleBatch.ACTIONS] = action_ph
                if self._obs_include_prev_action_reward:
                    self._input_dict.update({
                        SampleBatch.PREV_ACTIONS:
                        prev_action_ph,
                        SampleBatch.PREV_REWARDS:
                        tf1.placeholder(tf.float32, [None],
                                        name="prev_reward"),
                    })
            # Placeholder for (sampling steps) timestep (int).
            timestep = tf1.placeholder_with_default(tf.zeros((),
                                                             dtype=tf.int64),
                                                    (),
                                                    name="timestep")
            # Placeholder for `is_exploring` flag.
            explore = tf1.placeholder_with_default(True, (),
                                                   name="is_exploring")

        # Placeholder for RNN time-chunk valid lengths.
        self._seq_lens = tf1.placeholder(dtype=tf.int32,
                                         shape=[None],
                                         name="seq_lens")
        # Placeholder for `is_training` flag.
        self._input_dict["is_training"] = self._get_is_training_placeholder()

        # Create the Exploration object to use for this Policy.
        self.exploration = self._create_exploration()

        # Fully customized action generation (e.g., custom policy).
        if action_sampler_fn:
            sampled_action, sampled_action_logp = action_sampler_fn(
                self,
                self.model,
                obs_batch=self._input_dict[SampleBatch.CUR_OBS],
                state_batches=self._state_inputs,
                seq_lens=self._seq_lens,
                prev_action_batch=self._input_dict.get(
                    SampleBatch.PREV_ACTIONS),
                prev_reward_batch=self._input_dict.get(
                    SampleBatch.PREV_REWARDS),
                explore=explore,
                is_training=self._input_dict["is_training"])
        else:
            # Distribution generation is customized, e.g., DQN, DDPG.
            if action_distribution_fn:
                dist_inputs, dist_class, self._state_out = \
                    action_distribution_fn(
                        self, self.model,
                        obs_batch=self._input_dict[SampleBatch.CUR_OBS],
                        state_batches=self._state_inputs,
                        seq_lens=self._seq_lens,
                        prev_action_batch=self._input_dict.get(
                            SampleBatch.PREV_ACTIONS),
                        prev_reward_batch=self._input_dict.get(
                            SampleBatch.PREV_REWARDS),
                        explore=explore,
                        is_training=self._input_dict["is_training"])
            # Default distribution generation behavior:
            # Pass through model. E.g., PG, PPO.
            else:
                dist_inputs, self._state_out = self.model(
                    self._input_dict, self._state_inputs, self._seq_lens)

            action_dist = dist_class(dist_inputs, self.model)

            # Using exploration to get final action (e.g. via sampling).
            sampled_action, sampled_action_logp = \
                self.exploration.get_exploration_action(
                    action_distribution=action_dist,
                    timestep=timestep,
                    explore=explore)

        # Phase 1 init.
        sess = tf1.get_default_session() or tf1.Session()

        batch_divisibility_req = get_batch_divisibility_req(self) if \
            callable(get_batch_divisibility_req) else \
            (get_batch_divisibility_req or 1)

        super().__init__(
            observation_space=obs_space,
            action_space=action_space,
            config=config,
            sess=sess,
            obs_input=self._input_dict[SampleBatch.OBS],
            action_input=self._input_dict[SampleBatch.ACTIONS],
            sampled_action=sampled_action,
            sampled_action_logp=sampled_action_logp,
            dist_inputs=dist_inputs,
            dist_class=dist_class,
            loss=None,  # dynamically initialized on run
            loss_inputs=[],
            model=self.model,
            state_inputs=self._state_inputs,
            state_outputs=self._state_out,
            prev_action_input=self._input_dict.get(SampleBatch.PREV_ACTIONS),
            prev_reward_input=self._input_dict.get(SampleBatch.PREV_REWARDS),
            seq_lens=self._seq_lens,
            max_seq_len=config["model"]["max_seq_len"],
            batch_divisibility_req=batch_divisibility_req,
            explore=explore,
            timestep=timestep)

        # Phase 2 init.
        if before_loss_init is not None:
            before_loss_init(self, obs_space, action_space, config)

        # Loss initialization and model/postprocessing test calls.
        if not existing_inputs:
            self._initialize_loss_from_dummy_batch(
                auto_remove_unneeded_view_reqs=True)
예제 #34
0
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
            self.framework = config.get("framework", "tfe")
            Policy.__init__(self, observation_space, action_space, config)
            self._is_training = False
            self._loss_initialized = False
            self._sess = None

            self._loss = loss_fn
            self.batch_divisibility_req = get_batch_divisibility_req(self) if \
                callable(get_batch_divisibility_req) else \
                (get_batch_divisibility_req or 1)
            self._max_seq_len = config["model"]["max_seq_len"]

            if get_default_config:
                config = dict(get_default_config(), **config)

            if validate_spaces:
                validate_spaces(self, observation_space, action_space, config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config
            self.dist_class = None
            if action_sampler_fn or action_distribution_fn:
                if not make_model:
                    raise ValueError(
                        "`make_model` is required if `action_sampler_fn` OR "
                        "`action_distribution_fn` is given")
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework=self.framework,
                )
            # Lock used for locking some methods on the object-level.
            # This prevents possible race conditions when calling the model
            # first, then its value function (e.g. in a loss function), in
            # between of which another model call is made (e.g. to compute an
            # action).
            self._lock = threading.RLock()

            # Auto-update model's inference view requirements, if recurrent.
            self._update_model_view_requirements_from_init_state()

            self.exploration = self._create_exploration()
            self._state_inputs = self.model.get_initial_state()
            self._is_recurrent = len(self._state_inputs) > 0

            # Combine view_requirements for Model and Policy.
            self.view_requirements.update(self.model.view_requirements)

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            if optimizer_fn:
                optimizers = optimizer_fn(self, config)
            else:
                optimizers = tf.keras.optimizers.Adam(config["lr"])
            optimizers = force_list(optimizers)
            if getattr(self, "exploration", None):
                optimizers = self.exploration.get_exploration_optimizer(
                    optimizers)
            # TODO: (sven) Allow tf policy to have more than 1 optimizer.
            #  Just like torch Policy does.
            self._optimizer = optimizers[0] if optimizers else None

            self._initialize_loss_from_dummy_batch(
                auto_remove_unneeded_view_reqs=True,
                stats_fn=stats_fn,
            )
            self._loss_initialized = True

            if after_init:
                after_init(self, observation_space, action_space, config)

            # Got to reset global_timestep again after fake run-throughs.
            self.global_timestep = 0
예제 #35
0
    def __init__(self,
                 observation_space,
                 action_space,
                 config,
                 existing_inputs=None):
        """
        Arguments:
            observation_space: Environment observation space specification.
            action_space: Environment action space specification.
            config (dict): Configuration values for PPORND graph.
            existing_inputs (list): Optional list of tuples that specify the
                placeholders upon which the graph should be built upon.
        """
        config = dict(DEFAULT_CONFIG, **config)
        self.sess = tf.get_default_session()
        self.action_space = action_space
        self.config = config
        self.kl_coeff_val = self.config["kl_coeff"]
        self.kl_target = self.config["kl_target"]
        dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space)

        if existing_inputs:
            obs_ph, value_targets_ph, adv_ph, act_ph, \
                logits_ph, vf_preds_ph = existing_inputs[:6]
            # TODO: add adv_ph_int
            existing_state_in = existing_inputs[6:-1]
            existing_seq_lens = existing_inputs[-1]
        else:
            obs_ph = tf.placeholder(tf.float32,
                                    name="obs",
                                    shape=(None, ) + observation_space.shape)
            adv_ph = tf.placeholder(tf.float32,
                                    name="advantages",
                                    shape=(None, ))
            adv_int_ph = tf.placeholder(tf.float32,
                                        name="advantages_int",
                                        shape=(None, ))
            act_ph = ModelCatalog.get_action_placeholder(action_space)
            logits_ph = tf.placeholder(tf.float32,
                                       name="logits",
                                       shape=(None, logit_dim))
            vf_preds_ph = tf.placeholder(tf.float32,
                                         name="vf_preds",
                                         shape=(None, ))
            value_targets_ph = tf.placeholder(tf.float32,
                                              name="value_targets",
                                              shape=(None, ))
            existing_state_in = None
            existing_seq_lens = None
        self.observations = obs_ph

        self.loss_in = [
            ("obs", obs_ph),
            ("value_targets", value_targets_ph),
            ("advantages", adv_ph),
            ("actions", act_ph),
            ("logits", logits_ph),
            ("vf_preds", vf_preds_ph),
        ]
        self.model = ModelCatalog.get_model(obs_ph,
                                            logit_dim,
                                            self.config["model"],
                                            state_in=existing_state_in,
                                            seq_lens=existing_seq_lens)

        # KL Coefficient
        self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer(
            self.kl_coeff_val),
                                        name="kl_coeff",
                                        shape=(),
                                        trainable=False,
                                        dtype=tf.float32)

        self.logits = self.model.outputs
        curr_action_dist = dist_cls(self.logits)
        self.sampler = curr_action_dist.sample()
        if self.config["use_gae"]:
            if self.config["vf_share_layers"]:
                self.value_function = tf.reshape(
                    linear(self.model.last_layer, 1, "value",
                           normc_initializer(1.0)), [-1])
            else:
                vf_config = self.config["model"].copy()
                # Do not split the last layer of the value function into
                # mean parameters and standard deviation parameters and
                # do not make the standard deviations free variables.
                vf_config["free_log_std"] = False
                vf_config["use_lstm"] = False
                with tf.variable_scope("value_function"):
                    self.value_function = ModelCatalog.get_model(
                        obs_ph, 1, vf_config).outputs
                    self.value_function = tf.reshape(self.value_function, [-1])
        else:
            self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1])

        # TODO: add another head in the policy network for estimating value of intrinsic reward

        # RND target network
        with tf.variable_scope("rnd_target"):
            modelconfig = self.config["model"].copy()
            modelconfig["free_log_std"] = False
            modelconfig["use_lstm"] = False
            self.rnd_target = ModelCatalog.get_model(
                obs_ph, self.config["embedding_size"], modelconfig).outputs
            # self.rnd_target = tf.reshape(self.rnd_target, [-1])  # TODO: necessary?

        # RND predictor network
        with tf.variable_scope("rnd_predictor"):
            modelconfig = self.config["model"].copy()
            modelconfig["free_log_std"] = False
            modelconfig["use_lstm"] = False
            self.rnd_predictor = ModelCatalog.get_model(
                obs_ph, self.config["embedding_size"], modelconfig).outputs

        self.loss_obj = PPORNDLoss(
            action_space,
            value_targets_ph,
            adv_ph,
            adv_int_ph,
            act_ph,
            logits_ph,
            vf_preds_ph,
            curr_action_dist,
            self.value_function,
            self.kl_coeff,
            self.rnd_target,
            self.rnd_predictor,
            # TODO: valid_mask??
            entropy_coeff=self.config["entropy_coeff"],
            clip_param=self.config["clip_param"],
            vf_clip_param=self.config["vf_clip_param"],
            vf_loss_coeff=self.config["vf_loss_coeff"],
            use_gae=self.config["use_gae"])

        entropy_coeff = 0,
        clip_param = 0.1,
        vf_clip_param = 0.1,
        vf_loss_coeff = 1.0,
        use_gae = True,
        rnd_pred_update_prop = 0.25

        LearningRateSchedule.__init__(self, self.config["lr"],
                                      self.config["lr_schedule"])
        TFPolicyGraph.__init__(self,
                               observation_space,
                               action_space,
                               self.sess,
                               obs_input=obs_ph,
                               action_sampler=self.sampler,
                               loss=self.loss_obj.loss,
                               loss_inputs=self.loss_in,
                               state_inputs=self.model.state_in,
                               state_outputs=self.model.state_out,
                               seq_lens=self.model.seq_lens,
                               max_seq_len=config["model"]["max_seq_len"])

        self.sess.run(tf.global_variables_initializer())
        self.explained_variance = explained_variance(value_targets_ph,
                                                     self.value_function)
        self.stats_fetches = {
            "cur_lr": tf.cast(self.cur_lr, tf.float64),
            "total_loss": self.loss_obj.loss,
            "policy_loss": self.loss_obj.mean_policy_loss,
            "vf_loss": self.loss_obj.mean_vf_loss,
            "vf_explained_var": self.explained_variance,
            "kl": self.loss_obj.mean_kl,
            "entropy": self.loss_obj.mean_entropy
        }