예제 #1
0
    def __init__(
            self,
            env_spec,
            hardcoded_q=None,
            scope='policy',
            ent_wt=1.0,
    ):
        """
        :param env_spec: A spec for the env.
        :param hidden_dim: dimension of hidden layer
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        #self.graph = tf.get_default_graph()
        assert isinstance(env_spec.action_space, Discrete)
        Serializable.quick_init(self, locals())
        super(CategoricalSoftQPolicy, self).__init__(env_spec)
        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim
        self.dist = Categorical(action_dim)
        self.ent_wt = ent_wt
        self.hardcoded_q = hardcoded_q

        with tf.variable_scope(scope) as vs:
            self.vs = vs

            self.q_func = tf.get_variable(
                'q_func', shape=(obs_dim, action_dim))

            self.q_func_plc = tf.placeholder(
                tf.float32, shape=(obs_dim, action_dim))
            self.q_func_assgn = tf.assign(self.q_func, self.q_func_plc)
예제 #2
0
파일: irl.py 프로젝트: pidchen/atari-irl
    def __init__(self,
                 *,
                 name,
                 policy_model,
                 num_envs,
                 env_spec,
                 wrapped_env_action_space,
                 action_space,
                 observation_space,
                 batching_config,
                 init_location=None,
                 encoder=None):
        Serializable.quick_init(self, locals())
        assert isinstance(wrapped_env_action_space, Box)
        self._dist = Categorical(wrapped_env_action_space.shape[0])

        # this is going to be serialized, so we can't add in the envs or
        # wrappers
        self.init_args = dict(name=name,
                              policy_model=policy_model,
                              init_location=init_location)

        ent_coef = 0.01
        vf_coef = 0.5
        max_grad_norm = 0.5
        model_args = dict(policy=policy_model,
                          ob_space=observation_space,
                          ac_space=action_space,
                          nbatch_act=batching_config.nenvs,
                          nbatch_train=batching_config.nbatch_train,
                          nsteps=batching_config.nsteps,
                          ent_coef=ent_coef,
                          vf_coef=vf_coef,
                          max_grad_norm=max_grad_norm)

        self.num_envs = num_envs

        with tf.variable_scope(name) as scope:
            policy = policies.Policy(model_args)
            self.model = policy.model
            self.act_model = self.model.act_model
            self.scope = scope

        StochasticPolicy.__init__(self, env_spec)
        self.name = name

        self.probs = tf.nn.softmax(self.act_model.pd.logits)
        obs_var = self.act_model.X

        self.tensor_values = lambda **kwargs: tf.get_default_session().run(
            self.get_params())

        self._f_dist = tensor_utils.compile_function(inputs=[obs_var],
                                                     outputs=self.probs)

        if init_location:
            data = joblib.load(open(init_location, 'rb'))
            self.restore_from_snapshot(data['policy_params'])
    def __init__(
        self,
        name,
        env_spec,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.tanh,
        prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)
        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        with tf.variable_scope(name):
            if prob_network is None:
                prob_network = self.create_MLP(
                    input_shape=(obs_dim, ),
                    output_dim=env_spec.action_space.n,
                    hidden_sizes=hidden_sizes,
                    name="prob_network",
                )
            self._l_obs, self._l_prob = self.forward_MLP(
                'prob_network',
                prob_network,
                n_hidden=len(hidden_sizes),
                input_shape=(obs_dim, ),
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=tf.nn.softmax,
                reuse=None)

            # if you want to input your own tensor.
            self._forward_out = lambda x, is_train: self.forward_MLP(
                'prob_network',
                prob_network,
                n_hidden=len(hidden_sizes),
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                input_tensor=x,
                is_training=is_train)[1]

            self._f_prob = tensor_utils.compile_function([self._l_obs],
                                                         L.get_output(
                                                             self._l_prob))

            self._dist = Categorical(env_spec.action_space.n)
    def __init__(
            self,
            name,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            prob_network=None,
            grad_step_size=1.0,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :param grad_step_size: the step size taken in the learner's gradient update, sample uniformly if it is a range e.g. [0.1,1]
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)
        obs_dim = env_spec.observation_space.flat_dim
        self.action_dim = env_spec.action_space.n
        self.n_hidden = len(hidden_sizes)
        self.hidden_nonlinearity = hidden_nonlinearity
        self.input_shape = (None, obs_dim,)
        self.step_size = grad_step_size

        if prob_network is None:
            self.all_params = self.create_MLP(
                output_dim=self.action_dim,
                hidden_sizes=hidden_sizes,
                name="prob_network",
            )
        self.all_param_vals = None
        self._l_obs, self._l_prob = self.forward_MLP('prob_network', self.all_params,
            n_hidden=len(hidden_sizes), input_shape=(obs_dim,),
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=tf.nn.softmax, reuse=None)

        # if you want to input your own tensor.
        self._forward_out = lambda x, params, is_train: self.forward_MLP('prob_network', params,
            n_hidden=len(hidden_sizes), hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=tf.nn.softmax, input_tensor=x, is_training=is_train)[1]


        self._init_f_prob = tensor_utils.compile_function(
            [self._l_obs],
            [self._l_prob])
        self._cur_f_prob = self._init_f_prob

        self._dist = Categorical(self.action_dim)
        self._cached_params = {}
        super(MAMLCategoricalMLPPolicy, self).__init__(env_spec)
예제 #5
0
    def init_policy(self):
        output_vec = L.get_output(self._output_vec_layer,
                                  deterministic=True) / self._c
        prob = tf.nn.softmax(output_vec)
        max_qval = tf.reduce_logsumexp(output_vec, [1])

        self._f_prob = tensor_utils.compile_function(
            [self._obs_layer.input_var], prob)
        self._f_max_qvals = tensor_utils.compile_function(
            [self._obs_layer.input_var], max_qval)

        self._dist = Categorical(self._n)
    def __init__(
        self,
        name,
        env_spec,
        conv_filters,
        conv_filter_sizes,
        conv_strides,
        conv_pads,
        hidden_sizes=[],
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.softmax,
        prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        self._env_spec = env_spec
        # import pdb; pdb.set_trace()
        if prob_network is None:
            prob_network = ConvNetwork(
                input_shape=env_spec.observation_space.shape,
                output_dim=env_spec.action_space.n,
                conv_filters=conv_filters,
                conv_filter_sizes=conv_filter_sizes,
                conv_strides=conv_strides,
                conv_pads=conv_pads,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                name="prob_network",
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = tensor_utils.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer))

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalConvPolicy, self).__init__(env_spec)
        LayersPowered.__init__(self, [prob_network.output_layer])
    def __init__(self,
                 name,
                 env_spec,
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 gating_network=None,
                 input_layer=None,
                 num_options=4,
                 conv_filters=None,
                 conv_filter_sizes=None,
                 conv_strides=None,
                 conv_pads=None,
                 input_shape=None):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        self.num_options = num_options

        assert isinstance(env_spec.action_space, Discrete)

        with tf.variable_scope(name):
            input_layer, output_layer = self.make_network(
                (env_spec.observation_space.flat_dim, ),
                env_spec.action_space.n,
                hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                gating_network=gating_network,
                l_in=input_layer,
                conv_filters=conv_filters,
                conv_filter_sizes=conv_filter_sizes,
                conv_strides=conv_strides,
                conv_pads=conv_pads,
                input_shape=input_shape)
            self._l_prob = output_layer
            self._l_obs = input_layer

            self._f_prob = tensor_utils.compile_function(
                [input_layer.input_var], L.get_output(output_layer))

            self._dist = Categorical(env_spec.action_space.n)

            super(CategoricalDecomposedPolicy, self).__init__(env_spec)
            LayersPowered.__init__(self, [output_layer])
예제 #8
0
    def __init__(
            self,
            name,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        with tf.variable_scope(name):
            if prob_network is None:
                prob_network = MLP(
                    input_shape=(env_spec.observation_space.flat_dim,),
                    output_dim=env_spec.action_space.n,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=tf.nn.softmax,
                    name="prob_network",
                )

            self._l_prob = prob_network.output_layer
            self._l_obs = prob_network.input_layer
            self._f_prob = tensor_utils.compile_function(
                [prob_network.input_layer.input_var],
                L.get_output(prob_network.output_layer)
            )

            self._dist = Categorical(env_spec.action_space.n)

            super(CategoricalMLPPolicy, self).__init__(env_spec)
            LayersPowered.__init__(self, [prob_network.output_layer])
 def __init__(self, dim):
     self._cat = Categorical(dim)
     self._dim = dim
class RecurrentCategorical(Distribution):
    def __init__(self, dim):
        self._cat = Categorical(dim)
        self._dim = dim

    @property
    def dim(self):
        return self._dim

    def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
        """
        Compute the symbolic KL divergence of two categorical distributions
        """
        old_prob_var = old_dist_info_vars["prob"]
        new_prob_var = new_dist_info_vars["prob"]
        # Assume layout is N * T * A
        return tf.reduce_sum(
            old_prob_var * (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)),
            reduction_indices=2
        )

    def kl(self, old_dist_info, new_dist_info):
        """
        Compute the KL divergence of two categorical distributions
        """
        old_prob = old_dist_info["prob"]
        new_prob = new_dist_info["prob"]
        return np.sum(
            old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)),
            axis=2
        )

    def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
        old_prob_var = old_dist_info_vars["prob"]
        new_prob_var = new_dist_info_vars["prob"]
        # Assume layout is N * T * A
        a_dim = tf.shape(x_var)[2]
        flat_ratios = self._cat.likelihood_ratio_sym(
            tf.reshape(x_var, tf.pack([-1, a_dim])),
            dict(prob=tf.reshape(old_prob_var, tf.pack([-1, a_dim]))),
            dict(prob=tf.reshape(new_prob_var, tf.pack([-1, a_dim])))
        )
        return tf.reshape(flat_ratios, tf.shape(old_prob_var)[:2])

    def entropy(self, dist_info):
        probs = dist_info["prob"]
        return -np.sum(probs * np.log(probs + TINY), axis=2)

    def entropy_sym(self, dist_info_vars):
        probs = dist_info_vars["prob"]
        return -tf.reduce_sum(probs * tf.log(probs + TINY), 2)

    def log_likelihood_sym(self, xs, dist_info_vars):
        probs = dist_info_vars["prob"]
        # Assume layout is N * T * A
        a_dim = tf.shape(probs)[2]
        # a_dim = TT.printing.Print("lala")(a_dim)
        flat_logli = self._cat.log_likelihood_sym(
            tf.reshape(xs, tf.pack([-1, a_dim])),
            dict(prob=tf.reshape(probs, tf.pack((-1, a_dim))))
        )
        return tf.reshape(flat_logli, tf.shape(probs)[:2])

    def log_likelihood(self, xs, dist_info):
        probs = dist_info["prob"]
        # Assume layout is N * T * A
        a_dim = tf.shape(probs)[2]
        flat_logli = self._cat.log_likelihood_sym(
            xs.reshape((-1, a_dim)),
            dict(prob=probs.reshape((-1, a_dim)))
        )
        return flat_logli.reshape(probs.shape[:2])

    @property
    def dist_info_specs(self):
        return [("prob", (self.dim,))]
예제 #11
0
    def __init__(
            self,
            name,
            input_shape,
            output_dim,
            prob_network=None,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            optimizer=None,
            tr_optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            normalize_inputs=True,
            no_initial_trust_region=True,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        with tf.variable_scope(name):
            if optimizer is None:
                optimizer = LbfgsOptimizer(name="optimizer")
            if tr_optimizer is None:
                tr_optimizer = ConjugateGradientOptimizer()

            self.input_dim = input_shape[0]
            self.observation_space = Discrete(self.input_dim)
            self.action_space = Discrete(output_dim)


            self.output_dim = output_dim
            self.optimizer = optimizer
            self.tr_optimizer = tr_optimizer

            if prob_network is None:
                prob_network = MLP(
                    input_shape=input_shape,
                    output_dim=output_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=tf.nn.softmax,
                    name="prob_network"
                )

            l_prob = prob_network.output_layer

            LayersPowered.__init__(self, [l_prob])

            xs_var = prob_network.input_layer.input_var
            ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys")
            old_prob_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="old_prob")

            x_mean_var = tf.get_variable(
                name="x_mean",
                shape=(1,) + input_shape,
                initializer=tf.constant_initializer(0., dtype=tf.float32)
            )
            x_std_var = tf.get_variable(
                name="x_std",
                shape=(1,) + input_shape,
                initializer=tf.constant_initializer(1., dtype=tf.float32)
            )

            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var

            prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var})

            old_info_vars = dict(prob=old_prob_var)
            info_vars = dict(prob=prob_var)

            dist = self._dist = Categorical(output_dim)

            mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars))

            loss = - tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars))

            predicted = tensor_utils.to_onehot_sym(tf.argmax(prob_var, axis=1), output_dim)

            self.prob_network = prob_network
            self.f_predict = tensor_utils.compile_function([xs_var], predicted)
            self.f_prob = tensor_utils.compile_function([xs_var], prob_var)
            self.l_prob = l_prob

            self.optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var])
            self.tr_optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var],
                                         inputs=[xs_var, ys_var, old_prob_var],
                                         leq_constraint=(mean_kl, step_size)
                                         )

            self.use_trust_region = use_trust_region
            self.name = name

            self.normalize_inputs = normalize_inputs
            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var
            self.first_optimized = not no_initial_trust_region
 def __init__(self, dim):
     self._cat = Categorical(dim)
     self._dim = dim
class RecurrentCategorical(Distribution):
    def __init__(self, dim):
        self._cat = Categorical(dim)
        self._dim = dim

    @property
    def dim(self):
        return self._dim

    def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
        """
        Compute the symbolic KL divergence of two categorical distributions
        """
        old_prob_var = old_dist_info_vars["prob"]
        new_prob_var = new_dist_info_vars["prob"]
        # Assume layout is N * T * A
        return tf.reduce_sum(
            old_prob_var *
            (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)),
            reduction_indices=2)

    def kl(self, old_dist_info, new_dist_info):
        """
        Compute the KL divergence of two categorical distributions
        """
        old_prob = old_dist_info["prob"]
        new_prob = new_dist_info["prob"]
        return np.sum(old_prob *
                      (np.log(old_prob + TINY) - np.log(new_prob + TINY)),
                      axis=2)

    def likelihood_ratio_sym(self, x_var, old_dist_info_vars,
                             new_dist_info_vars):
        old_prob_var = old_dist_info_vars["prob"]
        new_prob_var = new_dist_info_vars["prob"]
        # Assume layout is N * T * A
        a_dim = tf.shape(x_var)[2]
        flat_ratios = self._cat.likelihood_ratio_sym(
            tf.reshape(x_var, tf.stack([-1, a_dim])),
            dict(prob=tf.reshape(old_prob_var, tf.stack([-1, a_dim]))),
            dict(prob=tf.reshape(new_prob_var, tf.stack([-1, a_dim]))))
        return tf.reshape(flat_ratios, tf.shape(old_prob_var)[:2])

    def entropy(self, dist_info):
        probs = dist_info["prob"]
        return -np.sum(probs * np.log(probs + TINY), axis=2)

    def entropy_sym(self, dist_info_vars):
        probs = dist_info_vars["prob"]
        return -tf.reduce_sum(probs * tf.log(probs + TINY), 2)

    def log_likelihood_sym(self, xs, dist_info_vars):
        probs = dist_info_vars["prob"]
        # Assume layout is N * T * A
        a_dim = tf.shape(probs)[2]
        # a_dim = TT.printing.Print("lala")(a_dim)
        flat_logli = self._cat.log_likelihood_sym(
            tf.reshape(xs, tf.stack([-1, a_dim])),
            dict(prob=tf.reshape(probs, tf.stack((-1, a_dim)))))
        return tf.reshape(flat_logli, tf.shape(probs)[:2])

    def log_likelihood(self, xs, dist_info):
        probs = dist_info["prob"]
        # Assume layout is N * T * A
        a_dim = tf.shape(probs)[2]
        flat_logli = self._cat.log_likelihood_sym(
            xs.reshape((-1, a_dim)), dict(prob=probs.reshape((-1, a_dim))))
        return flat_logli.reshape(probs.shape[:2])

    @property
    def dist_info_specs(self):
        return [("prob", (self.dim, ))]