Пример #1
0
    def __call__(self, state):

        with tf.variable_scope(self.name):
            # Net
            self.width = 1
            self.offset = 0
            self.k = 0.07
            self.theta = tf.get_variable(
                "theta",
                dtype=get_default_tf_dtype(),
                shape=(1, 1),
                initializer=tf.initializers.constant(
                    self.from_sigm_to_theta(self.init_theta)),
            )

            theta = self.width * (
                1 / (1 + tf.exp(-self.k * self.theta))) + self.offset

            # For taking actions
            self._pi = tf.concat(
                [
                    tf.tile(theta, (tf.shape(state)[0], 1)),
                    tf.tile(1 - theta, (tf.shape(state)[0], 1)),
                ],
                axis=1,
            )

            self._log_pi = tf.log(self._pi + self.epsilon_small)

        return self._pi, self._log_pi
Пример #2
0
 def __init__(
     self,
     policy,
     inputQ,
     outputQ,
     n_actions,
     obs_size,
     id,
     total_n_samples,
     n_params,
 ):
     # Invoke parent constructor BEFORE doing anything!!
     Process.__init__(self)
     self.dtype = get_default_tf_dtype()
     self.env = Torcs(port=id)
     self.state_tf = tf.placeholder(self.dtype,
                                    (None, self.env.observation_space_size),
                                    name="states")
     self.action_tf = tf.placeholder(self.dtype,
                                     (None, self.env.action_space_size),
                                     name="actions")
     self.policy_tf = None
     self.policy = policy
     self.inputQ = inputQ
     self.outputQ = outputQ
     self.n_actions = n_actions
     self.id = id
     self.total_n_samples = total_n_samples
     self.n_params = n_params
Пример #3
0
 def __init__(self, name="policy", init_theta=np.random.rand()):
     """
     Builds a policy network and returns a node for pi and a node for logpi
     """
     # net params
     super(OneParameterPolicy, self).__init__(name)
     self.sess = None
     self.default_dtype = get_default_tf_dtype()
     self.epsilon_small = 1e-20
     self.action_space = 2
     self.init_theta = init_theta
Пример #4
0
 def __init__(self, policy, env, n_traj, inputQ, outputQ, n_actions, obs_size):
     # Invoke parent constructor BEFORE doing anything!!
     Process.__init__(self)
     self.dtype = get_default_tf_dtype()
     self.state_tf = tf.placeholder(self.dtype, (None, obs_size), name="states")
     self.policy = policy
     self.n_traj = n_traj
     self.inputQ = inputQ
     self.outputQ = outputQ
     self.env = copy(env)
     self.n_actions = n_actions
Пример #5
0
 def __init__(self, name="chain"):
     """
     Parameters
     - input_space: dimension of state vector
     """
     self.sess = None
     self.log_prob = None
     self.default_dtype = get_default_tf_dtype()
     self.name = name
     self.omega_value = 0
     self.width = 1
     self.offset = 0
     self.k = 1
     self.param = 0.5
Пример #6
0
 def __init__(self, state_space, action_space, hidden_layer_size, name="policy"):
     """
     Builds a policy network and returns a node for the gradient and a node for action selection
     Simple network: from state space to action space
     Start from a random policy, all weights equal to 0
     @param state_space: dimension of state space
     @param action_space: dimension of action space
     """
     # net params
     super().__init__(name)
     self.hidden_layer_size = hidden_layer_size
     self.state_space = state_space
     self.action_space = action_space
     self.sess = None
     self.default_dtype = get_default_tf_dtype()
Пример #7
0
    def __init__(self,
                 model,
                 policy,
                 clip_gradient=False,
                 env=None,
                 n_trajectories=100):

        self.model = model
        self.policy = policy
        self.clip_gradient = clip_gradient
        self.dtype = get_default_tf_dtype()
        self.env = env
        self.global_step = 0
        self.iteration = 0
        self.n_trajectories = n_trajectories
 def __init__(self, name="cartpole"):
     """
     Parameters
     - input_space: dimension of state vector
     """
     self.sess = None
     self.log_prob = None
     self.default_dtype = get_default_tf_dtype()
     # must be initialized
     self.name = name
     self.x_range = 4.8
     self.theta_range = 180
     # the noise (3sigma) should be inside 10% of the range
     self.x_var = 1e-6  # (self.x_range/(3*1000))**2
     self.theta_var = 1e-6  # (self.theta_range/(3*1000))**2
     self.x_dot_var = 1e-6  # self.x_var/1e-6
     self.theta_dot_var = 1e-6  # self.theta_var/1e-6
     self.action_noise_var = 1e-2
     self.min_omega = 0.1
     self.max_omega = 30
Пример #9
0
 def __init__(self,
              state_space,
              action_space,
              hidden_layer_size,
              name="policy"):
     """
     Builds a policy network and returns a node for the gradient and a node for action selection
     Simple network: from state space to action space
     Start from a random policy, all weights equal to 0
     @param state_space: dimension of state space
     @param actions_space: dimension of action space
     @param trajectory_size: number of trajectories collected for estimating the gradient
     @param checkpoint_file: name of checkpoint file in which to save variables
     @param restore: True if need to restore variables
     """
     # net params
     super().__init__(name)
     self.hidden_layer_size = hidden_layer_size
     self.state_space = state_space
     self.action_space = action_space
     self.sess = None
     self.default_dtype = get_default_tf_dtype()
Пример #10
0
 def __init__(self, state_dim, param_dim, name="NN", training_set_size=4000):
     """
     Fit a NN for predicting the next state distribution
     Output of NN are the parameters of a parametric distribution (Gaussian)
     """
     self.sess = None
     self.log_prob = None
     self.prob = None
     self.dtype = get_default_tf_dtype()
     # must be initialized
     self.name = name
     self.x_range = 4.8
     self.theta_range = 180
     self.XData = None
     self.YData = None
     self.state_dim = state_dim
     self.x_dim = state_dim + param_dim
     self.gp_list = []
     self.training_set_size = training_set_size
     self.global_step = 0
     self.folder = self.name + "NNData" + "/"
     self.min_omega = 0.1
     self.max_omega = 30
Пример #11
0
    def __init__(
        self,
        kappa: float = 1e-3,
        L2_reg_dual: float = 0.0,  # 1e-7,# 1e-5,
        L2_reg_loss: float = 0.0,
        max_opt_itr: int = 1000,
        tf_optimizer=ScipyOptimizerInterface,
        model: ModelApproximator = None,
        policy: Policy = None,
        env: ConfMDP = None,
        projection_type: Projection = Projection.
        STATE_KERNEL,  # State kernel or disjoint
        training_set_size: int = 5000,
        exact: bool = False,
        restart_fitting: bool = False,
        fit_iterations: int = 40000,
        refit_iterations: int = 1000,
        refit: int = False,
        refit_every_iterations: int = 100,
        **kwargs,
    ):
        """
        :param kappa: Max KL divergence between new policy and old policy.
        :param L2_reg_dual: Dual regularization
        :param L2_reg_loss: Loss regularization
        :param max_opt_itr: Maximum number of batch optimization iterations.
        :param tf_optimizer: optimizer to use
        :param model: model approximation
        :param policy: policy to be optimized
        :param env: environment
        :param projection_type: type of projection
        :param use_features: whether to use features or not
        :param training_set_size: number of samples in the training set
        :param exact: whether the model approximation is exact or not
        :return:
        """
        self.kappa = kappa
        self.L2_reg_dual = L2_reg_dual
        self.L2_reg_loss = L2_reg_loss
        self.max_opt_itr = max_opt_itr
        self.tf_optimizer = tf_optimizer
        self.model = model
        self.policy = policy
        self.env = env
        self.dtype = get_default_tf_dtype()
        self.epsilon_small = 1e-24
        self.min_eta_inv = 1e-12
        self.projection_type = projection_type
        self.model_L2_reg_loss = 0
        self.policy_L2_reg_loss = L2_reg_loss
        self.write_every = 1
        self.training_set_size = training_set_size
        self.exact = exact
        self.fit_iterations = fit_iterations
        self.refit_iterations = (refit_iterations
                                 if not restart_fitting else fit_iterations)
        self.restart_fitting = restart_fitting
        self.refit = refit
        self.refit_every_iterations = refit_every_iterations
        self.sess = None
        self.summary_writer = None
        self.global_step = 0
        self.iteration = 0

        # ----------------------------------------
        # placeholders
        # ----------------------------------------
        self.observations_ph = None
        self.actions_one_hot_ph = None
        self.kappa_ph = None  # Constraint on the KL divergence: \kappa
        self.actions_ph = None
        self.rewards_ph = None
        self.returns_ph = None
        self.timesteps_ph = None
        self.next_states_ph = None
        self.feat_diff_ph = None
        self.param_eta = None  # Value of \eta
        self.param_eta_inv_ph = None  # inverse of eta_ 1/\eta
        self.policy_tf = None  # \pi(a | s)
        self.model_tf = None  # p_\omega(s'|s,a)
        self.model_logli = None  # log(p_\omega(s' | s,a))
        self.model_policy_loss = None  #
        self.dual = None  # \min_{\eta\in[0, +\infty)} g(\eta) =
        # \eta \log \ev_{S,A,S' \sim d} \left[ \exp\left(\frac{1}{\eta} r(S,A,S') + \kappa \right) \right]
        self.dual_grad = None  # Gradient of the dual
        self.primal = None  # \exp\left(\frac{1}{\eta} r(s,a,s')\right)
        self.model_grad_loss = None  # Gradient of the loss of the model
        self.policy_grad_loss = None  # Gradient of the loss of the poliicy
        self.model_policy_grad_loss = None  # Gradient of the loss of the model policy
        self.model_loss = None  # Loss of the model
        self.policy_loss = None  # Policy loss
        self.eta = None  # Dual parameter
        self.state_kernel = (
            None
        )  # p\{\theta, \omega} (s' | s) = p_\omega (s'|s,a) \pi_\theta (a | s)

        # ----------------------------------------
        # Optimizers
        # ----------------------------------------
        self.state_kernel_proj_opt = None
        self.model_tf_optimizer = None
        self.dual_optimizer = None

        # Summary
        self.summarize = None