def __init__(self,
                 env_spec: EnvSpec,
                 name_scope: str,
                 name: str,
                 mlp_config: list,
                 state_input: tf.Tensor = None,
                 reuse=False,
                 input_norm: np.ndarray = None,
                 output_norm: np.ndarray = None,
                 output_low: np.ndarray = None,
                 output_high: np.ndarray = None,
                 ):
        with tf.variable_scope(name_scope):
            state_input = state_input if state_input is not None else tf.placeholder(
                shape=[None, env_spec.flat_obs_dim],
                dtype=tf.float32,
                name='state_ph')

        mlp_input_ph = state_input
        mlp_kwargs = dict(
            reuse=reuse,
            mlp_config=mlp_config,
            input_norm=input_norm,
            output_norm=output_norm,
            output_high=output_high,
            output_low=output_low,
            name_scope=name_scope
        )
        mlp_net = MLP(input_ph=mlp_input_ph,
                      net_name='mlp',
                      **mlp_kwargs)
        parameters = ParametersWithTensorflowVariable(tf_var_list=mlp_net.var_list,
                                                      rest_parameters=mlp_kwargs,
                                                      name='mlp_v_value_function_tf_param')
        VValueFunction.__init__(self,
                                env_spec=env_spec,
                                state_input=state_input,
                                name=name,
                                parameters=None)
        PlaceholderInput.__init__(self,
                                  inputs=mlp_input_ph,
                                  parameters=parameters)

        self.name_scope = name_scope
        self.mlp_config = mlp_config
        self.input_norm = input_norm
        self.output_norm = output_norm
        self.output_low = output_low
        self.output_high = output_high
        self.state_input = state_input
        self.mlp_input_ph = mlp_input_ph
        self.mlp_net = mlp_net
        self.v_tensor = self.mlp_net.output
    def __init__(self,
                 env_spec: EnvSpec,
                 name_scope: str,
                 name: str,
                 mlp_config: list,
                 learning_rate: float,
                 output_norm: np.ndarray = None,
                 input_norm: np.ndarray = None,
                 output_low: np.ndarray = None,
                 output_high: np.ndarray = None,
                 init_state=None):
        if not isinstance(env_spec.obs_space, Box):
            raise TypeError(
                'ContinuousMLPGlobalDynamicsModel only support to predict state that hold space Box type'
            )
        GlobalDynamicsModel.__init__(self,
                                     env_spec=env_spec,
                                     parameters=None,
                                     name=name,
                                     init_state=init_state)

        with tf.variable_scope(name_scope):
            state_input = tf.placeholder(shape=[None, env_spec.flat_obs_dim],
                                         dtype=tf.float32,
                                         name='state_ph')
            action_input = tf.placeholder(
                shape=[None, env_spec.flat_action_dim],
                dtype=tf.float32,
                name='action_ph')
            mlp_input_ph = tf.concat([state_input, action_input],
                                     axis=1,
                                     name='state_action_input')
            delta_state_label_ph = tf.placeholder(
                shape=[None, env_spec.flat_obs_dim],
                dtype=tf.float32,
                name='delta_state_label_ph')
        mlp_net = MLP(
            input_ph=mlp_input_ph,
            reuse=False,
            mlp_config=mlp_config,
            input_norm=input_norm,
            output_norm=output_norm,
            # todo have a running-up mean module
            output_high=output_high - output_low,
            output_low=output_low - output_high,
            name_scope=name_scope,
            net_name='mlp')
        assert mlp_net.output.shape[1] == env_spec.flat_obs_dim

        parameters = ParametersWithTensorflowVariable(
            tf_var_list=mlp_net.var_list,
            name=name + '_'
            'mlp_continuous_dynamics_model',
            rest_parameters=dict(output_low=output_low,
                                 output_high=output_high,
                                 input_norm=input_norm,
                                 learning_rate=learning_rate))
        with tf.variable_scope(name_scope):
            with tf.variable_scope('train'):
                new_state_output = mlp_net.output + state_input

        DerivableDynamics.__init__(
            self,
            input_node_dict=dict(state_input=state_input,
                                 action_action_input=action_input),
            output_node_dict=dict(new_state_output=new_state_output))
        PlaceholderInput.__init__(self,
                                  inputs=(state_input, action_input,
                                          delta_state_label_ph),
                                  parameters=parameters)

        self.mlp_config = mlp_config
        self.name_scope = name_scope
        self.action_input = action_input
        self.state_input = state_input
        self.mlp_input_ph = mlp_input_ph
        self.delta_state_label_ph = delta_state_label_ph
        self.new_state_output = new_state_output
        self.mlp_net = mlp_net

        self._status = StatusWithSubInfo(obj=self)

        with tf.variable_scope(name_scope):
            with tf.variable_scope('train'):
                self.loss, self.optimizer, self.optimize_op = self._setup_loss(
                )
        train_var_list = get_tf_collection_var_list(
            key=tf.GraphKeys.GLOBAL_VARIABLES,
            scope='{}/train'.format(name_scope)) + self.optimizer.variables()

        self.parameters.set_tf_var_list(
            sorted(list(set(train_var_list)), key=lambda x: x.name))
示例#3
0
    def __init__(self,
                 env_spec: EnvSpec,
                 name: str,
                 name_scope: str,
                 mlp_config: list,
                 input_norm: np.ndarray = None,
                 output_norm: np.ndarray = None,
                 output_low: np.ndarray = None,
                 output_high: np.ndarray = None,
                 reuse=False,
                 distribution_tensors_tuple: tuple = None):
        StochasticPolicy.__init__(self,
                                  env_spec=env_spec,
                                  name=name,
                                  parameters=None)
        obs_dim = env_spec.flat_obs_dim
        action_dim = env_spec.flat_action_dim
        assert action_dim == mlp_config[-1]['N_UNITS']
        self.mlp_config = mlp_config
        self.input_norm = input_norm
        self.output_norm = output_norm
        self.output_low = output_low
        self.output_high = output_high
        self.mlp_config = mlp_config
        self.name_scope = name_scope

        mlp_kwargs = dict(reuse=reuse,
                          input_norm=input_norm,
                          output_norm=output_norm,
                          output_low=output_low,
                          output_high=output_high,
                          mlp_config=mlp_config,
                          name_scope=name_scope)
        ph_inputs = []
        if distribution_tensors_tuple is not None:
            self.mean_output = distribution_tensors_tuple[0][0]
            self.logvar_output = distribution_tensors_tuple[1][0]
            assert list(self.mean_output.shape)[-1] == action_dim
            assert list(self.logvar_output.shape)[-1] == action_dim
            self.mlp_net = None
        else:
            with tf.variable_scope(self.name_scope):
                self.state_input = tf.placeholder(shape=[None, obs_dim],
                                                  dtype=tf.float32,
                                                  name='state_ph')
                ph_inputs.append(self.state_input)
            self.mlp_net = MLP(input_ph=self.state_input,
                               net_name='normal_distribution_mlp_policy',
                               **mlp_kwargs)
            self.mean_output = self.mlp_net.output
            with tf.variable_scope(name_scope, reuse=reuse):
                with tf.variable_scope('norm_dist', reuse=reuse):
                    logvar_speed = (10 * self.mlp_config[-2]['N_UNITS']) // 48
                    logvar_output = tf.get_variable(
                        name='normal_distribution_variance',
                        shape=[logvar_speed, self.mlp_config[-1]['N_UNITS']],
                        dtype=tf.float32)
                    # self.logvar_output = tf.reduce_sum(logvar_output, axis=0) + self.parameters('log_var_init')
                    self.logvar_output = tf.reduce_sum(logvar_output, axis=0)
        with tf.variable_scope(name_scope, reuse=reuse):
            self.action_input = tf.placeholder(shape=[None, action_dim],
                                               dtype=tf.float32,
                                               name='action_ph')
            ph_inputs.append(self.action_input)
            with tf.variable_scope('norm_dist', reuse=reuse):
                self.stddev_output = tf.exp(self.logvar_output / 2.0,
                                            name='std_dev')
                self.var_output = tf.exp(self.logvar_output, name='variance')
                self.action_distribution = tfp.distributions.MultivariateNormalDiag(
                    loc=self.mean_output,
                    scale_diag=self.stddev_output,
                    name='mlp_normal_distribution')
                self.action_output = self.action_distribution.sample()
        self.dist_info_tensor_op_dict = {
            # todo support more in future
            'prob': self.action_distribution.prob,
            'log_prob': self.action_distribution.log_prob,
            'entropy': self.action_distribution.entropy,
            'kl': self.kl
        }
        var_list = get_tf_collection_var_list(
            scope='{}/norm_dist'.format(name_scope))
        if self.mlp_net:
            var_list += self.mlp_net.var_list

        self.parameters = ParametersWithTensorflowVariable(
            tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name),
            rest_parameters=dict(state_input=self.state_input,
                                 action_input=self.action_input,
                                 **mlp_kwargs),
            name='normal_distribution_mlp_tf_param')
        PlaceholderInput.__init__(self,
                                  parameters=self.parameters,
                                  inputs=tuple(ph_inputs))
示例#4
0
    def __init__(
        self,
        env_spec: EnvSpec,
        name: str,
        name_scope: str,
        mlp_config: list,
        state_input: tf.Tensor = None,
        action_input: tf.Tensor = None,
        reuse=False,
        input_norm: np.ndarray = None,
        output_norm: np.ndarray = None,
        output_low: np.ndarray = None,
        output_high: np.ndarray = None,
    ):
        with tf.name_scope(name_scope):
            state_input = state_input if state_input is not None else tf.placeholder(
                shape=[None, env_spec.flat_obs_dim],
                dtype=tf.float32,
                name='state_ph')
            action_input = action_input if action_input is not None else tf.placeholder(
                shape=[None, env_spec.flat_action_dim],
                dtype=tf.float32,
                name='action_ph')
        with tf.variable_scope(name_scope):
            mlp_input_ph = tf.concat([state_input, action_input],
                                     axis=1,
                                     name='state_action_input')
        mlp_net_kwargs = dict(
            reuse=reuse,
            mlp_config=mlp_config,
            input_norm=input_norm,
            output_norm=output_norm,
            output_high=output_high,
            output_low=output_low,
            name_scope=name_scope,
        )
        mlp_net = MLP(input_ph=mlp_input_ph, net_name='mlp', **mlp_net_kwargs)
        parameters = ParametersWithTensorflowVariable(
            tf_var_list=mlp_net.var_list,
            rest_parameters=dict(**mlp_net_kwargs, name=name),
            default_save_type='tf',
            name='{}_tf_param'.format(name))
        QValueFunction.__init__(self,
                                env_spec=env_spec,
                                name=name,
                                action_input=action_input,
                                state_input=state_input,
                                parameters=None)
        PlaceholderInput.__init__(self,
                                  parameters=parameters,
                                  inputs=mlp_input_ph)

        self.mlp_config = mlp_config
        self.input_norm = input_norm
        self.output_norm = output_norm
        self.output_low = output_low
        self.output_high = output_high
        self.name_scope = name_scope
        self.mlp_input_ph = mlp_input_ph
        self.mlp_net = mlp_net
        self.q_tensor = self.mlp_net.output