def __init__(self, env_spec: EnvSpec, name_scope: str, name: str, mlp_config: list, state_input: tf.Tensor = None, reuse=False, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, ): with tf.variable_scope(name_scope): state_input = state_input if state_input is not None else tf.placeholder( shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') mlp_input_ph = state_input mlp_kwargs = dict( reuse=reuse, mlp_config=mlp_config, input_norm=input_norm, output_norm=output_norm, output_high=output_high, output_low=output_low, name_scope=name_scope ) mlp_net = MLP(input_ph=mlp_input_ph, net_name='mlp', **mlp_kwargs) parameters = ParametersWithTensorflowVariable(tf_var_list=mlp_net.var_list, rest_parameters=mlp_kwargs, name='mlp_v_value_function_tf_param') VValueFunction.__init__(self, env_spec=env_spec, state_input=state_input, name=name, parameters=None) PlaceholderInput.__init__(self, inputs=mlp_input_ph, parameters=parameters) self.name_scope = name_scope self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.state_input = state_input self.mlp_input_ph = mlp_input_ph self.mlp_net = mlp_net self.v_tensor = self.mlp_net.output
def __init__(self, env_spec: EnvSpec, name_scope: str, name: str, mlp_config: list, learning_rate: float, output_norm: np.ndarray = None, input_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, init_state=None): if not isinstance(env_spec.obs_space, Box): raise TypeError( 'ContinuousMLPGlobalDynamicsModel only support to predict state that hold space Box type' ) GlobalDynamicsModel.__init__(self, env_spec=env_spec, parameters=None, name=name, init_state=init_state) with tf.variable_scope(name_scope): state_input = tf.placeholder(shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') action_input = tf.placeholder( shape=[None, env_spec.flat_action_dim], dtype=tf.float32, name='action_ph') mlp_input_ph = tf.concat([state_input, action_input], axis=1, name='state_action_input') delta_state_label_ph = tf.placeholder( shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='delta_state_label_ph') mlp_net = MLP( input_ph=mlp_input_ph, reuse=False, mlp_config=mlp_config, input_norm=input_norm, output_norm=output_norm, # todo have a running-up mean module output_high=output_high - output_low, output_low=output_low - output_high, name_scope=name_scope, net_name='mlp') assert mlp_net.output.shape[1] == env_spec.flat_obs_dim parameters = ParametersWithTensorflowVariable( tf_var_list=mlp_net.var_list, name=name + '_' 'mlp_continuous_dynamics_model', rest_parameters=dict(output_low=output_low, output_high=output_high, input_norm=input_norm, learning_rate=learning_rate)) with tf.variable_scope(name_scope): with tf.variable_scope('train'): new_state_output = mlp_net.output + state_input DerivableDynamics.__init__( self, input_node_dict=dict(state_input=state_input, action_action_input=action_input), output_node_dict=dict(new_state_output=new_state_output)) PlaceholderInput.__init__(self, inputs=(state_input, action_input, delta_state_label_ph), parameters=parameters) self.mlp_config = mlp_config self.name_scope = name_scope self.action_input = action_input self.state_input = state_input self.mlp_input_ph = mlp_input_ph self.delta_state_label_ph = delta_state_label_ph self.new_state_output = new_state_output self.mlp_net = mlp_net self._status = StatusWithSubInfo(obj=self) with tf.variable_scope(name_scope): with tf.variable_scope('train'): self.loss, self.optimizer, self.optimize_op = self._setup_loss( ) train_var_list = get_tf_collection_var_list( key=tf.GraphKeys.GLOBAL_VARIABLES, scope='{}/train'.format(name_scope)) + self.optimizer.variables() self.parameters.set_tf_var_list( sorted(list(set(train_var_list)), key=lambda x: x.name))
def __init__(self, env_spec: EnvSpec, name: str, name_scope: str, mlp_config: list, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, reuse=False, distribution_tensors_tuple: tuple = None): StochasticPolicy.__init__(self, env_spec=env_spec, name=name, parameters=None) obs_dim = env_spec.flat_obs_dim action_dim = env_spec.flat_action_dim assert action_dim == mlp_config[-1]['N_UNITS'] self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.mlp_config = mlp_config self.name_scope = name_scope mlp_kwargs = dict(reuse=reuse, input_norm=input_norm, output_norm=output_norm, output_low=output_low, output_high=output_high, mlp_config=mlp_config, name_scope=name_scope) ph_inputs = [] if distribution_tensors_tuple is not None: self.mean_output = distribution_tensors_tuple[0][0] self.logvar_output = distribution_tensors_tuple[1][0] assert list(self.mean_output.shape)[-1] == action_dim assert list(self.logvar_output.shape)[-1] == action_dim self.mlp_net = None else: with tf.variable_scope(self.name_scope): self.state_input = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32, name='state_ph') ph_inputs.append(self.state_input) self.mlp_net = MLP(input_ph=self.state_input, net_name='normal_distribution_mlp_policy', **mlp_kwargs) self.mean_output = self.mlp_net.output with tf.variable_scope(name_scope, reuse=reuse): with tf.variable_scope('norm_dist', reuse=reuse): logvar_speed = (10 * self.mlp_config[-2]['N_UNITS']) // 48 logvar_output = tf.get_variable( name='normal_distribution_variance', shape=[logvar_speed, self.mlp_config[-1]['N_UNITS']], dtype=tf.float32) # self.logvar_output = tf.reduce_sum(logvar_output, axis=0) + self.parameters('log_var_init') self.logvar_output = tf.reduce_sum(logvar_output, axis=0) with tf.variable_scope(name_scope, reuse=reuse): self.action_input = tf.placeholder(shape=[None, action_dim], dtype=tf.float32, name='action_ph') ph_inputs.append(self.action_input) with tf.variable_scope('norm_dist', reuse=reuse): self.stddev_output = tf.exp(self.logvar_output / 2.0, name='std_dev') self.var_output = tf.exp(self.logvar_output, name='variance') self.action_distribution = tfp.distributions.MultivariateNormalDiag( loc=self.mean_output, scale_diag=self.stddev_output, name='mlp_normal_distribution') self.action_output = self.action_distribution.sample() self.dist_info_tensor_op_dict = { # todo support more in future 'prob': self.action_distribution.prob, 'log_prob': self.action_distribution.log_prob, 'entropy': self.action_distribution.entropy, 'kl': self.kl } var_list = get_tf_collection_var_list( scope='{}/norm_dist'.format(name_scope)) if self.mlp_net: var_list += self.mlp_net.var_list self.parameters = ParametersWithTensorflowVariable( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name), rest_parameters=dict(state_input=self.state_input, action_input=self.action_input, **mlp_kwargs), name='normal_distribution_mlp_tf_param') PlaceholderInput.__init__(self, parameters=self.parameters, inputs=tuple(ph_inputs))
def __init__( self, env_spec: EnvSpec, name: str, name_scope: str, mlp_config: list, state_input: tf.Tensor = None, action_input: tf.Tensor = None, reuse=False, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, ): with tf.name_scope(name_scope): state_input = state_input if state_input is not None else tf.placeholder( shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') action_input = action_input if action_input is not None else tf.placeholder( shape=[None, env_spec.flat_action_dim], dtype=tf.float32, name='action_ph') with tf.variable_scope(name_scope): mlp_input_ph = tf.concat([state_input, action_input], axis=1, name='state_action_input') mlp_net_kwargs = dict( reuse=reuse, mlp_config=mlp_config, input_norm=input_norm, output_norm=output_norm, output_high=output_high, output_low=output_low, name_scope=name_scope, ) mlp_net = MLP(input_ph=mlp_input_ph, net_name='mlp', **mlp_net_kwargs) parameters = ParametersWithTensorflowVariable( tf_var_list=mlp_net.var_list, rest_parameters=dict(**mlp_net_kwargs, name=name), default_save_type='tf', name='{}_tf_param'.format(name)) QValueFunction.__init__(self, env_spec=env_spec, name=name, action_input=action_input, state_input=state_input, parameters=None) PlaceholderInput.__init__(self, parameters=parameters, inputs=mlp_input_ph) self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.name_scope = name_scope self.mlp_input_ph = mlp_input_ph self.mlp_net = mlp_net self.q_tensor = self.mlp_net.output