def __init__( self, input_ph: tf.Tensor, name_scope: str, net_name: str, reuse, mlp_config: list, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, ): self.input_ph = input_ph self.name_scope = name_scope self.mlp_config = mlp_config self.mlp_net_name = net_name self.net, self.output, self.var_list = MLPCreator.create_network_with_tf_layers( input=input_ph, reuse=reuse, network_config=mlp_config, tf_var_scope=name_scope, net_name=net_name, input_norm=input_norm, output_high=output_high, output_low=output_low, output_norm=output_norm) for var in self.var_list: assert name_scope in var.name self._parameters = ParametersWithTensorflowVariable( tf_var_list=self.var_list, name='parameters_{}'.format(self.mlp_net_name), rest_parameters=dict())
def __init__(self, env_spec: EnvSpec, batch_data, init_state=None, name_scope='gp_dynamics_model', name='gp_dynamics_model', gp_kernel_type='RBF'): if gp_kernel_type not in self.kernel_type_dict.keys(): raise TypeError('Not supported {} kernel, choose from'.format( gp_kernel_type, list(self.kernel_type_dict.keys()))) parameters = ParametersWithTensorflowVariable( tf_var_list=[], rest_parameters=dict(), name='{}_param'.format(name), require_snapshot=False) super().__init__(env_spec, parameters, init_state, name) self.name_scope = name_scope state_action_data = np.hstack( (batch_data.state_set, batch_data.action_set)) delta_state_data = batch_data.new_state_set - batch_data.state_set with tf.variable_scope(self.name_scope): self.mgpr_model = MGPR(name='mgpr', action_dim=env_spec.flat_action_dim, x=state_action_data, y=delta_state_data, state_dim=env_spec.flat_obs_dim) var_list = get_tf_collection_var_list( key=tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name_scope) self.parameters.set_tf_var_list( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name))
def __init__(self, env_spec, config_or_config_dict: (DictConfig, dict), value_func: MLPQValueFunction, schedule_param_list=None, name: str = 'dqn', replay_buffer=None): ModelFreeAlgo.__init__(self, env_spec=env_spec, name=name) self.config = construct_dict_config(config_or_config_dict, self) if replay_buffer: assert issubclass(replay_buffer, BaseReplayBuffer) self.replay_buffer = replay_buffer else: self.replay_buffer = UniformRandomReplayBuffer(limit=self.config('REPLAY_BUFFER_SIZE'), action_shape=self.env_spec.action_shape, observation_shape=self.env_spec.obs_shape) self.q_value_func = value_func self.state_input = self.q_value_func.state_input self.action_input = self.q_value_func.action_input self.update_target_q_every_train = self.config('UPDATE_TARGET_Q_FREQUENCY') if 'UPDATE_TARGET_Q_FREQUENCY' in \ self.config.config_dict else 1 self.parameters = ParametersWithTensorflowVariable(tf_var_list=[], rest_parameters=dict(), to_scheduler_param_tuple=schedule_param_list, name='{}_param'.format(name), source_config=self.config, require_snapshot=False) with tf.variable_scope(name): self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.next_state_input = tf.placeholder(shape=[None, self.env_spec.flat_obs_dim], dtype=tf.float32) self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool) self.target_q_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) done = tf.cast(self.done_input, dtype=tf.float32) self.target_q_value_func = self.q_value_func.make_copy(name_scope='{}_target_q_value_net'.format(name), name='{}_target_q_value_net'.format(name), reuse=False) self.predict_q_value = (1. - done) * self.config('GAMMA') * self.target_q_input + self.reward_input self.td_error = self.predict_q_value - self.q_value_func.q_tensor with tf.variable_scope('train'): self.q_value_func_loss, self.optimizer, self.update_q_value_func_op = self._set_up_loss() self.update_target_q_value_func_op = self._set_up_target_update() # redundant sort operation on var_list var_list = get_tf_collection_var_list(key=tf.GraphKeys.GLOBAL_VARIABLES, scope='{}/train'.format(name)) + self.optimizer.variables() self.parameters.set_tf_var_list(tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[dict(obj=self.q_value_func, attr_name='q_value_func'), dict(obj=self.target_q_value_func, attr_name='target_q_value_func')], parameters=self.parameters)
def __init__(self, env_spec: EnvSpec, name_scope: str, name: str, mlp_config: list, state_input: tf.Tensor = None, reuse=False, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, ): with tf.variable_scope(name_scope): state_input = state_input if state_input is not None else tf.placeholder( shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') mlp_input_ph = state_input mlp_kwargs = dict( reuse=reuse, mlp_config=mlp_config, input_norm=input_norm, output_norm=output_norm, output_high=output_high, output_low=output_low, name_scope=name_scope ) mlp_net = MLP(input_ph=mlp_input_ph, net_name='mlp', **mlp_kwargs) parameters = ParametersWithTensorflowVariable(tf_var_list=mlp_net.var_list, rest_parameters=mlp_kwargs, name='mlp_v_value_function_tf_param') VValueFunction.__init__(self, env_spec=env_spec, state_input=state_input, name=name, parameters=None) PlaceholderInput.__init__(self, inputs=mlp_input_ph, parameters=parameters) self.name_scope = name_scope self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.state_input = state_input self.mlp_input_ph = mlp_input_ph self.mlp_net = mlp_net self.v_tensor = self.mlp_net.output
def __init__(self, env_spec: EnvSpec, name_scope: str, name: str, mlp_config: list, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, reuse=False): DeterministicPolicy.__init__(self, env_spec=env_spec, name=name, parameters=None) obs_dim = env_spec.flat_obs_dim action_dim = env_spec.flat_action_dim print(mlp_config[-1]['N_UNITS']) print(action_dim) assert action_dim == mlp_config[-1]['N_UNITS'] with tf.variable_scope(name_scope): state_input = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32, name='state_ph') mlp_kwargs = dict(reuse=reuse, input_norm=input_norm, output_norm=output_norm, output_low=output_low, output_high=output_high, mlp_config=mlp_config, name_scope=name_scope) mlp_net = MLP(input_ph=state_input, **mlp_kwargs, net_name='deterministic_mlp_policy') PlaceholderInput.__init__(self, parameters=None) self.parameters = ParametersWithTensorflowVariable( tf_var_list=mlp_net.var_list, rest_parameters=mlp_kwargs, name='deterministic_mlp_policy_tf_param') self.state_input = state_input self.mlp_net = mlp_net self.action_tensor = mlp_net.output self.mlp_config = mlp_config self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.name_scope = name_scope
def create_tf_parameters(self, name='test_tf_param'): with tf.variable_scope(name): a = tf.get_variable(shape=[3, 4], dtype=tf.float32, name='var_1') b = tf.get_variable(shape=[3, 4], dtype=tf.bool, name='var_2') conf = DictConfig(required_key_dict=Foo.required_key_dict, config_dict=dict(var1=1, var2=0.01)) param = ParametersWithTensorflowVariable( tf_var_list=[a, b], rest_parameters=dict(var3='sss'), name=name, source_config=conf, require_snapshot=True, to_ph_parameter_dict=dict( var1=tf.placeholder(shape=(), dtype=tf.int32))) return param, locals()
def create_ph(self, name): with tf.variable_scope(name): a = tf.get_variable(shape=[3, 4], dtype=tf.float32, name='var_1') conf = DictConfig(required_key_dict=Foo.required_key_dict, config_dict=dict(var1=1, var2=0.01)) param = ParametersWithTensorflowVariable( tf_var_list=[a], rest_parameters=dict(var3='sss'), name=name, source_config=conf, require_snapshot=True, to_ph_parameter_dict=dict( var1=tf.placeholder(shape=(), dtype=tf.int32))) param.init() a = PlaceholderInput(parameters=param, inputs=None) return a, locals()
def __init__(self, env_spec: EnvSpec, stochastic_policy: StochasticPolicy, config_or_config_dict: (DictConfig, dict), value_func: VValueFunction, warm_up_trajectories_number=5, use_time_index_flag=False, name='ppo'): ModelFreeAlgo.__init__( self, env_spec=env_spec, name=name, warm_up_trajectories_number=warm_up_trajectories_number) self.use_time_index_flag = use_time_index_flag self.config = construct_dict_config(config_or_config_dict, self) self.policy = stochastic_policy self.value_func = value_func to_ph_parameter_dict = dict() self.trajectory_memory = TrajectoryData(env_spec=env_spec) self.transition_data_for_trajectory = TransitionData(env_spec=env_spec) self.value_func_train_data_buffer = None self.scaler = RunningStandardScaler(dims=self.env_spec.flat_obs_dim) if use_time_index_flag: scale_last_time_index_mean = self.scaler._mean scale_last_time_index_mean[-1] = 0 scale_last_time_index_var = self.scaler._var scale_last_time_index_var[-1] = 1000 * 1000 self.scaler.set_param(mean=scale_last_time_index_mean, var=scale_last_time_index_var) with tf.variable_scope(name): self.advantages_ph = tf.placeholder(tf.float32, (None, ), 'advantages') self.v_func_val_ph = tf.placeholder(tf.float32, (None, ), 'val_val_func') dist_info_list = self.policy.get_dist_info() self.old_dist_tensor = [ (tf.placeholder(**dict(dtype=dist_info['dtype'], shape=dist_info['shape'], name=dist_info['name'])), dist_info['name']) for dist_info in dist_info_list ] self.old_policy = self.policy.make_copy( reuse=False, name_scope='old_{}'.format(self.policy.name), name='old_{}'.format(self.policy.name), distribution_tensors_tuple=tuple(self.old_dist_tensor)) to_ph_parameter_dict['beta'] = tf.placeholder( tf.float32, (), 'beta') to_ph_parameter_dict['eta'] = tf.placeholder(tf.float32, (), 'eta') to_ph_parameter_dict['kl_target'] = tf.placeholder( tf.float32, (), 'kl_target') to_ph_parameter_dict['lr_multiplier'] = tf.placeholder( tf.float32, (), 'lr_multiplier') self.parameters = ParametersWithTensorflowVariable( tf_var_list=[], rest_parameters=dict( advantages_ph=self.advantages_ph, v_func_val_ph=self.v_func_val_ph, ), to_ph_parameter_dict=to_ph_parameter_dict, name='ppo_param', save_rest_param_flag=False, source_config=self.config, require_snapshot=False) with tf.variable_scope(name): with tf.variable_scope('train'): self.kl = tf.reduce_mean(self.old_policy.kl(self.policy)) self.average_entropy = tf.reduce_mean(self.policy.entropy()) self.policy_loss, self.policy_optimizer, self.policy_update_op = self._setup_policy_loss( ) self.value_func_loss, self.value_func_optimizer, self.value_func_update_op = self._setup_value_func_loss( ) var_list = get_tf_collection_var_list( '{}/train'.format(name)) + self.policy_optimizer.variables( ) + self.value_func_optimizer.variables() self.parameters.set_tf_var_list( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[ dict( obj=self.value_func, attr_name='value_func', ), dict(obj=self.policy, attr_name='policy') ], parameters=self.parameters)
def __init__( self, env_spec: EnvSpec, name: str, name_scope: str, mlp_config: list, state_input: tf.Tensor = None, action_input: tf.Tensor = None, reuse=False, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, ): with tf.name_scope(name_scope): state_input = state_input if state_input is not None else tf.placeholder( shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') action_input = action_input if action_input is not None else tf.placeholder( shape=[None, env_spec.flat_action_dim], dtype=tf.float32, name='action_ph') with tf.variable_scope(name_scope): mlp_input_ph = tf.concat([state_input, action_input], axis=1, name='state_action_input') mlp_net_kwargs = dict( reuse=reuse, mlp_config=mlp_config, input_norm=input_norm, output_norm=output_norm, output_high=output_high, output_low=output_low, name_scope=name_scope, ) mlp_net = MLP(input_ph=mlp_input_ph, net_name=name_scope, **mlp_net_kwargs) parameters = ParametersWithTensorflowVariable( tf_var_list=mlp_net.var_list, rest_parameters=dict(**mlp_net_kwargs, name=name), default_save_type='tf', name='{}_tf_param'.format(name)) QValueFunction.__init__(self, env_spec=env_spec, name=name, action_input=action_input, state_input=state_input, parameters=None) PlaceholderInput.__init__(self, parameters=parameters) self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.name_scope = name_scope self.mlp_input_ph = mlp_input_ph self.mlp_net = mlp_net self.q_tensor = self.mlp_net.output
def __init__(self, env_spec: EnvSpec, name_scope: str, name: str, mlp_config: list, learning_rate: float, output_norm: np.ndarray = None, input_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, init_state=None): if not isinstance(env_spec.obs_space, Box): raise TypeError( 'ContinuousMLPGlobalDynamicsModel only support to predict state that hold space Box type' ) GlobalDynamicsModel.__init__(self, env_spec=env_spec, parameters=None, name=name, init_state=init_state) with tf.variable_scope(name_scope): state_input = tf.placeholder(shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') action_input = tf.placeholder( shape=[None, env_spec.flat_action_dim], dtype=tf.float32, name='action_ph') mlp_input_ph = tf.concat([state_input, action_input], axis=1, name='state_action_input') delta_state_label_ph = tf.placeholder( shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='delta_state_label_ph') mlp_net = MLP( input_ph=mlp_input_ph, reuse=False, mlp_config=mlp_config, input_norm=input_norm, output_norm=output_norm, # todo have a running-up mean module output_high=output_high - output_low, output_low=output_low - output_high, name_scope=name_scope, net_name='mlp') assert mlp_net.output.shape[1] == env_spec.flat_obs_dim parameters = ParametersWithTensorflowVariable( tf_var_list=mlp_net.var_list, name=name + '_' 'mlp_continuous_dynamics_model', rest_parameters=dict(output_low=output_low, output_high=output_high, input_norm=input_norm, learning_rate=learning_rate)) with tf.variable_scope(name_scope): with tf.variable_scope('train'): new_state_output = mlp_net.output + state_input DerivableDynamics.__init__( self, input_node_dict=dict(state_input=state_input, action_action_input=action_input), output_node_dict=dict(new_state_output=new_state_output)) PlaceholderInput.__init__(self, inputs=(state_input, action_input, delta_state_label_ph), parameters=parameters) self.mlp_config = mlp_config self.name_scope = name_scope self.action_input = action_input self.state_input = state_input self.mlp_input_ph = mlp_input_ph self.delta_state_label_ph = delta_state_label_ph self.new_state_output = new_state_output self.mlp_net = mlp_net self._status = StatusWithSubInfo(obj=self) with tf.variable_scope(name_scope): with tf.variable_scope('train'): self.loss, self.optimizer, self.optimize_op = self._setup_loss( ) train_var_list = get_tf_collection_var_list( key=tf.GraphKeys.GLOBAL_VARIABLES, scope='{}/train'.format(name_scope)) + self.optimizer.variables() self.parameters.set_tf_var_list( sorted(list(set(train_var_list)), key=lambda x: x.name))
def __init__(self, env_spec: EnvSpec, name: str, name_scope: str, mlp_config: list, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, reuse=False, distribution_tensors_tuple: tuple = None): StochasticPolicy.__init__(self, env_spec=env_spec, name=name, parameters=None) obs_dim = env_spec.flat_obs_dim action_dim = env_spec.flat_action_dim assert action_dim == mlp_config[-1]['N_UNITS'] self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.mlp_config = mlp_config self.name_scope = name_scope mlp_kwargs = dict(reuse=reuse, input_norm=input_norm, output_norm=output_norm, output_low=output_low, output_high=output_high, mlp_config=mlp_config, name_scope=name_scope) ph_inputs = [] if distribution_tensors_tuple is not None: self.mean_output = distribution_tensors_tuple[0][0] self.logvar_output = distribution_tensors_tuple[1][0] assert list(self.mean_output.shape)[-1] == action_dim assert list(self.logvar_output.shape)[-1] == action_dim self.mlp_net = None else: with tf.variable_scope(self.name_scope): self.state_input = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32, name='state_ph') ph_inputs.append(self.state_input) self.mlp_net = MLP(input_ph=self.state_input, net_name='normal_distribution_mlp_policy', **mlp_kwargs) self.mean_output = self.mlp_net.output with tf.variable_scope(name_scope, reuse=reuse): with tf.variable_scope('norm_dist', reuse=reuse): logvar_speed = (10 * self.mlp_config[-2]['N_UNITS']) // 48 logvar_output = tf.get_variable( name='normal_distribution_variance', shape=[logvar_speed, self.mlp_config[-1]['N_UNITS']], dtype=tf.float32) # self.logvar_output = tf.reduce_sum(logvar_output, axis=0) + self.parameters('log_var_init') self.logvar_output = tf.reduce_sum(logvar_output, axis=0) with tf.variable_scope(name_scope, reuse=reuse): self.action_input = tf.placeholder(shape=[None, action_dim], dtype=tf.float32, name='action_ph') ph_inputs.append(self.action_input) with tf.variable_scope('norm_dist', reuse=reuse): self.stddev_output = tf.exp(self.logvar_output / 2.0, name='std_dev') self.var_output = tf.exp(self.logvar_output, name='variance') self.action_distribution = tfp.distributions.MultivariateNormalDiag( loc=self.mean_output, scale_diag=self.stddev_output, name='mlp_normal_distribution') self.action_output = self.action_distribution.sample() self.dist_info_tensor_op_dict = { # todo support more in future 'prob': self.action_distribution.prob, 'log_prob': self.action_distribution.log_prob, 'entropy': self.action_distribution.entropy, 'kl': self.kl } var_list = get_tf_collection_var_list( scope='{}/norm_dist'.format(name_scope)) if self.mlp_net: var_list += self.mlp_net.var_list self.parameters = ParametersWithTensorflowVariable( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name), rest_parameters=dict(state_input=self.state_input, action_input=self.action_input, **mlp_kwargs), name='normal_distribution_mlp_tf_param') PlaceholderInput.__init__(self, parameters=self.parameters, inputs=tuple(ph_inputs))
def __init__(self, env_spec: EnvSpec, config_or_config_dict: (DictConfig, dict), value_func: MLPQValueFunction, policy: DeterministicMLPPolicy, schedule_param_list=None, name='ddpg', replay_buffer=None): """ :param env_spec: environment specifications, like action apace or observation space :param config_or_config_dict: configuraion dictionary, like learning rate or decay, if any :param value_func: value function :param policy: agent policy :param schedule_param_list: schedule parameter list, if any initla final function to schedule learning process :param name: name of algorithm class instance :param replay_buffer: replay buffer, if any """ ModelFreeAlgo.__init__(self, env_spec=env_spec, name=name) config = construct_dict_config(config_or_config_dict, self) self.config = config self.actor = policy self.target_actor = self.actor.make_copy(name_scope='{}_target_actor'.format(self.name), name='{}_target_actor'.format(self.name), reuse=False) self.critic = value_func self.target_critic = self.critic.make_copy(name_scope='{}_target_critic'.format(self.name), name='{}_target_critic'.format(self.name), reuse=False) self.state_input = self.actor.state_input if replay_buffer: assert issubclass(replay_buffer, BaseReplayBuffer) self.replay_buffer = replay_buffer else: self.replay_buffer = UniformRandomReplayBuffer(limit=self.config('REPLAY_BUFFER_SIZE'), action_shape=self.env_spec.action_shape, observation_shape=self.env_spec.obs_shape) """ self.parameters contains all the parameters (variables) of the algorithm """ self.parameters = ParametersWithTensorflowVariable(tf_var_list=[], rest_parameters=dict(), to_scheduler_param_tuple=schedule_param_list, name='ddpg_param', source_config=config, require_snapshot=False) self._critic_with_actor_output = self.critic.make_copy(reuse=True, name='actor_input_{}'.format(self.critic.name), state_input=self.state_input, action_input=self.actor.action_tensor) self._target_critic_with_target_actor_output = self.target_critic.make_copy(reuse=True, name='target_critic_with_target_actor_output_{}'.format( self.critic.name), action_input=self.target_actor.action_tensor) with tf.variable_scope(name): self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.next_state_input = tf.placeholder(shape=[None, self.env_spec.flat_obs_dim], dtype=tf.float32) self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool) self.target_q_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) done = tf.cast(self.done_input, dtype=tf.float32) self.predict_q_value = (1. - done) * self.config('GAMMA') * self.target_q_input + self.reward_input with tf.variable_scope('train'): self.critic_loss, self.critic_update_op, self.target_critic_update_op, self.critic_optimizer, \ self.critic_grads = self._setup_critic_loss() self.actor_loss, self.actor_update_op, self.target_actor_update_op, self.action_optimizer, \ self.actor_grads = self._set_up_actor_loss() var_list = get_tf_collection_var_list( '{}/train'.format(name)) + self.critic_optimizer.variables() + self.action_optimizer.variables() self.parameters.set_tf_var_list(tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[dict(obj=self.target_actor, attr_name='target_actor', ), dict(obj=self.actor, attr_name='actor'), dict(obj=self.critic, attr_name='critic'), dict(obj=self.target_critic, attr_name='target_critic') ], parameters=self.parameters)
def __init__(self, env_spec: EnvSpec, name_scope: str, name: str, mlp_config: list, learning_rate: float, state_input_scaler: DataScaler = None, action_input_scaler: DataScaler = None, output_delta_state_scaler: DataScaler = None, init_state=None): if not isinstance(env_spec.obs_space, Box): raise TypeError( 'ContinuousMLPGlobalDynamicsModel only support to predict state that hold space Box type' ) GlobalDynamicsModel.__init__(self, env_spec=env_spec, parameters=None, name=name, state_input_scaler=state_input_scaler, action_input_scaler=action_input_scaler, init_state=init_state) with tf.variable_scope(name_scope): state_input = tf.placeholder(shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') action_input = tf.placeholder( shape=[None, env_spec.flat_action_dim], dtype=tf.float32, name='action_ph') mlp_input_ph = tf.concat([state_input, action_input], axis=1, name='state_action_input') delta_state_label_ph = tf.placeholder( shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='delta_state_label_ph') mlp_net = MLP(input_ph=mlp_input_ph, reuse=False, mlp_config=mlp_config, name_scope=name_scope, net_name='mlp') if mlp_net.output.shape[1] != env_spec.flat_obs_dim: raise InappropriateParameterSetting( "mlp output dims {} != env spec obs dim {}".format( mlp_net.output.shape[1], env_spec.flat_obs_dim)) parameters = ParametersWithTensorflowVariable( tf_var_list=mlp_net.var_list, name=name + '_' 'mlp_continuous_dynamics_model', rest_parameters=dict(learning_rate=learning_rate)) DifferentiableDynamics.__init__( self, input_node_dict=dict(state_input=state_input, action_action_input=action_input), output_node_dict=dict(delta_state_output=mlp_net.output)) PlaceholderInput.__init__(self, parameters=parameters) self.mlp_config = mlp_config self.name_scope = name_scope self.action_input = action_input self.state_input = state_input self.mlp_input_ph = mlp_input_ph self.delta_state_label_ph = delta_state_label_ph self.delta_state_output = mlp_net.output self.mlp_net = mlp_net self.output_delta_state_scaler = output_delta_state_scaler if output_delta_state_scaler else IdenticalDataScaler( dims=self.env_spec.flat_obs_dim) self._status = StatusWithSubInfo(obj=self) with tf.variable_scope(name_scope): with tf.variable_scope('train'): self.loss, self.optimizer, self.optimize_op = self._setup_loss( ) train_var_list = get_tf_collection_var_list( key=tf.GraphKeys.GLOBAL_VARIABLES, scope='{}/train'.format(name_scope)) + self.optimizer.variables() self.parameters.set_tf_var_list( sorted(list(set(train_var_list)), key=lambda x: x.name))