def __init__(self, config, action_bound, obs_bound): super().__init__(config=config) self.obs_dim = self.config.config_dict['STATE_SPACE'] self.obs_dim = self.obs_dim[0] + 1 self.act_dim = self.config.config_dict['ACTION_SPACE'][0] with tf.variable_scope(name_or_scope=self.config.config_dict['NAME']): self.scaler = Scaler(self.obs_dim) self.val_func = NNValueFunction( self.obs_dim, hid1_mult=self.config.config_dict['HIDDEN_MULTIPLE'], name_scope=self.config.config_dict['NAME']) self.policy = Policy( self.obs_dim, self.act_dim, kl_targ=self.config.config_dict['KL_TARG'], hid1_mult=self.config.config_dict['HIDDEN_MULTIPLE'], policy_logvar=self.config.config_dict['POLICY_LOGVAR'], name_scope=self.config.config_dict['NAME']) self._real_trajectories = { 'observes': [], 'actions': [], 'rewards': [], 'unscaled_obs': [] } self._cyber_trajectories = { 'observes': [], 'actions': [], 'rewards': [], 'unscaled_obs': [] } self._real_trajectories_memory = deque( maxlen=self.config.config_dict['EPISODE_REAL_MEMORY_SIZE']) self._cyber_trajectories_memory = deque( maxlen=self.config.config_dict['EPISODE_CYBER_MEMORY_SIZE']) self._real_step_count = 0.0 self._cyber_step_count = 0.0 self.action_low = action_bound[0] self.action_high = action_bound[1] self._env_status = None self.real_data_memory = Memory( limit=10000, action_shape=self.config.config_dict['ACTION_SPACE'], observation_shape=self.config.config_dict['STATE_SPACE']) self.simulation_data_memory = Memory( limit=10000, action_shape=self.config.config_dict['ACTION_SPACE'], observation_shape=self.config.config_dict['STATE_SPACE'])
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
class TrpoModel(TensorflowBasedModel): key_list = Config.load_json(file_path=CONFIG_KEY + '/trpoModelKey.json') def __init__(self, config, action_bound, obs_bound): super().__init__(config=config) self.obs_dim = self.config.config_dict['STATE_SPACE'] self.obs_dim = self.obs_dim[0] + 1 self.act_dim = self.config.config_dict['ACTION_SPACE'][0] with tf.variable_scope(name_or_scope=self.config.config_dict['NAME']): self.scaler = Scaler(self.obs_dim) self.val_func = NNValueFunction(self.obs_dim, hid1_mult=self.config.config_dict['HIDDEN_MULTIPLE'], name_scope=self.config.config_dict['NAME']) self.policy = Policy(self.obs_dim, self.act_dim, kl_targ=self.config.config_dict['KL_TARG'], hid1_mult=self.config.config_dict['HIDDEN_MULTIPLE'], policy_logvar=self.config.config_dict['POLICY_LOGVAR'], name_scope=self.config.config_dict['NAME']) self._real_trajectories = {'observes': [], 'actions': [], 'rewards': [], 'unscaled_obs': []} self._cyber_trajectories = {'observes': [], 'actions': [], 'rewards': [], 'unscaled_obs': []} self._real_trajectories_memory = deque(maxlen=self.config.config_dict['EPISODE_REAL_MEMORY_SIZE']) self._cyber_trajectories_memory = deque(maxlen=self.config.config_dict['EPISODE_CYBER_MEMORY_SIZE']) self._real_step_count = 0.0 self._cyber_step_count = 0.0 self.action_low = action_bound[0] self.action_high = action_bound[1] self._env_status = self.config.config_dict['REAL_ENVIRONMENT_STATUS'] self.real_data_memory = Memory(limit=10000, action_shape=self.config.config_dict['ACTION_SPACE'], observation_shape=self.config.config_dict['STATE_SPACE']) self.simulation_data_memory = Memory(limit=10000, action_shape=self.config.config_dict['ACTION_SPACE'], observation_shape=self.config.config_dict['STATE_SPACE']) @property def env_status(self): return self._env_status @env_status.setter def env_status(self, new): assert (new == self.config.config_dict['REAL_ENVIRONMENT_STATUS'] or new == self.config.config_dict[ 'CYBER_ENVIRONMENT_STATUS']) self._env_status = new # TODO change if new == self.config.config_dict['REAL_ENVIRONMENT_STATUS']: self.memory = self.real_data_memory elif new == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']: self.memory = self.simulation_data_memory else: raise KeyError('Environment status did not existed') @property def memory_length(self): count = 0 # self._save_trajectories_to_memory(reset_step_count=False) for espiode in self.trajectories_memory: count += len(espiode['observes']) return count @property def current_env_status(self): if self._env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']: return 'REAL_ENVIRONMENT_STATUS' elif self._env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']: return 'CYBER_ENVIRONMENT_STATUS' @property def trajectories_memory(self): if self._env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']: return self._real_trajectories_memory elif self._env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']: return self._cyber_trajectories_memory @property def trajectories(self): if self._env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']: return self._real_trajectories elif self._env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']: return self._cyber_trajectories @trajectories.setter def trajectories(self, new_val): if self._env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']: self._real_trajectories = new_val elif self._env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']: self._cyber_trajectories = new_val @property def step_count(self): if self._env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']: return self._real_step_count elif self._env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']: return self._cyber_step_count else: raise KeyError('Environment status did not existed') @step_count.setter def step_count(self, new_val): if self._env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']: self._real_step_count = new_val elif self._env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']: self._cyber_step_count = new_val else: raise KeyError('Environment status did not existed') def copy_model(self, new_model): assert isinstance(new_model, type(self)) self.policy.copy_weight(new_model.policy) self.val_func.copy_weight(new_model.val_func) from copy import deepcopy self.scaler = deepcopy(new_model.scaler) def update(self): observes, actions, advantages, disc_sum_rew = self._return_train_data() # TODO FIX LOGGER AND UODATE LOG DATA loss, entropy, kl, beta, lr_multiplier = self.policy.update(observes=observes, actions=actions, advantages=advantages, logger=None) loss_val, exp_var, old_exp_var = self.val_func.fit(x=observes, y=disc_sum_rew, logger=None) res_dict = { self.name + '_POLICY_LOSS': loss, self.name + '_ENTROPY': entropy, self.name + '_KL': kl, self.name + '_BETA': beta, self.name + '_LR_MULTIPLIER': lr_multiplier, self.name + '_VAL_FUNCTION_LOSS': loss_val, self.name + '_EXP_VAR': exp_var, self.name + '_OLD_EXP_VAR': old_exp_var, self.name + '_ENV_STATUS': self.current_env_status, self.name + '_TRAIN_SAMPLE_COUNT': len(observes) } self.log_queue.put(res_dict) return { 'VALUE_FUNCTION_LOSS': loss_val, 'CONTROLLER_LOSS': loss } def predict(self, obs, step_count=None): obs = np.reshape(obs, [1, -1]) if step_count is not None: obs = np.append(obs, [[step_count * self.config.config_dict['INCREMENT_ENV_STEP']]], axis=1) else: obs = np.append(obs, [[self.step_count * self.config.config_dict['INCREMENT_ENV_STEP']]], axis=1) scale, offset = self.scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature obs = (obs - offset) * scale action = self.policy.sample(np.reshape(obs, [1, -1])).reshape((1, -1)).astype(np.float32) action = np.clip(action, a_min=self.action_low, a_max=self.action_high) return action def print_log_queue(self, status): self.status = status while self.log_queue.qsize() > 0: log = self.log_queue.get() print("%s: Policy Loss %f, Entropy %f, Kl %f, Beta %f, Lr multiplier %f, Val function loss %f, " "Exp var %f, Old exp var %f" % (self.name, log[self.name + '_POLICY_LOSS'], log[self.name + '_ENTROPY'], log[self.name + '_KL'], log[self.name + '_BETA'], log[self.name + '_LR_MULTIPLIER'], log[self.name + '_VAL_FUNCTION_LOSS'], log[self.name + '_EXP_VAR'], log[self.name + '_OLD_EXP_VAR'] )) log['INDEX'] = self.log_print_count self.log_print_count += 1 self.log_file_content.append(log) def reset(self): self.trajectories = {'observes': [], 'actions': [], 'rewards': [], 'unscaled_obs': []} self.step_count = 0 def init(self): self.var_list = self.val_func.var_list + self.policy.var_list self.val_func.init() self.policy.init() self.trajectories = {'observes': [], 'actions': [], 'rewards': [], 'unscaled_obs': []} self.step_count = 0 self.env_status = self.config.config_dict['REAL_ENVIRONMENT_STATUS'] super().init() pass def store_one_sample(self, state, next_state, action, reward, done, *arg, **kwargs): # TODO HOW TO SET AND RESET STEP self.memory.append(obs0=state, obs1=next_state, action=action, reward=reward, terminal1=done) obs = state.astype(np.float32).reshape((1, -1)) obs = np.append(obs, [[self.step_count * self.config.config_dict['INCREMENT_ENV_STEP']]], axis=1) # add time step feature self.trajectories['unscaled_obs'].append(obs) scale, offset = self.scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature obs = (obs - offset) * scale # center and scale observations self.trajectories['observes'].append(np.reshape(obs, [-1])) self.trajectories['actions'].append(np.reshape(action, [-1])) self.trajectories['rewards'].append(reward) self.step_count += 1 if done is True: self._save_trajectories_to_memory(reset_step_count=True) def _return_train_data(self): trajectories = list(self.trajectories_memory) trpo_main.add_value(trajectories, val_func=self.val_func) trpo_main.add_disc_sum_rew(trajectories=trajectories, gamma=self.config.config_dict['GAMMA']) trpo_main.add_gae(trajectories=trajectories, gamma=self.config.config_dict['GAMMA'], lam=self.config.config_dict['LAM']) observes, actions, advantages, disc_sum_rew = trpo_main.build_train_set(trajectories=trajectories) # NO MORE CLEAR OF MEMORY if 'NOT_TRPO_CLEAR_MEMORY' in cfg.config_dict and cfg.config_dict['NOT_TRPO_CLEAR_MEMORY'] is True: pass else: self.trajectories_memory.clear() return observes, actions, advantages, disc_sum_rew def _save_trajectories_to_memory(self, reset_step_count=True): if len(self.trajectories['observes']) > 0: self.update_scale(unscaled_data=np.array(self.trajectories['unscaled_obs']).squeeze()) if reset_step_count is True: self.step_count = 0 for key, val in self.trajectories.items(): self.trajectories[key] = np.array(val) self.trajectories_memory.append(self.trajectories) self.trajectories = {'observes': [], 'actions': [], 'rewards': [], 'unscaled_obs': []} def update_scale(self, unscaled_data): self.scaler.update(x=unscaled_data) def q_value(self, state, step=0): return self.val_func.predict(x=np.array(state), step=step * self.config.config_dict['INCREMENT_ENV_STEP']) def return_most_recent_sample(self, sample_count, env_status, *args, **kwargs): if env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']: memory = self.real_data_memory elif env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']: memory = self.simulation_data_memory else: raise ValueError('Wrong Environment status') length = memory.nb_entries enough_flag = True if length < sample_count: enough_flag = False from src.util.sampler.sampler import SamplerData sample_data = SamplerData() for i in range(max(0, length - sample_count), length): sample_data.append(state=memory.observations0[i], action=memory.actions[i], new_state=memory.observations1[i], done=memory.terminals1[i], reward=memory.rewards[i]) return sample_data, enough_flag def enough_data(self, sample_count, env_status): if env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']: memory = self.real_data_memory elif env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']: memory = self.simulation_data_memory else: raise ValueError('Wrong Environment status') length = memory.nb_entries return length >= sample_count
def __init__(self, config, output_bound): # TODO THE PLACEHODER SHOULD MOVE TO AGENT AND USE IT AS INPUT FOR __init__ super(DynamicsEnvMlpModel, self).__init__(config) with tf.variable_scope(name_or_scope=self.config.config_dict['NAME']): self.state_means = tf.placeholder(shape=list( self.config.config_dict['STATE_SPACE']), dtype=tf.float32, name='state_means') self.state_vars = tf.placeholder(shape=list( self.config.config_dict['STATE_SPACE']), dtype=tf.float32, name='state_vars') self.action_means = tf.placeholder(shape=list( self.config.config_dict['ACTION_SPACE']), dtype=tf.float32) self.action_vars = tf.placeholder(shape=list( self.config.config_dict['ACTION_SPACE']), dtype=tf.float32) self.output_means = tf.placeholder(shape=list( self.config.config_dict['STATE_SPACE']), dtype=tf.float32, name='delta_means') self.output_vars = tf.placeholder(shape=list( self.config.config_dict['STATE_SPACE']), dtype=tf.float32, name='delta_vars') self.state_input = tf.placeholder( shape=[None] + list(self.config.config_dict['STATE_SPACE']), dtype=tf.float32) self.action_input = tf.placeholder( shape=[None] + list(self.config.config_dict['ACTION_SPACE']), dtype=tf.float32) self.state_delta_label = tf.placeholder( shape=[None] + list(self.config.config_dict['STATE_SPACE']), dtype=tf.float32) self.norm_state_input = (self.state_input - self.state_means) / self.state_vars self.norm_action_input = (self.action_input - self.action_means) / self.action_vars self.norm_state_delta_label = ( self.state_delta_label - self.output_means) / self.output_vars self.input = tf.concat( values=[self.norm_state_input, self.norm_action_input], axis=1) self.action_scalar = Scaler( obs_dim=self.config.config_dict['ACTION_SPACE']) self.state_scalar = Scaler( obs_dim=self.config.config_dict['STATE_SPACE']) self.delta_scalar = Scaler( obs_dim=self.config.config_dict['STATE_SPACE']) self.net, self.delta_state_output, self.trainable_var_list = \ NetworkCreator.create_network(input=self.input, network_config=self.config.config_dict['NET_CONFIG'], net_name=self.config.config_dict['NAME']) # output_low=output_bound[0] - output_bound[1], # output_high=output_bound[1] - output_bound[0]) self.loss, self.optimizer, self.optimize = self.create_training_method( ) self.denorm_delta_state_output = self.delta_state_output * self.output_vars + self.output_means self.denorm_state_input = self.norm_state_input * self.state_vars + self.state_means self.output = self.state_input + self.denorm_delta_state_output self.var_list = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=self.config.config_dict['NAME']) self.variables_initializer = tf.variables_initializer( var_list=self.var_list)
class DynamicsEnvMlpModel(TensorflowBasedModel): key_list = Config.load_json(file_path=CONFIG_KEY + '/dynamicsEnvMlpModelKey.json') def __init__(self, config, output_bound): # TODO THE PLACEHODER SHOULD MOVE TO AGENT AND USE IT AS INPUT FOR __init__ super(DynamicsEnvMlpModel, self).__init__(config) with tf.variable_scope(name_or_scope=self.config.config_dict['NAME']): self.state_means = tf.placeholder(shape=list( self.config.config_dict['STATE_SPACE']), dtype=tf.float32, name='state_means') self.state_vars = tf.placeholder(shape=list( self.config.config_dict['STATE_SPACE']), dtype=tf.float32, name='state_vars') self.action_means = tf.placeholder(shape=list( self.config.config_dict['ACTION_SPACE']), dtype=tf.float32) self.action_vars = tf.placeholder(shape=list( self.config.config_dict['ACTION_SPACE']), dtype=tf.float32) self.output_means = tf.placeholder(shape=list( self.config.config_dict['STATE_SPACE']), dtype=tf.float32, name='delta_means') self.output_vars = tf.placeholder(shape=list( self.config.config_dict['STATE_SPACE']), dtype=tf.float32, name='delta_vars') self.state_input = tf.placeholder( shape=[None] + list(self.config.config_dict['STATE_SPACE']), dtype=tf.float32) self.action_input = tf.placeholder( shape=[None] + list(self.config.config_dict['ACTION_SPACE']), dtype=tf.float32) self.state_delta_label = tf.placeholder( shape=[None] + list(self.config.config_dict['STATE_SPACE']), dtype=tf.float32) self.norm_state_input = (self.state_input - self.state_means) / self.state_vars self.norm_action_input = (self.action_input - self.action_means) / self.action_vars self.norm_state_delta_label = ( self.state_delta_label - self.output_means) / self.output_vars self.input = tf.concat( values=[self.norm_state_input, self.norm_action_input], axis=1) self.action_scalar = Scaler( obs_dim=self.config.config_dict['ACTION_SPACE']) self.state_scalar = Scaler( obs_dim=self.config.config_dict['STATE_SPACE']) self.delta_scalar = Scaler( obs_dim=self.config.config_dict['STATE_SPACE']) self.net, self.delta_state_output, self.trainable_var_list = \ NetworkCreator.create_network(input=self.input, network_config=self.config.config_dict['NET_CONFIG'], net_name=self.config.config_dict['NAME']) # output_low=output_bound[0] - output_bound[1], # output_high=output_bound[1] - output_bound[0]) self.loss, self.optimizer, self.optimize = self.create_training_method( ) self.denorm_delta_state_output = self.delta_state_output * self.output_vars + self.output_means self.denorm_state_input = self.norm_state_input * self.state_vars + self.state_means self.output = self.state_input + self.denorm_delta_state_output self.var_list = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=self.config.config_dict['NAME']) self.variables_initializer = tf.variables_initializer( var_list=self.var_list) def create_training_method(self): l2_loss = tf.reduce_sum( [tf.nn.l2_loss(var) for var in self.trainable_var_list]) loss = tf.reduce_mean( tf.reduce_sum(tf.square(self.norm_state_delta_label - self.delta_state_output), reduction_indices=[1])) + 0.0 * l2_loss optimizer = tf.train.AdamOptimizer( learning_rate=self.config.config_dict['LEARNING_RATE']) optimize = optimizer.minimize(loss=loss, var_list=self.trainable_var_list) return loss, optimizer, optimize def update_mean_var(self, state_input, action_input, delta_state_label): self.state_scalar.update(x=state_input) self.action_scalar.update(x=action_input) self.delta_scalar.update(x=delta_state_label) def update(self, sess, state_input, action_input, delta_state_label): state_input = np.reshape(state_input, newshape=[-1] + list(self.config.config_dict['STATE_SPACE'])) action_input = np.reshape( action_input, newshape=[-1] + list(self.config.config_dict['ACTION_SPACE'])) delta_state_label = np.reshape( delta_state_label, newshape=[-1] + list(self.config.config_dict['STATE_SPACE'])) total_loss = 0.0 batch_count = len(state_input) // self.config.config_dict['BATCH_SIZE'] if batch_count <= 0: raise ValueError( 'Batch count is zero, input data size: %d, batch size %d' % (len(state_input), self.config.config_dict['BATCH_SIZE'])) for j in range(batch_count): state_intput_j = state_input[ self.config.config_dict['BATCH_SIZE'] * j:self.config.config_dict['BATCH_SIZE'] * (j + 1), :] action_input_j = action_input[ self.config.config_dict['BATCH_SIZE'] * j:self.config.config_dict['BATCH_SIZE'] * (j + 1), :] delta_state_label_j = delta_state_label[ self.config.config_dict['BATCH_SIZE'] * j:self.config.config_dict['BATCH_SIZE'] * (j + 1), :] _, loss = sess.run(fetches=[self.optimize, self.loss], feed_dict={ self.state_input: state_intput_j, self.action_input: action_input_j, self.state_delta_label: delta_state_label_j, self.state_vars: np.sqrt(self.state_scalar.vars), self.state_means: self.state_scalar.means, self.action_vars: np.sqrt(self.action_scalar.vars), self.action_means: self.action_scalar.means, self.output_means: self.delta_scalar.means, self.output_vars: np.sqrt(self.delta_scalar.vars) }) total_loss += loss average_loss = total_loss / batch_count self.log_queue.put({self.name + '_LOSS': average_loss}) return average_loss def test(self, sess, state_input, action_input, delta_state_label): state_input = np.reshape(state_input, newshape=[-1] + list(self.config.config_dict['STATE_SPACE'])) action_input = np.reshape( action_input, newshape=[-1] + list(self.config.config_dict['ACTION_SPACE'])) delta_state_label = np.reshape( delta_state_label, newshape=[-1] + list(self.config.config_dict['STATE_SPACE'])) loss = sess.run(fetches=self.loss, feed_dict={ self.state_input: state_input, self.action_input: action_input, self.state_delta_label: delta_state_label, self.state_vars: np.sqrt(self.state_scalar.vars), self.state_means: self.state_scalar.means, self.action_vars: np.sqrt(self.action_scalar.vars), self.action_means: self.action_scalar.means, self.output_means: self.delta_scalar.means, self.output_vars: np.sqrt(self.delta_scalar.vars) }) self.log_queue.put({self.name + '_LOSS': np.mean(loss)}) def predict(self, sess, state_input, action_input): state_input = np.reshape(state_input, newshape=[-1] + list(self.config.config_dict['STATE_SPACE'])) action_input = np.reshape( action_input, newshape=[-1] + list(self.config.config_dict['ACTION_SPACE'])) res = sess.run(fetches=[self.output], feed_dict={ self.state_input: state_input, self.action_input: action_input, self.state_vars: np.sqrt(self.state_scalar.vars), self.state_means: self.state_scalar.means, self.action_vars: np.sqrt(self.action_scalar.vars), self.action_means: self.action_scalar.means, self.output_means: self.delta_scalar.means, self.output_vars: np.sqrt(self.delta_scalar.vars) }) return utl.squeeze_array(res, dim=1 + len(self.config.config_dict['STATE_SPACE'])) def init(self): sess = tf.get_default_session() sess.run(self.variables_initializer) super().init()