def predict_sym(self, obs_var, act_var): assert self.normalize_input with tf.variable_scope(self.name, reuse=True): in_obs_var = normalize(obs_var, mean=self._mean_obs_var, std=self._std_obs_var) in_act_var = normalize(act_var, mean=self._mean_act_var, std=self._std_act_var) input_var = tf.concat([in_obs_var, in_act_var], axis=1) mlp = MLP( self.name, output_dim=self.obs_space_dims, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=input_var, input_dim=self.obs_space_dims + self.action_space_dims, ) delta = denormalize(mlp.output_var, mean=self._mean_delta_var, std=self._std_delta_var) pred_obs = delta + obs_var pred_obs = tf.clip_by_value(pred_obs, -1e2, 1e2) return pred_obs
def distribution_info_sym(self, obs_var): with tf.variable_scope(self.name + '/value_function', reuse=True): input_var = (obs_var - self._mean_input_var) / (self._std_input_var + 1e-8) mlp = MLP(self.name, output_dim=1, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=input_var, input_dim=self.obs_space_dims) output_var = tf.reshape(mlp.output_var, shape=(-1, )) output_var = output_var * self._std_output_var + self._mean_output_var return dict(mean=output_var)
def predict_batches_sym(self, obs_ph, act_ph): """ Same batch fed into all models. Randomly output one of the predictions for each observation. :param obs_ph: (batch_size, obs_space_dims) :param act_ph: (batch_size, act_space_dims) :return: (batch_size, obs_space_dims) """ original_obs = obs_ph obs_ph, act_ph = tf.split(obs_ph, self.num_models, axis=0), tf.split(act_ph, self.num_models, axis=0) delta_preds = [] with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): for i in range(self.num_models): with tf.variable_scope('model_{}'.format(i), reuse=True): assert self.normalize_input in_obs_var = tf_normalize(obs_ph[i], mean=self._mean_obs_var[i], std=self._std_obs_var[i]) in_act_var = tf_normalize(act_ph[i], mean=self._mean_act_var[i], std=self._std_act_var[i]) input_var = tf.concat([in_obs_var, in_act_var], axis=1) mlp = MLP( self.name + '/model_{}'.format(i), output_dim=self.obs_space_dims, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=input_var, input_dim=self.obs_space_dims + self.action_space_dims, ) delta_pred = tf_denormalize(mlp.output_var, mean=self._mean_delta_var[i], std=self._std_delta_var[i]) delta_preds.append(delta_pred) delta_preds = tf.concat(delta_preds, axis=0) next_obs = original_obs + delta_preds return tf.clip_by_value(next_obs, -1e2, 1e2)
def __init__( self, name, env, hidden_sizes=(500, 500), hidden_nonlinearity="tanh", output_nonlinearity=None, batch_size=500, learning_rate=0.001, weight_normalization=True, normalize_input=True, optimizer=tf.train.AdamOptimizer, valid_split_ratio=0.2, rolling_average_persitency=0.99, buffer_size=100000, ): Serializable.quick_init(self, locals()) self.normalization = None self.normalize_input = normalize_input self.use_reward_model = False self.buffer_size = buffer_size self.name = name self.hidden_sizes = hidden_sizes self._dataset_train = None self._dataset_test = None self.next_batch = None self.valid_split_ratio = valid_split_ratio self.rolling_average_persitency = rolling_average_persitency self.hidden_nonlinearity = hidden_nonlinearity = self._activations[ hidden_nonlinearity] self.output_nonlinearity = output_nonlinearity = self._activations[ output_nonlinearity] with tf.variable_scope(name): self.batch_size = batch_size self.learning_rate = learning_rate # determine dimensionality of state and action space self.obs_space_dims = env.observation_space.shape[0] self.action_space_dims = env.action_space.shape[0] # placeholders self.obs_ph = tf.placeholder(tf.float32, shape=(None, self.obs_space_dims)) self.act_ph = tf.placeholder(tf.float32, shape=(None, self.action_space_dims)) self.delta_ph = tf.placeholder(tf.float32, shape=(None, self.obs_space_dims)) self._create_stats_vars() # concatenate action and observation --> NN input self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1) # create MLP mlp = MLP(name, output_dim=self.obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=self.nn_input, input_dim=self.obs_space_dims + self.action_space_dims, weight_normalization=weight_normalization) self.delta_pred = mlp.output_var # define loss and train_op self.loss = tf.reduce_mean( tf.linalg.norm(self.delta_ph - self.delta_pred, axis=-1)) self.optimizer = optimizer(self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) # tensor_utils self.f_delta_pred = compile_function([self.obs_ph, self.act_ph], self.delta_pred) self._networks = [mlp]
def predict_sym(self, obs_ph, act_ph, pred_type='rand', perm_dict=None): """ Same batch fed into all models. Randomly output one of the predictions for each observation. :param obs_ph: (batch_size, obs_space_dims) :param act_ph: (batch_size, act_space_dims) :return: (batch_size, obs_space_dims) """ if pred_type == 'rand': # shuffle if perm_dict is not None: perm = perm_dict['perm'] else: perm = tf.range(0, limit=tf.shape(obs_ph)[0], dtype=tf.int32) perm = tf.random.shuffle(perm) obs_ph_perm, act_ph_perm = tf.gather(obs_ph, perm), tf.gather( act_ph, perm) next_obs_perm = self.predict_batches_sym(obs_ph_perm, act_ph_perm) # unshuffle if perm_dict is not None: perm_inv = perm_dict['perm_inv'] else: perm_inv = tf.invert_permutation(perm) next_obs = tf.gather(next_obs_perm, perm_inv) return next_obs delta_preds = [] with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): for i in range(self.num_models): with tf.variable_scope('model_{}'.format(i), reuse=True): assert self.normalize_input in_obs_var = tf_normalize(obs_ph, mean=self._mean_obs_var[i], std=self._std_obs_var[i]) in_act_var = tf_normalize(act_ph, mean=self._mean_act_var[i], std=self._std_act_var[i]) input_var = tf.concat([in_obs_var, in_act_var], axis=1) mlp = MLP( self.name + '/model_{}'.format(i), output_dim=self.obs_space_dims, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=input_var, input_dim=self.obs_space_dims + self.action_space_dims, ) delta_pred = tf_denormalize(mlp.output_var, mean=self._mean_delta_var[i], std=self._std_delta_var[i]) delta_preds.append(delta_pred) delta_preds = tf.stack(delta_preds, axis=2) # (batch_size, obs_dims, num_models) next_obs = tf.expand_dims(obs_ph, axis=2) + delta_preds if pred_type == 'all': pass elif pred_type == 'mean': next_obs = tf.reduce_mean(next_obs, axis=2) else: NotImplementedError('[rand, mean, all]') return tf.clip_by_value(next_obs, -1e2, 1e2)
def __init__( self, name, env, num_models=5, hidden_sizes=(512, 512), hidden_nonlinearity='swish', output_nonlinearity=None, batch_size=500, learning_rate=0.001, weight_normalization=False, # Doesn't work normalize_input=True, optimizer=tf.train.AdamOptimizer, valid_split_ratio=0.2, # 0.1 rolling_average_persitency=0.99, buffer_size=50000, loss_str='MSE', ): Serializable.quick_init(self, locals()) max_logvar = 1 min_logvar = 0.1 self.normalization = None self.normalize_input = normalize_input self.next_batch = None self.valid_split_ratio = valid_split_ratio self.rolling_average_persitency = rolling_average_persitency self.buffer_size_train = int(buffer_size * (1 - valid_split_ratio)) self.buffer_size_test = int(buffer_size * valid_split_ratio) self.batch_size = batch_size self.learning_rate = learning_rate self.num_models = num_models self.hidden_sizes = hidden_sizes self.name = name self._dataset_train = None self._dataset_test = None # determine dimensionality of state and action space self.obs_space_dims = obs_space_dims = env.observation_space.shape[0] self.action_space_dims = action_space_dims = env.action_space.shape[0] self.timesteps_counter = 0 self.used_timesteps_counter = 0 self.hidden_nonlinearity = hidden_nonlinearity = self._activations[ hidden_nonlinearity] self.output_nonlinearity = output_nonlinearity = self._activations[ output_nonlinearity] """ computation graph for training and simple inference """ with tf.variable_scope(name, reuse=tf.AUTO_REUSE): # placeholders self.obs_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) self.act_ph = tf.placeholder(tf.float32, shape=(None, action_space_dims)) self.delta_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) self._create_stats_vars() # concatenate action and observation --> NN input self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1) obs_ph = tf.split(self.nn_input, self.num_models, axis=0) # create MLP mlps = [] delta_preds = [] self.obs_next_pred = [] for i in range(num_models): with tf.variable_scope('model_{}'.format(i), reuse=tf.AUTO_REUSE): mlp = MLP( name + '/model_{}'.format(i), output_dim=obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=obs_ph[i], input_dim=obs_space_dims + action_space_dims, ) mlps.append(mlp) delta_preds.append(mlp.output_var) self.delta_pred = tf.stack( delta_preds, axis=2) # shape: (batch_size, ndim_obs, n_models) # define loss and train_op if loss_str == 'L2': self.loss = tf.reduce_mean( tf.linalg.norm(self.delta_ph[:, :, None] - self.delta_pred, axis=1)) elif loss_str == 'MSE': self.loss = tf.reduce_mean( (self.delta_ph[:, :, None] - self.delta_pred)**2) else: raise NotImplementedError self.optimizer = optimizer(learning_rate=self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) # tensor_utils self.f_delta_pred = compile_function([self.obs_ph, self.act_ph], self.delta_pred) """ computation graph for inference where each of the models receives a different batch""" with tf.variable_scope(name, reuse=True): # placeholders self.obs_model_batches_stack_ph = tf.placeholder( tf.float32, shape=(None, obs_space_dims)) self.act_model_batches_stack_ph = tf.placeholder( tf.float32, shape=(None, action_space_dims)) self.delta_model_batches_stack_ph = tf.placeholder( tf.float32, shape=(None, obs_space_dims)) # split stack into the batches for each model --> assume each model receives a batch of the same size self.obs_model_batches = tf.split(self.obs_model_batches_stack_ph, self.num_models, axis=0) self.act_model_batches = tf.split(self.act_model_batches_stack_ph, self.num_models, axis=0) self.delta_model_batches = tf.split( self.delta_model_batches_stack_ph, self.num_models, axis=0) # reuse previously created MLP but each model receives its own batch delta_preds = [] self.obs_next_pred = [] self.loss_model_batches = [] self.train_op_model_batches = [] for i in range(num_models): with tf.variable_scope('model_{}'.format(i), reuse=True): # concatenate action and observation --> NN input nn_input = tf.concat( [self.obs_model_batches[i], self.act_model_batches[i]], axis=1) mlp = MLP(name + '/model_{}'.format(i), output_dim=obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=nn_input, input_dim=obs_space_dims + action_space_dims, weight_normalization=weight_normalization) delta_preds.append(mlp.output_var) # define loss and train_op if loss_str == 'L2': loss = tf.reduce_mean( tf.linalg.norm(self.delta_model_batches[i] - mlp.output_var, axis=1)) elif loss_str == 'MSE': loss = tf.reduce_mean( (self.delta_model_batches[i] - mlp.output_var)**2) else: raise NotImplementedError self.loss_model_batches.append(loss) self.train_op_model_batches.append( optimizer(learning_rate=self.learning_rate).minimize(loss)) self.delta_pred_model_batches_stack = tf.concat( delta_preds, axis=0) # shape: (batch_size_per_model*num_models, ndim_obs) # tensor_utils self.f_delta_pred_model_batches = compile_function([ self.obs_model_batches_stack_ph, self.act_model_batches_stack_ph ], self.delta_pred_model_batches_stack) self._networks = mlps