def __init__( self, name, env, hidden_sizes=(512, ), cell_type='lstm', hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, batch_size=500, learning_rate=0.001, normalize_input=True, optimizer=tf.train.AdamOptimizer, valid_split_ratio=0.2, rolling_average_persitency=0.99, backprop_steps=50, ): Serializable.quick_init(self, locals()) self.recurrent = True self.normalization = None self.normalize_input = normalize_input self.next_batch = None self.valid_split_ratio = valid_split_ratio self.rolling_average_persitency = rolling_average_persitency self.backprop_steps = backprop_steps self.batch_size = batch_size self.learning_rate = learning_rate self.name = name self._dataset_train = None self._dataset_test = None # Determine dimensionality of state and action space self.obs_space_dims = obs_space_dims = env.observation_space.shape[0] self.action_space_dims = action_space_dims = env.action_space.shape[0] """ computation graph for training and simple inference """ with tf.variable_scope(name): # Placeholders self.obs_ph = tf.placeholder(tf.float32, shape=(None, None, obs_space_dims), name='obs_ph') self.act_ph = tf.placeholder(tf.float32, shape=(None, None, action_space_dims), name='act_ph') self.delta_ph = tf.placeholder(tf.float32, shape=(None, None, obs_space_dims), name='delta_ph') # Concatenate action and observation --> NN input self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=2) # Create RNN rnns = [] delta_preds = [] self.obs_next_pred = [] self.hidden_state_ph = [] self.next_hidden_state_var = [] self.cell = [] with tf.variable_scope('rnn_model'): rnn = RNN( name, output_dim=self.obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=self.nn_input, input_dim=self.obs_space_dims + self.action_space_dims, cell_type=cell_type, ) self.delta_pred = rnn.output_var self.hidden_state_ph = rnn.state_var self.next_hidden_state_var = rnn.next_state_var self.cell = rnn.cell self._zero_state = self.cell.zero_state(1, tf.float32) self.loss = tf.reduce_mean( tf.square(self.delta_pred - self.delta_ph)) params = list(rnn.get_params().values()) self._gradients_ph = [ tf.placeholder(shape=param.shape, dtype=tf.float32) for param in params ] self._gradients_vars = tf.gradients(self.loss, params) applied_gradients = zip(self._gradients_ph, params) self.train_op = optimizer( self.learning_rate).apply_gradients(applied_gradients) # Tensor_utils self.f_delta_pred = tensor_utils.compile_function( [self.obs_ph, self.act_ph, self.hidden_state_ph], [self.delta_pred, self.next_hidden_state_var]) self._networks = [rnn]
def __init__(self, name, env, hidden_sizes=(512, 512), meta_batch_size=10, hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None, batch_size=500, learning_rate=0.001, inner_learning_rate=0.1, normalize_input=True, optimizer=tf.train.AdamOptimizer, valid_split_ratio=0.2, rolling_average_persitency=0.99, ): Serializable.quick_init(self, locals()) self.normalization = None self.normalize_input = normalize_input self.next_batch = None self.meta_batch_size = meta_batch_size self.valid_split_ratio = valid_split_ratio self.rolling_average_persitency = rolling_average_persitency self.batch_size = batch_size self.learning_rate = learning_rate self.inner_learning_rate = inner_learning_rate self.name = name self._dataset_train = None self._dataset_test = None self._prev_params = None self._adapted_param_values = None # determine dimensionality of state and action space self.obs_space_dims = obs_space_dims = env.observation_space.shape[0] self.action_space_dims = action_space_dims = env.action_space.shape[0] hidden_nonlinearity = self._activations[hidden_nonlinearity] output_nonlinearity = self._activations[output_nonlinearity] """ ------------------ Pre-Update Graph + Adaptation ----------------------- """ with tf.variable_scope(name): # Placeholders self.obs_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) self.act_ph = tf.placeholder(tf.float32, shape=(None, action_space_dims)) self.delta_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) # Concatenate action and observation --> NN input self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1) # Create MLP mlp = MLP(name, output_dim=obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=self.nn_input, input_dim=obs_space_dims+action_space_dims) self.delta_pred = mlp.output_var # shape: (batch_size, ndim_obs, n_models) self.loss = tf.reduce_mean(tf.square(self.delta_ph - self.delta_pred)) self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) self.adaptation_sym = tf.train.GradientDescentOptimizer(self.inner_learning_rate).minimize(self.loss) # Tensor_utils self.f_delta_pred = tensor_utils.compile_function([self.obs_ph, self.act_ph], self.delta_pred) """ --------------------------- Meta-training Graph ---------------------------------- """ nn_input_per_task = tf.split(self.nn_input, self.meta_batch_size, axis=0) delta_per_task = tf.split(self.delta_ph, self.meta_batch_size, axis=0) pre_input_per_task, post_input_per_task = zip(*[tf.split(nn_input, 2, axis=0) for nn_input in nn_input_per_task]) pre_delta_per_task, post_delta_per_task = zip(*[tf.split(delta, 2, axis=0) for delta in delta_per_task]) pre_losses = [] post_losses = [] self._adapted_params = [] for idx in range(self.meta_batch_size): with tf.variable_scope(name + '/pre_model_%d' % idx, reuse=tf.AUTO_REUSE): pre_mlp = MLP(name, output_dim=obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=pre_input_per_task[idx], input_dim=obs_space_dims + action_space_dims, params=mlp.get_params()) pre_delta_pred = pre_mlp.output_var pre_loss = tf.reduce_mean(tf.square(pre_delta_per_task[idx] - pre_delta_pred)) adapted_params = self._adapt_sym(pre_loss, pre_mlp.get_params()) self._adapted_params.append(adapted_params) with tf.variable_scope(name + '/post_model_%d' % idx, reuse=tf.AUTO_REUSE): post_mlp = MLP(name, output_dim=obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=post_input_per_task[idx], params=adapted_params, input_dim=obs_space_dims + action_space_dims) post_delta_pred = post_mlp.output_var post_loss = tf.reduce_mean(tf.square(post_delta_per_task[idx] - post_delta_pred)) pre_losses.append(pre_loss) post_losses.append(post_loss) self.pre_loss = tf.reduce_mean(pre_losses) self.post_loss = tf.reduce_mean(post_losses) self.train_op = optimizer(self.learning_rate).minimize(self.post_loss) """ --------------------------- Post-update Inference Graph --------------------------- """ with tf.variable_scope(name + '_ph_graph'): self.post_update_delta = [] self.network_phs_meta_batch = [] nn_input_per_task = tf.split(self.nn_input, self.meta_batch_size, axis=0) for idx in range(meta_batch_size): with tf.variable_scope('task_%i' % idx): network_phs = self._create_placeholders_for_vars(mlp.get_params()) self.network_phs_meta_batch.append(network_phs) mlp_meta_batch = MLP(name, output_dim=obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, params=network_phs, input_var=nn_input_per_task[idx], input_dim=obs_space_dims + action_space_dims, ) self.post_update_delta.append(mlp_meta_batch.output_var) self._networks = [mlp]
def __init__( self, name, env, hidden_sizes=(512, 512), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None, batch_size=500, learning_rate=0.001, normalize_input=True, optimizer=tf.train.AdamOptimizer, valid_split_ratio=0.2, rolling_average_persitency=0.99, ): Serializable.quick_init(self, locals()) self.normalization = None self.normalize_input = normalize_input self.next_batch = None self.valid_split_ratio = valid_split_ratio self.rolling_average_persitency = rolling_average_persitency self.batch_size = batch_size self.learning_rate = learning_rate self.name = name self._dataset_train = None self._dataset_test = None # determine dimensionality of state and action space self.obs_space_dims = obs_space_dims = env.observation_space.shape[0] self.action_space_dims = action_space_dims = env.action_space.shape[0] hidden_nonlinearity = self._activations[hidden_nonlinearity] output_nonlinearity = self._activations[output_nonlinearity] with tf.variable_scope(name): # placeholders self.obs_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) self.act_ph = tf.placeholder(tf.float32, shape=(None, action_space_dims)) self.delta_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) # concatenate action and observation --> NN input self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1) # create MLP with tf.variable_scope('ff_model'): mlp = MLP(name, output_dim=obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=self.nn_input, input_dim=obs_space_dims + action_space_dims) self.delta_pred = mlp.output_var # shape: (batch_size, ndim_obs, n_models) self.loss = tf.reduce_mean( tf.square(self.delta_ph - self.delta_pred)) self.optimizer = optimizer(self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) # tensor_utils self.f_delta_pred = tensor_utils.compile_function( [self.obs_ph, self.act_ph], self.delta_pred) self._networks = [mlp]