def define_forward_pass(self): if self.discrete: logits_na = build_mlp(self.observations_pl, output_size=self.ac_dim, scope='discrete_logits', n_layers=self.n_layers, size=self.size) self.parameters = logits_na else: mean = build_mlp(self.observations_pl, output_size=self.ac_dim, scope='continuous_logits', n_layers=self.n_layers, size=self.size) logstd = tf.Variable(tf.zeros(self.ac_dim), name='logstd') self.parameters = (mean, logstd)
def __init__( self, ac_dim, ob_dim, n_layers, size, learning_rate=1e-4, training=True, discrete=False, # unused for now nn_baseline=False, # unused for now **kwargs): super().__init__(**kwargs) # init vars # self.sess = sess self.ac_dim = ac_dim self.ob_dim = ob_dim self.n_layers = n_layers self.size = size self.learning_rate = learning_rate self.training = training self.model = build_mlp(output_size=self.ac_dim, n_layers=self.n_layers, size=self.size) self.loss_object = lambda y_true, y_pred: tf.reduce_mean( tf.reduce_sum(tf.square(y_true - y_pred))) self.optimizer = keras.optimizers.Adam( learning_rate=self.learning_rate) '''
def build_baseline_forward_pass(self): self.baseline_prediction = tf.squeeze( build_mlp(self.observations_pl, output_size=1, scope='nn_baseline', n_layers=self.n_layers, size=self.size))
def define_forward_pass(self): # normalize input data to mean 0, std 1 obs_unnormalized = self.obs_pl acs_unnormalized = self.acs_pl # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines obs_normalized = normalize( obs_unnormalized, self.obs_mean_pl, self.obs_std_pl ) # TODO(Q1) Define obs_normalized using obs_unnormalized,and self.obs_mean_pl and self.obs_std_pl acs_normalized = normalize( acs_unnormalized, self.acs_mean_pl, self.acs_std_pl ) # TODO(Q2) Define acs_normalized using acs_unnormalized and self.acs_mean_pl and self.acs_std_pl # predicted change in obs concatenated_input = tf.concat([obs_normalized, acs_normalized], axis=1) # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s) self.delta_pred_normalized = build_mlp(concatenated_input, \ self.ob_dim, \ self.scope, \ self.n_layers, \ self.size) # TODO(Q1) Use the build_mlp function and the concatenated_input above to define a neural network that predicts unnormalized delta states (i.e. change in state) self.delta_pred_unnormalized = unnormalize( self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl ) # TODO(Q1) Unnormalize the the delta_pred above using the unnormalize function, and self.delta_mean_pl and self.delta_std_pl self.next_obs_pred = obs_unnormalized + self.delta_pred_unnormalized # TODO(Q1) Predict next observation using current observation and delta prediction (not that next_obs here is unnormalized)
def build_model(self): # self.define_placeholders() model = build_mlp(output_size=self.ac_dim, n_layers=self.n_layers, size=self.size) self.model = model self.logstd = tf.Variable(tf.zeros(self.ac_dim), name='logstd')
def __init__(self, ac_dim, ob_dim, n_layers, size, **kwargs): super().__init__() self.model = build_mlp((ob_dim, ), output_size=ac_dim, n_layers=n_layers, size=size, name='model')
def define_forward_pass(self): # TODO implement this build_mlp function in tf_utils mean = build_mlp(self.observations_pl, output_size=self.ac_dim, scope='continuous_logits', n_layers=self.n_layers, size=self.size) logstd = tf.Variable(tf.zeros(self.ac_dim), name='logstd') self.parameters = (mean, logstd)
def define_forward_pass(self): # normalize input data to mean 0, std 1 obs_unnormalized = self.obs_pl acs_unnormalized = self.acs_pl # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines obs_normalized = normalize(obs_unnormalized, self.obs_mean_pl, self.obs_std_pl) acs_normalized = normalize(acs_unnormalized, self.acs_mean_pl, self.acs_std_pl) # predicted change in obs concatenated_input = tf.concat([obs_normalized, acs_normalized], axis=1) # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s) self.delta_pred_normalized = build_mlp(concatenated_input, self.ob_dim, self.scope, self.n_layers, self.size) self.delta_pred_unnormalized = unnormalize(self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl) self.next_obs_pred = self.obs_pl + self.delta_pred_unnormalized
def _build(self): """ Notes on notation: Symbolic variables have the prefix sy_, to distinguish them from the numerical values that are computed later in the function Prefixes and suffixes: ob - observation ac - action _no - this tensor should have shape (batch self.size /n/, observation dim) _na - this tensor should have shape (batch self.size /n/, action dim) _n - this tensor should have shape (batch self.size /n/) Note: batch self.size /n/ is defined at runtime, and until then, the shape for that axis is None ---------------------------------------------------------------------------------- loss: a function of self.sy_ob_no, self.sy_ac_na and self.sy_adv_n that we will differentiate to get the policy gradient. """ self.sy_ob_no, self.sy_ac_na, self.sy_adv_n = self.define_placeholders( ) # define the critic self.critic_prediction = tf.squeeze( build_mlp(self.sy_ob_no, 1, "nn_critic", n_layers=self.n_layers, size=self.size)) self.sy_target_n = tf.placeholder(shape=[None], name="critic_target", dtype=tf.float32) # TODO: set up the critic loss # HINT1: the critic_prediction should regress onto the targets placeholder (sy_target_n) # HINT2: use tf.losses.mean_squared_error # DONE self.critic_loss = tf.losses.mean_squared_error( self.sy_target_n, self.critic_prediction) # TODO: use the AdamOptimizer to optimize the loss defined above # DONE self.critic_update_op = tf.train.AdamOptimizer( self.learning_rate).minimize(self.critic_loss)
def __init__(self, hparams): super().__init__() self.ob_dim = hparams['ob_dim'] self.ac_dim = hparams['ac_dim'] self.discrete = hparams['discrete'] self.size = hparams['size'] self.n_layers = hparams['n_layers'] self.learning_rate = hparams['learning_rate'] self.num_target_updates = hparams['num_target_updates'] self.num_grad_steps_per_target_update = hparams[ 'num_grad_steps_per_target_update'] self.gamma = hparams['gamma'] self.nn_critic = build_mlp((hparams['ob_dim'], ), output_size=1, n_layers=hparams['n_layers'], size=hparams['size'], name='nn_critic')
def __init__(self, env, agent_params, batch_size=500000, **kwargs): super(PGAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.nn_baseline = self.agent_params['nn_baseline'] self.reward_to_go = self.agent_params['reward_to_go'] # actor/policy if self.agent_params['discrete']: self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) else: self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) self.policy_optimizer = tf.keras.optimizers.Adam( learning_rate=self.agent_params['learning_rate']) # replay buffer self.replay_buffer = ReplayBuffer(2 * batch_size) self.baseline_model = None if self.agent_params['nn_baseline']: self.baseline_model = build_mlp( (self.agent_params['ob_dim'], ), output_size=1, n_layers=self.agent_params['n_layers'], size=self.agent_params['size'], name='baseline_model') self.baseline_loss = tf.keras.losses.MeanSquaredError() self.baseline_optimizer = tf.keras.optimizers.Adam( learning_rate=self.agent_params['learning_rate']) self.baseline_model.compile(optimizer=self.baseline_optimizer, loss=self.baseline_loss)
def __init__( self, ac_dim, ob_dim, n_layers, size, discrete=False, # unused for now nn_baseline=False, # unused for now **kwargs): super().__init__() # init vars self.ac_dim = ac_dim self.ob_dim = ob_dim self.n_layers = n_layers self.size = size self.mean = build_mlp((self.ob_dim, ), output_size=self.ac_dim, n_layers=self.n_layers, size=self.size) self.gauss_noise = GaussianNoise(self.ac_dim, name='noise')
def define_forward_pass(self): # normalize input data to mean 0, std 1 obs_unnormalized = self.obs_pl acs_unnormalized = self.acs_pl # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines obs_normalized = normalize(obs_unnormalized, self.obs_mean_pl, self.obs_std_pl) acs_normalized = normalize(acs_unnormalized, self.acs_mean_pl, self.acs_std_pl) # predicted change in obs concatenated_input = tf.concat([obs_normalized, acs_normalized], axis=1) # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s) # TODO(Q1) Use the build_mlp function and the concatenated_input above to define a neural network that predicts unnormalized delta states (i.e. change in state) # TODO(Q1) Unnormalize the the delta_pred above using the unnormalize function, and self.delta_mean_pl and self.delta_std_pl # TODO(Q1) Predict next observation using current observation and delta prediction (not that next_obs here is unnormalized) # DONE self.delta_pred_normalized = build_mlp(concatenated_input, self.ob_dim, self.scope, self.n_layers, self.size) self.delta_pred_unnormalized = unnormalize(self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl self.next_obs_pred = obs_unnormalized + self.delta_pred_unnormalized def define_train_op(self): # normalize the labels # TODO(Q1) Define a normalized version of delta_labels using self.delta_labels (which are unnormalized), and self.delta_mean_pl and self.delta_std_pl # DONE self.delta_labels_normalized = normalize(self.delta_labels, self.delta_mean_pl, self.delta_std_pl) # compared predicted deltas to labels (both should be normalized) # TODO(Q1) Define a loss function that takes as input normalized versions of predicted change in state and ground truth change in state # TODO(Q1) Define a train_op to minimize the loss defined above. Adam optimizer will work well. # DONE self.loss = self.delta_labels_normalized - self.next_obs_pred self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) ############################# def get_prediction(self, obs, acs, data_statistics): if len(obs.shape)>1: observations = obs actions = acs else: observations = obs[None] actions = acs [None] # TODO(Q1) Run model prediction on the given batch of data # DONE return self.sess.run(self.next_obs_pred, feed_dict={self.obs_pl:obs, self.acs_pl:acs, self.obs_mean_pl:data_statistics["obs_mean"], self.acs_mean_pl:data_statistics["acs_mean"], self.obs_std_pl:data_statistics["obs_std"], self.acs_std_pl:data_statistics["acs_std"], self.delta_mean_pl:data_statistics["delta_mean"], self.delta_std_pl:data_statistics["delta_std"]}) def update(self, observations, actions, next_observations, data_statistics): # train the model # TODO(Q1) Run the defined train_op here, and also return the loss being optimized (on this batch of data) # DONE _, loss = self.sess.run([self.train_op, self.loss], feed_dict={self.obs_pl: observations, self.acs_pl: actions, self.delta_labels:next_observations, self.obs_mean_pl:data_statistics["obs_mean"], self.acs_mean_pl:data_statistics["acs_mean"], self.obs_std_pl:data_statistics["obs_std"], self.acs_std_pl:data_statistics["acs_std"], self.delta_mean_pl:data_statistics["delta_mean"], self.delta_std_pl:data_statistics["delta_std"]}) return loss
def define_forward_pass(self): self.values = build_mlp(self.observations_pl, output_size=1, scope='value', n_layers=self.n_layers, size=self.size)