def __init__(self, sess, ob_spaces, ac_spaces, nstack, index, disc_type='decentralized', hidden_size=128, lr_rate=0.01, total_steps=50000, scope="discriminator", kfac_clip=0.001, max_grad_norm=0.5): self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear') self.disc_type = disc_type if disc_type not in disc_types: assert False self.scope = scope self.index = index self.sess = sess ob_space = ob_spaces[index] ac_space = ac_spaces[index] self.ob_shape = ob_space.shape[0] * nstack nact = ac_space.n self.ac_shape = nact * nstack self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack self.hidden_size = hidden_size if disc_type == 'decentralized': input_shape = self.ob_shape + self.ac_shape elif disc_type == 'centralized': input_shape = self.all_ob_shape + self.all_ac_shape elif disc_type == 'single': input_shape = self.all_ob_shape + self.all_ac_shape else: assert False self.g = tf.placeholder(tf.float32, (None, input_shape)) self.e = tf.placeholder(tf.float32, (None, input_shape)) self.lr_rate = tf.placeholder(tf.float32, ()) self.adv = tf.placeholder(tf.float32, ()) num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1 logits = self.build_graph(tf.concat([self.g, self.e], axis=0), num_outputs, reuse=False) labels = tf.concat([ tf.ones([tf.shape(self.g)[0], 1]), -tf.ones([tf.shape(self.e)[0], 1]) ], axis=0) g_logits = self.build_graph(self.g, num_outputs, reuse=True) e_logits = self.build_graph(self.e, num_outputs, reuse=True) self.g_loss = tf.reduce_mean(g_logits) self.e_loss = tf.reduce_mean(-e_logits) # self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( # logits=g_logits, labels=tf.zeros_like(g_logits))) # self.e_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( # logits=e_logits, labels=tf.ones_like(e_logits))) self.total_loss = logits * labels # tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)) epsilon = tf.random_uniform([], 0.0, 1.0) ge = self.g * epsilon + self.e * (1 - epsilon) gel = self.build_graph(ge, num_outputs, reuse=True) ddd = tf.gradients(gel, [ge]) ddd = tf.norm(ddd, axis=1) self.ddd = tf.reduce_mean(tf.square(ddd - 1.)) * 5 sample_net = logits + tf.random_normal(tf.shape(logits)) fisher_loss = -tf.reduce_mean( tf.pow(logits - tf.stop_gradient(sample_net), 2)) self.reward_op = tf.sigmoid(g_logits) # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits)) self.var_list = self.get_trainable_variables() params = find_trainable_variables(self.scope) grads = tf.gradients(self.total_loss, params) # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list) with tf.variable_scope(self.scope + '/d_optim'): d_optim = kfac.KfacOptimizer(learning_rate=self.lr_rate, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=0, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params) train_op, q_runner = d_optim.apply_gradients( list(zip(grads, params))) self.q_runner = q_runner self.g_optim = tf.train.AdamOptimizer(learning_rate=0.0005).minimize( self.ddd) self.d_optim = train_op self.saver = tf.train.Saver(self.get_variables()) self.params_flat = self.get_trainable_variables()
def __init__(self, sess, ob_spaces, ac_spaces, state_only, discount, nstack, index, disc_type='decentralized', hidden_size=128, lr_rate=0.01, total_steps=50000, scope="discriminator", kfac_clip=0.001, max_grad_norm=0.5, l2_loss_ratio=0.01): self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear') self.disc_type = disc_type self.l2_loss_ratio = l2_loss_ratio if disc_type not in disc_types: assert False self.state_only = state_only self.gamma = discount self.scope = scope self.index = index self.sess = sess ob_space = ob_spaces[index] ac_space = ac_spaces[index] self.ob_shape = ob_space.shape[0] * nstack self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack try: nact = ac_space.n except: nact = ac_space.shape[0] self.ac_shape = nact * nstack self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack try: self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack except: self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack self.hidden_size = hidden_size if disc_type == 'decentralized': self.obs = tf.placeholder(tf.float32, (None, self.ob_shape)) self.nobs = tf.placeholder(tf.float32, (None, self.ob_shape)) self.act = tf.placeholder(tf.float32, (None, self.ac_shape)) self.labels = tf.placeholder(tf.float32, (None, 1)) self.lprobs = tf.placeholder(tf.float32, (None, 1)) elif disc_type == 'decentralized-all': self.obs = tf.placeholder(tf.float32, (None, self.all_ob_shape)) self.nobs = tf.placeholder(tf.float32, (None, self.all_ob_shape)) self.act = tf.placeholder(tf.float32, (None, self.all_ac_shape)) self.labels = tf.placeholder(tf.float32, (None, 1)) self.lprobs = tf.placeholder(tf.float32, (None, 1)) else: assert False self.lr_rate = tf.placeholder(tf.float32, ()) with tf.variable_scope(self.scope): rew_input = self.obs if not self.state_only: rew_input = tf.concat([self.obs, self.act], axis=1) with tf.variable_scope('reward'): self.reward = self.relu_net(rew_input, dout=1) # self.reward = self.tanh_net(rew_input, dout=1) with tf.variable_scope('vfn'): self.value_fn_n = self.relu_net(self.nobs, dout=1) # self.value_fn_n = self.tanh_net(self.nobs, dout=1) with tf.variable_scope('vfn', reuse=True): self.value_fn = self.relu_net(self.obs, dout=1) # self.value_fn = self.tanh_net(self.obs, dout=1) log_q_tau = self.lprobs log_p_tau = self.reward + self.gamma * self.value_fn_n - self.value_fn log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0) self.discrim_output = tf.exp(log_p_tau - log_pq) self.total_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) + (1 - self.labels) * (log_q_tau - log_pq)) self.var_list = self.get_trainable_variables() params = find_trainable_variables(self.scope) self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in params]) * self.l2_loss_ratio self.total_loss += self.l2_loss grads = tf.gradients(self.total_loss, params) # fisher_loss = -self.total_loss # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list) with tf.variable_scope(self.scope + '/d_optim'): # d_optim = kfac.KfacOptimizer( # learning_rate=self.lr_rate, clip_kl=kfac_clip, # momentum=0.9, kfac_update=1, epsilon=0.01, # stats_decay=0.99, async=0, cold_iter=10, # max_grad_norm=max_grad_norm) # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params) # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params))) # self.q_runner = q_runner d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate) train_op = d_optim.apply_gradients(list(zip(grads, params))) self.d_optim = train_op self.saver = tf.train.Saver(self.get_variables()) self.params_flat = self.get_trainable_variables()
class Discriminator(object): def __init__(self, sess, ob_spaces, ac_spaces, nstack, index, disc_type='decentralized', hidden_size=128, lr_rate=0.01, total_steps=50000, scope="discriminator", kfac_clip=0.001, max_grad_norm=0.5): self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear') self.disc_type = disc_type if disc_type not in disc_types: assert False self.scope = scope self.index = index self.sess = sess ob_space = ob_spaces[index] ac_space = ac_spaces[index] self.ob_shape = ob_space.shape[0] * nstack nact = ac_space.n self.ac_shape = nact * nstack self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack self.hidden_size = hidden_size if disc_type == 'decentralized': input_shape = self.ob_shape + self.ac_shape elif disc_type == 'centralized': input_shape = self.all_ob_shape + self.all_ac_shape elif disc_type == 'single': input_shape = self.all_ob_shape + self.all_ac_shape else: assert False self.g = tf.placeholder(tf.float32, (None, input_shape)) self.e = tf.placeholder(tf.float32, (None, input_shape)) self.lr_rate = tf.placeholder(tf.float32, ()) self.adv = tf.placeholder(tf.float32, ()) num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1 logits = self.build_graph(tf.concat([self.g, self.e], axis=0), num_outputs, reuse=False) labels = tf.concat([ tf.ones([tf.shape(self.g)[0], 1]), -tf.ones([tf.shape(self.e)[0], 1]) ], axis=0) g_logits = self.build_graph(self.g, num_outputs, reuse=True) e_logits = self.build_graph(self.e, num_outputs, reuse=True) self.g_loss = tf.reduce_mean(g_logits) self.e_loss = tf.reduce_mean(-e_logits) # self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( # logits=g_logits, labels=tf.zeros_like(g_logits))) # self.e_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( # logits=e_logits, labels=tf.ones_like(e_logits))) self.total_loss = logits * labels # tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)) epsilon = tf.random_uniform([], 0.0, 1.0) ge = self.g * epsilon + self.e * (1 - epsilon) gel = self.build_graph(ge, num_outputs, reuse=True) ddd = tf.gradients(gel, [ge]) ddd = tf.norm(ddd, axis=1) self.ddd = tf.reduce_mean(tf.square(ddd - 1.)) * 5 sample_net = logits + tf.random_normal(tf.shape(logits)) fisher_loss = -tf.reduce_mean( tf.pow(logits - tf.stop_gradient(sample_net), 2)) self.reward_op = tf.sigmoid(g_logits) # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits)) self.var_list = self.get_trainable_variables() params = find_trainable_variables(self.scope) grads = tf.gradients(self.total_loss, params) # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list) with tf.variable_scope(self.scope + '/d_optim'): d_optim = kfac.KfacOptimizer(learning_rate=self.lr_rate, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=0, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params) train_op, q_runner = d_optim.apply_gradients( list(zip(grads, params))) self.q_runner = q_runner self.g_optim = tf.train.AdamOptimizer(learning_rate=0.0005).minimize( self.ddd) self.d_optim = train_op self.saver = tf.train.Saver(self.get_variables()) self.params_flat = self.get_trainable_variables() # self.clip = [tf.assign(v, tf.clip_by_value(v, -0.05, 0.05)) for v in self.get_trainable_variables()] # self.clip = tf.group(*self.clip) def build_graph(self, x, num_outputs=1, reuse=False): with tf.variable_scope(self.scope): if reuse: tf.get_variable_scope().reuse_variables() p_h1 = fc(x, 'fc1', nh=self.hidden_size) p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size) logits = fc(p_h2, 'out', nh=num_outputs, act=lambda x: x) return logits def get_variables(self): return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) def get_reward(self, obs, acs): if len(obs.shape) == 1: obs = np.expand_dims(obs, 0) if len(acs.shape) == 1: acs = np.expand_dims(acs, 0) feed_dict = {self.g: np.concatenate([obs, acs], axis=1)} return self.sess.run(self.reward_op, feed_dict) def train(self, g_obs, g_acs, e_obs, e_acs): feed_dict = { self.g: np.concatenate([g_obs, g_acs], axis=1), self.e: np.concatenate([e_obs, e_acs], axis=1), self.lr_rate: self.lr.value() } loss, _ = self.sess.run([self.total_loss, self.d_optim], feed_dict) for _ in range(5): self.sess.run(self.g_optim, feed_dict) g_loss, e_loss = self.sess.run([self.g_loss, self.e_loss], feed_dict) return g_loss, e_loss, None, None def restore(self, path): print('restoring from:' + path) self.saver.restore(self.sess, path) def save(self, save_path): ps = self.sess.run(self.params_flat) joblib.dump(ps, save_path) def load(self, load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(self.params_flat, loaded_params): restores.append(p.assign(loaded_p)) self.sess.run(restores)
class Discriminator(object): def __init__(self, sess, ob_spaces, ac_spaces, state_only, discount, nstack, index, disc_type='decentralized', hidden_size=128, lr_rate=0.01, total_steps=50000, scope="discriminator", kfac_clip=0.001, max_grad_norm=0.5, l2_loss_ratio=0.01): self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear') self.disc_type = disc_type self.l2_loss_ratio = l2_loss_ratio if disc_type not in disc_types: assert False self.state_only = state_only self.gamma = discount self.scope = scope self.index = index self.sess = sess ob_space = ob_spaces[index] ac_space = ac_spaces[index] self.ob_shape = ob_space.shape[0] * nstack self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack try: nact = ac_space.n except: nact = ac_space.shape[0] self.ac_shape = nact * nstack self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack try: self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack except: self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack self.hidden_size = hidden_size if disc_type == 'decentralized': self.obs = tf.placeholder(tf.float32, (None, self.ob_shape)) self.nobs = tf.placeholder(tf.float32, (None, self.ob_shape)) self.act = tf.placeholder(tf.float32, (None, self.ac_shape)) self.labels = tf.placeholder(tf.float32, (None, 1)) self.lprobs = tf.placeholder(tf.float32, (None, 1)) elif disc_type == 'decentralized-all': self.obs = tf.placeholder(tf.float32, (None, self.all_ob_shape)) self.nobs = tf.placeholder(tf.float32, (None, self.all_ob_shape)) self.act = tf.placeholder(tf.float32, (None, self.all_ac_shape)) self.labels = tf.placeholder(tf.float32, (None, 1)) self.lprobs = tf.placeholder(tf.float32, (None, 1)) else: assert False self.lr_rate = tf.placeholder(tf.float32, ()) with tf.variable_scope(self.scope): rew_input = self.obs if not self.state_only: rew_input = tf.concat([self.obs, self.act], axis=1) with tf.variable_scope('reward'): self.reward = self.relu_net(rew_input, dout=1) # self.reward = self.tanh_net(rew_input, dout=1) with tf.variable_scope('vfn'): self.value_fn_n = self.relu_net(self.nobs, dout=1) # self.value_fn_n = self.tanh_net(self.nobs, dout=1) with tf.variable_scope('vfn', reuse=True): self.value_fn = self.relu_net(self.obs, dout=1) # self.value_fn = self.tanh_net(self.obs, dout=1) log_q_tau = self.lprobs log_p_tau = self.reward + self.gamma * self.value_fn_n - self.value_fn log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0) self.discrim_output = tf.exp(log_p_tau - log_pq) self.total_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) + (1 - self.labels) * (log_q_tau - log_pq)) self.var_list = self.get_trainable_variables() params = find_trainable_variables(self.scope) self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in params]) * self.l2_loss_ratio self.total_loss += self.l2_loss grads = tf.gradients(self.total_loss, params) # fisher_loss = -self.total_loss # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list) with tf.variable_scope(self.scope + '/d_optim'): # d_optim = kfac.KfacOptimizer( # learning_rate=self.lr_rate, clip_kl=kfac_clip, # momentum=0.9, kfac_update=1, epsilon=0.01, # stats_decay=0.99, async=0, cold_iter=10, # max_grad_norm=max_grad_norm) # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params) # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params))) # self.q_runner = q_runner d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate) train_op = d_optim.apply_gradients(list(zip(grads, params))) self.d_optim = train_op self.saver = tf.train.Saver(self.get_variables()) self.params_flat = self.get_trainable_variables() def relu_net(self, x, layers=2, dout=1, hidden_size=128): out = x for i in range(layers): out = relu_layer(out, dout=hidden_size, name='l%d' % i) out = linear(out, dout=dout, name='lfinal') return out def tanh_net(self, x, layers=2, dout=1, hidden_size=128): out = x for i in range(layers): out = tanh_layer(out, dout=hidden_size, name='l%d' % i) out = linear(out, dout=dout, name='lfinal') return out def get_variables(self): return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) def get_reward(self, obs, acs, obs_next, path_probs, discrim_score=False): if len(obs.shape) == 1: obs = np.expand_dims(obs, 0) if len(acs.shape) == 1: acs = np.expand_dims(acs, 0) if discrim_score: feed_dict = { self.obs: obs, self.act: acs, self.nobs: obs_next, self.lprobs: path_probs } scores = self.sess.run(self.discrim_output, feed_dict) score = np.log(scores + 1e-20) - np.log(1 - scores + 1e-20) else: feed_dict = {self.obs: obs, self.act: acs} score = self.sess.run(self.reward, feed_dict) return score def train(self, g_obs, g_acs, g_nobs, g_probs, e_obs, e_acs, e_nobs, e_probs): labels = np.concatenate( (np.zeros([g_obs.shape[0], 1]), np.ones([e_obs.shape[0], 1])), axis=0) feed_dict = { self.obs: np.concatenate([g_obs, e_obs], axis=0), self.act: np.concatenate([g_acs, e_acs], axis=0), self.nobs: np.concatenate([g_nobs, e_nobs], axis=0), self.lprobs: np.concatenate([g_probs, e_probs], axis=0), self.labels: labels, self.lr_rate: self.lr.value() } loss, _ = self.sess.run([self.total_loss, self.d_optim], feed_dict) return loss def restore(self, path): print('restoring from:' + path) self.saver.restore(self.sess, path) def save(self, save_path): ps = self.sess.run(self.params_flat) joblib.dump(ps, save_path) def load(self, load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(self.params_flat, loaded_params): restores.append(p.assign(loaded_p)) self.sess.run(restores)
def __init__(self, sess, ob_spaces, ac_spaces, nstack, index, disc_type='decentralized', hidden_size=128, lr_rate=0.01, total_steps=50000, scope="discriminator", kfac_clip=0.001, max_grad_norm=0.5): self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear') self.disc_type = disc_type if disc_type not in disc_types: assert False self.scope = scope self.index = index self.sess = sess ob_space = ob_spaces[index] ac_space = ac_spaces[index] self.ob_shape = ob_space.shape[0] * nstack try: nact = ac_space.n except: nact = ac_space.shape[0] self.ac_shape = nact * nstack self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack try: self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack except: self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack self.hidden_size = hidden_size if disc_type == 'decentralized': input_shape = self.ob_shape + self.all_ac_shape elif disc_type == 'decentralized-all': input_shape = self.all_ob_shape + self.all_ac_shape else: assert False self.g = tf.placeholder(tf.float32, (None, input_shape)) self.e = tf.placeholder(tf.float32, (None, input_shape)) self.lr_rate = tf.placeholder(tf.float32, ()) self.adv = tf.placeholder(tf.float32, ()) num_outputs = 1 logits = self.build_graph(tf.concat([self.g, self.e], axis=0), num_outputs, reuse=False) labels = tf.concat([ tf.zeros([tf.shape(self.g)[0], 1]), tf.ones([tf.shape(self.e)[0], 1]) ], axis=0) g_logits = self.build_graph(self.g, num_outputs, reuse=True) e_logits = self.build_graph(self.e, num_outputs, reuse=True) self.g_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=g_logits, labels=tf.zeros_like(g_logits))) self.e_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=e_logits, labels=tf.ones_like(e_logits))) self.total_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)) fisher_loss = -self.total_loss # self.reward_op = tf.sigmoid(g_logits) * 2.0 - 1 # self.reward_op = tf.log(tf.sigmoid(g_logits) + 1e-10) # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits)) self.reward_op = tf.log(tf.sigmoid(g_logits) + 1e-10) - tf.log(1 - tf.sigmoid(g_logits) + 1e-10) self.var_list = self.get_trainable_variables() params = find_trainable_variables(self.scope) grads = tf.gradients(self.total_loss, params) # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list) with tf.variable_scope(self.scope + '/d_optim'): # d_optim = kfac.KfacOptimizer( # learning_rate=self.lr_rate, clip_kl=kfac_clip, # momentum=0.9, kfac_update=1, epsilon=0.01, # stats_decay=0.99, async=0, cold_iter=10, # max_grad_norm=max_grad_norm) # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params) # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params))) # self.q_runner = q_runner d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate) train_op = d_optim.apply_gradients(list(zip(grads, params))) self.d_optim = train_op self.saver = tf.train.Saver(self.get_variables()) self.params_flat = self.get_trainable_variables()
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=2, nsteps=200, nstack=1, ent_coef=0.00, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', identical=None): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nbatch = nenvs * nsteps self.num_agents = num_agents = len(ob_space) self.n_actions = [ac_space[k].n for k in range(self.num_agents)] if identical is None: identical = [False for _ in range(self.num_agents)] scale = [1 for _ in range(num_agents)] pointer = [i for i in range(num_agents)] h = 0 for k in range(num_agents): if identical[k]: scale[h] += 1 else: pointer[h] = k h = k pointer[h] = num_agents A, ADV, R, PG_LR = [], [], [], [] for k in range(num_agents): if identical[k]: A.append(A[-1]) ADV.append(ADV[-1]) R.append(R[-1]) PG_LR.append(PG_LR[-1]) else: A.append(tf.placeholder(tf.int32, [nbatch * scale[k]])) ADV.append(tf.placeholder(tf.float32, [nbatch * scale[k]])) R.append(tf.placeholder(tf.float32, [nbatch * scale[k]])) PG_LR.append(tf.placeholder(tf.float32, [])) pg_loss, entropy, vf_loss, train_loss = [], [], [], [] self.model = step_model = [] self.model2 = train_model = [] self.pg_fisher = pg_fisher_loss = [] self.logits = logits = [] sample_net = [] self.vf_fisher = vf_fisher_loss = [] self.joint_fisher = joint_fisher_loss = [] self.lld = lld = [] self.log_pac = [] for k in range(num_agents): if identical[k]: step_model.append(step_model[-1]) train_model.append(train_model[-1]) else: step_model.append( policy(sess, ob_space[k], ac_space[k], ob_space, ac_space, nenvs, 1, nstack, reuse=False, name='%d' % k)) train_model.append( policy(sess, ob_space[k], ac_space[k], ob_space, ac_space, nenvs * scale[k], nsteps, nstack, reuse=True, name='%d' % k)) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model[k].pi, labels=A[k]) self.log_pac.append(-logpac) lld.append(tf.reduce_mean(logpac)) logits.append(train_model[k].pi) pg_loss.append(tf.reduce_mean(ADV[k] * logpac)) entropy.append(tf.reduce_mean(cat_entropy(train_model[k].pi))) pg_loss[k] = pg_loss[k] - ent_coef * entropy[k] vf_loss.append( tf.reduce_mean(mse(tf.squeeze(train_model[k].vf), R[k]))) train_loss.append(pg_loss[k] + vf_coef * vf_loss[k]) pg_fisher_loss.append(-tf.reduce_mean(logpac)) sample_net.append(train_model[k].vf + tf.random_normal(tf.shape(train_model[k].vf))) vf_fisher_loss.append(-vf_fisher_coef * tf.reduce_mean( tf.pow(train_model[k].vf - tf.stop_gradient(sample_net[k]), 2))) joint_fisher_loss.append(pg_fisher_loss[k] + vf_fisher_loss[k]) self.policy_params = [] self.value_params = [] for k in range(num_agents): if identical[k]: self.policy_params.append(self.policy_params[-1]) self.value_params.append(self.value_params[-1]) else: self.policy_params.append( find_trainable_variables("policy_%d" % k)) self.value_params.append( find_trainable_variables("value_%d" % k)) self.params = params = [ a + b for a, b in zip(self.policy_params, self.value_params) ] params_flat = [] for k in range(num_agents): params_flat.extend(params[k]) self.grads_check = grads = [ tf.gradients(train_loss[k], params[k]) for k in range(num_agents) ] clone_grads = [ tf.gradients(lld[k], params[k]) for k in range(num_agents) ] self.optim = optim = [] self.clones = clones = [] update_stats_op = [] train_op, clone_op, q_runner = [], [], [] for k in range(num_agents): if identical[k]: optim.append(optim[-1]) train_op.append(train_op[-1]) q_runner.append(q_runner[-1]) clones.append(clones[-1]) clone_op.append(clone_op[-1]) else: with tf.variable_scope('optim_%d' % k): optim.append( kfac.KfacOptimizer(learning_rate=PG_LR[k], clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async_var=0, cold_iter=10, max_grad_norm=max_grad_norm)) update_stats_op.append(optim[k].compute_and_apply_stats( joint_fisher_loss, var_list=params[k])) train_op_, q_runner_ = optim[k].apply_gradients( list(zip(grads[k], params[k]))) train_op.append(train_op_) q_runner.append(q_runner_) with tf.variable_scope('clone_%d' % k): clones.append( kfac.KfacOptimizer(learning_rate=PG_LR[k], clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async_var=0, cold_iter=10, max_grad_norm=max_grad_norm)) update_stats_op.append(clones[k].compute_and_apply_stats( pg_fisher_loss[k], var_list=self.policy_params[k])) clone_op_, q_runner_ = clones[k].apply_gradients( list(zip(clone_grads[k], self.policy_params[k]))) clone_op.append(clone_op_) update_stats_op = tf.group(*update_stats_op) train_ops = train_op clone_ops = clone_op train_op = tf.group(*train_op) clone_op = tf.group(*clone_op) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) self.clone_lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = [rewards[k] - values[k] for k in range(num_agents)] for step in range(len(obs)): cur_lr = self.lr.value() ob = np.concatenate(obs, axis=1) td_map = {} for k in range(num_agents): if identical[k]: continue new_map = {} if num_agents > 1: action_v = [] for j in range(k, pointer[k]): action_v.append( np.concatenate([ multionehot(actions[i], self.n_actions[i]) for i in range(num_agents) if i != k ], axis=1)) action_v = np.concatenate(action_v, axis=0) new_map.update({train_model[k].A_v: action_v}) td_map.update({train_model[k].A_v: action_v}) new_map.update({ train_model[k].X: np.concatenate([obs[j] for j in range(k, pointer[k])], axis=0), train_model[k].X_v: np.concatenate([ob.copy() for j in range(k, pointer[k])], axis=0), A[k]: np.concatenate([actions[j] for j in range(k, pointer[k])], axis=0), ADV[k]: np.concatenate([advs[j] for j in range(k, pointer[k])], axis=0), R[k]: np.concatenate([rewards[j] for j in range(k, pointer[k])], axis=0), PG_LR[k]: cur_lr / float(scale[k]) }) sess.run(train_ops[k], feed_dict=new_map) td_map.update(new_map) if states[k] != []: td_map[train_model[k].S] = states td_map[train_model[k].M] = masks policy_loss, value_loss, policy_entropy = sess.run( [pg_loss, vf_loss, entropy], td_map) return policy_loss, value_loss, policy_entropy def clone(obs, actions): td_map = {} cur_lr = self.clone_lr.value() for k in range(num_agents): if identical[k]: continue new_map = {} new_map.update({ train_model[k].X: np.concatenate([obs[j] for j in range(k, pointer[k])], axis=0), A[k]: np.concatenate([actions[j] for j in range(k, pointer[k])], axis=0), PG_LR[k]: cur_lr / float(scale[k]) }) sess.run(clone_ops[k], feed_dict=new_map) td_map.update(new_map) lld_loss = sess.run([lld], td_map) return lld_loss def get_log_action_prob(obs, actions): action_prob = [] for k in range(num_agents): if identical[k]: continue new_map = { train_model[k].X: np.concatenate([obs[j] for j in range(k, pointer[k])], axis=0), A[k]: np.concatenate([actions[j] for j in range(k, pointer[k])], axis=0) } log_pac = sess.run(self.log_pac[k], feed_dict=new_map) if scale[k] == 1: action_prob.append(log_pac) else: log_pac = np.split(log_pac, scale[k], axis=0) action_prob += log_pac return action_prob self.get_log_action_prob = get_log_action_prob def get_log_action_prob_step(obs, actions): action_prob = [] for k in range(num_agents): action_prob.append(step_model[k].step_log_prob( obs[k], actions[k])) return action_prob self.get_log_action_prob_step = get_log_action_prob_step def save(save_path): ps = sess.run(params_flat) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params_flat, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.clone = clone self.save = save self.load = load self.train_model = train_model self.step_model = step_model def step(ob, av, *_args, **_kwargs): a, v, s = [], [], [] obs = np.concatenate(ob, axis=1) for k in range(num_agents): a_v = np.concatenate([ multionehot(av[i], self.n_actions[i]) for i in range(num_agents) if i != k ], axis=1) a_, v_, s_ = step_model[k].step(ob[k], obs, a_v) a.append(a_) v.append(v_) s.append(s_) return a, v, s self.step = step def value(obs, av): v = [] ob = np.concatenate(obs, axis=1) for k in range(num_agents): a_v = np.concatenate([ multionehot(av[i], self.n_actions[i]) for i in range(num_agents) if i != k ], axis=1) v_ = step_model[k].value(ob, a_v) v.append(v_) return v self.value = value self.initial_state = [ step_model[k].initial_state for k in range(num_agents) ]
def __init__(self, sess, ob_spaces, ac_spaces, nstack, index, disc_type='decentralized', hidden_size=128, gp_coef=5, lr_rate=5e-4, total_steps=50000, scope="discriminator"): self.lr = Scheduler(v=lr_rate, nvalues=total_steps * 20, schedule='linear') self.disc_type = disc_type if disc_type not in disc_types: assert False self.scope = scope self.index = index self.sess = sess ob_space = ob_spaces[index] ac_space = ac_spaces[index] self.ob_shape = ob_space.shape[0] * nstack nact = ac_space.n self.ac_shape = nact * nstack self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack self.hidden_size = hidden_size if disc_type == 'decentralized': input_shape = self.all_ob_shape + self.ac_shape elif disc_type == 'centralized': input_shape = self.all_ob_shape + self.all_ac_shape elif disc_type == 'single': input_shape = self.all_ob_shape + self.all_ac_shape else: assert False self.g = tf.placeholder(tf.float32, (None, input_shape)) self.e = tf.placeholder(tf.float32, (None, input_shape)) self.lr_rate = tf.placeholder(tf.float32, ()) num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1 self.bias = tf.get_variable(name=scope + '_bias', shape=(num_outputs, ), initializer=tf.zeros_initializer, trainable=False) self.bias_ph = tf.placeholder(tf.float32, (num_outputs, )) self.update_bias = tf.assign(self.bias, self.bias_ph * 0.01 + self.bias * 0.99) generator_logits = self.build_graph(self.g, num_outputs, reuse=False) expert_logits = self.build_graph(self.e, num_outputs, reuse=True) self.generator_loss = tf.reduce_mean(generator_logits, axis=0) self.expert_loss = tf.reduce_mean(expert_logits, axis=0) ddg = tf.gradients(generator_logits, [self.g]) ddg = tf.sqrt(tf.reduce_sum(tf.square(ddg[0]), axis=1)) self.ddg = tf.reduce_mean(tf.square(ddg - 1.)) dde = tf.gradients(expert_logits, [self.e]) dde = tf.sqrt(tf.reduce_sum(tf.square(dde[0]), axis=1)) self.dde = tf.reduce_mean(tf.square(dde - 1.)) epsilon = tf.random_uniform([], 0.0, 1.0) ge = self.g * epsilon + self.e * (1 - epsilon) gel = self.build_graph(ge, num_outputs, reuse=True) ddd = tf.gradients(gel, [ge]) ddd = tf.norm(ddd, axis=1) self.ddd = tf.reduce_mean(tf.square(ddd - 1.)) self.total_loss = self.generator_loss - self.expert_loss + gp_coef * self.ddd #(self.ddg + self.dde) self.reward_op = generator_logits self.var_list = self.get_trainable_variables() self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize( self.total_loss, var_list=self.var_list) self.saver = tf.train.Saver(self.get_variables())
class Discriminator(object): def __init__(self, sess, ob_spaces, ac_spaces, nstack, index, disc_type='decentralized', hidden_size=128, gp_coef=5, lr_rate=5e-4, total_steps=50000, scope="discriminator"): self.lr = Scheduler(v=lr_rate, nvalues=total_steps * 20, schedule='linear') self.disc_type = disc_type if disc_type not in disc_types: assert False self.scope = scope self.index = index self.sess = sess ob_space = ob_spaces[index] ac_space = ac_spaces[index] self.ob_shape = ob_space.shape[0] * nstack nact = ac_space.n self.ac_shape = nact * nstack self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack self.hidden_size = hidden_size if disc_type == 'decentralized': input_shape = self.all_ob_shape + self.ac_shape elif disc_type == 'centralized': input_shape = self.all_ob_shape + self.all_ac_shape elif disc_type == 'single': input_shape = self.all_ob_shape + self.all_ac_shape else: assert False self.g = tf.placeholder(tf.float32, (None, input_shape)) self.e = tf.placeholder(tf.float32, (None, input_shape)) self.lr_rate = tf.placeholder(tf.float32, ()) num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1 self.bias = tf.get_variable(name=scope + '_bias', shape=(num_outputs, ), initializer=tf.zeros_initializer, trainable=False) self.bias_ph = tf.placeholder(tf.float32, (num_outputs, )) self.update_bias = tf.assign(self.bias, self.bias_ph * 0.01 + self.bias * 0.99) generator_logits = self.build_graph(self.g, num_outputs, reuse=False) expert_logits = self.build_graph(self.e, num_outputs, reuse=True) self.generator_loss = tf.reduce_mean(generator_logits, axis=0) self.expert_loss = tf.reduce_mean(expert_logits, axis=0) ddg = tf.gradients(generator_logits, [self.g]) ddg = tf.sqrt(tf.reduce_sum(tf.square(ddg[0]), axis=1)) self.ddg = tf.reduce_mean(tf.square(ddg - 1.)) dde = tf.gradients(expert_logits, [self.e]) dde = tf.sqrt(tf.reduce_sum(tf.square(dde[0]), axis=1)) self.dde = tf.reduce_mean(tf.square(dde - 1.)) epsilon = tf.random_uniform([], 0.0, 1.0) ge = self.g * epsilon + self.e * (1 - epsilon) gel = self.build_graph(ge, num_outputs, reuse=True) ddd = tf.gradients(gel, [ge]) ddd = tf.norm(ddd, axis=1) self.ddd = tf.reduce_mean(tf.square(ddd - 1.)) self.total_loss = self.generator_loss - self.expert_loss + gp_coef * self.ddd #(self.ddg + self.dde) self.reward_op = generator_logits self.var_list = self.get_trainable_variables() self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize( self.total_loss, var_list=self.var_list) self.saver = tf.train.Saver(self.get_variables()) def build_graph(self, x, num_outputs=1, reuse=False): with tf.variable_scope(self.scope): if reuse: tf.get_variable_scope().reuse_variables() p_h1 = fc(x, 'fc1', nh=self.hidden_size) p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size) p_h3 = fc(p_h2, 'fc3', nh=self.hidden_size) logits = fc(p_h3, 'out', nh=num_outputs, act=lambda x: x) logits -= self.bias return logits def get_variables(self): return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) def get_reward(self, all_obs, acs): if len(all_obs.shape) == 1: all_obs = np.expand_dims(all_obs, 0) if len(acs.shape) == 1: acs = np.expand_dims(acs, 0) feed_dict = {self.g: np.concatenate([all_obs, acs], axis=1)} return self.sess.run(self.reward_op, feed_dict) def train(self, g_all_obs, g_acs, e_all_obs, e_acs): feed_dict = { self.g: np.concatenate([g_all_obs, g_acs], axis=1), self.e: np.concatenate([e_all_obs, e_acs], axis=1), self.lr_rate: self.lr.value() } gl, el, _ = self.sess.run( [self.generator_loss, self.expert_loss, self.d_optim], feed_dict) # self.sess.run(self.update_bias, feed_dict={self.bias_ph: (gl + el) / 2.0}) return self.sess.run( [self.generator_loss, self.expert_loss, self.ddg, self.dde], feed_dict) def restore(self, path): print('restoring from:' + path) self.saver.restore(self.sess, path)