class ThreadTrainer(Thread): def __init__(self, server, id): super(ThreadTrainer, self).__init__() self.setDaemon(True) self.id = id self.server = server self.exit_flag = False self.rm = ReplayMemory( Config.TRAINING_REPLAY_MEMORY_SIZE, (Config.IMAGE_HEIGHT, Config.IMAGE_WIDTH, Config.STACKED_FRAMES), (24), dtype=np.__dict__['float32']) def run(self): while not self.exit_flag: o_, r_, a_, n_, d_ = self.server.training_q.get() for i in range(o_.shape[0]): self.rm.enqueue(o_[i, ...], a_[i, ...], r_[i], n_[i, ...], d_[i]) if Config.TRAIN_MODELS and self.rm.i % 20 == 0 and self.rm.n > 200: o__, a__, r__, n__, d__ = self.rm.minibatch( size=Config.TRAINING_MIN_BATCH_SIZE) self.server.train_model(o__, r__, a__, n__, d__, self.id)
def test_replay_memory(): from replay_memory import ReplayMemory s = 100 rm = ReplayMemory(s, 1, 1) for i in range(0, 100, 1): rm.enqueue(i, i % 3 == 0, i, i, i) for i in range(1000): o, a, r, o2, t2, info = rm.minibatch(10) assert all(o == o2 - 1), "error: o and o2" assert all(o != s - 1), "error: o wrap over rm. o = " + str(o) assert all(o2 != 0), "error: o2 wrap over rm"
class Agent: def __init__(self, dimO, dimA): dimA = list(dimA) dimO = list(dimO) self.dimA = dimA[0] self.dimO = dimO[0] tau = FLAGS.tau discount = FLAGS.discount l2norm = FLAGS.l2norm learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma nets = icnn_nets_dm if FLAGS.icnn_opt == 'adam': self.opt = self.adam elif FLAGS.icnn_opt == 'bundle_entropy': self.opt = self.bundle_entropy else: raise RuntimeError("Unrecognized ICNN optimizer: " + FLAGS.icnn_opt) def entropy(x): #the real concave entropy function x_move_reg = tf.clip_by_value((x + 1) / 2, 0.0001, 0.9999) pen = x_move_reg * tf.log(x_move_reg) + ( 1 - x_move_reg) * tf.log(1 - x_move_reg) return -tf.reduce_sum(pen, 1) # init replay memory self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) # start tf session self.sess = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.1))) # create tf computational graph self.theta = nets.theta(dimO[0], dimA[0], FLAGS.l1size, FLAGS.l2size, 'theta') self.theta_t, update_t = exponential_moving_averages(self.theta, tau) obs = tf.placeholder(tf.float32, [1] + dimO, "obs") act_test = tf.placeholder(tf.float32, [1] + dimA, "act") # explore noise_init = tf.zeros([1] + dimA) noise_var = tf.Variable(noise_init, name="noise", trainable=False) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub((outheta) * noise_var - \ tf.random_normal(dimA, stddev=ousigma)) act_expl = act_test + noise # test, single sample q function & gradient for bundle method q_test_opt, _, _, _, _ = nets.qfunction(obs, act_test, self.theta, False, False) loss_test = -q_test_opt act_test_grad = tf.gradients(loss_test, act_test)[0] loss_test_entr = -q_test_opt - entropy(act_test) act_test_grad_entr = tf.gradients(loss_test_entr, act_test)[0] # batched q function & gradient for bundle method obs_train2_opt = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs_train2_opt") act_train2_opt = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train2_opt") q_train2_opt, _, _, _, _ = nets.qfunction(obs_train2_opt, act_train2_opt, self.theta_t, True, False) loss_train2 = -q_train2_opt act_train2_grad = tf.gradients(loss_train2, act_train2_opt)[0] loss_train2_entr = -q_train2_opt - entropy(act_train2_opt) act_train2_grad_entr = tf.gradients(loss_train2_entr, act_train2_opt)[0] # training obs_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs_train") act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") obs_train2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs_train2") act_train2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train2") term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2") q_train, q_train_z1, q_train_z2, q_train_u1, q_train_u2 = nets.qfunction( obs_train, act_train, self.theta, True, True) q_train_entropy = q_train + entropy(act_train) q_train2, _, _, _, _ = nets.qfunction(obs_train2, act_train2, self.theta_t, True, True) q_train2_entropy = q_train2 + entropy(act_train2) # q loss if FLAGS.icnn_opt == 'adam': q_target = tf.select(term2, rew, rew + discount * q_train2) q_target = tf.maximum(q_train - 1., q_target) q_target = tf.minimum(q_train + 1., q_target) q_target = tf.stop_gradient(q_target) td_error = q_train - q_target elif FLAGS.icnn_opt == 'bundle_entropy': q_target = tf.select(term2, rew, rew + discount * q_train2_entropy) q_target = tf.maximum(q_train_entropy - 1., q_target) q_target = tf.minimum(q_train_entropy + 1., q_target) q_target = tf.stop_gradient(q_target) td_error = q_train_entropy - q_target ms_td_error = tf.reduce_mean(tf.square(td_error), 0) theta = self.theta # TODO: Replace with something cleaner, this could easily stop working # if the variable names change. wd_q = tf.add_n([ l2norm * tf.nn.l2_loss(var) if var.name[6] == 'W' else 0. for var in theta ]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars_q = optim_q.compute_gradients(loss_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_t) summary_writer = tf.train.SummaryWriter( os.path.join(FLAGS.outdir, 'board'), self.sess.graph) summary_list = [] if FLAGS.icnn_opt == 'adam': summary_list.append( tf.scalar_summary('Qvalue', tf.reduce_mean(q_train))) elif FLAGS.icnn_opt == 'bundle_entropy': summary_list.append( tf.scalar_summary('Qvalue', tf.reduce_mean(q_train_entropy))) summary_list.append(tf.scalar_summary('loss', ms_td_error)) summary_list.append(tf.scalar_summary('reward', tf.reduce_mean(rew))) summary_list.append( tf.scalar_summary('cvx_z1', tf.reduce_mean(q_train_z1))) summary_list.append( tf.scalar_summary('cvx_z2', tf.reduce_mean(q_train_z2))) summary_list.append( tf.scalar_summary('cvx_z1_pos', tf.reduce_mean(tf.to_float(q_train_z1 > 1e-15)))) summary_list.append( tf.scalar_summary('cvx_z2_pos', tf.reduce_mean(tf.to_float(q_train_z2 > 1e-15)))) summary_list.append( tf.scalar_summary('noncvx_u1', tf.reduce_mean(q_train_u1))) summary_list.append( tf.scalar_summary('noncvx_u2', tf.reduce_mean(q_train_u2))) summary_list.append( tf.scalar_summary('noncvx_u1_pos', tf.reduce_mean(tf.to_float(q_train_u1 > 1e-15)))) summary_list.append( tf.scalar_summary('noncvx_u2_pos', tf.reduce_mean(tf.to_float(q_train_u2 > 1e-15)))) # tf functions with self.sess.as_default(): self._reset = Fun([], self.ou_reset) self._act_expl = Fun(act_test, act_expl) self._train = Fun( [obs_train, act_train, rew, obs_train2, act_train2, term2], [train_q, loss_q], summary_list, summary_writer) self._opt_test = Fun([obs, act_test], [loss_test, act_test_grad]) self._opt_train = Fun([obs_train2_opt, act_train2_opt], [loss_train2, act_train2_grad]) self._opt_test_entr = Fun([obs, act_test], [loss_test_entr, act_test_grad_entr]) self._opt_train_entr = Fun( [obs_train2_opt, act_train2_opt], [loss_train2_entr, act_train2_grad_entr]) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def bundle_entropy(self, func, obs): act = np.ones((obs.shape[0], self.dimA)) * 0.5 def fg(x): value, grad = func(obs, 2 * x - 1) grad *= 2 return value, grad act = bundle_entropy.solveBatch(fg, act)[0] act = 2 * act - 1 return act def adam(self, func, obs): b1 = 0.9 b2 = 0.999 lam = 0.5 eps = 1e-8 alpha = 0.01 nBatch = obs.shape[0] act = np.zeros((nBatch, self.dimA)) m = np.zeros_like(act) v = np.zeros_like(act) b1t, b2t = 1., 1. act_best, a_diff, f_best = [None] * 3 for i in range(10000): f, g = func(obs, act) if i == 0: act_best = act.copy() f_best = f.copy() else: I = (f < f_best) act_best[I] = act[I] f_best[I] = f[I] m = b1 * m + (1. - b1) * g v = b2 * v + (1. - b2) * (g * g) b1t *= b1 b2t *= b2 mhat = m / (1. - b1t) vhat = v / (1. - b2t) prev_act = act.copy() act -= alpha * mhat / (np.sqrt(v) + eps) act = np.clip(act, -1, 1) a_diff_i = np.mean(np.linalg.norm(act - prev_act, axis=1)) a_diff = a_diff_i if a_diff is None else lam * a_diff + ( 1. - lam) * a_diff_i # print(a_diff_i, a_diff, np.sum(f)) if a_diff_i == 0 or a_diff < 1e-2: print(' + ADAM took {} iterations'.format(i)) return act_best print(' + Warning: ADAM did not converge.') return act_best def reset(self, obs): self._reset() self.observation = obs # initial observation def act(self, test=False): print('--- Selecting action, test={}'.format(test)) obs = np.expand_dims(self.observation, axis=0) if FLAGS.icnn_opt == 'adam': # f = self._opt_test_entr f = self._opt_test elif FLAGS.icnn_opt == 'bundle_entropy': f = self._opt_test else: raise RuntimeError("Unrecognized ICNN optimizer: " + FLAGS.icnn_opt) act = self.opt(f, obs) action = act if test else self._act_expl(act) action = np.clip(action, -1, 1) self.action = np.atleast_1d(np.squeeze( action, axis=0)) # TODO: remove this hack return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 self.rm.enqueue(obs1, term, self.action, rew) if self.t > FLAGS.warmup: for i in xrange(FLAGS.iter): loss = self.train() def train(self): obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) if FLAGS.icnn_opt == 'adam': # f = self._opt_train_entr f = self._opt_train elif FLAGS.icnn_opt == 'bundle_entropy': f = self._opt_train else: raise RuntimeError("Unrecognized ICNN optimizer: " + FLAGS.icnn_opt) print('--- Optimizing for training') act2 = self.opt(f, ob2) _, loss = self._train(obs, act, rew, ob2, act2, term2, log=FLAGS.summary, global_step=self.t) return loss def __del__(self): self.sess.close()
class Agent: def __init__(self, dimO, dimA): dimA = list(dimA) dimO = list(dimO) tau = FLAGS.tau discount = FLAGS.discount l2norm = FLAGS.l2norm learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma nets = naf_nets_dm # init replay memory self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) # start tf session self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True))) is_training = tf.placeholder(tf.bool) # create tf computational graph self.theta_L = nets.theta(dimO[0], dimA[0] * dimA[0], FLAGS.l1size, FLAGS.l2size, 'theta_L') self.theta_U = nets.theta(dimO[0], dimA[0], FLAGS.l1size, FLAGS.l2size, 'theta_U') self.theta_V = nets.theta(dimO[0], 1, FLAGS.l1size, FLAGS.l2size, 'theta_V') self.theta_Vt, update_Vt = exponential_moving_averages(self.theta_V, tau) obs_single = tf.placeholder(tf.float32, [1] + dimO, "obs-single") act_test = nets.ufunction(obs_single, self.theta_U, False, is_training) # explore noise_init = tf.zeros([1] + dimA) noise_var = tf.Variable(noise_init, name="noise", trainable=False) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub((outheta) * noise_var - tf.random_normal(dimA, stddev=ousigma)) act_expl = act_test + noise # training obs_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, 'obs_train') act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2") # q lmat = nets.lfunction(obs_train, self.theta_L, False, is_training) uvalue = nets.ufunction(obs_train, self.theta_U, True, is_training) avalue = nets.afunction(act_train, lmat, uvalue, dimA[0]) q_train = nets.qfunction(obs_train, avalue, self.theta_V, False, is_training) # q targets q2 = nets.qfunction(obs2, tf.constant([0.] * FLAGS.bsize), self.theta_Vt, True, is_training) q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2)) # q loss td_error = q_train - q_target ms_td_error = tf.reduce_mean(tf.square(td_error), 0) theta = self.theta_L + self.theta_U + self.theta_V wd_q = tf.add_n([l2norm * tf.nn.l2_loss(var) for var in theta]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4) grads_and_vars_q = optim_q.compute_gradients(loss_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_Vt) summary_writer = tf.train.SummaryWriter(os.path.join(FLAGS.outdir, 'board'), self.sess.graph) summary_list = [] summary_list.append(tf.scalar_summary('Qvalue', tf.reduce_mean(q_train))) summary_list.append(tf.scalar_summary('loss', ms_td_error)) summary_list.append(tf.scalar_summary('reward', tf.reduce_mean(rew))) # tf functions with self.sess.as_default(): self._act_test = Fun([obs_single, is_training], act_test) self._act_expl = Fun([obs_single, is_training], act_expl) self._reset = Fun([], self.ou_reset) self._train = Fun([obs_train, act_train, rew, obs2, term2, is_training], [train_q, loss_q], summary_list, summary_writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def reset(self, obs): self._reset() self.observation = obs # initial observation def act(self, test=False): obs = np.expand_dims(self.observation, axis=0) action = self._act_test(obs, False) if test else self._act_expl(obs, False) action = np.clip(action, -1, 1) self.action = np.atleast_1d(np.squeeze(action, axis=0)) # TODO: remove this hack return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 self.rm.enqueue(obs1, term, self.action, rew) if self.t > FLAGS.warmup: for i in range(FLAGS.iter): loss = self.train() def train(self): obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) _, loss = self._train(obs, act, rew, ob2, term2, True, log=FLAGS.summary, global_step=self.t) return loss def __del__(self): self.sess.close()
class Agent: def __init__(self, dimO, dimA): dimA, dimO = dimA[0], dimO[0] self.dimA = dimA self.dimO = dimO tau = FLAGS.tau discount = FLAGS.discount l2norm = FLAGS.l2norm learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma if FLAGS.icnn_opt == 'adam': self.opt = self.adam elif FLAGS.icnn_opt == 'bundle_entropy': self.opt = self.bundle_entropy else: raise RuntimeError("Unrecognized ICNN optimizer: " + FLAGS.icnn_opt) self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) self.sess = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions( allow_growth=True))) self.noise = np.zeros(self.dimA) obs = tf.placeholder(tf.float32, [None, dimO], "obs") act = tf.placeholder(tf.float32, [None, dimA], "act") rew = tf.placeholder(tf.float32, [None], "rew") with tf.variable_scope('q'): negQ = self.negQ(obs, act) negQ_entr = negQ - entropy(act) q = -negQ q_entr = -negQ_entr act_grad, = tf.gradients(negQ, act) act_grad_entr, = tf.gradients(negQ_entr, act) obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target") act_target = tf.placeholder(tf.float32, [None, dimA], "act_target") term_target = tf.placeholder(tf.bool, [None], "term_target") with tf.variable_scope('q_target'): negQ_target = self.negQ(obs_target, act_target) negQ_entr_target = negQ_target - entropy(act_target) act_target_grad, = tf.gradients(negQ_target, act_target) act_entr_target_grad, = tf.gradients(negQ_entr_target, act_target) q_target = -negQ_target q_target_entr = -negQ_entr_target if FLAGS.icnn_opt == 'adam': y = tf.select(term_target, rew, rew + discount * q_target_entr) y = tf.maximum(q_entr - 1., y) y = tf.minimum(q_entr + 1., y) y = tf.stop_gradient(y) td_error = q_entr - y elif FLAGS.icnn_opt == 'bundle_entropy': raise RuntimError("Needs checking.") q_target = tf.select(term2, rew, rew + discount * q2_entropy) q_target = tf.maximum(q_entropy - 1., q_target) q_target = tf.minimum(q_entropy + 1., q_target) q_target = tf.stop_gradient(q_target) td_error = q_entropy - q_target ms_td_error = tf.reduce_mean(tf.square(td_error), 0) regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/') loss_q = ms_td_error + l2norm * tf.reduce_sum(regLosses) self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/') self.theta_cvx_ = [ v for v in self.theta_ if 'proj' in v.name and 'W:' in v.name ] self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_] # self.proj = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.theta_target_ = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_target/') update_target = [ theta_target_i.assign_sub(tau * (theta_target_i - theta_i)) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_) ] optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars_q = optim_q.compute_gradients(loss_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) summary_writer = tf.train.SummaryWriter( os.path.join(FLAGS.outdir, 'board'), self.sess.graph) if FLAGS.icnn_opt == 'adam': tf.scalar_summary('Qvalue', tf.reduce_mean(q)) elif FLAGS.icnn_opt == 'bundle_entropy': tf.scalar_summary('Qvalue', tf.reduce_mean(q_entr)) tf.scalar_summary('loss', ms_td_error) tf.scalar_summary('reward', tf.reduce_mean(rew)) merged = tf.merge_all_summaries() # tf functions with self.sess.as_default(): self._train = Fun( [obs, act, rew, obs_target, act_target, term_target], [optimize_q, update_target, loss_q], merged, summary_writer) self._fg = Fun([obs, act], [negQ, act_grad]) self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad]) self._fg_entr = Fun([obs, act], [negQ_entr, act_grad_entr]) self._fg_entr_target = Fun( [obs_target, act_target], [negQ_entr_target, act_entr_target_grad]) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.run(self.makeCvx) self.sess.run([ theta_target_i.assign(theta_i) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_) ]) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def bundle_entropy(self, func, obs): act = np.ones((obs.shape[0], self.dimA)) * 0.5 def fg(x): value, grad = func(obs, 2 * x - 1) grad *= 2 return value, grad act = bundle_entropy.solveBatch(fg, act)[0] act = 2 * act - 1 return act def adam(self, func, obs, plot=False): # if npr.random() < 1./20: # plot = True b1 = 0.9 b2 = 0.999 lam = 0.5 eps = 1e-8 alpha = 0.01 nBatch = obs.shape[0] act = np.zeros((nBatch, self.dimA)) m = np.zeros_like(act) v = np.zeros_like(act) b1t, b2t = 1., 1. act_best, a_diff, f_best = [None] * 3 hist = {'act': [], 'f': [], 'g': []} for i in range(1000): f, g = func(obs, act) if plot: hist['act'].append(act.copy()) hist['f'].append(f) hist['g'].append(g) if i == 0: act_best = act.copy() f_best = f.copy() else: prev_act_best = act_best.copy() I = (f < f_best) act_best[I] = act[I] f_best[I] = f[I] a_diff_i = np.mean( np.linalg.norm(act_best - prev_act_best, axis=1)) a_diff = a_diff_i if a_diff is None \ else lam*a_diff + (1.-lam)*a_diff_i # print(a_diff_i, a_diff, np.sum(f)) if a_diff < 1e-3 and i > 5: print(' + Adam took {} iterations'.format(i)) if plot: self.adam_plot(func, obs, hist) return act_best m = b1 * m + (1. - b1) * g v = b2 * v + (1. - b2) * (g * g) b1t *= b1 b2t *= b2 mhat = m / (1. - b1t) vhat = v / (1. - b2t) act -= alpha * mhat / (np.sqrt(v) + eps) # act = np.clip(act, -1, 1) act = np.clip(act, -1. + 1e-8, 1. - 1e-8) print(' + Warning: Adam did not converge.') if plot: self.adam_plot(func, obs, hist) return act_best def adam_plot(self, func, obs, hist): hist['act'] = np.array(hist['act']).T hist['f'] = np.array(hist['f']).T hist['g'] = np.array(hist['g']).T if self.dimA == 1: xs = np.linspace(-1. + 1e-8, 1. - 1e-8, 100) ys = [func(obs[[0], :], [[xi]])[0] for xi in xs] fig = plt.figure() plt.plot(xs, ys) plt.plot(hist['act'][0, 0, :], hist['f'][0, :], label='Adam') plt.legend() fname = os.path.join(FLAGS.outdir, 'adamPlt.png') print("Saving Adam plot to {}".format(fname)) plt.savefig(fname) plt.close(fig) elif self.dimA == 2: assert (False) else: xs = npr.uniform(-1., 1., (5000, self.dimA)) ys = np.array([func(obs[[0], :], [xi])[0] for xi in xs]) epi = np.hstack((xs, ys)) pca = PCA(n_components=2).fit(epi) W = pca.components_[:, :-1] xs_proj = xs.dot(W.T) fig = plt.figure() X = Y = np.linspace(xs_proj.min(), xs_proj.max(), 100) Z = griddata(xs_proj[:, 0], xs_proj[:, 1], ys.ravel(), X, Y, interp='linear') plt.contourf(X, Y, Z, 15) plt.colorbar() adam_x = hist['act'][:, 0, :].T adam_x = adam_x.dot(W.T) plt.plot(adam_x[:, 0], adam_x[:, 1], label='Adam', color='k') plt.legend() fname = os.path.join(FLAGS.outdir, 'adamPlt.png') print("Saving Adam plot to {}".format(fname)) plt.savefig(fname) plt.close(fig) def reset(self, obs): self.noise = np.zeros(self.dimA) self.observation = obs # initial observation def act(self, test=False): with self.sess.as_default(): print('--- Selecting action, test={}'.format(test)) obs = np.expand_dims(self.observation, axis=0) if FLAGS.icnn_opt == 'adam': f = self._fg_entr # f = self._fg elif FLAGS.icnn_opt == 'bundle_entropy': f = self._fg else: raise RuntimeError("Unrecognized ICNN optimizer: " + FLAGS.icnn_opt) tflearn.is_training(False) action = self.opt(f, obs) tflearn.is_training(not test) if not test: self.noise -= FLAGS.outheta*self.noise - \ FLAGS.ousigma*npr.randn(self.dimA) action += self.noise action = np.clip(action, -1, 1) self.action = np.atleast_1d(np.squeeze(action, axis=0)) return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 self.rm.enqueue(obs1, term, self.action, rew) if self.t > FLAGS.warmup: for i in range(FLAGS.iter): loss = self.train() def train(self): with self.sess.as_default(): obs, act, rew, ob2, term2, info = self.rm.minibatch( size=FLAGS.bsize) if FLAGS.icnn_opt == 'adam': # f = self._opt_train_entr f = self._fg_entr_target # f = self._fg_target elif FLAGS.icnn_opt == 'bundle_entropy': f = self._fg_target else: raise RuntimeError("Unrecognized ICNN optimizer: " + FLAGS.icnn_opt) print('--- Optimizing for training') tflearn.is_training(False) act2 = self.opt(f, ob2) tflearn.is_training(True) _, _, loss = self._train(obs, act, rew, ob2, act2, term2, log=FLAGS.summary, global_step=self.t) self.sess.run(self.proj) return loss def negQ(self, x, y, reuse=False): szs = [FLAGS.l1size, FLAGS.l2size] assert (len(szs) >= 1) fc = tflearn.fully_connected bn = tflearn.batch_normalization lrelu = tflearn.activations.leaky_relu if reuse: tf.get_variable_scope().reuse_variables() nLayers = len(szs) us = [] zs = [] z_zs = [] z_ys = [] z_us = [] reg = 'L2' prevU = x for i in range(nLayers): with tf.variable_scope('u' + str(i)) as s: u = fc(prevU, szs[i], reuse=reuse, scope=s, regularizer=reg) if i < nLayers - 1: u = tf.nn.relu(u) if FLAGS.icnn_bn: u = bn(u, reuse=reuse, scope=s, name='bn') variable_summaries(u, suffix='u{}'.format(i)) us.append(u) prevU = u prevU, prevZ = x, y for i in range(nLayers + 1): sz = szs[i] if i < nLayers else 1 z_add = [] if i > 0: with tf.variable_scope('z{}_zu_u'.format(i)) as s: zu_u = fc(prevU, szs[i - 1], reuse=reuse, scope=s, activation='relu', bias=True, regularizer=reg, bias_init=tf.constant_initializer(1.)) variable_summaries(zu_u, suffix='zu_u{}'.format(i)) with tf.variable_scope('z{}_zu_proj'.format(i)) as s: z_zu = fc(tf.mul(prevZ, zu_u), sz, reuse=reuse, scope=s, bias=False, regularizer=reg) variable_summaries(z_zu, suffix='z_zu{}'.format(i)) z_zs.append(z_zu) z_add.append(z_zu) with tf.variable_scope('z{}_yu_u'.format(i)) as s: yu_u = fc(prevU, self.dimA, reuse=reuse, scope=s, bias=True, regularizer=reg, bias_init=tf.constant_initializer(1.)) variable_summaries(yu_u, suffix='yu_u{}'.format(i)) with tf.variable_scope('z{}_yu'.format(i)) as s: z_yu = fc(tf.mul(y, yu_u), sz, reuse=reuse, scope=s, bias=False, regularizer=reg) z_ys.append(z_yu) variable_summaries(z_yu, suffix='z_yu{}'.format(i)) z_add.append(z_yu) with tf.variable_scope('z{}_u'.format(i)) as s: z_u = fc(prevU, sz, reuse=reuse, scope=s, bias=True, regularizer=reg, bias_init=tf.constant_initializer(0.)) variable_summaries(z_u, suffix='z_u{}'.format(i)) z_us.append(z_u) z_add.append(z_u) z = tf.add_n(z_add) variable_summaries(z, suffix='z{}_preact'.format(i)) if i < nLayers: # z = tf.nn.relu(z) z = lrelu(z, alpha=FLAGS.lrelu) variable_summaries(z, suffix='z{}_act'.format(i)) zs.append(z) prevU = us[i] if i < nLayers else None prevZ = z z = tf.reshape(z, [-1], name='energies') return z def __del__(self): self.sess.close()
class Agent: def __init__(self, dimO, dimA): dimA = list(dimA) dimO = list(dimO) self.dimA = dimA[0] self.dimO = dimO[0] tau = FLAGS.tau discount = FLAGS.discount l2norm = FLAGS.l2norm learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma nets = icnn_nets_dm # init replay memory self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) # start tf session self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.1))) # create tf computational graph self.theta = nets.theta(dimO[0], dimA[0], FLAGS.l1size, FLAGS.l2size, 'theta') self.theta_t, update_t = exponential_moving_averages(self.theta, tau) obs = tf.placeholder(tf.float32, [1] + dimO, "obs") act_test = tf.placeholder(tf.float32, [1] + dimA, "act") # explore noise_init = tf.zeros([1] + dimA) noise_var = tf.Variable(noise_init, name="noise", trainable=False) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub((outheta) * noise_var - tf.random_normal(dimA, stddev=ousigma)) act_expl = act_test + noise # test, single sample q function & gradient for bundle method q_test_opt, cz1, cz2, cz3, _, _, _, _ = nets.qfunction(obs, act_test, self.theta) loss_test = -q_test_opt act_test_grad = tf.gradients(loss_test, act_test)[0] # batched q function & gradient for bundle method obs_train2_opt = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs_train2_opt") act_train2_opt = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train2_opt") q_train2_opt, cz1t, cz2t, cz3t, _, _, _, _ = nets.qfunction(obs_train2_opt, act_train2_opt, self.theta_t) loss_train2 = -q_train2_opt act_train2_grad = tf.gradients(loss_train2, act_train2_opt)[0] # training obs_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs_train") act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") obs_train2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs_train2") act_train2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train2") term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2") def entropy(x): #the real concave entropy function x_move_reg = tf.clip_by_value((x + 1) / 2, 0.0001, 0.9999) pen = x_move_reg * tf.log(x_move_reg) + (1 - x_move_reg) * tf.log(1 - x_move_reg) return -tf.reduce_sum(pen, 1) q_train, q_train_cz1, q_train_cz2, q_train_cz3, q_train_z1, q_train_z2, q_train_u1, q_train_u2 = nets.qfunction(obs_train, act_train, self.theta) q_train_entropy = q_train + entropy(act_train) q_train2, q_train2_cz1, q_train2_cz2, q_train2_cz3, _, _, _, _ = nets.qfunction(obs_train2, act_train2, self.theta_t) q_train2_entropy = q_train2 + entropy(act_train2) q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q_train2_entropy)) # q loss td_error = q_train_entropy - q_target ms_td_error = tf.reduce_mean(tf.square(td_error), 0) theta = self.theta wd_q = tf.add_n([l2norm * tf.nn.l2_loss(var) for var in theta]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars_q = optim_q.compute_gradients(loss_q) grads_and_vars_q_clip = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in grads_and_vars_q] optimize_q = optim_q.apply_gradients(grads_and_vars_q_clip) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_t) summary_writer = tf.train.SummaryWriter(os.path.join(FLAGS.outdir, 'board'), self.sess.graph) summary_list = [] summary_list.append(tf.scalar_summary('Qvalue', tf.reduce_mean(q_train_entropy))) summary_list.append(tf.scalar_summary('loss', ms_td_error)) summary_list.append(tf.scalar_summary('reward', tf.reduce_mean(rew))) summary_list.append(tf.scalar_summary('cvx_z1', tf.reduce_mean(q_train_z1))) summary_list.append(tf.scalar_summary('cvx_z2', tf.reduce_mean(q_train_z2))) summary_list.append(tf.scalar_summary('cvx_z1_pos', tf.reduce_mean(tf.to_float(q_train_z1 > 0)))) summary_list.append(tf.scalar_summary('cvx_z2_pos', tf.reduce_mean(tf.to_float(q_train_z2 > 0)))) summary_list.append(tf.scalar_summary('noncvx_u1', tf.reduce_mean(q_train_u1))) summary_list.append(tf.scalar_summary('noncvx_u2', tf.reduce_mean(q_train_u2))) summary_list.append(tf.scalar_summary('noncvx_u1_pos', tf.reduce_mean(tf.to_float(q_train_u1 > 1e-15)))) summary_list.append(tf.scalar_summary('noncvx_u2_pos', tf.reduce_mean(tf.to_float(q_train_u2 > 1e-15)))) # tf functions with self.sess.as_default(): self._cz = Fun([obs], [cz1, cz2, cz3]) self._czt = Fun([obs_train2_opt], [cz1t, cz2t, cz3t]) self._reset = Fun([], self.ou_reset) self._act_expl = Fun(act_test, act_expl) self._train = Fun([obs_train, act_train, rew, obs_train2, act_train2, term2], [train_q, loss_q], summary_list, summary_writer) self._opt_test = Fun([act_test, cz1, cz2, cz3], [loss_test, act_test_grad]) self._opt_train = Fun([act_train2_opt, cz1t, cz2t, cz3t], [loss_train2, act_train2_grad]) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def get_cvx_opt(self, func, cz1, cz2, cz3): act = np.ones((cz1.shape[0], self.dimA)) * 0.5 def fg(x): value, grad = func(2 * x - 1, cz1, cz2, cz3) grad *= 2 return value, grad act = bundle_entropy.solveBatch(fg, act)[0] act = 2 * act - 1 return act def reset(self, obs): self._reset() self.observation = obs # initial observation def act(self, test=False): obs = np.expand_dims(self.observation, axis=0) cz1, cz2, cz3 = self._cz(obs) act = self.get_cvx_opt(self._opt_test, cz1, cz2, cz3) action = act if test else self._act_expl(act) action = np.clip(action, -1, 1) self.action = np.atleast_1d(np.squeeze(action, axis=0)) # TODO: remove this hack return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 self.rm.enqueue(obs1, term, self.action, rew) if self.t > FLAGS.warmup: for i in xrange(FLAGS.iter): loss = self.train() def train(self): obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) cz1t, cz2t, cz3t = self._czt(ob2) act2 = self.get_cvx_opt(self._opt_train, cz1t, cz2t, cz3t) _, loss = self._train(obs, act, rew, ob2, act2, term2, log=FLAGS.summary, global_step=self.t) return loss def __del__(self): self.sess.close()
class Agent: def __init__(self, dimO, dimA): dimA = list(dimA) dimO = list(dimO) nets = nets_dm # init replay memory self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype]) # own replay memory self.replay_memory = deque(maxlen=rm_size) # start tf session self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=threads, log_device_placement=False, allow_soft_placement=True)) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA) self.theta_q = nets.theta_q(dimO, dimA) self.theta_pt, update_pt = exponential_moving_averages(self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages(self.theta_q, tau) obs = tf.placeholder(tf.float32, [None] + dimO, "obs") act_test, sum_p = nets.policy(obs, self.theta_p) # explore noise_init = tf.zeros([1] + dimA) noise_var = tf.Variable(noise_init) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub((ou_theta) * noise_var - tf.random_normal(dimA, stddev=ou_sigma)) act_expl = act_test + noise # test q, sum_q = nets.qfunction(obs, act_test, self.theta_q, name= 'q_mu_of_s') # training # policy loss meanq = tf.reduce_mean(q, 0) wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=lrp) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q optimization act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2") # q q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q, name= 'qs_a') # q targets act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt) q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt, name='qsprime_aprime') q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2)) # q_target = tf.stop_gradient(rew + discount * q2) # q loss td_error = q_train - q_target ms_td_error = tf.reduce_mean(tf.square(td_error), 0) wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=lrq) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) # logging log_obs = [] if dimO[0] > 20 else [tf.histogram_summary("obs/" + str(i), obs[:, i]) for i in range(dimO[0])] log_act = [] if dimA[0] > 20 else [tf.histogram_summary("act/inf" + str(i), act_test[:, i]) for i in range(dimA[0])] log_act2 = [] if dimA[0] > 20 else [tf.histogram_summary("act/train" + str(i), act_train[:, i]) for i in range(dimA[0])] log_misc = [sum_p, sum_qq, tf.histogram_summary("td_error", td_error)] log_grad = [grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q)] log_noise = [tf.histogram_summary('noise', noise_var)] log_train = log_obs + log_act + log_act2 + log_misc + log_grad + log_noise merged = tf.merge_all_summaries() # initialize tf log writer self.writer = tf.train.SummaryWriter(FLAGS.outdir + "/tf", self.sess.graph, flush_secs=20) # init replay memory for recording episodes max_ep_length = 10000 self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, rm_dtype) # tf functions with self.sess.as_default(): self.act_test = Fun(obs, act_test) self._act_expl = Fun(obs, act_expl) self._reset = Fun([], self.ou_reset) self._train_q = Fun([obs, act_train, rew, obs2, term2], [train_q], log_train, self.writer) self._train_p = Fun([obs], [train_p]) self._train_p = Fun([obs], [train_p], log_obs, self.writer) self._train = Fun([obs, act_train, rew, obs2, term2], [train_p, train_q], merged, self.writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def reset(self, obs): self._reset() self.observation = obs # initial observation def act(self, test=False): obs = np.expand_dims(self.observation, axis=0) action = self.act_test(obs) if test else self._act_expl(obs) self.action = np.atleast_1d(np.squeeze(action, axis=0)) # TODO: remove this hack return self.action def observe(self, rew, term, obs2, test=False, perform_trainstep= True): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 self.rm.enqueue(obs1, term, self.action, rew) self.replay_memory.append((obs1, self.action, rew, obs2, term)) if self.t > FLAGS.warmup: # print('warmed up') if perform_trainstep: self.train() # elif FLAGS.warmq and self.rm.n > 1000: # # Train Q on warmup # obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) # self._train_q(obs, act, rew, ob2, term2, log=(np.random.rand() < FLAGS.log), global_step=self.t) # save parameters etc. # if (self.t+45000) % 50000 == 0: # TODO: correct # s = self.saver.save(self.sess,FLAGS.outdir+"f/tf/c",self.t) # print("DDPG Checkpoint: " + s) def train(self): # obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) obs, act, rew, ob2, term2, = self.get_train_batch() log = (np.random.rand() < FLAGS.log) if FLAGS.async: self._train(obs, act, rew, ob2, term2, log=log, global_step=self.t) else: self._train_q(obs, act, rew, ob2, term2, log=log, global_step=self.t) self._train_p(obs, log=log, global_step=self.t) def write_scalar(self, tag, val): s = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)]) self.writer.add_summary(s, self.t) def __del__(self): self.sess.close() def get_train_batch(self): #selecting transitions randomly from the replay memory: indices = np.random.randint(0, len(self.replay_memory), [FLAGS.bsize]) transition_batch = [self.replay_memory[i] for i in indices] states = np.asarray([transition_batch[i][0].squeeze() for i in range(FLAGS.bsize)]) actions = np.asarray([transition_batch[i][1] for i in range(FLAGS.bsize)]) rewards = np.asarray([transition_batch[i][2] for i in range(FLAGS.bsize)]) states_prime = np.asarray([transition_batch[i][3].squeeze() for i in range(FLAGS.bsize)]) term2 = np.asarray([transition_batch[i][4] for i in range(FLAGS.bsize)]) return states, actions, rewards, states_prime, term2
class Agent: def __init__(self, dimO, dimA): dimA = list(dimA) dimO = list(dimO) nets = nets_dm # init replay memory self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype]) # start tf session self.sess = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=threads, log_device_placement=False, allow_soft_placement=True)) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA) self.theta_q = nets.theta_q(dimO, dimA) self.theta_pt, update_pt = exponential_moving_averages( self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages( self.theta_q, tau) obs = tf.placeholder(tf.float32, [None] + dimO, "obs") act_test, sum_p = nets.policy(obs, self.theta_p) # explore noise_init = tf.zeros([1] + dimA) noise_var = tf.Variable(noise_init) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub( (FLAGS.ou_theta) * noise_var - tf.random_normal(dimA, stddev=FLAGS.ou_sigma)) act_expl = act_test + noise # test q, sum_q = nets.qfunction(obs, act_test, self.theta_q) # training # policy loss meanq = tf.reduce_mean(q, 0) wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=lrp) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q optimization act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") term = tf.placeholder(tf.bool, [FLAGS.bsize], "term") obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2") # q q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q) # q targets act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt) q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt) q_target = tf.stop_gradient(tf.select(term, rew, rew + discount * q2)) # q_target = tf.stop_gradient(rew + discount * q2) # q loss td_error = q_train - q_target ms_td_error = tf.reduce_mean(tf.square(td_error), 0) wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=lrq) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) # logging log_obs = [] if dimO[0] > 20 else [ tf.histogram_summary("obs/" + str(i), obs[:, i]) for i in range(dimO[0]) ] log_act = [] if dimA[0] > 20 else [ tf.histogram_summary("act/inf" + str(i), act_test[:, i]) for i in range(dimA[0]) ] log_act2 = [] if dimA[0] > 20 else [ tf.histogram_summary("act/train" + str(i), act_train[:, i]) for i in range(dimA[0]) ] log_misc = [sum_p, sum_qq, tf.histogram_summary("td_error", td_error)] log_grad = [ grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q) ] log_train = log_obs + log_act + log_act2 + log_misc + log_grad # initialize tf log writer self.writer = tf.train.SummaryWriter(FLAGS.outdir + "/tf", self.sess.graph, flush_secs=20) # init replay memory for recording episodes max_ep_length = 10000 self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, rm_dtype) # tf functions with self.sess.as_default(): self._act_test = Fun(obs, act_test) self._act_expl = Fun(obs, act_expl) self._reset = Fun([], self.ou_reset) self._train_q = Fun([obs, act_train, rew, term, obs2], [train_q], log_train, self.writer) self._train_p = Fun([obs], [train_p], log_train, self.writer) self._train = Fun([obs, act_train, rew, term, obs2], [train_p, train_q], log_train, self.writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def reset(self, obs): self._reset() self.observation = obs # initial observation def act(self, test=False): obs = np.expand_dims(self.observation, axis=0) action = self._act_test(obs) if test else self._act_expl(obs) self.action = np.atleast_1d(np.squeeze( action, axis=0)) # TODO: remove this hack return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 self.rm.enqueue(obs1, term, self.action, rew) if self.t > FLAGS.warmup: self.train() elif FLAGS.warmq and self.rm.n > 1000: # Train Q on warmup obs, act, rew, term, ob2, info = self.rm.minibatch( size=FLAGS.bsize) self._train_q(obs, act, rew, term, ob2, log=(np.random.rand() < FLAGS.log), global_step=self.t) # save parameters etc. # if (self.t+45000) % 50000 == 0: # TODO: correct # s = self.saver.save(self.sess,FLAGS.outdir+"f/tf/c",self.t) # print("DDPG Checkpoint: " + s) def train(self): obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) log = (np.random.rand() < FLAGS.log) if FLAGS. async: self._train(obs, act, rew, ob2, term2, log=log, global_step=self.t) else: self._train_q(obs, act, rew, ob2, term2, log=log, global_step=self.t) self._train_p(obs, log=log, global_step=self.t) def write_scalar(self, tag, val): s = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)]) self.writer.add_summary(s, self.t) def __del__(self): self.sess.close()
class Agent(object): def __init__(self, dimO, dimA): dimA = list(dimA) dimO = list(dimO) if len(dimO) > 1: assert len(dimO) == 3 self.use_conv = True nets = ddpg_convnets_dm else: self.use_conv = False nets = ddpg_nets_dm # init replay memory self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) # start tf session self.sess = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.1))) # Placeholders input_obs_dim = [None] + dimO obs = tf.placeholder(tf.float32, input_obs_dim, "obs") is_training = tf.placeholder(tf.bool, [], name='is_training') act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2") summary_writer = tf.train.SummaryWriter( os.path.join(FLAGS.outdir, 'board'), self.sess.graph) summary_list = [] summary_list.append(tf.scalar_summary('reward', tf.reduce_mean(rew))) self.setup_actor_critic(nets, dimO, dimA, obs, obs2, is_training, rew, term2, act_train) summary_list.extend(self.actor.get_summary()) summary_list.extend(self.critic.get_summary()) # summary_list.append(tf.scalar_summary('Qvalue', tf.reduce_mean(q_train))) # summary_list.append(tf.scalar_summary('loss', ms_td_error)) # tf functions with self.sess.as_default(): train_outputs = self.actor.get_train_outputs( ) + self.critic.get_train_outputs() self._train = Fun([obs, act_train, rew, obs2, term2, is_training], train_outputs, summary_list, summary_writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def setup_actor_critic(self, nets, dimO, dimA, obs, obs2, is_training, rew, term2, act_train): self.actor = Actor(self.use_conv, nets, dimO, dimA, obs, obs2, is_training, self.sess, scope='actor') self.critic = Critic(self.use_conv, nets, dimO, dimA, obs, obs2, rew, term2, is_training, act_train, self.actor, scope='critic') self.actor.compute_loss(self.critic, obs, is_training) def reset(self, obs): self.actor.reset() self.observation = obs # initial observation def act(self, test=False): obs = np.expand_dims(self.observation, axis=0) action = self.actor.act(obs, test) action = np.clip(action, -1, 1) self.action = np.atleast_1d(np.squeeze( action, axis=0)) # TODO: remove this hack return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 self.rm.enqueue(obs1, term, self.action, rew) if self.t > FLAGS.warmup: for i in xrange(FLAGS.iter): loss = self.train() def train(self): obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) _, _, loss = self._train(obs, act, rew, ob2, term2, True, log=FLAGS.summary, global_step=self.t) return loss def __del__(self): self.sess.close()
class Agent: def __init__(self, dimO, dimA): dimA = list(dimA) dimO = list(dimO) nets = ddpg_nets_dm tau = FLAGS.tau discount = FLAGS.discount pl2norm = FLAGS.pl2norm l2norm = FLAGS.l2norm plearning_rate = FLAGS.prate learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma # init replay memory self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) # start tf session self.sess = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.1))) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA, FLAGS.l1size, FLAGS.l2size) self.theta_q = nets.theta_q(dimO, dimA, FLAGS.l1size, FLAGS.l2size) self.theta_pt, update_pt = exponential_moving_averages( self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages( self.theta_q, tau) obs = tf.placeholder(tf.float32, [None] + dimO, "obs") act_test = nets.policy(obs, self.theta_p) # explore noise_init = tf.zeros([1] + dimA) noise_var = tf.Variable(noise_init) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub((outheta) * noise_var - tf.random_normal(dimA, stddev=ousigma)) act_expl = act_test + noise # test q = nets.qfunction(obs, act_test, self.theta_q) # training # q optimization act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2") # policy loss act_train_policy = nets.policy(obs, self.theta_p) q_train_policy = nets.qfunction(obs, act_train_policy, self.theta_q) meanq = tf.reduce_mean(q_train_policy, 0) wd_p = tf.add_n([pl2norm * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=plearning_rate, epsilon=1e-4) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q q_train = nets.qfunction(obs, act_train, self.theta_q) # q targets act2 = nets.policy(obs2, theta=self.theta_pt) q2 = nets.qfunction(obs2, act2, theta=self.theta_qt) q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2)) # q_target = tf.stop_gradient(rew + discount * q2) # q loss td_error = q_train - q_target ms_td_error = tf.reduce_mean(tf.square(td_error), 0) wd_q = tf.add_n([l2norm * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) summary_writer = tf.train.SummaryWriter( os.path.join(FLAGS.outdir, 'board'), self.sess.graph) summary_list = [] summary_list.append( tf.scalar_summary('Qvalue', tf.reduce_mean(q_train))) summary_list.append(tf.scalar_summary('loss', ms_td_error)) summary_list.append(tf.scalar_summary('reward', tf.reduce_mean(rew))) # tf functions with self.sess.as_default(): self._act_test = Fun(obs, act_test) self._act_expl = Fun(obs, act_expl) self._reset = Fun([], self.ou_reset) self._train = Fun([obs, act_train, rew, obs2, term2], [train_p, train_q, loss_q], summary_list, summary_writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def reset(self, obs): self._reset() self.observation = obs # initial observation def act(self, test=False): obs = np.expand_dims(self.observation, axis=0) action = self._act_test(obs) if test else self._act_expl(obs) action = np.clip(action, -1, 1) self.action = np.atleast_1d(np.squeeze( action, axis=0)) # TODO: remove this hack return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 self.rm.enqueue(obs1, term, self.action, rew) if self.t > FLAGS.warmup: for i in xrange(FLAGS.iter): loss = self.train() def train(self): obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) _, _, loss = self._train(obs, act, rew, ob2, term2, log=FLAGS.summary, global_step=self.t) return loss def __del__(self): self.sess.close()
class Agent: def __init__( self, dimO, dimA, nets=nets_dm, tau=.001, # fdsla discount=.99, pl2=.0, ql2=.01, lrp=.0001, lrq=.001, ou_theta=0.15, ou_sigma=0.2, rm_size=500000, rm_dtype='float32', mb_size=32, threads=4, **kwargs): dimA = list(dimA) dimO = list(dimO) # init replay memory self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype]) self.mb_size = mb_size # start tf session self.sess = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=threads, log_device_placement=False, allow_soft_placement=True)) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA) self.theta_q = nets.theta_q(dimO, dimA) self.theta_pt, update_pt = exponential_moving_averages( self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages( self.theta_q, tau) obs = tf.placeholder(tf.float32, [None] + dimO, "obs") act_test, sum_p = nets.policy(obs, self.theta_p) # explore noise_init = tf.zeros([1] + dimA) noise_var = tf.Variable(noise_init) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub((ou_theta) * noise_var - tf.random_normal(dimA, stddev=ou_sigma)) act_expl = act_test + noise # test q, sum_q = nets.qfunction(obs, act_test, self.theta_q) # training # policy loss meanq = tf.reduce_mean(q, 0) wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=lrp) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q optimization act_train = tf.placeholder(tf.float32, [None] + dimA, "act_train") rew = tf.placeholder(tf.float32, [None], "rew") obs2 = tf.placeholder(tf.float32, [None] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [None], "term2") # q q, sum_qq = nets.qfunction(obs, act_train, self.theta_q) # q targets act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt) q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt) q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2)) # = tf.stop_gradient(rew + discount * q2) # q loss mb_td_error = tf.square(q - q_target) mean_td_error = tf.reduce_mean(mb_td_error, 0) wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = mean_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=lrq) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) # logging log_obs = [] if dimO[0] > 20 else [ tf.histogram_summary("obs/" + str(i), obs[:, i]) for i in range(dimO[0]) ] log_act = [] if dimA[0] > 20 else [ tf.histogram_summary("act/inf" + str(i), act_test[:, i]) for i in range(dimA[0]) ] log_act2 = [] if dimA[0] > 20 else [ tf.histogram_summary("act/train" + str(i), act_train[:, i]) for i in range(dimA[0]) ] log_misc = [ sum_p, sum_qq, tf.histogram_summary("qfunction/td_error", mb_td_error) ] log_grad = [ grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q) ] log_train = log_obs + log_act + log_act2 + log_misc + log_grad # initialize tf log writer self.writer = tf.train.SummaryWriter("./tf", self.sess.graph, flush_secs=20) # init replay memory for recording episodes max_ep_length = 10000 self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, rm_dtype) # tf functions with self.sess.as_default(): self._act_test = Fun(obs, act_test) self._act_expl = Fun(obs, act_expl) self._reset = Fun([], self.ou_reset) self._train = Fun([obs, act_train, rew, obs2, term2], [train_p, train_q], log_train, self.writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint("./tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def reset(self, obs): self._reset() self.observation = np.squeeze(obs) # initial observation def act(self, test=False, logging=False): obs = np.expand_dims(self.observation, axis=0) action = self._act_test(obs) if test else self._act_expl(obs) self.action = np.atleast_1d(np.squeeze( action, axis=0)) # TODO: remove this hack return self.action def observe(self, rew, term, obs2, test=False): rew = self.reward(rew) # internal reward # TODO: outsource if not test: self.t = self.t + 1 self.rm.enqueue(self.observation, term, self.action, rew) # save parameters etc. if (self.t + 45000) % 50000 == 0: # TODO: correct s = self.saver.save(self.sess, "./tf/c", self.t) print("DDPG Checkpoint: " + s) self.observation = np.squeeze(obs2) # current observation <- obs2 return rew def train(self, logging=False): obs, act, rew, obs2, term2, info = self.rm.minibatch(size=self.mb_size) self._train(obs, act, rew, obs2, term2, log=logging, global_step=self.t) def reward(self, external_reward, logging=False): """ calculate internal reward """ ra = -.1 * np.mean(np.square(self.action)) rint = external_reward + ra if logging: self.write_scalar('reward/ext', external_reward) self.write_scalar('reward/a', ra) self.write_scalar('reward/rint', rint) return rint def write_scalar(self, tag, val): s = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)]) self.writer.add_summary(s, self.t) def __del__(self): self.sess.close()
class Agent: def __init__(self, dimO, dimA, nets=nets_dm, tau =.001, # fdsla discount =.99, pl2 =.0, ql2 =.01, lrp =.0001, lrq =.001, ou_theta = 0.15, ou_sigma = 0.2, rm_size = 500000, rm_dtype = 'float32', mb_size = 32, threads = 4,**kwargs): dimA = list(dimA) dimO = list(dimO) # init replay memory self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype]) self.mb_size = mb_size # start tf session self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=threads, log_device_placement=False, allow_soft_placement=True)) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA) self.theta_q = nets.theta_q(dimO, dimA) self.theta_pt, update_pt = exponential_moving_averages( self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages( self.theta_q, tau) obs = tf.placeholder(tf.float32, [None] + dimO, "obs") act_test, sum_p = nets.policy(obs, self.theta_p) # explore noise_init = tf.zeros([1]+dimA) noise_var = tf.Variable(noise_init) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub( (ou_theta) * noise_var - tf.random_normal(dimA, stddev=ou_sigma)) act_expl = act_test + noise # test q, sum_q = nets.qfunction(obs, act_test, self.theta_q) # training # policy loss meanq = tf.reduce_mean(q, 0) wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=lrp) grads_and_vars_p = optim_p.compute_gradients( loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q optimization act_train = tf.placeholder(tf.float32, [None] + dimA, "act_train") rew = tf.placeholder(tf.float32, [None], "rew") obs2 = tf.placeholder(tf.float32, [None] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [None], "term2") # q q, sum_qq = nets.qfunction(obs, act_train, self.theta_q) # q targets act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt) q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt) q_target = tf.stop_gradient(tf.select(term2,rew,rew + discount*q2)) # = tf.stop_gradient(rew + discount * q2) # q loss mb_td_error = tf.square(q - q_target) mean_td_error = tf.reduce_mean(mb_td_error, 0) wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = mean_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=lrq) grads_and_vars_q = optim_q.compute_gradients( loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) # logging log_obs = [] if dimO[0]>20 else [tf.histogram_summary("obs/"+str(i),obs[:,i]) for i in range(dimO[0])] log_act = [] if dimA[0]>20 else [tf.histogram_summary("act/inf"+str(i),act_test[:,i]) for i in range(dimA[0])] log_act2 = [] if dimA[0]>20 else [tf.histogram_summary("act/train"+str(i),act_train[:,i]) for i in range(dimA[0])] log_misc = [sum_p, sum_qq, tf.histogram_summary("qfunction/td_error", mb_td_error)] log_grad = [grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q)] log_train = log_obs + log_act + log_act2 + log_misc + log_grad # initialize tf log writer self.writer = tf.train.SummaryWriter( "./tf", self.sess.graph, flush_secs=20) # init replay memory for recording episodes max_ep_length = 10000 self.rm_log = ReplayMemory(max_ep_length,dimO,dimA,rm_dtype) # tf functions with self.sess.as_default(): self._act_test = Fun(obs,act_test) self._act_expl = Fun(obs,act_expl) self._reset = Fun([],self.ou_reset) self._train = Fun([obs,act_train,rew,obs2,term2],[train_p,train_q],log_train,self.writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint("./tf") if ckpt: self.saver.restore(self.sess,ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def reset(self, obs): self._reset() self.observation = np.squeeze(obs) # initial observation def act(self, test=False, logging=False): obs = np.expand_dims(self.observation, axis=0) action = self._act_test(obs) if test else self._act_expl(obs) self.action = np.atleast_1d(np.squeeze(action, axis=0)) # TODO: remove this hack return self.action def observe(self, rew, term, obs2, test=False): rew = self.reward(rew) # internal reward # TODO: outsource if not test: self.t = self.t + 1 self.rm.enqueue(self.observation, term, self.action, rew) # save parameters etc. if (self.t+45000) % 50000 == 0: # TODO: correct s = self.saver.save(self.sess,"./tf/c",self.t) print("DDPG Checkpoint: " + s) self.observation = np.squeeze(obs2) # current observation <- obs2 return rew def train(self, logging=False): obs, act, rew, obs2, term2, info = self.rm.minibatch(size=self.mb_size) self._train(obs,act,rew,obs2,term2,log=logging,global_step=self.t) def reward(self,external_reward,logging=False): """ calculate internal reward """ ra = - .1 * np.mean(np.square(self.action)) rint = external_reward + ra if logging: self.write_scalar('reward/ext',external_reward) self.write_scalar('reward/a',ra) self.write_scalar('reward/rint',rint) return rint def write_scalar(self,tag,val): s = tf.Summary(value=[tf.Summary.Value(tag=tag,simple_value=val)]) self.writer.add_summary(s,self.t) def __del__(self): self.sess.close()
class Agent: """ DDPG Agent """ started_train = False def __init__(self, dimO, dimA, custom_policy=False, env_dtype=tf.float32): dimA = list(dimA) dimO = list(dimO) nets = nets_dm self.custom_policy = custom_policy # init replay memory self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[env_dtype]) # start tf session self.sess = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=threads, log_device_placement=False, allow_soft_placement=True)) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA) self.theta_q = nets.theta_q(dimO, dimA) self.theta_pt, update_pt = exponential_moving_averages( self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages( self.theta_q, tau) obs = tf.placeholder(env_dtype, [None] + dimO, "obs") is_training = tf.placeholder(tf.bool, name="is_training") # act_test, sum_p = nets.policy(obs, self.theta_p) act_test, sum_p = nets.policy( obs, self.theta_p) if not FLAGS.batch_norm else nets.policy_norm( obs, self.theta_p, is_training) # explore noise_init = tf.zeros([1] + dimA, dtype=env_dtype) noise_var = tf.Variable(noise_init) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub( (ou_theta) * noise_var - tf.random_normal(dimA, stddev=ou_sigma, dtype=env_dtype)) act_expl = act_test + noise # for Wolpertinger full policy act_cont = tf.placeholder(env_dtype, [None] + dimA, "action_cont_space") # g_actions = tf.placeholder(env_dtype, [FLAGS.knn] + dimA, "knn_actions") g_actions = tf.placeholder(env_dtype, [None] + dimA, "knn_actions") # rew_g = tf.placeholder(env_dtype, [FLAGS.knn] + dimA, "rew") # rew_g = tf.placeholder(env_dtype, [FLAGS.knn], "rew_g") # term_g = tf.placeholder(tf.bool, [FLAGS.knn], "term_g") rew_g = tf.placeholder(env_dtype, [1], "rew_g") term_g = tf.placeholder(tf.bool, [1], "term_g") # g_dot_f = tf.mul(g_actions, act_cont, "g_dot_f") g_dot_f = g_actions q_eval, _ = nets.qfunction( obs, g_dot_f, self.theta_q) if not FLAGS.batch_norm else nets.qfunction_norm( obs, g_dot_f, self.theta_q, is_training, reuse=True) # wolpertinger_policy = tf.stop_gradient( tf.argmax( tf.select(term_g, rew_g, rew_g + discount * q_eval), # dimension=0, name="q_max") ) wolpertinger_policy = tf.stop_gradient( tf.select(term_g, rew_g, rew_g + discount * q_eval)) # test # q, sum_q = nets.qfunction(obs, act_test, self.theta_q) q, sum_q = nets.qfunction( obs, act_test, self.theta_q) if not FLAGS.batch_norm else nets.qfunction_norm( obs, act_test, self.theta_q, is_training) # training # policy loss meanq = tf.reduce_mean(q, 0) wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p #??? # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=lrp) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q optimization act_train = tf.placeholder(env_dtype, [FLAGS.bsize] + dimA, "act_train") g_act_train = tf.placeholder(env_dtype, [FLAGS.bsize] + dimA, "g_act_train") rew = tf.placeholder(env_dtype, [FLAGS.bsize], "rew") obs2 = tf.placeholder(env_dtype, [FLAGS.bsize] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2") # FOR WOLPERTINGER FUNCTIONALITY: eval wheter the agent is using pure DDPG or DDPG + Wolpertinger tensor_cond = tf.constant(self.custom_policy, dtype=tf.bool, name="is_custom_p") # full_act_policy = tf.cond(tensor_cond, # # lambda: tf.mul(g_act_train, act_train, name="full_act_policy"), # lambda: g_act_train, # lambda: act_train, # ) # q # q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q) # TAKING THE POLICY GRADIENT AT THE ACTUAL OUTPUT OF f q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q) if not FLAGS.batch_norm else \ nets.qfunction_norm(obs, act_train, self.theta_q, is_training, reuse=True) # q_train, sum_qq = nets.qfunction(obs, full_act_policy, self.theta_q) if not FLAGS.batch_norm else \ # nets.qfunction_norm(obs, full_act_policy, self.theta_q, is_training, reuse=True) # q targets # act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt) act2, sum_p2 = nets.policy( obs2, theta=self.theta_pt) if not FLAGS.batch_norm else nets.policy_norm( obs2, theta=self.theta_pt, is_training=is_training, reuse=True) # WOLPERTINGER FUNCTIONALITY: The target action in the Q-update is generated by the full policy and not simply f full_act_policy2 = tf.cond( tensor_cond, # lambda: tf.mul(g_act_train, act2, name="full_act_policy"), lambda: g_act_train, lambda: act2, ) # q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt) # q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt) if not FLAGS.batch_norm else nets.qfunction_norm(obs2, act2, theta=self.theta_qt, is_training=is_training, reuse=True) q2, sum_q2 = nets.qfunction( obs2, full_act_policy2, theta=self.theta_qt ) if not FLAGS.batch_norm else nets.qfunction_norm( obs2, full_act_policy2, theta=self.theta_qt, is_training=is_training, reuse=True) q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2)) # q_target = tf.stop_gradient(rew + discount * q2) # q loss td_error = q_train - q_target # TODO: maybe it needs to be q_target - q_train ms_td_error = tf.reduce_mean(tf.square(td_error), 0) wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=lrq) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) # logging log_obs = [] if dimO[0] > 20 else [ tf.histogram_summary("obs/" + str(i), obs[:, i]) for i in range(dimO[0]) ] log_act = [] if dimA[0] > 20 else [ tf.histogram_summary("act/inf" + str(i), act_test[:, i]) for i in range(dimA[0]) ] log_act2 = [] if dimA[0] > 20 else [ tf.histogram_summary("act/train" + str(i), act_train[:, i]) for i in range(dimA[0]) ] log_misc = [sum_p, sum_qq, tf.histogram_summary("td_error", td_error)] log_grad = [ grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q) ] log_train = log_obs + log_act + log_act2 + log_misc + log_grad # initialize tf log writer self.writer = tf.train.SummaryWriter(FLAGS.outdir + "/tf", self.sess.graph, flush_secs=20) # init replay memory for recording episodes max_ep_length = 10000 self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, env_dtype) # tf functions with self.sess.as_default(): # self._act_test = Fun(obs,act_test) # self._act_expl = Fun(obs,act_expl) # self._reset = Fun([],self.ou_reset) # self._train_q = Fun([obs,act_train,rew,obs2,term2],[train_q],log_train,self.writer) # self._train_p = Fun([obs],[train_p],log_train,self.writer) # self._train = Fun([obs,act_train,rew,obs2,term2],[train_p,train_q],log_train,self.writer) self._act_test = Fun([obs, is_training], act_test) self._act_expl = Fun([obs, is_training], act_expl) self._reset = Fun([], self.ou_reset) self._train_q = Fun( [obs, act_train, g_act_train, rew, obs2, term2, is_training], [train_q], log_train, self.writer) self._train_p = Fun([obs, is_training], [train_p], log_train, self.writer) self._train = Fun( [obs, act_train, g_act_train, rew, obs2, term2, is_training], [train_p, train_q], log_train, self.writer) self._wolpertinger_p = Fun( [obs, act_cont, g_actions, rew_g, term_g, is_training], [wolpertinger_policy]) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: print "==> RESTORING VARIABLES FROM CHECKPOINT: {}".format(ckpt) self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def reset(self, obs): self._reset() self.observation = obs # initial observation def act(self, test=False): obs = np.expand_dims(self.observation, axis=0) # action = self._act_test(obs) if test else self._act_expl(obs) action = self._act_test(obs, False) if test else self._act_expl( obs, True) self.action = np.atleast_1d(np.squeeze( action, axis=0)) # TODO: remove this hack return self.action def wolpertinger_policy(self, action_cont, g_actions, rew_g, term_g): obs = np.expand_dims(self.observation, axis=0) action_cont = np.expand_dims(action_cont, axis=0) # rew_g = np.expand_dims(rew_g, axis=0) # return np.asarray( self._wolpertinger_p(obs, action_cont, g_actions, rew_g, term_g) ) i = 0 q_values = [] for g_action in g_actions: g_action = np.expand_dims(g_action, axis=0) q_values.append( self._wolpertinger_p(obs, action_cont, g_action, [rew_g[i]], [term_g[i]])[0]) i += 1 # return self._wolpertinger_p(obs, action_cont, g_actions, rew_g, term_g)[0] return np.argmax(q_values) def observe(self, rew, term, obs2, test=False, g_action=None): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 self.rm.enqueue(obs1, term, self.action, g_action, rew) if self.t > FLAGS.warmup: self.train() elif FLAGS.warmq and self.rm.n > 1000: # Train Q on warmup obs, act, g_act, rew, ob2, term2, info = self.rm.minibatch( size=FLAGS.bsize) # self._train_q(obs,act,rew,ob2,term2, log = (np.random.rand() < FLAGS.log), global_step=self.t) for i in xrange(FLAGS.iter): self._train_q(obs, act, g_act, rew, ob2, term2, True, log=(np.random.rand() < FLAGS.log), global_step=self.t) # save parameters etc. # if (self.t+45000) % 50000 == 0: # TODO: correct # s = self.saver.save(self.sess,FLAGS.outdir+"f/tf/c",self.t) # print("DDPG Checkpoint: " + s) def checkpoint_session(self): return self.saver.save(self.sess, FLAGS.outdir + "/tf/c", self.t) def train(self): if not self.started_train: with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f: f.write('===> Warm up complete\n') self.started_train = True obs, act, g_act, rew, ob2, term2, info = self.rm.minibatch( size=FLAGS.bsize) log = (np.random.rand() < FLAGS.log) if FLAGS. async: # self._train(obs,act,rew,ob2,term2, log = log, global_step=self.t) for i in xrange(FLAGS.iter): self._train(obs, act, g_act, rew, ob2, term2, True, log=log, global_step=self.t) else: # self._train_q(obs,act,rew,ob2,term2, log = log, global_step=self.t) # self._train_p(obs, log = log, global_step=self.t) for i in xrange(FLAGS.iter): self._train_q(obs, act, g_act, rew, ob2, term2, True, log=log, global_step=self.t) self._train_p(obs, True, log=log, global_step=self.t) def write_scalar(self, tag, val): s = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)]) self.writer.add_summary(s, self.t) def __del__(self): self.sess.close()