def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 #X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+2]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = U.function([X], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=1, kfac_update=2, cold_iter=50, \ weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 U.initialize() # Initialize uninitialized TF variables
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): super(Model, self).__init__(name='ACKTRModel') nbatch = nenvs * nsteps # TODO: PolicyWithValue does this right? Original implementation uses 'nbatch' #self.model = step_model = policy(nenvs, 1) #self.model2 = train_model = policy(nbatch, nsteps) train_model = PolicyWithValue(ac_space, policy, value_network=None, estimate_q=False) self.ent_coef = ent_coef self.vf_coef = vf_coef self.vf_fisher_coef = vf_fisher_coef self.kfac_clip = kfac_clip self.is_async = is_async self.max_grad_norm = max_grad_norm self.total_timesteps = total_timesteps # TODO: Learning rate schedule and definition of optimizer #self.lrschedule = lrschedule lrschedule = LinearTimeDecay(initial_learning_rate=lr) # TODO self.optim = kfac.KfacOptimizer(learning_rate=lrschedule, clip_kl=self.kfac_clip, \ momentum=0.9, kfac_update=1, epsilon=0.01, \ stats_decay=0.99, is_async=self.is_async, cold_iter=10, max_grad_norm=self.max_grad_norm) self.train_model = train_model #self.step_model = step_model self.step = self.train_model.step self.value = self.train_model.value self.initial_state = self.train_model.initial_state
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): self.sess = sess = get_session() nbatch = nenvs * nsteps with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE): self.model = step_model = policy(nenvs, 1, sess=sess) self.model2 = train_model = policy(nenvs * nsteps, nsteps, sess=sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) self.logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * neglogpac) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("acktr_model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm) # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, PG_LR: cur_lr, VF_LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy self.train = train self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, fname=None): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() if fname != None and tf.train.checkpoint_exists(fname): load_result = U.load_state(fname) logger.log("Model loaded from file {}".format(fname)) # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(get_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Save model every 100 iterations if fname != None and (i % 100 == 99): U.save_state(fname) logger.log("Model saved to file {}".format(fname)) env.seed() # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") U.eval( tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) elif kl < desired_kl / 2: logger.log("kl too low") U.eval( tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, expert_nbatch, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, vf_expert_coef=0.5 * 0.0, expert_coeff=1.0, exp_adv_est='reward', lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): # create tf stuff config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) # the actual model nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) A_EXP = tf.placeholder(tf.int32, [expert_nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) ADV_EXP = tf.placeholder(tf.float32, [expert_nbatch]) R = tf.placeholder(tf.float32, [nbatch]) R_EXP = tf.placeholder(tf.float32, [expert_nbatch]) PG_LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) eval_step_model = policy(sess, ob_space, ac_space, 1, 1, reuse=True) train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) expert_train_model = policy(sess, ob_space, ac_space, expert_nbatch, 1, reuse=True) logpac_expert = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=expert_train_model.pi, labels=A_EXP) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) _, acc = tf.metrics.accuracy(labels=A, predictions=tf.argmax(train_model.pi, 1)) ## training loss pg_loss = tf.reduce_mean(ADV*logpac) pg_expert_loss = tf.reduce_mean(ADV_EXP * logpac_expert) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) vf_expert_loss = tf.reduce_mean(mse(tf.squeeze(expert_train_model.vf), R_EXP)) train_loss = pg_loss + vf_coef * vf_loss + expert_coeff * pg_expert_loss + vf_expert_coef * vf_expert_loss self.check = check = tf.add_check_numerics_ops() ## Fisher loss construction pg_fisher_loss = -tf.reduce_mean(logpac) # + logpac_expert) # pg_expert_fisher_loss = -tf.reduce_mean(logpac_expert) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) vf_fisher_loss = - vf_fisher_coef * tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) joint_fisher_loss = pg_fisher_loss + vf_fisher_loss params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=20, max_grad_norm=max_grad_norm ) # why is this unused? update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) self.q_runner = q_runner lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values, expert_obs, expert_rewards, expert_actions, expert_values): if exp_adv_est == 'critic': expert_advs = np.clip(expert_rewards - expert_values, a_min=0, a_max=None) elif exp_adv_est == 'reward': expert_advs = expert_rewards elif exp_adv_est == 'simple': expert_advs = np.ones_like(expert_rewards) else: raise ValueError("Unknown expert advantage estimator {}".format(exp_adv_est)) advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X:obs, expert_train_model.X: expert_obs, A_EXP: expert_actions, A:actions, ADV:advs, ADV_EXP: expert_advs, R:rewards, PG_LR:cur_lr, R_EXP: expert_rewards } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, policy_expert_loss, value_loss, policy_entropy, train_accuracy, _, grads_to_check = sess.run( [pg_loss, pg_expert_loss, vf_loss, entropy, acc, train_op, grads], td_map ) for grad in grads_to_check: if np.isnan(grad).any(): print("ojojoj grad is nan") return policy_loss, policy_expert_loss, value_loss, policy_entropy, train_accuracy def save(save_path): print("Writing model to {}".format(save_path)) ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) def eval_step(obs, eval_type): td_map = {eval_step_model.X: [obs]} logits = sess.run(eval_step_model.pi, td_map)[0] if eval_type == 'argmax': act = logits.argmax() if np.random.rand() < 0.01: act = ac_space.sample() return act elif eval_type == 'prob': # probs = func(s[None, :, :, :])[0][0] x = logits e_x = np.exp(x - np.max(x)) probs = e_x / e_x.sum(axis=0) act = np.random.choice(range(probs.shape[-1]), 1, p=probs)[0] return act else: raise ValueError("Unknown eval type {}".format(eval_type)) self.model = step_model self.model2 = train_model self.expert_train_model = expert_train_model self.vf_fisher = vf_fisher_loss self.pg_fisher = pg_fisher_loss self.joint_fisher = joint_fisher_loss self.params = params self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.eval_step = eval_step self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess) tf.local_variables_initializer().run(session=sess)
def __init__(self, ob_dim, ac_dim): """ Create an MLP policy for a value function :param ob_dim: (int) Observation dimention :param ac_dim: (int) action dimention """ obs_ph = tf.placeholder(tf.float32, shape=[None, ob_dim * 2 + ac_dim * 2 + 2 ]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} layer_1 = tf.nn.elu( dense(obs_ph, 64, "h1", weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) layer_2 = tf.nn.elu( dense(layer_1, 64, "h2", weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(layer_2, 1, "hfinal", weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:, 0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = tf.reduce_mean( tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = tf_util.function([obs_ph], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001 * (1 - 0.9), momentum=0.9, clip_kl=0.3, epsilon=0.1, stats_decay=0.95, async=1, kfac_update=2, cold_iter=50, weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = tf_util.function([obs_ph, vtarg_n], update_op) # pylint: disable=E1101 tf_util.initialize() # Initialize uninitialized TF variables
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, fname='./training.ckpt'): mean_logger = setup_logger("Mean Logger", "log/episode_mean.txt") # print("Filter shape: ", env.observation_space.shape) space = (env.observation_space.shape[0] * 2, ) obfilter = ZFilter(space) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') #0.03 inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() #changes if fname != None and tf.train.checkpoint_exists(fname): saver = tf.train.Saver() saver.restore(tf.get_default_session(), fname) logger.log("Model loaded from file {}".format(fname)) # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None, "QR is None") enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 total_reward = float() while True: print("Timestep Number: %d of %d" % (timesteps_so_far, num_timesteps)) if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) #Save model every 100 iterations if fname != None and (i % 100 == 0): os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("Model saved to file {}".format(fname)) env.seed() # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] terminal_rew = [] while True: path, temp_rew = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) terminal_rew.append(np.array(temp_rew)) n = pathlength(path) timesteps_this_batch += n if timesteps_this_batch > timesteps_per_batch: break timesteps_so_far += 1 # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") terminal_rew = np.array(terminal_rew) rew_mean = np.mean([path.sum() for path in terminal_rew]) rew_sem = np.std( [path.sum() / np.sqrt(len(terminal_rew)) for path in terminal_rew]) len_mean = np.mean([path.shape[0] for path in terminal_rew]) # rewList = [] # for path in paths: # trew = [] # rew_i = 0 # while True: # trew.append(path["reward"][rew_i]) # rew_i += 11 # if rew_i > (len(path["reward"])-1): # break # rewList.append( np.array(trew) ) # rewList = np.array(rewList) # rew_mean = np.mean([path.sum() for path in rewList]) # rew_sem = np.std([path.sum()/np.sqrt(len(rewList)) for path in rewList]) # len_mean = np.mean([path.shape[0] for path in rewList]) # rew_mean = np.mean([path["reward"].sum() for path in paths]) # rew_sem = np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths]) # len_mean = np.mean([pathlength(path) for path in paths]) total_reward += rew_mean logger.record_tabular("EpRewMean", rew_mean) logger.record_tabular("EpRewSEM", rew_sem) logger.record_tabular("EpLenMean", len_mean) logger.record_tabular("TotalRewardMean", total_reward) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() mean_logger.info( "Result for episode {} of {}: Sum: {}, Average: {}, Length: {}". format(timesteps_so_far, num_timesteps, rew_mean, rew_sem, len_mean)) i += 1 if fname != None: os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("Model saved to file {}".format(fname)) env.seed() coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, lr=0.03, momentum=0.9): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(lr)), name='stepsize') stepsize_mul = tf.placeholder(tf.float32, shape=None) inputs, loss, loss_sampled = policy.update_info inputs = list(inputs) inputs.append(stepsize_mul) optim = kfac.KfacOptimizer(learning_rate=stepsize * stepsize_mul, cold_lr=stepsize * stepsize_mul *(1-0.9), momentum=momentum, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) grads = optim.compute_gradients(loss, pi_var_list) grads = [g[0] for g in grads] old_var = [ tf.Variable(initial_value=tf.zeros_like(v)) for v in pi_var_list ] old_to_new = tf.group( *[tf.assign(v, o) for v, o in zip(pi_var_list, old_var)]) old_from_new = tf.group( *[tf.assign(o, v) for v, o in zip(pi_var_list, old_var)]) do_old_var = U.function([], old_var) do_pi_var = U.function([], pi_var_list) do_old_from_new = U.function([], old_from_new) with tf.control_dependencies(grads): with tf.control_dependencies([old_to_new]): midpoint_op, q_runner_mid = optim.apply_gradients( list(zip(grads, pi_var_list))) do_midpoint = U.function(inputs, midpoint_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_old_from_new() # print(do_old_var()) do_update(ob_no, action_na, standardized_adv_n, 0.5) do_midpoint(ob_no, action_na, standardized_adv_n, 1.0) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) # if kl > desired_kl * 2: # logger.log("kl too high") # tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() # elif kl < desired_kl / 2: # logger.log("kl too low") # tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() # else: # logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, lr=0.03, momentum=0.9): ob_dim, ac_dim = policy.ob_dim, policy.ac_dim dbpi = GaussianMlpPolicy(ob_dim, ac_dim, 'dbp') oldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'oe') dboldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'doi') # with tf.variable_scope('dbp'): # with tf.variable_scope('oe'): # with tf.variable_scope('doi'): pi = policy do_std = U.function([], [pi.std_1a, pi.logstd_1a]) kloldnew = oldpi.pd.kl(pi.pd) dbkloldnew = dboldpi.pd.kl(dbpi.pd) dist = meankl = tf.reduce_mean(kloldnew) dbkl = tf.reduce_mean(dbkloldnew) obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(lr)), name='stepsize') inputs, loss, loss_sampled = policy.update_info var_list = [v for v in tf.global_variables() if "pi" in v.name] db_var_list = [v for v in tf.global_variables() if "dbp" in v.name] old_var_list = [v for v in tf.global_variables() if "oe" in v.name] db_old_var_list = [v for v in tf.global_variables() if "doi" in v.name] print(len(var_list), len(db_var_list), len(old_var_list), len(db_old_var_list)) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_var_list, var_list) ]) assign_db = U.function( [], [], updates=[ tf.assign(db, o) for (db, o) in zipsame(db_var_list, var_list) ] + [ tf.assign(dbold, dbnew) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list) ]) assign_old_eq_newr = U.function( [], [], updates=[ tf.assign(newv, oldv) for (oldv, newv) in zipsame(old_var_list, var_list) ]) # assign_dbr = U.function([], [], updates= # [tf.assign(o, db) for (db, o) in zipsame(db_var_list, var_list)] + # [tf.assign(dbnew, dbold) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list)]) klgrads = tf.gradients(dist, var_list) dbklgrads = tf.gradients(dbkl, db_var_list) p_grads = [tf.ones_like(v) for v in dbklgrads] get_flat = U.GetFlat(var_list) get_old_flat = U.GetFlat(old_var_list) set_from_flat = U.SetFromFlat(var_list) flat_tangent2 = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan2") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents2 = [] for shape in shapes: sz = U.intprod(shape) tangents2.append(tf.reshape(flat_tangent2[start:start + sz], shape)) start += sz gvp2 = tf.add_n([ tf.reduce_sum(g * tangent2) for (g, tangent2) in zipsame(dbklgrads, tangents2) ]) gvp2_grads = tf.gradients(gvp2, db_var_list) neg_term = tf.add_n([ tf.reduce_sum(g * tangent2) for (g, tangent2) in zipsame(gvp2_grads, tangents2) ]) / 2. ng1 = tf.gradients(neg_term, db_var_list) ng2 = tf.gradients(neg_term, db_old_var_list) neg_term_grads = [ a + b for (a, b) in zip(tf.gradients(neg_term, db_var_list), tf.gradients(neg_term, db_old_var_list)) ] neg_term = neg_term_grads # neg_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in neg_term_grads]) pos_term = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(gvp2_grads, p_grads) ]) pos_term_grads = [ a + b for (a, b) in zip(tf.gradients(pos_term, db_var_list), tf.gradients(pos_term, db_old_var_list)) ] pos_term_sum = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(pos_term_grads, tangents2) ]) pos_term_grads = tf.gradients(pos_term_sum, p_grads) pos_term = pos_term_grads # pos_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in pos_term_grads]) geo_term = [(p - n) * 0.5 for p, n in zip(pos_term, neg_term)] optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=momentum, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) grads = optim.compute_gradients(loss, var_list=pi_var_list) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) geo_term = [g1 + g2[0] for g1, g2 in zip(geo_term, grads)] geo_grads = list(zip(geo_term, var_list)) update_geo_op, q_runner_geo = optim.apply_gradients(geo_grads) do_update = U.function(inputs, update_op) inputs_tangent = list(inputs) + [flat_tangent2] do_update_geo = U.function(inputs_tangent, update_geo_op) do_get_geo_term = U.function(inputs_tangent, [ng1, ng2]) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner, q_runner_geo]: assert (qr != None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) assign_old_eq_new() # set old parameter values to new parameter values assign_db() # Policy update do_update(ob_no, action_na, standardized_adv_n) # ft2 = get_flat() - get_old_flat() # assign_old_eq_newr() # assign back # gnp = do_get_geo_term(ob_no, action_na, standardized_adv_n, ft2) # def check_nan(bs): # return [~np.isnan(b).all() for b in bs] # print(gnp[0]) # print('.....asdfasdfadslfkadsjfaksdfalsdkfjaldskf') # print(gnp[1]) # do_update_geo(ob_no, action_na, standardized_adv_n, ft2) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) # if kl > desired_kl * 2: # logger.log("kl too high") # tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() # elif kl < desired_kl / 2: # logger.log("kl too low") # tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() # else: # logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) print(do_std()) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, save_path="./", save_after=200, load_path=None, save_rollouts=False): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(U.get_session(), coord=coord, start=True)) if load_path != None: saver = tf.train.Saver() saver.restore(U.get_session(), os.path.join(load_path, "model.ckpt")) obfilter_path = os.path.join(load_path, "obfilter.pkl") with open(obfilter_path, 'rb') as obfilter_input: obfilter = pickle.load(obfilter_input) print("Loaded Model") else: # create saver saver = tf.train.Saver() i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: #path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter) if "jaco" in env.spec.id.lower(): path = rollout(env, policy, max_pathlength, animate=animate, obfilter=obfilter, save_rollouts=save_rollouts) goal_dist = np.linalg.norm(env.env.env.get_body_com("jaco_link_hand") \ - env.env.env.get_body_com("target")) if goal_dist <= 0.12: print("goal_dist {} ; episode added".format(goal_dist)) paths.append(path) else: path = rollout(env, policy, max_pathlength, animate=animate, obfilter=obfilter, save_rollouts=save_rollouts) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break if save_rollouts: # save the rollouts rollouts_path = os.path.join(load_path, "rollouts-v2.pkl") with open(rollouts_path, 'wb') as rollouts_output: pickle.dump(paths, rollouts_output, pickle.HIGHEST_PROTOCOL) sys.exit() # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) logp_n = np.concatenate([path["logp"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") U.eval( tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) elif kl < desired_kl / 2: logger.log("kl too low") U.eval( tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() # save model if necessary if i % save_after == 0: save(saver, obfilter, save_path) i += 1
def __init__(self, policy, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): sess = get_session() nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) eval_model = policy(1, 1, sess) # A = train_model.pdtype.sample_placeholder([None]) # A = tf.placeholder(step_model.action.dtype, step_model.action.shape) probs = tf.nn.softmax(step_model.pi) class_ind = tf.to_int32(tf.multinomial(tf.log(probs), 1)[0][0]) self.pg_fisher = pg_fisher_loss = tf.log(probs[0, class_ind]) ##Fisher loss construction # self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac) # sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) # self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) # self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.joint_fisher = joint_fisher_loss = pg_fisher_loss self.params = params = find_trainable_variables("a2c_model") with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer() # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) stats = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params[:-4]) def compute_fisher(obs): # action = action[:, np.newaxis] td_map = {step_model.X: obs, step_model.keep_prob: 1.0} fisher = sess.run(stats, td_map) return fisher self.compute_fisher = compute_fisher self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, save_model_with_prefix=None, restore_model_from_file=None, outdir="/tmp/rosrl/experiments/continuous/acktr/"): obfilter = ZFilter(env.observation_space.shape) # Risto change max_pathlength = env.max_episode_steps stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async_=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() """ Here we add a possibility to resume from a previously saved model if a model file is provided """ if restore_model_from_file: saver = tf.train.Saver() saver.restore(tf.get_default_session(), restore_model_from_file) logger.log("Loaded model from {}".format(restore_model_from_file)) # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 if save_model_with_prefix: # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/' summary_writer = tf.summary.FileWriter(outdir, graph=tf.get_default_graph()) while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() """ Save the model at every itteration """ if save_model_with_prefix: if np.mean([path["reward"].sum() for path in paths]) > -50.0: # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/' summary = tf.Summary(value=[ tf.Summary.Value(tag="EpRewMean", simple_value=np.mean([ path["reward"].sum() for path in paths ])) ]) summary_writer.add_summary(summary, i) if not os.path.exists(outdir): os.makedirs(outdir) modelF = outdir + '/' + save_model_with_prefix + "_afterIter_" + str( i) + ".model" U.save_state(modelF) logger.log("Saved model to file :{}".format(modelF)) i += 1 coord.request_stop() coord.join(enqueue_threads)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) #nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) SUB3 = tf.placeholder(tf.int32, [nbatch]) SUB4 = tf.placeholder(tf.int32, [nbatch]) SUB5 = tf.placeholder(tf.int32, [nbatch]) SUB6 = tf.placeholder(tf.int32, [nbatch]) SUB7 = tf.placeholder(tf.int32, [nbatch]) SUB8 = tf.placeholder(tf.int32, [nbatch]) SUB9 = tf.placeholder(tf.int32, [nbatch]) SUB10 = tf.placeholder(tf.int32, [nbatch]) SUB11 = tf.placeholder(tf.int32, [nbatch]) SUB12 = tf.placeholder(tf.int32, [nbatch]) X0 = tf.placeholder(tf.int32, [nbatch]) Y0 = tf.placeholder(tf.int32, [nbatch]) X1 = tf.placeholder(tf.int32, [nbatch]) Y1 = tf.placeholder(tf.int32, [nbatch]) X2 = tf.placeholder(tf.int32, [nbatch]) Y2 = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub3, labels=SUB3) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub4, labels=SUB4) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub5, labels=SUB5) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub6, labels=SUB6) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub7, labels=SUB7) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub8, labels=SUB8) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub9, labels=SUB9) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub10, labels=SUB10) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub11, labels=SUB11) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub12, labels=SUB12) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x0, labels=X0) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y0, labels=Y0) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x1, labels=X1) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y1, labels=Y1) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x2, labels=X2) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y2, labels=Y2) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * logpac) * tf.reduce_mean(ADV) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub3)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub4)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub5)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub6)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub7)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub8)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub9)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub10)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub11)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub12)) \ + tf.reduce_mean(cat_entropy(train_model.pi_x0)) \ + tf.reduce_mean(cat_entropy(train_model.pi_y0)) \ + tf.reduce_mean(cat_entropy(train_model.pi_x1)) \ + tf.reduce_mean(cat_entropy(train_model.pi_y1)) \ + tf.reduce_mean(cat_entropy(train_model.pi_x2)) \ + tf.reduce_mean(cat_entropy(train_model.pi_y2)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, \ momentum=0.9, kfac_update=1, epsilon=0.01, \ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, sub12, x0, y0, x1, y1, x2, y2, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, SUB3: sub3, SUB4: sub4, SUB5: sub5, SUB6: sub6, SUB7: sub7, SUB8: sub8, SUB9: sub9, SUB10: sub10, SUB11: sub11, SUB12: sub12, X0: x0, Y0: y0, X1: x1, Y1: y1, X2: x2, Y2: y2, ADV: advs, R: rewards, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) print("policy_loss : ", policy_loss, " value_loss : ", value_loss, " entropy : ", entropy) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state print("global_variables_initializer start") tf.global_variables_initializer().run(session=sess) print("global_variables_initializer complete")
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, resume, logdir, agentName, num_timesteps, animate=False, callback=None, desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1.0 - 0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(U.get_session(), coord=coord, start=True)) timesteps_so_far = 0 saver = tf.train.Saver(max_to_keep=10) if resume > 0: saver.restore( tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume))) ob_filter_path = os.path.join(os.path.abspath(logdir), "{}-{}".format('obfilter', resume)) with open(ob_filter_path, 'rb') as ob_filter_input: obfilter = pickle.load(ob_filter_input) print("Loaded observation filter") iters_so_far = resume print('logdir = ', logdir) logF = open(os.path.join(logdir, 'log.txt'), 'a') logF2 = open(os.path.join(logdir, 'log_it.txt'), 'a') logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a') while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % iters_so_far) save_interval = 5 # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (iters_so_far % save_interval == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2.0: logger.log("kl too high") U.eval( tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) elif kl < desired_kl / 2.0: logger.log("kl too low") U.eval( tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) else: logger.log("kl just right!") rew_mean = np.mean([path["reward"].sum() for path in paths]) logger.record_tabular("EpRewMean", rew_mean) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) logF.write(str(rew_mean) + "\n") logF2.write(str(iters_so_far) + "," + str(rew_mean) + "\n") # json.dump(combined_stats, logStats) logF.flush() logF2.flush() # logStats.flush() if save_interval and (iters_so_far % save_interval == 0 or iters_so_far == 1): saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=iters_so_far) ob_filter_path = os.path.join( os.path.abspath(logdir), "{}-{}".format('obfilter', iters_so_far)) with open(ob_filter_path, 'wb') as ob_filter_output: pickle.dump(obfilter, ob_filter_output, pickle.HIGHEST_PROTOCOL) if callback: callback() logger.dump_tabular() iters_so_far += 1 coord.request_stop() coord.join(enqueue_threads)
def __init__(self, policy, ob_space, ac_space, n_envs, total_timesteps, nprocs=32, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear'): """ The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144 :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param ob_space: (Gym Space) The observation space :param ac_space: (Gym Space) The action space :param n_envs: (int) The number of environments :param total_timesteps: (int) The total number of timesteps for training the model :param nprocs: (int) The number of threads for TensorFlow operations :param n_steps: (int) The number of steps to run for each environment :param ent_coef: (float) The weight for the entropic loss :param vf_coef: (float) The weight for the loss on the value function :param vf_fisher_coef: (float) The weight for the fisher loss on the value function :param learning_rate: (float) The initial learning rate for the RMS prop optimizer :param max_grad_norm: (float) The clipping value for the maximum gradient :param kfac_clip: (float) gradient clipping for Kullback leiber :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') """ config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) n_batch = n_envs * n_steps action_ph = tf.placeholder(tf.int32, [n_batch]) advs_ph = tf.placeholder(tf.float32, [n_batch]) rewards_ph = tf.placeholder(tf.float32, [n_batch]) pg_lr_ph = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, n_envs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, n_envs * n_steps, n_steps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.policy, labels=action_ph) self.logits = train_model.policy # training loss pg_loss = tf.reduce_mean(advs_ph * logpac) entropy = tf.reduce_mean(calc_entropy(train_model.policy)) pg_loss = pg_loss - ent_coef * entropy vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=pg_lr_ph, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for _ in range(len(obs)): cur_lr = self.learning_rate.value() td_map = { train_model.obs_ph: obs, action_ph: actions, advs_ph: advs, rewards_ph: rewards, pg_lr_ph: cur_lr } if states is not None: td_map[train_model.states_ph] = states td_map[train_model.masks_ph] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): session_params = sess.run(params) joblib.dump(session_params, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for param, loaded_p in zip(params, loaded_params): restores.append(param.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def learn(env, policy, value_fn, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002): """ Traines an ACKTR model. :param env: (Gym environment) The environment to learn from :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param value_fn: (Object) The value function model to use (MLP, CNN, LSTM, ...) :param gamma: (float) The discount value :param lam: (float) the tradeoff between exploration and exploitation :param timesteps_per_batch: (int) the number of timesteps for each batch :param num_timesteps: (int) the total number of timesteps to run :param animate: (bool) if render env :param callback: (function) called every step, used for logging and saving :param desired_kl: (float) the Kullback leibler weight for the loss """ obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize * (1 - 0.9), momentum=0.9, kfac_update=2, epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = tf_util.function(inputs, update_op) tf_util.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for queue_runner in [q_runner, value_fn.q_runner]: assert queue_runner is not None enqueue_threads.extend( queue_runner.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) timesteps_this_batch += path["reward"].shape[0] timesteps_so_far += path["reward"].shape[0] if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = value_fn.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function value_fn.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl_loss = policy.compute_kl(ob_no, oldac_dist) if kl_loss > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl_loss < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular( "EpLenMean", np.mean([path["reward"].shape[0] for path in paths])) logger.record_tabular("KL", kl_loss) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def __init__(self, x, y_): self.x = x input = tf.reshape(self.x, [-1, 28, 28, 1]) # input placeholder with tf.variable_scope('conv1'): # simple 2-layer network W1 = weight_variable([5, 5, 1, 32]) b1 = bias_variable([32]) h_conv1 = tf.nn.relu(conv2d(input, W1) + b1) h_pool1 = max_pool_2_2((h_conv1)) with tf.variable_scope('conv2'): W2 = weight_variable([5, 5, 32, 64]) b2 = bias_variable([64]) h_conv2 = tf.nn.relu(conv2d(h_pool1, W2) + b2) h_pool2 = max_pool_2_2(h_conv2) with tf.variable_scope('fc1'): h_pool2_flatten = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) W3 = weight_variable(([7 * 7 * 64, 1024])) b3 = bias_variable([1024]) h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flatten, W3) + b3) with tf.variable_scope('output'): W4 = weight_variable(([1024, 10])) b4 = bias_variable([10]) self.y = tf.matmul(h_fc1, W4) + b4 self.var_list = [W1, b1, W2, b2, W3, b3, W4, b4] # in_dim = int(x.get_shape()[1]) # 784 for MNIST # out_dim = int(y_.get_shape()[1]) # 10 for MNIST # # self.x = x # input placeholder # # # simple 2-layer network # W1 = weight_variable([in_dim, 100]) # b1 = bias_variable([100]) # # W2 = weight_variable([100, out_dim]) # b2 = bias_variable([out_dim]) # # h1 = tf.nn.relu(tf.matmul(x, W1) + b1) # hidden layer # self.y = tf.matmul(h1, W2) + b2 # output layer # # self.var_list = [W1, b1, W2, b2] # vanilla single-task loss self.cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_, logits=self.y)) self.set_vanilla_loss() # performance metrics correct_prediction = tf.equal(tf.argmax(self.y, 1), tf.argmax(y_, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) self.ewc_loss = 0 # self.star_vars = [] self.F_accum = [] self.optim = kfac.KfacOptimizer()