def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs, stochastic_policy, save=False, reuse=False): # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space U.initialize() policy = build_policy(env, 'mlp', value_network='copy') ob = observation_placeholder(ob_space) with tf.variable_scope('pi'): pi = policy(observ_placeholder=ob) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(load_model_path) saver.restore(U.get_session(), ckpt.model_checkpoint_path) obs_list = [] acs_list = [] len_list = [] ret_list = [] from tqdm import tqdm for _ in tqdm(range(number_trajs)): traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj[ 'ep_len'], traj['ep_ret'] obs_list.append(obs) acs_list.append(acs) len_list.append(ep_len) ret_list.append(ep_ret) if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') if save: filename = load_model_path.split('/')[-1] + '.' + env.spec.id np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list), lens=np.array(len_list), rets=np.array(ret_list)) avg_len = sum(len_list) / len(len_list) avg_ret = sum(ret_list) / len(ret_list) # print("Average length:", avg_len) # print("Average return:", avg_ret) return avg_len, avg_ret
def train_copos(args): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(args.log_path) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(args.log_path, format_strs=[]) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() #def policy_fn(name, ob_space, ac_space): # return CompatibleMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, # hid_size=32, num_hid_layers=2) set_global_seeds(workerseed) env = build_env(args, normalize_ob=True) #env = gym.make(args.env) #env.seed(workerseed) timesteps_per_batch = 10000 #timesteps_per_batch=2048 beta = -1 if beta < 0: nr_episodes = int(args.num_timesteps) // timesteps_per_batch # Automatically compute beta based on initial entropy and number of iterations policy = build_policy(env, "mlp", value_network='copy', copos=True) ob = observation_placeholder(env.observation_space) with tf.variable_scope("tmp_pi"): tmp_pi = policy(observ_placeholder=ob) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob}) beta = 2 * entropy / nr_episodes print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Automatically set beta: " + str(beta)) copos_mpi.learn(network='mlp', env=env, seed=args.seed, timesteps_per_batch=timesteps_per_batch, epsilon=0.01, beta=beta, cg_iters=10, cg_damping=0.1, max_timesteps=int(args.num_timesteps), gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def build_model(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, call_staliro=False, **network_kwargs): policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = 1 # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) model = PlayModel(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) return model
def learn(network, env, nsteps=5, total_timesteps=1e6, vf_coef=0.5, ent_coef=0.01, lr=1e-3, epsilon=1e-5, alpha=0.99, gamma=0.99): """ main entrypoint for A2C. train a policy with given network using A2C. """ nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps) if load_path is not None: model.load(load_path) # instantiate the runner object runner = Runner(env, model, nsteps=nsteps, gamma=gamma) nbatch = nenvs * nsteps for update in range(1, total_timesteps // nbatch + 1): # get mini batch of experiences obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) return model
def policy_fn(name, ob_space, ac_space, reuse=False): return build_policy(env, 'mlp', value_network='copy')
def learn(env, policy_func, reward_giver, reward_guidance, expert_dataset, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, task_name, gamma, lam, algo, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=1e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, loss_percent=0.0, callback=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space policy = build_policy(env, 'mlp', value_network='copy') ob = observation_placeholder(ob_space) with tf.variable_scope('pi'): pi = policy(observ_placeholder=ob) with tf.variable_scope('oldpi'): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables('pi') # var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")] # vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") # assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) guidance_adam = MpiAdam(reward_guidance.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables('oldpi'), get_variables('pi')) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() guidance_adam.sync() vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, reward_guidance, timesteps_per_batch, stochastic=True, algo=algo, loss_percent=loss_percent) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(reward_giver.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("********** Iteration %i ************" % iters_so_far) # global flag_render # if iters_so_far > 0 and iters_so_far % 10 ==0: # flag_render = True # else: # flag_render = False def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() print('rewards', seg['rew']) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch( batch_size=len(ob)) batch_size = 128 d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch with timed("Discriminator"): for (ob_batch, ac_batch) in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch( batch_size=batch_size) # update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ob_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) # ------------------ Update Guidance ------------ logger.log("Optimizing Guidance...") logger.log(fmt_row(13, reward_guidance.loss_name)) batch_size = 128 guidance_losses = [ ] # list of tuples, each of which gives the loss for a minibatch with timed("Guidance"): for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch( batch_size=batch_size) idx_condition = process_expert(ob_expert, ac_expert) pick_idx = (idx_condition >= loss_percent) # pick_idx = idx_condition ob_expert_p = ob_expert[pick_idx] ac_expert_p = ac_expert[pick_idx] ac_batch_p = [] for each_ob in ob_expert_p: tmp_ac, _, _, _ = pi.step(each_ob, stochastic=True) ac_batch_p.append(tmp_ac) # update running mean/std for reward_giver if hasattr(reward_guidance, "obs_rms"): reward_guidance.obs_rms.update(ob_expert_p) # reward_guidance.train(expert_s=ob_batch_p, agent_a=ac_batch_p, expert_a=ac_expert_p) *newlosses, g = reward_guidance.lossandgrad( ob_expert_p, ac_batch_p, ac_expert_p) guidance_adam.update(allmean(g), d_stepsize) guidance_losses.append(newlosses) logger.log(fmt_row(13, np.mean(guidance_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) * g_step iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def learn( *, network, env, timesteps_per_batch, # what to train on epsilon, beta, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint seed=None, callback=None, load_path=None, TRPO=False, **network_kwargs): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() policy = build_policy(env, network, value_network='copy', copos=True, **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space discrete_ac_space = isinstance(ac_space, gym.spaces.Discrete) #ob = U.get_placeholder_cached(name="ob") ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() old_entropy = oldpi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "Entropy"] dist = meankl #all_var_list = pi.get_trainable_variables() #all_var_list = [v for v in all_var_list if v.name.split("/")[0].startswith("pi")] #var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] #vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] all_var_list = get_trainable_variables("pi") var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz #????gvp and fvp??? gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Initialize eta, omega optimizer if discrete_ac_space: init_eta = 1 init_omega = 0.5 eta_omega_optimizer = EtaOmegaOptimizerDiscrete( beta, epsilon, init_eta, init_omega) else: init_eta = 0.5 init_omega = 2.0 #????eta_omega_optimizer details????? eta_omega_optimizer = EtaOmegaOptimizer(beta, epsilon, init_eta, init_omega) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate #print(ob[:20]) #print(ac[:20]) if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "rms"): pi.rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() if TRPO: # # TRPO specific code. # Find correct step size using line search # shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / epsilon) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > epsilon * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) else: # # COPOS specific implementation. # copos_update_dir = stepdir # Split direction into log-linear 'w_theta' and non-linear 'w_beta' parts w_theta, w_beta = pi.split_w(copos_update_dir) tmp_ob = np.zeros( (1, ) + env.observation_space.shape ) # We assume that entropy does not depend on the NN # Optimize eta and omega if discrete_ac_space: entropy = lossbefore[4] #entropy = - 1/timesteps_per_batch * np.sum(np.sum(pi.get_action_prob(ob) * pi.get_log_action_prob(ob), axis=1)) eta, omega = eta_omega_optimizer.optimize( pi.compute_F_w(ob, copos_update_dir), pi.get_log_action_prob(ob), timesteps_per_batch, entropy) else: Waa, Wsa = pi.w2W(w_theta) wa = pi.get_wa(ob, w_beta) varphis = pi.get_varphis(ob) #old_ent = old_entropy.eval({oldpi.ob: tmp_ob})[0] old_ent = lossbefore[4] eta, omega = eta_omega_optimizer.optimize( w_theta, Waa, Wsa, wa, varphis, pi.get_kt(), pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent) logger.log("Initial eta: " + str(eta) + " and omega: " + str(omega)) current_theta_beta = get_flat() prev_theta, prev_beta = pi.all_to_theta_beta( current_theta_beta) if discrete_ac_space: # Do a line search for both theta and beta parameters by adjusting only eta eta = eta_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, epsilon, args, discrete_ac_space) logger.log("Updated eta, eta: " + str(eta)) set_from_flat(pi.theta_beta_to_all(prev_theta, prev_beta)) # Find proper omega for new eta. Use old policy parameters first. eta, omega = eta_omega_optimizer.optimize( pi.compute_F_w(ob, copos_update_dir), pi.get_log_action_prob(ob), timesteps_per_batch, entropy, eta) logger.log("Updated omega, eta: " + str(eta) + " and omega: " + str(omega)) # do line search for ratio for non-linear "beta" parameter values #ratio = beta_ratio_line_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, # epsilon, beta, args) # set ratio to 1 if we do not use beta ratio line search ratio = 1 #print("ratio from line search: " + str(ratio)) cur_theta = (eta * prev_theta + w_theta.reshape(-1, )) / (eta + omega) cur_beta = prev_beta + ratio * w_beta.reshape(-1, ) / eta else: for i in range(2): # Do a line search for both theta and beta parameters by adjusting only eta eta = eta_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, epsilon, args) logger.log("Updated eta, eta: " + str(eta) + " and omega: " + str(omega)) # Find proper omega for new eta. Use old policy parameters first. set_from_flat( pi.theta_beta_to_all(prev_theta, prev_beta)) eta, omega = \ eta_omega_optimizer.optimize(w_theta, Waa, Wsa, wa, varphis, pi.get_kt(), pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent, eta) logger.log("Updated omega, eta: " + str(eta) + " and omega: " + str(omega)) # Use final policy logger.log("Final eta: " + str(eta) + " and omega: " + str(omega)) cur_theta = (eta * prev_theta + w_theta.reshape(-1, )) / (eta + omega) cur_beta = prev_beta + w_beta.reshape(-1, ) / eta set_from_flat(pi.theta_beta_to_all(cur_theta, cur_beta)) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) ##copos specific over if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) #cg over for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) #policy update over with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) print("Reward max: " + str(max(rewbuffer))) print("Reward min: " + str(min(rewbuffer))) logger.record_tabular( "EpLenMean", np.mean(lenbuffer) if np.sum(lenbuffer) != 0.0 else 0.0) logger.record_tabular( "EpRewMean", np.mean(rewbuffer) if np.sum(rewbuffer) != 0.0 else 0.0) logger.record_tabular( "AverageReturn", np.mean(rewbuffer) if np.sum(rewbuffer) != 0.0 else 0.0) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def learn( *, policy_network, classifier_network, env, max_iters, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, expert_trajs_path='./expert_trajs', num_expert_trajs=500, data_subsample_freq=20, g_step=1, d_step=1, classifier_entcoeff=1e-3, num_particles=5, d_stepsize=3e-4, max_episodes=0, total_timesteps=0, # time constraint callback=None, load_path=None, save_path=None, render=False, use_classifier_logsumexp=True, use_reward_logsumexp=False, use_svgd=True, **policy_network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) entcoeff coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' nworkers = MPI.COMM_WORLD.Get_size() if nworkers > 1: raise NotImplementedError rank = MPI.COMM_WORLD.Get_rank() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.49) U.get_session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) policy = build_policy(env, policy_network, value_network='copy', **policy_network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) D = build_classifier(env, classifier_network, num_particles, classifier_entcoeff, use_classifier_logsumexp, use_reward_logsumexp) grads_list, vars_list = D.get_grads_and_vars() if use_svgd: optimizer = SVGD( grads_list, vars_list, lambda: tf.train.AdamOptimizer(learning_rate=d_stepsize)) else: optimizer = Ensemble( grads_list, vars_list, lambda: tf.train.AdamOptimizer(learning_rate=d_stepsize)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='yellow')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='blue')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if rank == 0: saver = tf.train.Saver(var_list=get_variables("pi"), max_to_keep=10000) writer = FileWriter(os.path.join(save_path, 'logs')) stats = Statistics( scalar_keys=["average_return", "average_episode_length"]) if load_path is not None: # pi.load(load_path) saver.restore(U.get_session(), load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- if load_path is not None: seg_gen = traj_segment_generator(pi, env, 1, stochastic=False, render=render) else: seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, render=render) seg_gen_e = expert_traj_segment_generator(env, expert_trajs_path, data_subsample_freq, timesteps_per_batch, num_expert_trajs) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # nothing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) if iters_so_far % 500 == 0 and save_path is not None and load_path is None: fname = os.path.join(save_path, 'checkpoints', 'checkpoint') save_state(fname, saver, iters_so_far) with timed("sampling"): seg = seg_gen.__next__() if load_path is not None: iters_so_far += 1 logger.record_tabular("EpRew", int(np.mean(seg["ep_true_rets"]))) logger.record_tabular("EpLen", int(np.mean(seg["ep_lens"]))) logger.dump_tabular() continue seg["rew"] = D.get_reward(seg["ob"], seg["ac"]) add_vtarg_and_adv(seg, gamma, lam) ob, ac, ep_lens, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "ep_lens"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "rms"): pi.rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=1000): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) with timed("sample expert trajectories"): ob_a, ac_a, ep_lens_a = ob, ac, ep_lens seg_e = seg_gen_e.__next__() ob_e, ac_e, ep_lens_e = seg_e["ob"], seg_e["ac"], seg_e["ep_lens"] if hasattr(D, "rms"): obs = np.concatenate([ob_a, ob_e], axis=0) if isinstance(ac_space, spaces.Box): acs = np.concatenate([ac_a, ac_e], axis=0) D.rms.update(np.concatenate([obs, acs], axis=1)) elif isinstance(ac_space, spaces.Discrete): D.rms.update(obs) else: raise NotImplementedError with timed("SVGD"): sess = tf.get_default_session() feed_dict = { D.Xs['a']: ob_a, D.As['a']: ac_a, D.Ls['a']: ep_lens_a, D.Xs['e']: ob_e, D.As['e']: ac_e, D.Ls['e']: ep_lens_e } for _ in range(d_step): sess.run(optimizer.update_op, feed_dict=feed_dict) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_true_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() stats.add_all_summary( writer, [np.mean(rewbuffer), np.mean(lenbuffer)], iters_so_far) rewbuffer.clear() lenbuffer.clear() return pi
path = ['train_log/RoboSumo-Ant-vs-Ant-v0-2020-04-28-12-07-49-785609', 'train_log/RoboSumo-Ant-vs-Ant-v0-2020-04-28-12-07-49-785609'] ID = [2000, 1000] length = 5000 model_path = [path[0] + '/checkpoints/%.5i' % ID[0], path[1] + '/checkpoints/%.5i' % ID[1]] env = gym.make('RoboSumo-Ant-vs-Ant-v0') env.num_envs = 1 for agent in env.agents: agent._adjust_z = -0.5 # env = VideoRecorder(env, osp.join(path, "videos-%d" % mid), record_video_trigger=lambda x: True, video_length=length) policy = [build_policy(env, 'mlp', num_hidden=64, activation=tf.nn.relu, value_network='copy'), build_policy(env, 'mlp', num_hidden=64, activation=tf.nn.relu, value_network='copy')] ob_space = env.observation_space[0] ac_space = env.action_space[0] from model import Model model_fn = Model model = [model_fn(policy=policy[0], ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=None, nsteps=None, ent_coef=None, vf_coef=None, max_grad_norm=None, trainable=False, model_scope="model_0"), model_fn(policy=policy[1], ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=None, nsteps=None, ent_coef=None, vf_coef=None, max_grad_norm=None, trainable=False, model_scope="model_1")] model[0].load(model_path[0]) model[1].load(model_path[1]) ep_id = 0
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) logger.log(tf.trainable_variables()) # Load VGG-m Conv Layer parameters load_vggm_conv('checkpoint/ACT1-4.ckpt') if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() # Dylan, for tensorboard writer = tf.summary.FileWriter(logger.get_dir(), tf.get_default_session().graph) ep_stats = stats( ["Total_timesteps", "EpRewMean", "EpLenMean", "FraRewMean"]) nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) if is_mpi_root: EpRewMean = safemean([epinfo['r'] for epinfo in epinfobuf]) EpLenMean = safemean([epinfo['l'] for epinfo in epinfobuf]) ep_stats.add_all_summary( writer, [update * nbatch, EpRewMean, EpLenMean, EpRewMean / EpLenMean], update) return model