def agent(game, n_ep, n_mcts, max_ep_len, lr, c, gamma, data_size, batch_size, temp, n_hidden_layers, n_hidden_units, stochastic=False, eval_freq=-1, eval_episodes=100, alpha=0.6, out_dir='../', pre_process=None, visualize=False): ''' Outer training loop ''' if pre_process is not None: pre_process() # tf.reset_default_graph() if not os.path.exists(out_dir): os.makedirs(out_dir) episode_returns = [] # storage timepoints = [] # Environments Env = make_game(game) is_atari = is_atari_game(Env) mcts_env = make_game(game) if is_atari else None online_scores = [] offline_scores = [] mcts_params = dict(gamma=gamma) if stochastic: mcts_params['alpha'] = alpha mcts_maker = MCTSStochastic else: mcts_maker = MCTS D = Database(max_size=data_size, batch_size=batch_size) model = Model(Env=Env, lr=lr, n_hidden_layers=n_hidden_layers, n_hidden_units=n_hidden_units) t_total = 0 # total steps R_best = -np.Inf with tf.Session() as sess: model.sess = sess sess.run(tf.global_variables_initializer()) for ep in range(n_ep): if eval_freq > 0 and ep % eval_freq == 0: #and ep > 0 print( 'Evaluating policy for {} episodes!'.format(eval_episodes)) seed = np.random.randint(1e7) # draw some Env seed Env.seed(seed) s = Env.reset() mcts = mcts_maker(root_index=s, root=None, model=model, na=model.action_dim, **mcts_params) env_wrapper = EnvEvalWrapper() env_wrapper.mcts = mcts starting_states = [] def reset_env(): s = Env.reset() env_wrapper.mcts = mcts_maker(root_index=s, root=None, model=model, na=model.action_dim, **mcts_params) starting_states.append(s) if env_wrapper.curr_probs is not None: env_wrapper.episode_probabilities.append( env_wrapper.curr_probs) env_wrapper.curr_probs = [] return s def forward(a, s, r): env_wrapper.mcts.forward(a, s, r) #pass env_wrapper.reset = reset_env env_wrapper.step = lambda x: Env.step(x) env_wrapper.forward = forward env_wrapper.episode_probabilities = [] env_wrapper.curr_probs = None def pi_wrapper(ob): if not is_atari: mcts_env = None env_wrapper.mcts.search(n_mcts=n_mcts, c=c, Env=Env, mcts_env=mcts_env) state, pi, V = env_wrapper.mcts.return_results(temp=0) #pi = model.predict_pi(s).flatten() env_wrapper.curr_probs.append(pi) a = np.argmax(pi) return a rews, lens = eval_policy(pi_wrapper, env_wrapper, n_episodes=eval_episodes, verbose=True) offline_scores.append([ np.min(rews), np.max(rews), np.mean(rews), np.std(rews), len(rews), np.mean(lens) ]) # if len(rews) < eval_episodes or len(rews) == 0: # print("WTF") # if np.std(rews) == 0.: # print("WTF 2") np.save(out_dir + '/offline_scores.npy', offline_scores) start = time.time() s = Env.reset() R = 0.0 # Total return counter a_store = [] seed = np.random.randint(1e7) # draw some Env seed Env.seed(seed) if is_atari: mcts_env.reset() mcts_env.seed(seed) if ep % eval_freq == 0: print("Collecting %d episodes" % eval_freq) mcts = mcts_maker( root_index=s, root=None, model=model, na=model.action_dim, **mcts_params) # the object responsible for MCTS searches for t in range(max_ep_len): # MCTS step if not is_atari: mcts_env = None mcts.search(n_mcts=n_mcts, c=c, Env=Env, mcts_env=mcts_env) # perform a forward search if visualize: mcts.visualize() state, pi, V = mcts.return_results( temp) # extract the root output D.store((state, V, pi)) # Make the true step a = np.random.choice(len(pi), p=pi) a_store.append(a) s1, r, terminal, _ = Env.step(a) R += r t_total += n_mcts # total number of environment steps (counts the mcts steps) if terminal: break else: mcts.forward(a, s1, r) # Finished episode episode_returns.append(R) # store the total episode return online_scores.append(R) timepoints.append( t_total) # store the timestep count of the episode return store_safely(out_dir, 'result', { 'R': episode_returns, 't': timepoints }) np.save(out_dir + '/online_scores.npy', online_scores) # print('Finished episode {}, total return: {}, total time: {} sec'.format(ep, np.round(R, 2), # np.round((time.time() - start), # 1))) if R > R_best: a_best = a_store seed_best = seed R_best = R # Train D.reshuffle() try: for epoch in range(1): for sb, Vb, pib in D: model.train(sb, Vb, pib) except Exception as e: print("ASD") model.save(out_dir + 'model') # Return results return episode_returns, timepoints, a_best, seed_best, R_best, offline_scores
def eval_policy_closure(**args): return eval_policy(env_eval, **args)
def agent(game, n_ep, n_mcts, max_ep_len, lr, c, gamma, data_size, batch_size, temp, n_hidden_layers, n_hidden_units, stochastic=False, eval_freq=-1, eval_episodes=100, alpha=0.6, n_epochs=100, c_dpw=1, numpy_dump_dir='../', pre_process=None, visualize=False, game_params={}, parallelize_evaluation=False, mcts_only=False, particles=0, show_plots=False, n_workers=1, use_sampler=False, budget=np.inf, unbiased=False, biased=False, max_workers=100, variance=False, depth_based_bias=False, scheduler_params=None, out_dir=None, render=False, second_version=False, third_version=False): visualizer = None # if particles: # parallelize_evaluation = False # Cannot run parallelized evaluation with particle filtering if not mcts_only: from mcts import MCTS from mcts_dpw import MCTSStochastic elif particles: if unbiased: from particle_filtering.ol_uct import OL_MCTS elif biased: if second_version: from particle_filtering.pf_uct_2 import PFMCTS2 as PFMCTS elif third_version: from particle_filtering.pf_uct_3 import PFMCTS3 as PFMCTS else: from particle_filtering.pf_uct import PFMCTS else: from particle_filtering.pf_mcts_edo import PFMCTS else: from pure_mcts.mcts import MCTS from pure_mcts.mcts_dpw import MCTSStochastic if parallelize_evaluation: print("The evaluation will be parallel") parameter_list = { "game": game, "n_ep": n_ep, "n_mcts": n_mcts, "max_ep_len": max_ep_len, "lr": lr, "c": c, "gamma": gamma, "data_size": data_size, "batch_size": batch_size, "temp": temp, "n_hidden_layers": n_hidden_layers, "n_hidden_units": n_hidden_units, "stochastic": stochastic, "eval_freq": eval_freq, "eval_episodes": eval_episodes, "alpha": alpha, "n_epochs": n_epochs, "out_dir": numpy_dump_dir, "pre_process": pre_process, "visualize": visualize, "game_params": game_params, "n_workers": n_workers, "use_sampler": use_sampler, "variance": variance, "depth_based_bias": depth_based_bias, "unbiased": unbiased, "second_version": second_version, 'third_version': third_version } if out_dir is not None: if not os.path.exists(out_dir): os.makedirs(out_dir) with open(os.path.join(out_dir, "parameters.txt"), 'w') as d: d.write(json.dumps(parameter_list)) #logger = Logger(parameter_list, game, show=show_plots) if DEBUG_TAXI: from utils.visualization.taxi import TaxiVisualizer with open(game_params["grid"]) as f: m = f.readlines() matrix = [] for r in m: row = [] for ch in r.strip('\n'): row.append(ch) matrix.append(row) visualizer = TaxiVisualizer(matrix) f.close() exit() ''' Outer training loop ''' if pre_process is not None: pre_process() # numpy_dump_dir = logger.numpy_dumps_dir # # if not os.path.exists(numpy_dump_dir): # os.makedirs(numpy_dump_dir) episode_returns = [] # storage timepoints = [] # Environments if game == 'Trading-v0': game_params['save_dir'] = out_dir #logger.save_dir Env = make_game(game, game_params) num_actions = Env.action_space.n sampler = None if use_sampler and not (unbiased or biased): def make_pi(action_space): def pi(s): return np.random.randint(low=0, high=action_space.n) return pi def make_env(): return make_game(game, game_params) sampler = ParallelSampler(make_pi=make_pi, make_env=make_env, n_particles=particles, n_workers=n_workers, seed=10) is_atari = is_atari_game(Env) mcts_env = make_game(game, game_params) if is_atari else None online_scores = [] offline_scores = [] # Setup the parameters for generating the search environments if game == "RaceStrategy-v1": mcts_maker, mcts_params, c_dpw = load_race_agents_config( 'envs/configs/race_strategy_full.json', gamma) else: mcts_params = dict(gamma=gamma) if particles: if not (biased or unbiased): mcts_params['particles'] = particles mcts_params['sampler'] = sampler elif biased: mcts_params['alpha'] = alpha mcts_maker = PFMCTS mcts_params['depth_based_bias'] = depth_based_bias if unbiased: mcts_params['variance'] = variance mcts_maker = OL_MCTS elif stochastic: mcts_params['alpha'] = alpha mcts_params['depth_based_bias'] = depth_based_bias mcts_maker = MCTSStochastic else: mcts_maker = MCTS # Prepare the database for storing training data to be sampled db = Database(max_size=data_size, batch_size=batch_size) # TODO extract dimensions to avoid allocating model # Setup the model model_params = { "Env": Env, "lr": lr, "n_hidden_layers": n_hidden_layers, "n_hidden_units": n_hidden_units, "joint_networks": True } model_wrapper = ModelWrapper(**model_params) t_total = 0 # total steps R_best = -np.Inf a_best = None seed_best = None # Variables for storing values to be plotted avgs = [] stds = [] # Run the episodes for ep in range(n_ep): if DEBUG_TAXI: visualizer.reset() ##### Policy evaluation step ##### if eval_freq > 0 and ep % eval_freq == 0: # and ep > 0 print( '--------------------------------\nEvaluating policy for {} episodes!' .format(eval_episodes)) seed = np.random.randint(1e7) # draw some Env seed Env.seed(seed) s = Env.reset() if parallelize_evaluation: penv = None pgame = { "game_maker": make_game, "game": game, "game_params": game_params } else: penv = Env pgame = None model_file = os.path.join(out_dir, "model.h5") # model_wrapper.save(model_file) if game == "RaceStrategy-v1": env_wrapper = RaceWrapper(s, mcts_maker, model_file, model_params, mcts_params, is_atari, n_mcts, budget, mcts_env, c_dpw, temp, env=penv, game_maker=pgame, mcts_only=mcts_only, scheduler_params=scheduler_params) else: env_wrapper = Wrapper(s, mcts_maker, model_file, model_params, mcts_params, is_atari, n_mcts, budget, mcts_env, c_dpw, temp, env=penv, game_maker=pgame, mcts_only=mcts_only, scheduler_params=scheduler_params) # Run the evaluation if parallelize_evaluation: total_reward, reward_per_timestep, lens, action_counts = \ parallelize_eval_policy(env_wrapper, n_episodes=eval_episodes, verbose=False, max_len=max_ep_len, max_workers=max_workers, out_dir=out_dir) else: total_reward, reward_per_timestep, lens, action_counts = \ eval_policy(env_wrapper, n_episodes=eval_episodes, verbose=False, max_len=max_ep_len, visualize=visualize, out_dir=out_dir, render=render) # offline_scores.append([np.min(rews), np.max(rews), np.mean(rews), np.std(rews), # len(rews), np.mean(lens)]) offline_scores.append( [total_reward, reward_per_timestep, lens, action_counts]) #np.save(numpy_dump_dir + '/offline_scores.npy', offline_scores) # Store and plot data avgs.append(np.mean(total_reward)) stds.append(np.std(total_reward)) #logger.plot_evaluation_mean_and_variance(avgs, stds) ##### Policy improvement step ##### if not mcts_only: start = time.time() s = start_s = Env.reset() R = 0.0 # Total return counter a_store = [] seed = np.random.randint(1e7) # draw some Env seed Env.seed(seed) if is_atari: mcts_env.reset() mcts_env.seed(seed) if eval_freq > 0 and ep % eval_freq == 0: print("\nCollecting %d episodes" % eval_freq) mcts = mcts_maker( root_index=s, root=None, model=model_wrapper, na=model_wrapper.action_dim, **mcts_params) # the object responsible for MCTS searches print("\nPerforming MCTS steps\n") ep_steps = 0 start_targets = [] for st in range(max_ep_len): print_step = max_ep_len // 10 if st % print_step == 0: print('Step ' + str(st + 1) + ' of ' + str(max_ep_len)) # MCTS step if not is_atari: mcts_env = None mcts.search(n_mcts=n_mcts, c=c, Env=Env, mcts_env=mcts_env) # perform a forward search if visualize: mcts.visualize() state, pi, V = mcts.return_results( temp) # extract the root output # Save targets for starting state to debug if np.array_equal(start_s, state): if DEBUG: print("Pi target for starting state:", pi) start_targets.append((V, pi)) db.store((state, V, pi)) # Make the true step a = np.random.choice(len(pi), p=pi) a_store.append(a) s1, r, terminal, _ = Env.step(a) # Perform command line visualization if necessary if DEBUG_TAXI: olds, olda = copy.deepcopy(s1), copy.deepcopy(a) visualizer.visualize_taxi(olds, olda) print("Reward:", r) R += r t_total += n_mcts # total number of environment steps (counts the mcts steps) ep_steps = st + 1 if terminal: break # Stop the episode if we encounter a terminal state else: mcts.forward(a, s1, r) # Otherwise proceed # Finished episode if DEBUG: print("Train episode return:", R) print("Train episode actions:", a_store) episode_returns.append(R) # store the total episode return online_scores.append(R) timepoints.append( t_total) # store the timestep count of the episode return #store_safely(numpy_dump_dir, '/result', {'R': episode_returns, 't': timepoints}) #np.save(numpy_dump_dir + '/online_scores.npy', online_scores) if DEBUG or True: print( 'Finished episode {} in {} steps, total return: {}, total time: {} sec' .format(ep, ep_steps, np.round(R, 2), np.round((time.time() - start), 1))) # Plot the online return over training episodes #logger.plot_online_return(online_scores) if R > R_best: a_best = a_store seed_best = seed R_best = R print() # Train only if the model has to be used if not mcts_only: # Train try: print("\nTraining network") ep_V_loss = [] ep_pi_loss = [] for _ in range(n_epochs): # Reshuffle the dataset at each epoch db.reshuffle() batch_V_loss = [] batch_pi_loss = [] # Batch training for sb, Vb, pib in db: if DEBUG: print("sb:", sb) print("Vb:", Vb) print("pib:", pib) loss = model_wrapper.train(sb, Vb, pib) batch_V_loss.append(loss[1]) batch_pi_loss.append(loss[2]) ep_V_loss.append(mean(batch_V_loss)) ep_pi_loss.append(mean(batch_pi_loss)) # Plot the loss over training epochs #logger.plot_loss(ep, ep_V_loss, ep_pi_loss) except Exception as e: print("Something wrong while training:", e) # model.save(out_dir + 'model') # Plot the loss over different episodes #logger.plot_training_loss_over_time() pi_start = model_wrapper.predict_pi(start_s) V_start = model_wrapper.predict_V(start_s) print("\nStart policy: ", pi_start) print("Start value:", V_start) #logger.log_start(ep, pi_start, V_start, start_targets) # Return results if use_sampler: sampler.close() return episode_returns, timepoints, a_best, seed_best, R_best, offline_scores
def learn( *, network, env, eval_policy, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, checkpoint_path_in=None, checkpoint_dir_out=None, checkpoint_freq=100, # In iterations!!, from_iter=0, eval_episodes=20, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- # ob_space = Box(low=-np.inf, high=np.inf, shape=(env.observation_space.n,)) ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) # Loading checkpoint if checkpoint_path_in is not None and os.path.isfile(checkpoint_path_in): pi.load(checkpoint_path_in) logger.log('Loaded policy weights from %s' % checkpoint_path_in) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() # s = env.reset() # start = time.time() # for i in range(10000): # pi.step(s, stochastic=True) # duration = time.time() - start # print(duration) # return if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, gamma=gamma) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 iters_eval = 0 all_logs = [] best_rew = -np.inf tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards online_scores = [] offline_scores = [] if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) if iters_so_far % checkpoint_freq == 0 and checkpoint_dir_out is not None: if not os.path.exists(checkpoint_dir_out): os.makedirs(checkpoint_dir_out) pi.save( os.path.join(checkpoint_dir_out, 'checkpoint_%d' % iters_so_far)) logger.log('Saved policy weights as %s' % os.path.join( checkpoint_dir_out, 'checkpoint_%d.npy' % iters_so_far)) def pi_wrapper(ob): ac, vpred, _, _ = pi.step(ob, stochastic=True) return ac rew, _, logs, disc_rets, num_stops, avg_damages = eval_policy( pi=pi_wrapper, n_episodes=eval_episodes, verbose=True) offline_scores.append( [np.mean(disc_rets), np.mean(num_stops), np.mean(avg_damages)]) np.save(os.path.join(checkpoint_dir_out, 'offline_scores.npy'), offline_scores) for log in logs: log['iter'] = iters_eval all_logs = all_logs + logs iters_eval += 1 with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) ep_rew_mean = np.mean(rewbuffer) online_scores.append(ep_rew_mean) np.save(os.path.join(checkpoint_dir_out, 'online_scores.npy'), online_scores) # Saving best if iters_so_far % checkpoint_freq == 0 and ep_rew_mean > best_rew and checkpoint_dir_out is not None: pi.save(os.path.join(checkpoint_dir_out, 'best')) best_rew = ep_rew_mean logger.log('Saved policy weights as %s' % os.path.join(checkpoint_dir_out, 'best.npy')) if rank == 0: logger.dump_tabular() return pi