def evaluate(env, policy_func, load_model_path, stochastic_policy=False, number_trajs=10): from gail.trpo_mpi import traj_episode_generator ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") ep_gen = traj_episode_generator(pi, env, 1024, stochastic=stochastic_policy) U.load_state(load_model_path) len_list = [] ret_list = [] for _ in tqdm(range(number_trajs)): traj = ep_gen.__next__() ep_len, ep_ret = traj['ep_len'], traj['ep_ret'] len_list.append(ep_len) ret_list.append(ep_ret) if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') print("Average length:", sum(len_list) / len(len_list)) print("Average return:", sum(ret_list) / len(ret_list))
def sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic): assert load_model_path is not None U.load_state(load_model_path) sample_trajs = [] for iters_so_far in range(max_sample_traj): logger.log("********** Iteration %i ************" % iters_so_far) traj = traj_gen.__next__() ob, new, ep_ret, ac, rew, ep_len = traj['ob'], traj['new'], traj[ 'ep_ret'], traj['ac'], traj['rew'], traj['ep_len'] logger.record_tabular("ep_ret", ep_ret) logger.record_tabular("ep_len", ep_len) logger.record_tabular("immediate reward", np.mean(rew)) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() traj_data = {"ob": ob, "ac": ac, "rew": rew, "ep_ret": ep_ret} sample_trajs.append(traj_data) sample_ep_rets = [traj["ep_ret"] for traj in sample_trajs] logger.log("Average total return: %f" % (sum(sample_ep_rets) / len(sample_ep_rets))) if sample_stochastic: task_name = 'stochastic.' + task_name else: task_name = 'deterministic.' + task_name pkl.dump(sample_trajs, open(task_name + ".pkl", "wb"))
def evaluate(env, policy_func, load_model_path, timesteps_per_batch, number_trajs=10, stochastic_policy=False): from tqdm import tqdm # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=False) U.initialize() # Prepare for rollouts # ---------------------------------------- ep_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) U.load_state(load_model_path) len_list = [] ret_list = [] for _ in tqdm(range(number_trajs)): traj = ep_gen.__next__() ep_len, ep_ret = traj['ep_len'], traj['ep_ret'] len_list.append(ep_len) ret_list.append(ep_ret) if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') print("Average length:", sum(len_list) / len(len_list)) print("Average return:", sum(ret_list) / len(ret_list))
def load(path): with open(path, "rb") as f: model_data, act_params = cloudpickle.load(f) act = build_act(**act_params) with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") with open(arc_path, "wb") as f: f.write(model_data) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) load_state(os.path.join(td, "model")) return ActWrapper(act, act_params)
def load(path, num_cpu=16): with open(path, "rb") as f: model_data, act_params = dill.load(f) act = deepq.build_act(**act_params) sess = U.make_session(num_cpu=num_cpu) sess.__enter__() with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") with open(arc_path, "wb") as f: f.write(model_data) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) U.load_state(os.path.join(td, "model")) return ActWrapper(act, act_params)
def learn( env, model_path, data_path, policy_fn, *, horizon=150, # timesteps per actor per update rolloutSize=50, clip_param=0.2, entcoeff=0.02, # clipping parameter epsilon, entropy coeff optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=32, # optimization hypers gamma=0.99, lam=0.95, # advantage estimation max_iters=0, # time constraint adam_epsilon=1e-4, schedule='constant', # annealing for stepsize parameters (epsilon and adam) retrain=False): # Setup losses and policy ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=5) # rolling buffer for episode lengths rewbuffer = deque(maxlen=5) # rolling buffer for episode rewards p = [] # for saving the rollouts if retrain == True: print("Retraining the policy from saved path") time.sleep(2) U.load_state(model_path) max_timesteps = int(horizon * rolloutSize * max_iters) while True: if max_iters and iters_so_far >= max_iters: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) print("Collecting samples for policy optimization !! ") if iters_so_far > 70: render = True else: render = False rollouts = sample_trajectory(pi, env, horizon=horizon, rolloutSize=rolloutSize, stochastic=True, render=render) # Save rollouts data = {'rollouts': rollouts} p.append(data) del data data_file_name = data_path + 'rollout_data.pkl' pickle.dump(p, open(data_file_name, "wb")) add_vtarg_and_adv(rollouts, gamma, lam) ob, ac, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts[ "adv"], rollouts["tdlamret"] atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), deterministic=pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("Success", rollouts["success"]) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def learn(env, model_path, data_path, policy_fn, *, rolloutSize, num_options=4, horizon=80, clip_param=0.025, ent_coeff=0.01, # clipping parameter epsilon, entropy coeff optim_epochs=10, mainlr=3.25e-4, intlr=1e-4, piolr=1e-4, termlr=5e-7, optim_batchsize=100, # optimization hypers gamma=0.99, lam=0.95, # advantage estimation max_iters=20, # time constraint adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) retrain=False, ): """ Core learning function """ ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space, num_options=num_options) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space, num_options=num_options) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None]) op_adv = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) betas = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) # Setup losses and stuff kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-ent_coeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] term_loss = pi.tpred * term_adv activated_options = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) pi_w = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) option_hot = tf.one_hot(option, depth=num_options) pi_I = (pi.intfc * activated_options) * pi_w / tf.expand_dims( tf.reduce_sum((pi.intfc * activated_options) * pi_w, axis=1), 1) pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6) int_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv) intfc = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) pi_I = (intfc * activated_options) * pi.op_pi / tf.expand_dims( tf.reduce_sum((intfc * activated_options) * pi.op_pi, axis=1), 1) pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6) op_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv) log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0)) op_entropy = -tf.reduce_mean(pi.op_pi * log_pi, reduction_indices=1) op_loss -= 0.01 * tf.reduce_sum(op_entropy) var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(total_loss, var_list)]) termgrad = U.function([ob, option, term_adv], [U.flatgrad(term_loss, var_list)]) # Since we will use a different step size. opgrad = U.function([ob, option, betas, op_adv, intfc, activated_options], [U.flatgrad(op_loss, var_list)]) # Since we will use a different step size. intgrad = U.function([ob, option, betas, op_adv, pi_w, activated_options], [U.flatgrad(int_loss, var_list)]) # Since we will use a different step size. adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=5) # rolling buffer for episode lengths rewbuffer = deque(maxlen=5) # rolling buffer for episode rewards datas = [0 for _ in range(num_options)] if retrain: print("Retraining to New Task !! ") time.sleep(2) U.load_state(model_path+'/') p = [] max_timesteps = int(horizon * rolloutSize * max_iters) while True: if max_iters and iters_so_far >= max_iters: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) render = False rollouts = sample_trajectory(pi, env, horizon=horizon, rolloutSize=rolloutSize, render=render) # Save rollouts data = {'rollouts': rollouts} p.append(data) del data data_file_name = data_path + 'rollout_data.pkl' pickle.dump(p, open(data_file_name, "wb")) add_vtarg_and_adv(rollouts, gamma, lam, num_options) opt_d = [] for i in range(num_options): dur = np.mean(rollouts['opt_dur'][i]) if len(rollouts['opt_dur'][i]) > 0 else 0. opt_d.append(dur) ob, ac, opts, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts["opts"], rollouts["adv"], rollouts["tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # Optimizing the policy for opt in range(num_options): indices = np.where(opts == opt)[0] print("Option- ", opt, " Batch Size: ", indices.size) opt_d[opt] = indices.size if not indices.size: continue datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) if indices.size < optim_batchsize: print("Too few samples for opt - ", opt) continue optim_batchsize_corrected = optim_batchsize optim_epochs_corrected = np.clip(np.int(indices.size / optim_batchsize_corrected), 1, optim_epochs) print("Optim Epochs:", optim_epochs_corrected) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs_corrected): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize_corrected): *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt]) adam.update(grads, mainlr * cur_lrmult) losses.append(newlosses) # Optimize termination functions termg = termgrad(rollouts["ob"], rollouts['opts'], rollouts["op_adv"])[0] adam.update(termg, termlr) # Optimize interest functions intgrads = intgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["op_probs"], rollouts["activated_options"])[0] adam.update(intgrads, intlr) # Optimize policy over options opgrads = opgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["intfc"], rollouts["activated_options"])[0] adam.update(opgrads, piolr) lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("Success", rollouts["success"]) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def load(self, load_path): tf_util.load_state(load_path, sess=self.sess)
def learn( env, policy_func, discriminator, expert_dataset, embedding_z, pretrained, pretrained_weight, *, g_step, d_step, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, save_per_iter=100, ckpt_dir=None, log_dir=None, load_model_path=None, task_name=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] d_adam = MpiAdam(discriminator.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n( [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out writer = U.FileWriter(log_dir) U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, discriminator, embedding=embedding_z, timesteps_per_batch=timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(discriminator.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) # if provieded model path if load_model_path is not None: U.load_state(load_model_path) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, discriminator.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for discriminator if hasattr(discriminator, "obs_rms"): discriminator.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = discriminator.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() g_loss_stats.add_all_summary(writer, g_losses, iters_so_far) d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0), iters_so_far) ep_stats.add_all_summary(writer, [ np.mean(true_rewbuffer), np.mean(rewbuffer), np.mean(lenbuffer) ], iters_so_far)
def train_maddpg(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist, good_agent_mode=arglist.good_policy, adv_agent_mode=arglist.adv_policy) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver(max_to_keep=None) obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() if arglist.real_q_log: world_state_buffer, action_n_buffer, start_episode_step_buffer, obs_n_buffer = [], [], [], [] q_means, real_means = [], [] print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) # note: unused, never happens terminal = (episode_step >= arglist.max_episode_len) done = done or terminal if arglist.real_q_log: world_state_buffer.append(deepcopy(env.world)) obs_n_buffer.append(obs_n) action_n_buffer.append(action_n) start_episode_step_buffer.append(episode_step) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done, terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) # add element for next episode for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): if arglist.save_dir != '/tmp/policy/': U.save_state(arglist.save_dir + arglist.exp_name, saver=saver, global_step=len(episode_rewards)) else: U.save_state( arglist.save_dir, saver=saver ) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:-1]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:-1]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:-1])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:-1])) if arglist.real_q_log and (len(episode_rewards) % (5 * arglist.save_rate) == 0): q_mean, real_mean = calculate_real_q_value( deepcopy(env), trainers, world_state_buffer=world_state_buffer, action_n_buffer=action_n_buffer, obs_n_buffer=obs_n_buffer, start_episode_step_buffer=start_episode_step_buffer, num_start_states=200, args=arglist) world_state_buffer, action_n_buffer, start_episode_step_buffer, obs_n_buffer = [], [], [], [] q_means.append(q_mean) real_means.append(real_mean) print('Q-mean: ' + str(q_mean) + ' Real mean: ' + str(real_mean)) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) args_file_name = arglist.plots_dir + arglist.exp_name + '_args.pkl' with open(args_file_name, 'wb') as fp: pickle.dump(arglist, fp) if arglist.real_q_log: real_q_path = arglist.plots_dir + arglist.exp_name + '_q_values.pkl' with open(real_q_path, 'wb') as fp: pickle.dump( { 'q_means': q_means, 'real_means': real_means }, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def learn(env, model_path, data_path, policy_fn, model_learning_params, svm_grid_params, svm_params_interest, svm_params_guard, *, modes, rolloutSize, num_options=2, horizon, # timesteps per actor per update clip_param, ent_coeff=0.02, # clipping parameter epsilon, entropy coeff optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=160, # optimization hypers gamma=0.99, lam=0.95, # advantage estimation max_iters=0, # time constraint adam_epsilon=1.2e-4, schedule='linear', # annealing for stepsize parameters (epsilon and adam) retrain=False ): """ Core learning function """ ob_space = env.observation_space ac_space = env.action_space if retrain: model = pickle.load(open(model_path + '/hybrid_model.pkl', 'rb')) print("Model graph:", model.transitionGraph.nodes) print("Model options:", model.transitionGraph.edges) else: model = partialHybridModel(env, model_learning_params, svm_grid_params, svm_params_interest, svm_params_guard, horizon, modes, num_options, rolloutSize) pi = policy_fn("pi", ob_space, ac_space, model, num_options) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space, model, num_options) # Network for old policy atarg = tf1.placeholder(dtype=tf1.float32, shape=[None]) # Target advantage function (if applicable) ret = tf1.placeholder(dtype=tf1.float32, shape=[None]) # Empirical return lrmult = tf1.placeholder(name='lrmult', dtype=tf1.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # Define placeholders for computing the advantage ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") ac = pi.pdtype.sample_placeholder([None]) # Defining losses for optimization kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf1.reduce_mean(kloldnew) meanent = tf1.reduce_mean(ent) pol_entpen = (-ent_coeff) * meanent ratio = tf1.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf1.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf1.reduce_mean(tf1.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP), negative to convert from a maximization to minimization problem vf_loss = tf1.reduce_mean(tf1.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([], [], updates=[tf1.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() # Prepare for rollouts episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=10) # rolling buffer for episode lengths rewbuffer = deque(maxlen=10) # rolling buffer for episode rewards p = [] # for saving the rollouts if retrain: print("Retraining to New Task !!") time.sleep(2) U.load_state(model_path+'/') print(pi.eps) max_timesteps = int(horizon * rolloutSize * max_iters) while True: if max_iters and iters_so_far >= max_iters: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("************* Iteration %i *************" % iters_so_far) print("Collecting samples for policy optimization !! ") render = False rollouts = sample_trajectory(pi, model, env, horizon=horizon, rolloutSize=rolloutSize, render=render) # Save rollouts data = {'rollouts': rollouts} p.append(data) del data data_file_name = data_path + '/rollout_data.pkl' pickle.dump(p, open(data_file_name, "wb")) # Model update print("Updating model !!\n") model.updateModel(rollouts, pi) print("Model graph:", model.transitionGraph.nodes) print("Model options:", model.transitionGraph.edges) edges = list(model.transitionGraph.edges) for i in range(0, len(edges)): print(edges[i][0], " -> ", edges[i][1], " : ", model.transitionGraph[edges[i][0]][edges[i][1]]['weight']) datas = [0 for _ in range(num_options)] add_vtarg_and_adv(rollouts, pi, gamma, lam, num_options) ob, ac, opts, atarg, tdlamret = rollouts["seg_obs"], rollouts["seg_acs"], rollouts["des_opts"], rollouts["adv"], rollouts["tdlamret"] old_opts = rollouts["seg_opts"] similarity = 0 for i in range(0, len(old_opts)): if old_opts[i] == opts[i]: similarity += 1 print("Percentage similarity of options: ", similarity/len(old_opts) * 100) vpredbefore = rollouts["vpreds"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() pi.eps = pi.eps * gamma #reduce exploration # Optimizing the policy print("\nOptimizing policy !! \n") for opt in range(num_options): indices = np.where(opts == opt)[0] print("Option- ", opt, " Batch Size: ", indices.size) if not indices.size: continue datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) if indices.size < optim_batchsize: print("Too few samples for opt - ", opt) continue optim_batchsize_corrected = optim_batchsize optim_epochs_corrected = np.clip(np.int(indices.size / optim_batchsize_corrected), 1, optim_epochs) print("Optim Epochs:", optim_epochs_corrected) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs_corrected): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize_corrected): *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt]) if np.isnan(newlosses).any(): continue adam.update(grads, optim_stepsize * cur_lrmult) losses.append(newlosses) if len(losses) > 0: meanlosses, _, _ = mpi_moments(losses, axis=0) print("Mean loss ", meanlosses) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("Success", rollouts["success"]) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() ''' if model_path and not retrain: U.save_state(model_path + '/') model_file_name = model_path + '/hybrid_model.pkl' pickle.dump(model, open(model_file_name, "wb"), pickle.HIGHEST_PROTOCOL) print("Policy and Model saved in - ", model_path) ''' return pi, model
def learn(env_id, q_func, lr=5e-4, max_timesteps=10000, buffer_size=5000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, train_steps=10, learning_starts=500, batch_size=32, print_freq=10, checkpoint_freq=100, model_dir=None, gamma=1.0, target_network_update_freq=50, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, player_processes=None, player_connections=None): env, _, _ = create_gvgai_environment(env_id) # Create all the functions necessary to train the model # expert_decision_maker = ExpertDecisionMaker(env=env) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) session = tf.Session() session.__enter__() policy_path = os.path.join(model_dir, "Policy.pkl") model_path = os.path.join(model_dir, "model", "model") if os.path.isdir(os.path.join(model_dir, "model")): load_state(model_path) else: act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Initialize the parameters and copy them to the target network. U.initialize() update_target() act.save(policy_path) save_state(model_path) env.close() # Create the replay buffer if prioritized_replay: replay_buffer_path = os.path.join(model_dir, "Prioritized_replay.pkl") if os.path.isfile(replay_buffer_path): with open(replay_buffer_path, 'rb') as input_file: replay_buffer = pickle.load(input_file) else: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer_path = os.path.join(model_dir, "Normal_replay.pkl") if os.path.isfile(replay_buffer_path): with open(replay_buffer_path, 'rb') as input_file: replay_buffer = pickle.load(input_file) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) episode_rewards = list() saved_mean_reward = -999999999 signal.signal(signal.SIGQUIT, signal_handler) global terminate_learning total_timesteps = 0 for timestep in range(max_timesteps): if terminate_learning: break for connection in player_connections: experiences, reward = connection.recv() episode_rewards.append(reward) for experience in experiences: replay_buffer.add(*experience) total_timesteps += 1 if total_timesteps < learning_starts: if timestep % 10 == 0: print("not strated yet", flush=True) continue if timestep % train_freq == 0: for i in range(train_steps): # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(total_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if timestep % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if print_freq is not None and timestep % print_freq == 0: logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * exploration.value(total_timesteps))) logger.dump_tabular() if timestep % checkpoint_freq == 0 and mean_100ep_reward > saved_mean_reward: act.save(policy_path) save_state(model_path) saved_mean_reward = mean_100ep_reward with open(replay_buffer_path, 'wb') as output_file: pickle.dump(replay_buffer, output_file, pickle.HIGHEST_PROTOCOL) send_message_to_all(player_connections, Message.UPDATE) send_message_to_all(player_connections, Message.TERMINATE) if mean_100ep_reward > saved_mean_reward: act.save(policy_path) with open(replay_buffer_path, 'wb') as output_file: pickle.dump(replay_buffer, output_file, pickle.HIGHEST_PROTOCOL) for player_process in player_processes: player_process.join() # player_process.terminate() return act.load(policy_path)
def learn_continuous_tasks(env, q_func, env_name, time_stamp, total_num_episodes, num_actions_pad=33, lr=1e-4, grad_norm_clipping=10, max_timesteps=int(1e8), buffer_size=int(1e6), train_freq=1, batch_size=64, print_freq=10, learning_starts=1000, gamma=0.99, target_network_update_freq=500, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=2e6, prioritized_replay_eps=int(1e8), num_cpu=16, timesteps_std=1e6, initial_std=0.4, final_std=0.05, eval_freq=100, n_eval_episodes=10, eval_std=0.01, callback=None): """Train a branching deepq model to solve continuous control tasks via discretization. Current assumptions in the implementation: - for solving continuous control domains via discretization (can be adjusted to be compatible with naturally disceret-action domains using 'env.action_space.n') - uniform number of sub-actions per action dimension (can be generalized to heterogeneous number of sub-actions across branches) Parameters ------- env : gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions_pad: int number of sub-actions per action dimension (= num of discretization grains/bars + 1) lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimize for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed 0.1 for dqn-baselines exploration_final_eps: float final value of random action probability 0.02 for dqn-baselines train_freq: int update the model every `train_freq` steps. batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor grad_norm_clipping: int set None for no clipping target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the unified TD error for updating priorities. Erratum: The camera-ready copy of this paper incorrectly reported 1e-8. The value used to produece the results is 1e8. num_cpu: int number of cpus to use for training losses_version: int optimization version number dir_path: str path for logs and results to be stored in callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) num_action_grains = num_actions_pad - 1 num_action_dims = env.action_space.shape[0] num_action_streams = num_action_dims num_actions = num_actions_pad * num_action_streams # total numb network outputs for action branching with one action dimension per branch act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, num_action_streams=num_action_streams, batch_size=batch_size, learning_rate=lr, grad_norm_clipping=grad_norm_clipping, gamma=gamma, scope="deepq", reuse=None) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, 'num_action_streams': num_action_streams, } # prioritized_replay: create the replay buffer replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) # epsilon_greedy = False: just greedy policy exploration = ConstantSchedule(value=0.0) # greedy policy std_schedule = LinearSchedule(schedule_timesteps=timesteps_std, initial_p=initial_std, final_p=final_std) # Initialize the parameters and copy them to the target network. U.initialize() update_target() # Initialize the parameters used for converting branching, discrete action indeces to continuous actions low = env.action_space.low high = env.action_space.high actions_range = np.subtract(high, low) episode_rewards = [] reward_sum = 0.0 num_episodes = 0 time_steps = [0] time_spent_exploring = [0] prev_time = time.time() n_trainings = 0 # Set up on-demand rendering of Gym environments using keyboard controls: 'r'ender or 's'top import termios, fcntl, sys fd = sys.stdin.fileno() oldterm = termios.tcgetattr(fd) newattr = termios.tcgetattr(fd) newattr[3] = newattr[3] & ~termios.ICANON & ~termios.ECHO render = False displayed_mean_reward = None def evaluate(step, episode_number): global max_eval_reward_mean, model_saved print('Evaluate...') eval_reward_sum = 0.0 # Run evaluation episodes for eval_episode in range(n_eval_episodes): obs = env.reset() done = False while not done: # Choose action action_idxes = np.array( act(np.array(obs)[None], stochastic=False)) # deterministic actions_greedy = action_idxes / num_action_grains * actions_range + low if eval_std == 0.0: action = actions_greedy else: action = [] for index in range(len(actions_greedy)): a_greedy = actions_greedy[index] out_of_range_action = True while out_of_range_action: a_stoch = np.random.normal(loc=a_greedy, scale=eval_std) a_idx_stoch = np.rint( (a_stoch + high[index]) / actions_range[index] * num_action_grains) if a_idx_stoch >= 0 and a_idx_stoch < num_actions_pad: action.append(a_stoch) out_of_range_action = False # Step obs, rew, done, _ = env.step(action) eval_reward_sum += rew # Average the rewards and log eval_reward_mean = eval_reward_sum / n_eval_episodes print(eval_reward_mean, 'over', n_eval_episodes, 'episodes') with open("results/{}_{}_eval.csv".format(time_stamp, env_name), "a") as eval_fw: eval_writer = csv.writer( eval_fw, delimiter="\t", lineterminator="\n", ) eval_writer.writerow([episode_number, step, eval_reward_mean]) if max_eval_reward_mean is None or eval_reward_mean > max_eval_reward_mean: logger.log( "Saving model due to mean eval increase: {} -> {}".format( max_eval_reward_mean, eval_reward_mean)) U.save_state(model_file) model_saved = True max_eval_reward_mean = eval_reward_mean with tempfile.TemporaryDirectory() as td: model_file = os.path.join(td, "model") evaluate(0, 0) obs = env.reset() with open("results/{}_{}.csv".format(time_stamp, env_name), "w") as fw: writer = csv.writer( fw, delimiter="\t", lineterminator="\n", ) t = -1 while True: t += 1 # Select action and update exploration probability action_idxes = np.array( act(np.array(obs)[None], update_eps=exploration.value(t))) # Convert sub-actions indexes (discrete sub-actions) to continuous controls action = action_idxes / num_action_grains * actions_range + low # epsilon_greedy = False: use Gaussian noise actions_greedy = action action_idx_stoch = [] action = [] for index in range(len(actions_greedy)): a_greedy = actions_greedy[index] out_of_range_action = True while out_of_range_action: # Sample from a Gaussian with mean at the greedy action and a std following a schedule of choice a_stoch = np.random.normal(loc=a_greedy, scale=std_schedule.value(t)) # Convert sampled cont action to an action idx a_idx_stoch = np.rint( (a_stoch + high[index]) / actions_range[index] * num_action_grains) # Check if action is in range if a_idx_stoch >= 0 and a_idx_stoch < num_actions_pad: action_idx_stoch.append(a_idx_stoch) action.append(a_stoch) out_of_range_action = False action_idxes = action_idx_stoch new_obs, rew, done, _ = env.step(action) # On-demand rendering if (t + 1) % 100 == 0: # TO DO better? termios.tcsetattr(fd, termios.TCSANOW, newattr) oldflags = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, oldflags | os.O_NONBLOCK) try: try: c = sys.stdin.read(1) if c == 'r': print() print('Rendering begins...') render = True elif c == 's': print() print('Stop rendering!') render = False env.render(close=True) except IOError: pass finally: termios.tcsetattr(fd, termios.TCSAFLUSH, oldterm) fcntl.fcntl(fd, fcntl.F_SETFL, oldflags) # Visualize Gym environment on render if render: env.render() # Store transition in the replay buffer replay_buffer.add(obs, action_idxes, rew, new_obs, float(done)) obs = new_obs reward_sum += rew if done: obs = env.reset() time_spent_exploring[-1] = int(100 * exploration.value(t)) time_spent_exploring.append(0) episode_rewards.append(reward_sum) time_steps[-1] = t reward_sum = 0.0 time_steps.append(0) # Frequently log to file writer.writerow( [len(episode_rewards), t, episode_rewards[-1]]) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer # prioritized_replay experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience td_errors = train( obses_t, actions, rewards, obses_tp1, dones, weights) #np.ones_like(rewards)) #TEMP AT NEW # prioritized_replay new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) n_trainings += 1 if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically update_target() if len(episode_rewards) == 0: mean_100ep_reward = 0 elif len(episode_rewards) < 100: mean_100ep_reward = np.mean(episode_rewards) else: mean_100ep_reward = np.mean(episode_rewards[-100:]) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) current_time = time.time() logger.record_tabular( "trainings per second", n_trainings / (current_time - prev_time)) logger.dump_tabular() n_trainings = 0 prev_time = current_time if t > learning_starts and num_episodes > 100: if displayed_mean_reward is None or mean_100ep_reward > displayed_mean_reward: if print_freq is not None: logger.log("Mean reward increase: {} -> {}".format( displayed_mean_reward, mean_100ep_reward)) displayed_mean_reward = mean_100ep_reward # Performance evaluation with a greedy policy if done and num_episodes % eval_freq == 0: evaluate(t + 1, num_episodes) obs = env.reset() # STOP training if num_episodes >= total_num_episodes: break if model_saved: logger.log("Restore model with mean eval: {}".format( max_eval_reward_mean)) U.load_state(model_file) data_to_log = { 'time_steps': time_steps, 'episode_rewards': episode_rewards, 'time_spent_exploring': time_spent_exploring } # Write to file the episodic rewards, number of steps, and the time spent exploring with open("results/{}_{}.txt".format(time_stamp, env_name), 'wb') as fp: pickle.dump(data_to_log, fp) return ActWrapper(act, act_params)