def policy_gradient_optimize_nesterov(mdp, policy, gamma, max_pathlength, timesteps_per_batch, n_iter, stepsize, beta = .95): stat2timeseries = defaultdict(list) widths = (17,10,10,10,10) print fmt_row(widths, ["EpRewMean","EpLenMean","Perplexity","KLOldNew"]) fprev_sa = policy.f_sa for i in xrange(n_iter): total_ts = 0 paths = [] while True: path = rollout(mdp, policy, max_pathlength) paths.append(path) total_ts += pathlength(path) if total_ts > timesteps_per_batch: break # get observations: obs_no = np.concatenate([path["observations"] for path in paths]) z_sa = policy.f_sa + beta * (policy.f_sa - fprev_sa) # Momemtum term grad = 0 for path in paths: n = len(path['rewards']) q_n = ((path['rewards'] * gamma ** np.arange(n) )[::-1].cumsum())[::-1] q_n = q_n / gamma ** np.arange(n) # Biased estimator but doesn't decay as fast grad += softmax_policy_gradient(z_sa, path['observations'], path['actions'], q_n) grad = grad / len(paths) fprev_sa = policy.f_sa policy.f_sa = z_sa + stepsize * grad pdists = np.concatenate([path["pdists"] for path in paths]) kl = policy.compute_kl(pdists, policy.compute_pdists(obs_no)).mean() perplexity = np.exp(policy.compute_entropy(pdists).mean()) stats = { "EpRewMean" : np.mean([path["rewards"].sum() for path in paths]), "EpRewSEM" : np.std([path["rewards"].sum() for path in paths])/np.sqrt(len(paths)), "EpLenMean" : np.mean([pathlength(path) for path in paths]), "Perplexity" : perplexity, "KLOldNew" : kl } print fmt_row(widths, ['%.3f+-%.3f'%(stats["EpRewMean"], stats['EpRewSEM']), stats['EpLenMean'], stats['Perplexity'], stats['KLOldNew']]) for (name,val) in stats.items(): stat2timeseries[name].append(val) return stat2timeseries
# FrozenLake is a MDP with finite state and action that involves navigating across a frozen lake. # (It's conventionally called a "grid-world" MDP, as the state space involves points in a 2D grid) # Let's look at the docstring for details print FrozenLake.__doc__ print "-----------------" class RandomDiscreteActionChooser(Policy): def __init__(self, n_actions): self.n_actions = n_actions def step(self, observation): return {"action":np.array([nr.randint(0, self.n_actions)])} policy = RandomDiscreteActionChooser(mdp.n_actions) rdata = rollout(mdp, policy, 100) print rdata s_n = rdata['observations'] # Vector of states (same as observations since MDP is fully-observed) a_n = rdata['actions'] # Vector of actions (each is an int in {0,1,2,3}) n = a_n.shape[0] # Length of trajectory q_n = np.random.randn(n) # Returns (random for the sake of gradient checking) f_sa = np.random.randn(mdp.n_states, mdp.n_actions) # Policy parameter vector. explained shortly. def softmax_prob(f_na): """ Exponentiate f_na and normalize rows to have sum 1 so each row gives a probability distribution over discrete action set """ prob_nk = np.exp(f_na - f_na.max(axis=1,keepdims=True))
def run_ppo(mdp, policy, gamma, max_pathlength, timesteps_per_batch, n_iter, vf = None, lam=1.0, penalty_coeff=1.0, parallel = True, max_kl = 0.1 ): """ mdp : instance of MDP policy : instance of PPOPolicy max_pathlength : maximum episode length (number of timesteps) n_iter : number of batches of PPO vf : instance of ValueFunction lam : lambda parameter of lambda for computing advantage estimator adv_t = delta_t + (\gamma \lambda) \delta_{t+1} + (\gamma \lambda)^2 \delta_{t+2} + \dots as described in http://arxiv.org/abs/1506.05254 penalty_coeff : each iteration we solve the unconstrained minimization problem minimize_{theta} L(theta) + penalty_coeff * KL( thetaold, theta ) parallel : use python's multiprocessing to parallelize the rollouts max_kl : hard limit on KL divergence between old and new policy for one iteration of PPO """ assert isinstance(policy, PPOPolicy) if vf is None: vf = NoValueFunction() theta = policy.get_parameters_flat() seed_iter = itertools.count() start_time = time.time() numeptotal = 0 for i in xrange(n_iter): print "********** Iteration %i ************"%i with Message("Generating paths"): total_ts = 0 paths = [] if parallel: # DO ROLLOUTS IN PARALLEL nproc = multiprocessing.cpu_count() if sys.platform == "darwin": nproc /= 2 # hyperthreading makes num cpu look twice as high # but there's no speedup # store data in global variables so it's accessible from forked processes # (which start when multiprocessing.Pool is created) Globals.mdp = mdp Globals.policy = policy Globals.max_pathlength = max_pathlength pool = multiprocessing.Pool(nproc) pending = [] done = False while True: if len(pending) < nproc and not done: pending.append(pool.apply_async(rollout1, (seed_iter.next(),))) stillpending = [] for job in pending: if job.ready(): path = job.get() paths.append(path) total_ts += pathlength(path) else: stillpending.append(job) pending = stillpending if total_ts > timesteps_per_batch: done = True if len(pending) == 0: break time.sleep(.001) pool.close() else: # EQUIVALENT SERIAL CODE while True: path = rollout(mdp, policy, max_pathlength) paths.append(path) total_ts += pathlength(path) if total_ts > timesteps_per_batch: break with Message("Computing returns and estimating advantage function"): allret = [] allb = [] for path in paths: path["returns"] = discount(path["rewards"], gamma) b = vf.predict(path) b1 = np.append(b,0) # b1 = np.append(b, 0 if path["terminated"] else b[-1]) deltas = path["rewards"] + gamma*b1[1:] - b1[:-1] path["advantage"] = discount(deltas, gamma*lam) allb.append(b) allret.append(path["returns"]) baseline_ev = explained_variance_1d(np.concatenate(allb), np.concatenate(allret)) # baseline_ev = what fraction of variance of returns is explained by the baseline function # it'll be <= 1; if it's <= 0, it's giving predictions worse than a constant function. with Message("Updating policy"): pdist_np = np.concatenate([path["pdists"] for path in paths]) obs_no = np.concatenate([path["observations"] for path in paths]) action_na = np.concatenate([path["actions"] for path in paths]) # Standardize the advantage function to have mean=0 and std=1 advantage_n = np.concatenate([path["advantage"] for path in paths]) advantage_n -= advantage_n.mean() advantage_n /= (advantage_n.std()+1e-8) assert obs_no.shape[0] == pdist_np.shape[0] == action_na.shape[0] == advantage_n.shape[0] n_train_paths = int(len(paths)*.75) train_sli = slice(0,sum(pathlength(path) for path in paths[:n_train_paths])) test_sli = slice(train_sli.stop, None) # Training/test split poar_train, poar_test = [tuple(arr[sli] for arr in (pdist_np, obs_no, action_na, advantage_n)) for sli in (train_sli, test_sli)] obj_names = ["L","KL"] obj_before_train = policy.compute_surr_kl(*poar_train) obj_before_test = policy.compute_surr_kl(*poar_test) def fpen(th): thprev = policy.get_parameters_flat() policy.set_parameters_flat(th) surr, kl = policy.compute_surr_kl(*poar_train) #pylint: disable=W0640 out = penalty_coeff * kl - surr if kl > max_kl or not np.isfinite(out): out = 1e10 # testsurr = policy.compute_surr_kl(*poar_test)[0] # print "dtheta norm",np.linalg.norm(th - theta),"train lagrangian",out # print "testsurr improvement",testsurr - obj_before_test[0] policy.set_parameters_flat(thprev) return out def fgradpen(th): thprev = policy.get_parameters_flat() policy.set_parameters_flat(th) out = - policy.compute_grad_lagrangian(penalty_coeff, *poar_train) #pylint: disable=W0640 policy.set_parameters_flat(thprev) return out theta,_,info = scipy.optimize.fmin_l_bfgs_b(fpen, theta, fprime=fgradpen, maxiter=20) del info["grad"] print info policy.set_parameters_flat(theta) obj_after_train = policy.compute_surr_kl(*poar_train) obj_after_test = policy.compute_surr_kl(*poar_test) delta_train = np.array(obj_after_train) - np.array(obj_before_train) delta_test = np.array(obj_after_test) - np.array(obj_before_test) with Message("Computing baseline function for next iter"): vf.fit(paths) with Message("Computing stats"): episoderewards = np.array([path["rewards"].sum() for path in paths]) pathlengths = np.array([pathlength(path) for path in paths]) entropy = policy.compute_entropy(pdist_np).mean() perplexity = np.exp(entropy) stats = OrderedDict() for (name, dtrain, dtest) in zip(obj_names, delta_train, delta_test): stats.update({ u"Train_d"+name : dtrain, u"Test_d"+name : dtest, # u"Ratio_"+name : dtest/dtrain }) # stats["Armijo"] = (obj_after_train[0] - obj_before_train[0]) / step.dot(g) numeptotal += len(episoderewards) stats["NumEpBatch"] = len(episoderewards) stats["NumEpTotal"] = numeptotal stats["EpRewMean"] = episoderewards.mean() stats["EpRewSEM"] = episoderewards.std()/np.sqrt(len(paths)) stats["EpRewMax"] = episoderewards.max() stats["EpLenMean"] = pathlengths.mean() stats["EpLenMax"] = pathlengths.max() stats["RewPerStep"] = episoderewards.sum()/pathlengths.sum() stats["BaselineEV"] = baseline_ev stats["Entropy"] = entropy stats["Perplexity"] = perplexity stats["TimeElapsed"] = time.time() - start_time yield stats
def rollout1(seed): np.random.seed(seed) return rollout(Globals.mdp, Globals.policy, Globals.max_pathlength)
def run_ppo(mdp, policy, gamma, max_pathlength, timesteps_per_batch, n_iter, vf=None, lam=1.0, penalty_coeff=1.0, parallel=True, max_kl=0.1): """ mdp : instance of MDP policy : instance of PPOPolicy max_pathlength : maximum episode length (number of timesteps) n_iter : number of batches of PPO vf : instance of ValueFunction lam : lambda parameter of lambda for computing advantage estimator adv_t = delta_t + (\gamma \lambda) \delta_{t+1} + (\gamma \lambda)^2 \delta_{t+2} + \dots as described in http://arxiv.org/abs/1506.05254 penalty_coeff : each iteration we solve the unconstrained minimization problem minimize_{theta} L(theta) + penalty_coeff * KL( thetaold, theta ) parallel : use python's multiprocessing to parallelize the rollouts max_kl : hard limit on KL divergence between old and new policy for one iteration of PPO """ assert isinstance(policy, PPOPolicy) if vf is None: vf = NoValueFunction() theta = policy.get_parameters_flat() seed_iter = itertools.count() start_time = time.time() numeptotal = 0 for i in xrange(n_iter): print "********** Iteration %i ************" % i with Message("Generating paths"): total_ts = 0 paths = [] if parallel: # DO ROLLOUTS IN PARALLEL nproc = multiprocessing.cpu_count() if sys.platform == "darwin": nproc /= 2 # hyperthreading makes num cpu look twice as high # but there's no speedup # store data in global variables so it's accessible from forked processes # (which start when multiprocessing.Pool is created) Globals.mdp = mdp Globals.policy = policy Globals.max_pathlength = max_pathlength pool = multiprocessing.Pool(nproc) pending = [] done = False while True: if len(pending) < nproc and not done: pending.append( pool.apply_async(rollout1, (seed_iter.next(), ))) stillpending = [] for job in pending: if job.ready(): path = job.get() paths.append(path) total_ts += pathlength(path) else: stillpending.append(job) pending = stillpending if total_ts > timesteps_per_batch: done = True if len(pending) == 0: break time.sleep(.001) pool.close() else: # EQUIVALENT SERIAL CODE while True: path = rollout(mdp, policy, max_pathlength) paths.append(path) total_ts += pathlength(path) if total_ts > timesteps_per_batch: break with Message("Computing returns and estimating advantage function"): allret = [] allb = [] for path in paths: path["returns"] = discount(path["rewards"], gamma) b = vf.predict(path) b1 = np.append(b, 0) # b1 = np.append(b, 0 if path["terminated"] else b[-1]) deltas = path["rewards"] + gamma * b1[1:] - b1[:-1] path["advantage"] = discount(deltas, gamma * lam) allb.append(b) allret.append(path["returns"]) baseline_ev = explained_variance_1d(np.concatenate(allb), np.concatenate(allret)) # baseline_ev = what fraction of variance of returns is explained by the baseline function # it'll be <= 1; if it's <= 0, it's giving predictions worse than a constant function. with Message("Updating policy"): pdist_np = np.concatenate([path["pdists"] for path in paths]) obs_no = np.concatenate([path["observations"] for path in paths]) action_na = np.concatenate([path["actions"] for path in paths]) # Standardize the advantage function to have mean=0 and std=1 advantage_n = np.concatenate([path["advantage"] for path in paths]) advantage_n -= advantage_n.mean() advantage_n /= (advantage_n.std() + 1e-8) assert obs_no.shape[0] == pdist_np.shape[0] == action_na.shape[ 0] == advantage_n.shape[0] n_train_paths = int(len(paths) * .75) train_sli = slice( 0, sum(pathlength(path) for path in paths[:n_train_paths])) test_sli = slice(train_sli.stop, None) # Training/test split poar_train, poar_test = [ tuple(arr[sli] for arr in (pdist_np, obs_no, action_na, advantage_n)) for sli in (train_sli, test_sli) ] obj_names = ["L", "KL"] obj_before_train = policy.compute_surr_kl(*poar_train) obj_before_test = policy.compute_surr_kl(*poar_test) def fpen(th): thprev = policy.get_parameters_flat() policy.set_parameters_flat(th) surr, kl = policy.compute_surr_kl(*poar_train) #pylint: disable=W0640 out = penalty_coeff * kl - surr if kl > max_kl or not np.isfinite(out): out = 1e10 # testsurr = policy.compute_surr_kl(*poar_test)[0] # print "dtheta norm",np.linalg.norm(th - theta),"train lagrangian",out # print "testsurr improvement",testsurr - obj_before_test[0] policy.set_parameters_flat(thprev) return out def fgradpen(th): thprev = policy.get_parameters_flat() policy.set_parameters_flat(th) out = -policy.compute_grad_lagrangian(penalty_coeff, * poar_train) #pylint: disable=W0640 policy.set_parameters_flat(thprev) return out theta, _, info = scipy.optimize.fmin_l_bfgs_b(fpen, theta, fprime=fgradpen, maxiter=20) del info["grad"] print info policy.set_parameters_flat(theta) obj_after_train = policy.compute_surr_kl(*poar_train) obj_after_test = policy.compute_surr_kl(*poar_test) delta_train = np.array(obj_after_train) - np.array( obj_before_train) delta_test = np.array(obj_after_test) - np.array(obj_before_test) with Message("Computing baseline function for next iter"): vf.fit(paths) with Message("Computing stats"): episoderewards = np.array( [path["rewards"].sum() for path in paths]) pathlengths = np.array([pathlength(path) for path in paths]) entropy = policy.compute_entropy(pdist_np).mean() perplexity = np.exp(entropy) stats = OrderedDict() for (name, dtrain, dtest) in zip(obj_names, delta_train, delta_test): stats.update({ u"Train_d" + name: dtrain, u"Test_d" + name: dtest, # u"Ratio_"+name : dtest/dtrain }) # stats["Armijo"] = (obj_after_train[0] - obj_before_train[0]) / step.dot(g) numeptotal += len(episoderewards) stats["NumEpBatch"] = len(episoderewards) stats["NumEpTotal"] = numeptotal stats["EpRewMean"] = episoderewards.mean() stats["EpRewSEM"] = episoderewards.std() / np.sqrt(len(paths)) stats["EpRewMax"] = episoderewards.max() stats["EpLenMean"] = pathlengths.mean() stats["EpLenMax"] = pathlengths.max() stats["RewPerStep"] = episoderewards.sum() / pathlengths.sum() stats["BaselineEV"] = baseline_ev stats["Entropy"] = entropy stats["Perplexity"] = perplexity stats["TimeElapsed"] = time.time() - start_time yield stats