def setup_es(seed=0, env_id='DirHopper', log_path='/tmp/out', n_cpu=1, **agent_args): seed = MPI.COMM_WORLD.Get_rank() * 1000 assert agent_args is not None np.random.seed(seed) env = env_selector(env_id, seed) env.seed(seed) es = ES(env, env_id, **agent_args) logger.log('Experiment configuration: {}'.format(str(locals()))) return es
def test(self, fix_ppo=None, load_theta_path=None, **_): def objective(env, theta, pool_rank): agent = self.create_agent(env, pool_rank) loss_n_params = len(agent.get_loss().get_params_1d()) agent.get_loss().set_params_1d(theta[:loss_n_params]) if self._outer_evolve_policy_init: agent.pi.set_params_1d(theta[loss_n_params:]) # Agent lifetime is inner_opt_freq * inner_max_n_epoch out = run_batch_rl(env, agent, inner_opt_freq=self._inner_opt_freq, inner_max_n_epoch=self._inner_max_n_epoch, inner_buffer_size=self._inner_buffer_size, pool_rank=0, ppo_factor=1. if fix_ppo else 0., render=True, verbose=True) if load_theta_path is not None: try: theta = self.load_theta(load_theta_path) while True: objective(self._env, theta, 0) except Exception as e: print(e) logger.log('Test run finished.')
def train(self, outer_n_epoch, outer_l2, outer_std, outer_learning_rate, outer_n_samples_per_ep, n_cpu=None, fix_ppo=None, **_): # Requires more than 1 MPI process. assert MPI.COMM_WORLD.Get_size() > 1 assert n_cpu is not None if fix_ppo: ppo_factor_schedule = PiecewiseSchedule([(0, 1.), (int(outer_n_epoch / 16), 0.5)], outside_value=0.5) else: ppo_factor_schedule = PiecewiseSchedule([(0, 1.), (int(outer_n_epoch / 8), 0.)], outside_value=0.) outer_lr_scheduler = PiecewiseSchedule([(0, outer_learning_rate), (int(outer_n_epoch / 2), outer_learning_rate * 0.1)], outside_value=outer_learning_rate * 0.1) def objective(env, theta, pool_rank): agent = self.create_agent(env, pool_rank) loss_n_params = len(agent.get_loss().get_params_1d()) agent.get_loss().set_params_1d(theta[:loss_n_params]) if self._outer_evolve_policy_init: agent.pi.set_params_1d(theta[loss_n_params:]) # Agent lifetime is inner_opt_freq * inner_max_n_epoch return run_batch_rl(env, agent, inner_opt_freq=self._inner_opt_freq, inner_buffer_size=self._inner_buffer_size, inner_max_n_epoch=self._inner_max_n_epoch, pool_rank=pool_rank, ppo_factor=ppo_factor_schedule.value(epoch), epoch=None) # Initialize theta. theta = self.init_theta(self._env) num_params = len(theta) logger.log('Theta dim: {}'.format(num_params)) # Set up outer loop parameter update schedule. adam = Adam(shape=(num_params,), beta1=0., stepsize=outer_learning_rate, dtype=np.float32) # Set up intra-machine parallelization. logger.log('Using {} proceses per MPI process.'.format(n_cpu)) from pathos.multiprocessing import ProcessPool pool = ProcessPool(nodes=n_cpu) begin_time, best_test_return = time.time(), -np.inf for epoch in range(outer_n_epoch): # Anneal outer learning rate adam.stepsize = outer_lr_scheduler.value(epoch) noise = np.random.randn(outer_n_samples_per_ep // NUM_EQUAL_NOISE_VECTORS, num_params) noise = np.repeat(noise, NUM_EQUAL_NOISE_VECTORS, axis=0) theta_noise = theta[np.newaxis, :] + noise * outer_std theta_noise = theta_noise.reshape(MPI.COMM_WORLD.Get_size(), -1) # Distributes theta_noise vectors to all nodes. logger.log('Scattering all perturbed theta vectors and running inner loops ...') recvbuf = np.empty(theta_noise.shape[1], dtype='float') MPI.COMM_WORLD.Scatter(theta_noise, recvbuf, root=0) theta_noise = recvbuf.reshape(-1, num_params) # Noise vectors are scattered, run inner loop, parallelized over `pool_size` processes. start_time = time.time() pool_size = int(outer_n_samples_per_ep / MPI.COMM_WORLD.Get_size()) results = pool.amap(objective, [self._env] * pool_size, theta_noise, range(pool_size)).get() # Extract relevant results returns = [utils.ret_to_obj(r['ep_final_rew']) for r in results] update_time = [np.mean(r['update_time']) for r in results] env_time = [np.mean(r['env_time']) for r in results] ep_length = [np.mean(r['ep_length']) for r in results] n_ep = [len(r['ep_length']) for r in results] mean_ep_kl = [np.mean(r['ep_kl']) for r in results] final_rets = [np.mean(r['ep_return'][-3:]) for r in results] # We gather the results at node 0 recvbuf = np.empty([MPI.COMM_WORLD.Get_size(), 7 * pool_size], # 7 = number of scalars in results vector dtype='float') if MPI.COMM_WORLD.Get_rank() == 0 else None results_processed_arr = np.asarray( [returns, update_time, env_time, ep_length, n_ep, mean_ep_kl, final_rets], dtype='float').ravel() MPI.COMM_WORLD.Gather(results_processed_arr, recvbuf, root=0) # Do outer loop update calculations at node 0 if MPI.COMM_WORLD.Get_rank() == 0: end_time = time.time() logger.log( 'All inner loops completed, returns gathered ({:.2f} sec).'.format( time.time() - start_time)) results_processed_arr = recvbuf.reshape(MPI.COMM_WORLD.Get_size(), 7, pool_size) results_processed_arr = np.transpose(results_processed_arr, (0, 2, 1)).reshape(-1, 7) results_processed = [dict(returns=r[0], update_time=r[1], env_time=r[2], ep_length=r[3], n_ep=r[4], mean_ep_kl=r[5], final_rets=r[6]) for r in results_processed_arr] returns = np.asarray([r['returns'] for r in results_processed]) # ES update noise = noise[::NUM_EQUAL_NOISE_VECTORS] returns = np.mean(returns.reshape(-1, NUM_EQUAL_NOISE_VECTORS), axis=1) theta_grad = relative_ranks(returns).dot(noise) / outer_n_samples_per_ep \ - outer_l2 * theta theta -= adam.step(theta_grad) # Perform `NUM_TEST_SAMPLES` evaluation runs on root 0. if epoch % self._outer_plot_freq == 0 or epoch == outer_n_epoch - 1: start_test_time = time.time() logger.log('Performing {} test runs in parallel on node 0 ...'.format(NUM_TEST_SAMPLES)) # Evaluation run with current theta test_results = pool.amap( objective, [self._env] * NUM_TEST_SAMPLES, theta[np.newaxis, :] + np.zeros((NUM_TEST_SAMPLES, num_params)), range(NUM_TEST_SAMPLES) ).get() plotting.plot_results(epoch, test_results) test_return = np.mean([utils.ret_to_obj(r['ep_return']) for r in test_results]) if test_return > best_test_return: best_test_return = test_return # Save theta as numpy array. self.save_theta(theta) self.save_theta(theta, str(epoch)) logger.log('Test runs performed ({:.2f} sec).'.format(time.time() - start_test_time)) logger.logkv('Epoch', epoch) utils.log_misc_stats('Obj', logger, returns) logger.logkv('PPOFactor', ppo_factor_schedule.value(epoch)) logger.logkv('EpochTimeSpent(s)', end_time - start_time) logger.logkv('TotalTimeSpent(s)', end_time - begin_time) logger.logkv('BestTestObjMean', best_test_return) logger.dumpkvs()
def act_to_env_format(act): if np.isnan(act).any() or np.isinf(act).any(): logger.log("WARNING: nan or inf action {}".format(act)) return np.zeros_like(act) else: return act
def update(self, obs, acts, rews, dones, ppo_factor, inner_opt_freq): epg_rews = rews # Want to zero out rewards to the EPG loss function? # epg_rews = np.zeros_like(rews) # Calculate auxiliary functions. lst_bonus = [] for rew_bonus_eval in self.lst_rew_bonus_eval: lst_bonus.append(rew_bonus_eval.predict(obs).T) auxs = np.concatenate(lst_bonus, axis=0) traj_raw = np.c_[obs, acts, epg_rews, auxs, dones].astype(np.float32) # Update here, since we only have access to these raws at this specific spot. self._traj_norm.update(traj_raw) traj = self._traj_norm.norm(traj_raw) auxs_pad = np.zeros(self._buffer_size - obs.shape[0], dtype=np.float32) rew_pad = np.zeros(self._buffer_size - obs.shape[0], dtype=np.float32) done_pad = np.zeros(self._buffer_size - obs.shape[0], dtype=np.float32) obs_pad = np.zeros((self._buffer_size - obs.shape[0], obs.shape[1]), dtype=np.float32) act_pad = np.zeros((self._buffer_size - acts.shape[0], acts.shape[1]), dtype=np.float32) pad = np.hstack([ obs_pad, act_pad, rew_pad[:, None], auxs_pad[:, None], done_pad[:, None] ]) traj = np.vstack([pad, traj]) traj[:, obs.shape[1] + acts.shape[1]] = epg_rews traj[:, -1] = dones # Since the buffer length can be larger than the set of new samples, we truncate the # trajectories here for PPO. dones = dones[-inner_opt_freq:] rews = rews[-inner_opt_freq:] acts = acts[-inner_opt_freq:] obs = obs[-inner_opt_freq:] _obs = traj[-inner_opt_freq:, :obs.shape[1]] n = len(obs) if self._use_ppo: old_params_sym = self._pi_f(_obs) vp = np.ravel(self._vf_f(_obs).data) old_params = [item.data for item in old_params_sym] advs = gamma_expand( rews + self._ppo_gam * (1 - dones) * np.append(vp[1:], vp[-1]) - vp, self._ppo_gam * self._ppo_lam * (1 - dones)) vt = advs + vp at = (advs - advs.mean()) / advs.std() epg_surr_loss = 0. pi_params_before = self._pi_f(_obs) for _ in range(self.inner_n_opt_steps): for idx in np.array_split(np.random.permutation(n), n // self.inner_opt_batch_size): # Clear gradients for v in self.backprop_params: v.cleargrad() # Forward pass through loss function. # Apply temporal conv to input trajectory processed_traj = self._process_trajectory(traj) # Compute epg loss value epg_surr_loss_sym = self._compute_loss(traj[idx], processed_traj[idx]) epg_surr_loss += epg_surr_loss_sym.data # Add bootstrapping signal if needed. if self._use_ppo: old_params_idx = [item[idx] for item in old_params] ppo_surr_loss = self._compute_ppo_loss( _obs[idx], acts[idx], at[idx], vt[idx], old_params_idx) total_surr_loss = epg_surr_loss_sym * ( 1 - ppo_factor) + ppo_surr_loss * ppo_factor else: total_surr_loss = epg_surr_loss_sym # Backward pass through loss function total_surr_loss.backward() for v, adam in zip(self.backprop_params, self._lst_adam): if np.isnan(v.grad).any() or np.isinf(v.grad).any(): logger.log( "WARNING: gradient update nan on node {}".format( MPI.COMM_WORLD.Get_rank())) else: v.data += adam.step(v.grad) pi_params_after = self._pi_f(_obs) return epg_surr_loss / (n // self.inner_opt_batch_size) / self.inner_n_opt_steps, \ np.mean(self.kl(pi_params_before, pi_params_after).data)
def main(test): d = datetime.datetime.now() date = '{}-{}'.format(d.month, d.day) time = '{:02d}-{:02d}'.format(d.hour, d.minute) # Experiment params # ----------------- env_id = 'DirHopper' # Number of noise vector seeds for ES outer_n_samples_per_ep = 8 # Perform policy SGD updates every `inner_opt_freq` steps inner_opt_freq = 64 # Perform `inner_max_n_epoch` total SGD policy updates, # so in total `inner_steps` = `inner_opt_freq` * `inner_max_n_epoch` inner_max_n_epoch = 128 # Temporal convolutions slide over buffer of length `inner_buffer_size` inner_buffer_size = inner_opt_freq * 8 # Use PPO bootstrapping? ppo = True # Evolve policy initialization togeher with loss function? gpi = False # Fix PPO alpha (ppo_factor) to 0.5? fix_ppo = False # Use memory structure? mem = False # Number of outer loop epochs outer_n_epoch = 2000 # Outer loop theta L2 penalty outer_l2 = 0.001 # Outer loop noise standard deviation outer_std = 0.01 # Outer loop Adam step size outer_learning_rate = 1e-2 # Inner loop batch size per gradient update inner_opt_batch_size = 32 # Number of times to cycle through the sampled dataset in the inner loop inner_n_opt_steps = 1 # Inner loop adam step size inner_lr = 1e-3 # Plotting frequency in number of outer loop epochs plot_freq = 50 # Maximum number of cpus used per MPI process max_cpu = 2 # Local experiment log path launcher.LOCAL_LOG_PATH = os.path.expanduser("~/EPG_experiments") # Where to load theta from for `--test true` purposes theta_load_path = '~/EPG_experiments/<path_to_theta.npy>/theta.npy' # ----------------- exp_tag = '{}-{}-{}{}{}{}'.format( outer_n_samples_per_ep, inner_opt_freq, inner_max_n_epoch, '-p' if ppo else '', '-i' if gpi else '', '-f' if fix_ppo else '', ).replace('.', '') exp_name = '{}-{}-{}'.format(time, env_id.lower(), exp_tag) job_name = 'epg-{}--{}'.format(date, exp_name) epg_args = dict( env_id=env_id, n_cpu=max_cpu, log_path=os.path.join(launcher.LOCAL_LOG_PATH, date, exp_name), load_theta_path=theta_load_path if test else None, plot_freq=plot_freq, outer_n_epoch=outer_n_epoch, outer_l2=outer_l2, outer_std=outer_std, outer_learning_rate=outer_learning_rate, outer_n_samples_per_ep=outer_n_samples_per_ep, inner_opt_freq=inner_opt_freq, inner_max_n_epoch=inner_max_n_epoch, inner_opt_batch_size=inner_opt_batch_size, inner_buffer_size=inner_buffer_size, inner_n_opt_steps=inner_n_opt_steps, inner_lr=inner_lr, mem=mem, inner_use_ppo=ppo, fix_ppo=fix_ppo, gpi=gpi, ) mpi_machines = 1 mpi_proc_per_machine = int( np.ceil(outer_n_samples_per_ep / mpi_machines / float(max_cpu))) logger.log( 'Running experiment {}/{} with {} noise vectors on {} machines with {}' ' MPI processes per machine, each using {} pool processes.'.format( date, exp_name, outer_n_samples_per_ep, mpi_machines, mpi_proc_per_machine, max_cpu)) # Experiment launcher launcher.call(job_name=job_name, fn=test_run if test else run, kwargs=epg_args, log_relpath=os.path.join(date, exp_name), mpi_proc_per_machine=mpi_proc_per_machine, mpi_machines=mpi_machines)
def run_batch_rl(env, agent, inner_opt_freq, inner_max_n_epoch, inner_buffer_size, pool_rank, ppo_factor, epoch=None, render=False, verbose=False): from collections import deque assert isinstance(inner_opt_freq, int) assert isinstance(inner_max_n_epoch, int) assert isinstance(inner_buffer_size, int) lst_ep_rew, lst_loss, lst_ep_steps, lst_kl = [], [], [], [] buffer = deque(maxlen=inner_buffer_size) n_ep, ep_rew, ep_steps = 0, 0., 0 tot_update_time, start_env_time = 0., time.time() # Assumes meta wrapper used. if epoch is not None: env.meta_reset(epoch) env.seed(epoch) else: env.meta_reset(pool_rank + utils.get_time_seed()) env.seed(pool_rank + utils.get_time_seed()) obs = env.reset() n_steps = 0 for itr in range(inner_max_n_epoch): ep_obs = [] for _ in range(inner_opt_freq): obs = obs.astype(np.float32) act = agent.act(obs) obs_prime, rew, done, _ = env.step(agent.act_to_env_format(act)) ep_obs.append(obs) buffer.append((obs, act, rew, done)) ep_rew += rew ep_steps += 1 n_steps += 1 if done: obs = env.reset() lst_ep_rew.append(ep_rew) lst_ep_steps.append(ep_steps) if verbose and pool_rank == 0: logger.log('Train run (ep {}, return {:.3f})'.format( n_ep, ep_rew)) ep_steps, ep_rew = 0, 0. n_ep += 1 else: obs = obs_prime # This is disabled for now. But it's easy to add an exploration bonus as an additional # input the the loss function! # for rew_bonus_eval in agent.lst_rew_bonus_eval: # rew_bonus_eval.fit_before_process_samples(obs) start_update_time = time.time() loss_input = [ np.array([e[i] for e in buffer], dtype=np.float32) for i in range(len(buffer[0])) ] loss_input += [ppo_factor, inner_opt_freq] loss, kl = agent.update(*loss_input) lst_loss.append(loss) lst_kl.append(kl) tot_update_time += time.time() - start_update_time # Evaluate final policy obs, final_rew, ep_counter = env.reset(), [0., 0., 0.], 0 while ep_counter < 3: obs = obs.astype(np.float32) act = agent.act(obs) obs_prime, rew, done, _ = env.step(agent.act_to_env_format(act)) final_rew[ep_counter] += rew if done: obs = env.reset() ep_counter += 1 else: obs = obs_prime tot_env_time = time.time() - start_env_time - tot_update_time if render: logger.log('Rendering final policy for 5 steps ...') obs, ep_rew = env.reset(), 0. ep_counter = 0 while ep_counter < 5: obs = obs.astype(np.float32) act = agent.act(obs) obs_prime, rew, done, _ = env.step(agent.act_to_env_format(act)) env.render() ep_rew += rew if done: logger.log( 'Test run with final policy (return {:.3f}).'.format( ep_rew)) time.sleep(2) obs, ep_rew = env.reset(), 0. ep_counter += 1 else: obs = obs_prime return dict(ep_return=np.asarray(lst_ep_rew), ep_final_rew=np.asarray(final_rew), ep_loss=lst_loss, ep_length=lst_ep_steps, ep_kl=np.asarray(lst_kl), update_time=tot_update_time, env_time=tot_env_time)