def run(config, seed, device, logdir): set_global_seeds(seed) print('Initializing...') agent = Agent(config, make_env(config, seed), device) es = CMAES([config['train.mu0']]*agent.num_params, config['train.std0'], {'popsize': config['train.popsize'], 'seed': seed}) train_logs = [] checkpoint_count = 0 with ProcessPoolExecutor(max_workers=config['train.popsize'], initializer=initializer, initargs=(config, seed, device)) as executor: print('Finish initialization. Training starts...') for generation in range(config['train.generations']): start_time = time.perf_counter() solutions = es.ask() out = list(executor.map(fitness, solutions, chunksize=2)) Rs, Hs = zip(*out) es.tell(solutions, [-R for R in Rs]) logger = Logger() logger('generation', generation+1) logger('num_seconds', round(time.perf_counter() - start_time, 1)) logger('Returns', describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n')) logger('Horizons', describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n')) logger('fbest', es.result.fbest) train_logs.append(logger.logs) if generation == 0 or (generation+1)%config['log.freq'] == 0: logger.dump(keys=None, index=0, indent=0, border='-'*50) if (generation+1) >= int(config['train.generations']*(checkpoint_count/(config['checkpoint.num'] - 1))): agent.from_vec(tensorify(es.result.xbest, 'cpu')) agent.checkpoint(logdir, generation+1) checkpoint_count += 1 pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl') return None
def eval(self, n=None, **kwargs): start_time = perf_counter() returns = [] horizons = [] for _ in range(self.config['eval.num_episode']): observation = self.eval_env.reset() for _ in range(self.eval_env.spec.max_episode_steps): with torch.no_grad(): action = self.agent.choose_action(observation, mode='eval')['action'] next_observation, reward, done, info = self.eval_env.step(action) if done[0]: # [0] single environment returns.append(info[0]['episode']['return']) horizons.append(info[0]['episode']['horizon']) break observation = next_observation logger = Logger() logger('num_seconds', round(perf_counter() - start_time, 1)) logger('accumulated_trained_timesteps', kwargs['accumulated_trained_timesteps']) logger('accumulated_trained_episodes', kwargs['accumulated_trained_episodes']) logger('online_return', describe(returns, axis=-1, repr_indent=1, repr_prefix='\n')) logger('online_horizon', describe(horizons, axis=-1, repr_indent=1, repr_prefix='\n')) monitor_env = get_wrapper(self.eval_env, 'VecMonitor') logger('running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger('running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green')) return logger.logs
def train(self, n=None, **kwargs): self.agent.train() t0 = time.perf_counter() D = self.runner(self.agent, self.env, self.config['train.timestep_per_iter']) out_agent = self.agent.learn(D) logger = Logger() logger('train_iteration', n + 1) logger('num_seconds', round(time.perf_counter() - t0, 1)) [logger(key, value) for key, value in out_agent.items()] logger('num_trajectories', len(D)) logger('num_timesteps', sum([traj.T for traj in D])) logger('accumulated_trained_timesteps', self.agent.total_timestep) logger( 'return', describe([sum(traj.rewards) for traj in D], axis=-1, repr_indent=1, repr_prefix='\n')) E = [ traj[-1].info['episode'] for traj in D if 'episode' in traj[-1].info ] logger( 'online_return', describe([e['return'] for e in E], axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'online_horizon', describe([e['horizon'] for e in E], axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'running_return', describe(self.env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'running_horizon', describe(self.env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n')) return logger
def run(config, seed, device, logdir): set_global_seeds(seed) torch.set_num_threads(1) # VERY IMPORTANT TO AVOID GETTING STUCK print('Initializing...') agent = Agent(config, make_env(config, seed, 'eval'), device) es = OpenAIES( [config['train.mu0']] * agent.num_params, config['train.std0'], { 'popsize': config['train.popsize'], 'seed': seed, 'sigma_scheduler_args': config['train.sigma_scheduler_args'], 'lr': config['train.lr'], 'lr_decay': config['train.lr_decay'], 'min_lr': config['train.min_lr'], 'antithetic': config['train.antithetic'], 'rank_transform': config['train.rank_transform'] }) train_logs = [] checkpoint_count = 0 with Pool(processes=config['train.popsize'] // config['train.worker_chunksize']) as pool: print('Finish initialization. Training starts...') for generation in range(config['train.generations']): t0 = time.perf_counter() solutions = es.ask() data = [(config, seed, device, solution) for solution in solutions] out = pool.map(CloudpickleWrapper(fitness), data, chunksize=config['train.worker_chunksize']) Rs, Hs = zip(*out) es.tell(solutions, [-R for R in Rs]) logger = Logger() logger('generation', generation + 1) logger('num_seconds', round(time.perf_counter() - t0, 1)) logger('Returns', describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n')) logger('Horizons', describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n')) logger('fbest', es.result.fbest) train_logs.append(logger.logs) if generation == 0 or (generation + 1) % config['log.freq'] == 0: logger.dump(keys=None, index=0, indent=0, border='-' * 50) if (generation + 1) >= int(config['train.generations'] * (checkpoint_count / (config['checkpoint.num'] - 1))): agent.from_vec(tensorify(es.result.xbest, 'cpu')) agent.checkpoint(logdir, generation + 1) checkpoint_count += 1 pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl') return None
def eval(self, n=None, **kwargs): t0 = time.perf_counter() with torch.no_grad(): D = self.runner(self.agent, self.eval_env, 10, mode='eval') logger = Logger() logger('eval_iteration', n+1) logger('num_seconds', round(time.perf_counter() - t0, 1)) logger('accumulated_trained_timesteps', self.agent.total_timestep) logger('online_return', describe([sum(traj.rewards) for traj in D], axis=-1, repr_indent=1, repr_prefix='\n')) logger('online_horizon', describe([traj.T for traj in D], axis=-1, repr_indent=1, repr_prefix='\n')) logger('running_return', describe(self.eval_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger('running_horizon', describe(self.eval_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green')) return logger.logs
def learn(self, D, **kwargs): logprobs = [torch.cat(traj.get_infos('action_logprob')) for traj in D] entropies = [torch.cat(traj.get_infos('entropy')) for traj in D] Vs = [torch.cat(traj.get_infos('V')) for traj in D] last_Vs = [traj.extra_info['last_info']['V'] for traj in D] Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj.rewards, last_V, traj.reach_terminal) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj.rewards, V, last_V, traj.reach_terminal) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-4) assert all([x.ndim == 1 for x in [logprobs, entropies, Vs, Qs, As]]) # Loss policy_loss = -logprobs * As.detach() entropy_loss = -entropies value_loss = F.mse_loss(Vs, Qs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) self.optimizer.step() if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.total_timestep += sum([traj.T for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -out['entropy_loss'] out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(Qs, 'float'), y_pred=numpify(Vs, 'float')) return out
def train(self, n=None, **kwargs): self.agent.train() start_time = perf_counter() D = self.runner(self.agent, self.env, self.config['train.timestep_per_iter']) out_agent = self.agent.learn(D) logger = Logger() logger('train_iteration', n + 1) logger('num_seconds', round(perf_counter() - start_time, 1)) [logger(key, value) for key, value in out_agent.items()] logger('num_trajectories', len(D)) logger('num_timesteps', sum([len(traj) for traj in D])) logger('accumulated_trained_timesteps', self.agent.total_timestep) G = [traj.numpy_rewards.sum() for traj in D] logger('return', describe(G, axis=-1, repr_indent=1, repr_prefix='\n')) infos = [ info for info in chain.from_iterable([traj.infos for traj in D]) if 'episode' in info ] online_returns = [info['episode']['return'] for info in infos] online_horizons = [info['episode']['horizon'] for info in infos] logger( 'online_return', describe(online_returns, axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'online_horizon', describe(online_horizons, axis=-1, repr_indent=1, repr_prefix='\n')) monitor_env = get_wrapper(self.env, 'VecMonitor') logger( 'running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n')) return logger
def learn(self, D, **kwargs): replay = kwargs['replay'] episode_length = kwargs['episode_length'] out = {} out['actor_loss'] = [] out['critic_loss'] = [] Q_vals = [] for i in range(episode_length): observations, actions, rewards, next_observations, masks = replay.sample( self.config['replay.batch_size']) Qs = self.critic(observations, actions).squeeze() with torch.no_grad(): next_Qs = self.critic_target( next_observations, self.actor_target(next_observations)).squeeze() targets = rewards + self.config[ 'agent.gamma'] * masks * next_Qs.detach() critic_loss = F.mse_loss(Qs, targets) self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() critic_loss.backward() critic_grad_norm = nn.utils.clip_grad_norm_( self.critic.parameters(), self.config['agent.max_grad_norm']) self.critic_optimizer.step() actor_loss = -self.critic(observations, self.actor(observations)).mean() self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() actor_loss.backward() actor_grad_norm = nn.utils.clip_grad_norm_( self.actor.parameters(), self.config['agent.max_grad_norm']) self.actor_optimizer.step() self.polyak_update_target() out['actor_loss'].append(actor_loss) out['critic_loss'].append(critic_loss) Q_vals.append(Qs) out['actor_loss'] = torch.stack(out['actor_loss']).mean().item() out['actor_grad_norm'] = actor_grad_norm out['critic_loss'] = torch.stack(out['critic_loss']).mean().item() out['critic_grad_norm'] = critic_grad_norm describe_it = lambda x: describe(numpify(torch.cat(x), 'float'). squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['Q'] = describe_it(Q_vals) return out
def learn(self, D, **kwargs): # Compute all metrics, D: list of Trajectory Ts = [len(traj) for traj in D] behavior_logprobs = [ torch.cat(traj.get_all_info('action_logprob')) for traj in D ] out_agent = self.choose_action( np.concatenate([traj.numpy_observations[:-1] for traj in D], 0)) logprobs = out_agent['action_logprob'].squeeze() entropies = out_agent['entropy'].squeeze() Vs = out_agent['V'].squeeze() with torch.no_grad(): last_observations = tensorify( np.concatenate([traj.last_observation for traj in D], 0), self.device) last_Vs = self.V_head( self.feature_network(last_observations)).squeeze(-1) vs, As = [], [] for traj, behavior_logprob, logprob, V, last_V in zip( D, behavior_logprobs, logprobs.detach().cpu().split(Ts), Vs.detach().cpu().split(Ts), last_Vs): v, A = vtrace(behavior_logprob, logprob, self.gamma, traj.rewards, V, last_V, traj.reach_terminal, self.clip_rho, self.clip_pg_rho) vs.append(v) As.append(A) # Metrics -> Tensor, device vs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [vs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-8) assert all( [x.ndimension() == 1 for x in [logprobs, entropies, Vs, vs, As]]) # Loss policy_loss = -logprobs * As entropy_loss = -entropies value_loss = F.mse_loss(Vs, vs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) self.optimizer.step() if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.total_timestep += sum([len(traj) for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -entropy_loss.mean().item() out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(vs, 'float'), y_pred=numpify(Vs, 'float')) return out
def learn(self, D, **kwargs): replay = kwargs['replay'] episode_length = kwargs['episode_length'] out = {} out['actor_loss'] = [] out['critic_loss'] = [] out['alpha_loss'] = [] Q1_vals = [] Q2_vals = [] logprob_vals = [] for i in range(episode_length): observations, actions, rewards, next_observations, masks = replay.sample( self.config['replay.batch_size']) Qs1, Qs2 = self.critic(observations, actions) #Qs1, Qs2 = map(lambda x: x.squeeze(), [Qs1, Qs2]) ##########print(Qs1.mean().item()) with torch.no_grad(): #out_actor = self.choose_action(next_observations, mode='train') _, policy_action, log_pi = self.actor(next_observations) next_Qs1, next_Qs2 = self.critic_target( next_observations, policy_action) #print(next_actions_logprob.shape) next_Qs = torch.min(next_Qs1, next_Qs2) - self.alpha.detach() * log_pi Q_targets = rewards.unsqueeze(-1) + self.config[ 'agent.gamma'] * masks.unsqueeze(-1) * next_Qs #print(Q_targets.shape) #print(Qs1.shape, Q_targets.shape) critic_loss = F.mse_loss(Qs1, Q_targets) + F.mse_loss( Qs2, Q_targets) self.optimizer_zero_grad() critic_loss.backward() critic_grad_norm = nn.utils.clip_grad_norm_( self.critic.parameters(), self.config['agent.max_grad_norm']) self.critic_optimizer.step() if i % self.config['agent.policy_delay'] == 0: _, pi, log_pi = self.actor(observations) actor_Qs1, actor_Qs2 = self.critic(observations, pi) actor_Qs = torch.min(actor_Qs1, actor_Qs2) actor_loss = (self.alpha.detach() * log_pi - actor_Qs).mean() self.optimizer_zero_grad() actor_loss.backward() actor_grad_norm = nn.utils.clip_grad_norm_( self.actor.parameters(), self.config['agent.max_grad_norm']) self.actor_optimizer.step() alpha_loss = torch.mean( self.alpha * (-log_pi - self.target_entropy).detach()) #print((self.alpha*(sampled_actions_logprob - self.target_entropy)).shape) self.optimizer_zero_grad() alpha_loss.backward() self.log_alpha_optimizer.step() self.polyak_update_target() out['actor_loss'].append(actor_loss) out['alpha_loss'].append(alpha_loss) out['critic_loss'].append(critic_loss) Q1_vals.append(Qs1) Q2_vals.append(Qs2) logprob_vals.append(log_pi) out['actor_loss'] = torch.tensor(out['actor_loss']).mean().item() out['actor_grad_norm'] = actor_grad_norm out['critic_loss'] = torch.tensor(out['critic_loss']).mean().item() out['critic_grad_norm'] = critic_grad_norm describe_it = lambda x: describe(torch.cat(x).detach().cpu().numpy(). squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['Q1'] = describe_it(Q1_vals) out['Q2'] = describe_it(Q2_vals) out['logprob'] = describe_it(logprob_vals) out['alpha_loss'] = torch.tensor(out['alpha_loss']).mean().item() out['alpha'] = self.alpha.item() return out
def learn(self, D, **kwargs): replay = kwargs['replay'] episode_length = kwargs['episode_length'] out = {} out['actor_loss'] = [] out['critic_loss'] = [] out['alpha_loss'] = [] Q1_vals = [] Q2_vals = [] logprob_vals = [] for i in range(episode_length): observations, actions, rewards, next_observations, masks = replay.sample( self.config['replay.batch_size']) Qs1, Qs2 = self.critic(observations, actions) Qs1, Qs2 = map(lambda x: x.squeeze(-1), [Qs1, Qs2]) with torch.no_grad(): out_actor = self.choose_action(next_observations, mode='train') next_actions = out_actor['action'] next_actions_logprob = out_actor['action_logprob'] next_Qs1, next_Qs2 = self.critic_target( next_observations, next_actions) next_Qs = torch.min(next_Qs1, next_Qs2).squeeze( -1) - self.alpha.detach() * next_actions_logprob Q_targets = rewards + self.config[ 'agent.gamma'] * masks * next_Qs critic_loss = F.mse_loss(Qs1, Q_targets.detach()) + F.mse_loss( Qs2, Q_targets.detach()) print(critic_loss.item()) ############ self.optimizer_zero_grad() critic_loss.backward() critic_grad_norm = nn.utils.clip_grad_norm_( self.critic.parameters(), self.config['agent.max_grad_norm']) self.critic_optimizer.step() if i % self.config['agent.policy_delay'] == 0: out_actor = self.choose_action(observations, mode='train') policy_actions = out_actor['action'] policy_actions_logprob = out_actor['action_logprob'] actor_Qs1, actor_Qs2 = self.critic(observations, policy_actions) actor_Qs = torch.min(actor_Qs1, actor_Qs2).squeeze(-1) actor_loss = torch.mean(self.alpha.detach() * policy_actions_logprob - actor_Qs) self.optimizer_zero_grad() actor_loss.backward() actor_grad_norm = nn.utils.clip_grad_norm_( self.actor.parameters(), self.config['agent.max_grad_norm']) self.actor_optimizer.step() alpha_loss = torch.mean( self.log_alpha * (-policy_actions_logprob - self.target_entropy).detach()) self.optimizer_zero_grad() alpha_loss.backward() self.log_alpha_optimizer.step() self.polyak_update_target() out['actor_loss'].append(actor_loss) out['alpha_loss'].append(alpha_loss) out['critic_loss'].append(critic_loss) Q1_vals.append(Qs1) Q2_vals.append(Qs2) logprob_vals.append(policy_actions_logprob) out['actor_loss'] = torch.tensor(out['actor_loss']).mean().item() out['actor_grad_norm'] = actor_grad_norm out['critic_loss'] = torch.tensor(out['critic_loss']).mean().item() out['critic_grad_norm'] = critic_grad_norm describe_it = lambda x: describe(numpify(torch.cat(x), 'float'). squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['Q1'] = describe_it(Q1_vals) out['Q2'] = describe_it(Q2_vals) out['logprob'] = describe_it(logprob_vals) out['alpha_loss'] = torch.tensor(out['alpha_loss']).mean().item() out['alpha'] = self.alpha.item() return out
def learn(self, D, **kwargs): replay = kwargs['replay'] T = kwargs['T'] list_actor_loss = [] list_critic_loss = [] list_alpha_loss = [] Q1_vals = [] Q2_vals = [] logprob_vals = [] for i in range(T): observations, actions, rewards, next_observations, masks = replay.sample(self.config['replay.batch_size']) Qs1, Qs2 = self.critic(observations, actions) with torch.no_grad(): action_dist = self.actor(next_observations) next_actions = action_dist.rsample() next_actions_logprob = action_dist.log_prob(next_actions).unsqueeze(-1) next_Qs1, next_Qs2 = self.critic_target(next_observations, next_actions) next_Qs = torch.min(next_Qs1, next_Qs2) - self.alpha.detach()*next_actions_logprob targets = rewards + self.config['agent.gamma']*masks*next_Qs critic_loss = F.mse_loss(Qs1, targets.detach()) + F.mse_loss(Qs2, targets.detach()) self.optimizer_zero_grad() critic_loss.backward() critic_grad_norm = nn.utils.clip_grad_norm_(self.critic.parameters(), self.config['agent.max_grad_norm']) self.critic_optimizer.step() action_dist = self.actor(observations) policy_actions = action_dist.rsample() policy_actions_logprob = action_dist.log_prob(policy_actions).unsqueeze(-1) actor_Qs1, actor_Qs2 = self.critic(observations, policy_actions) actor_Qs = torch.min(actor_Qs1, actor_Qs2) actor_loss = torch.mean(self.alpha.detach()*policy_actions_logprob - actor_Qs) self.optimizer_zero_grad() actor_loss.backward() actor_grad_norm = nn.utils.clip_grad_norm_(self.actor.parameters(), self.config['agent.max_grad_norm']) self.actor_optimizer.step() alpha_loss = torch.mean(self.log_alpha*(-policy_actions_logprob - self.target_entropy).detach()) self.optimizer_zero_grad() alpha_loss.backward() self.log_alpha_optimizer.step() self.polyak_update_target() list_actor_loss.append(actor_loss) list_critic_loss.append(critic_loss) list_alpha_loss.append(alpha_loss) Q1_vals.append(Qs1) Q2_vals.append(Qs2) logprob_vals.append(policy_actions_logprob) self.total_timestep += T out = {} out['actor_loss'] = torch.tensor(list_actor_loss).mean(0).item() out['actor_grad_norm'] = actor_grad_norm out['critic_loss'] = torch.tensor(list_critic_loss).mean(0).item() out['critic_grad_norm'] = critic_grad_norm describe_it = lambda x: describe(numpify(torch.cat(x), 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['Q1'] = describe_it(Q1_vals) out['Q2'] = describe_it(Q2_vals) out['logprob'] = describe_it(logprob_vals) out['alpha_loss'] = torch.tensor(list_alpha_loss).mean(0).item() out['alpha'] = self.alpha.item() return out
def learn(self, D, **kwargs): replay = kwargs['replay'] T = kwargs['T'] list_actor_loss = [] list_critic_loss = [] Q1_vals = [] Q2_vals = [] for i in range(T): observations, actions, rewards, next_observations, masks = replay.sample( self.config['replay.batch_size']) Qs1, Qs2 = self.critic(observations, actions) with torch.no_grad(): next_actions = self.actor_target(next_observations) eps = torch.empty_like(next_actions).normal_( 0.0, self.config['agent.target_noise']) eps = eps.clamp(-self.config['agent.target_noise_clip'], self.config['agent.target_noise_clip']) next_actions = torch.clamp(next_actions + eps, -self.max_action, self.max_action) next_Qs1, next_Qs2 = self.critic_target( next_observations, next_actions) next_Qs = torch.min(next_Qs1, next_Qs2) targets = rewards + self.config['agent.gamma'] * masks * next_Qs critic_loss = F.mse_loss(Qs1, targets.detach()) + F.mse_loss( Qs2, targets.detach()) self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() critic_loss.backward() critic_grad_norm = nn.utils.clip_grad_norm_( self.critic.parameters(), self.config['agent.max_grad_norm']) self.critic_optimizer.step() if i % self.config['agent.policy_delay'] == 0: actor_loss = -self.critic.Q1(observations, self.actor(observations)).mean() self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() actor_loss.backward() actor_grad_norm = nn.utils.clip_grad_norm_( self.actor.parameters(), self.config['agent.max_grad_norm']) self.actor_optimizer.step() self.polyak_update_target() list_actor_loss.append(actor_loss) list_critic_loss.append(critic_loss) Q1_vals.append(Qs1) Q2_vals.append(Qs2) self.total_timestep += T out = {} out['actor_loss'] = torch.tensor(list_actor_loss).mean(0).item() out['actor_grad_norm'] = actor_grad_norm out['critic_loss'] = torch.tensor(list_critic_loss).mean(0).item() out['critic_grad_norm'] = critic_grad_norm describe_it = lambda x: describe(numpify(torch.cat(x), 'float'). squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['Q1'] = describe_it(Q1_vals) out['Q2'] = describe_it(Q2_vals) return out
def learn(self, D, **kwargs): # Compute all metrics, D: list of Trajectory logprobs = [ torch.cat(traj.get_all_info('action_logprob')) for traj in D ] entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D] Vs = [torch.cat(traj.get_all_info('V')) for traj in D] with torch.no_grad(): last_observations = tensorify( np.concatenate([traj.last_observation for traj in D], 0), self.device) last_Vs = self.V_head( self.feature_network(last_observations)).squeeze(-1) Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj, last_V) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-8) assert all( [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]]) # Loss policy_loss = -logprobs * As entropy_loss = -entropies value_loss = F.mse_loss(Vs, Qs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.optimizer.step() self.total_timestep += sum([len(traj) for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -entropy_loss.mean().item() out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(Qs, 'float'), y_pred=numpify(Vs, 'float')) return out
def evaluator(config, logdir, seed, make_env, learner_agent): torch.set_num_threads(1) # VERY IMPORTANT TO AVOID GETTING STUCK eval_logs = [] env = make_env(config, seed, 'train') agent = Agent(config, env, torch.device('cpu')) runner = EpisodeRunner(reset_on_call=True) evaluated_steps = config['eval.freq'] while learner_agent.total_timestep < config['train.timestep']: if learner_agent.total_timestep < evaluated_steps: time.sleep(1.0) else: t0 = time.perf_counter() agent.load_state_dict( learner_agent.state_dict()) # copy to CPU by default with torch.no_grad(): D = [] for _ in range(config['eval.num_episode']): D += runner(agent, env, env.spec.max_episode_steps) logger = Logger() logger('num_seconds', round(time.perf_counter() - t0, 1)) logger('num_trajectories', len(D)) logger('num_timesteps', sum([len(traj) for traj in D])) logger('accumulated_trained_timesteps', learner_agent.total_timestep) infos = [ info for info in chain.from_iterable([traj.infos for traj in D]) if 'episode' in info ] online_returns = [info['episode']['return'] for info in infos] online_horizons = [info['episode']['horizon'] for info in infos] logger( 'online_return', describe(online_returns, axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'online_horizon', describe(online_horizons, axis=-1, repr_indent=1, repr_prefix='\n')) monitor_env = get_wrapper(env, 'VecMonitor') logger( 'running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger.dump(keys=None, index=0, indent=0, border=color_str('+' * 50, color='green')) eval_logs.append(logger.logs) evaluated_steps += config['eval.freq'] pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')