def train(self, n=None, **kwargs): self.model.train() logger = Logger() for i, (data, label) in enumerate(self.train_loader): start_time = perf_counter() data = data.to(self.model.device) re_x, mu, logvar = self.model(data) out = vae_loss(re_x, data, mu, logvar, 'BCE') loss = out['loss'] self.optimizer.zero_grad() loss.backward() self.optimizer.step() logger('epoch', n) self.model.total_iter += 1 logger('iteration', self.model.total_iter) logger('mini-batch', i) logger('train_loss', out['loss'].item()) logger('reconstruction_loss', out['re_loss'].item()) logger('KL_loss', out['KL_loss'].item()) logger('num_seconds', round(perf_counter() - start_time, 1)) if i == 0 or (i+1) % self.config['log.freq'] == 0: logger.dump(keys=None, index=-1, indent=0, border='-'*50) mean_loss = np.mean([logger.logs['train_loss']]) print(f'====> Average loss: {mean_loss}') # Use decoder to sample images from standard Gaussian noise with torch.no_grad(): # fast, disable grad z = torch.randn(64, self.config['nn.z_dim']).to(self.model.device) re_x = self.model.decode(z).cpu() save_image(re_x.view(64, 1, 28, 28), f'{kwargs["logdir"]}/sample_{n}.png') return logger
def log_eval(self, eval_output, **kwargs): # Create evaluation logger logger = Logger(name='eval_logger') # Unpack evaluation for logging D = eval_output['D'] n = eval_output['n'] T = eval_output['T'] # Loggings: use item() to save memory # Log something about trajectories batch_returns = [sum(trajectory.all_r) for trajectory in D] batch_T = [trajectory.T for trajectory in D] logger.log('evaluation_iteration', n + 1) logger.log('num_trajectories', len(D)) logger.log('max_allowed_horizon', T) logger.log('average_horizon', np.mean(batch_T)) logger.log('num_timesteps', np.sum(batch_T)) logger.log('accumulated_trained_timesteps', self.agent.total_T) logger.log('average_return', np.mean(batch_returns)) logger.log('std_return', np.std(batch_returns)) logger.log('min_return', np.min(batch_returns)) logger.log('max_return', np.max(batch_returns)) # Dump loggings if n == 0 or (n + 1) % self.config['log.print_interval'] == 0: print(color_str('+' * 50, 'yellow', 'bold')) logger.dump(keys=None, index=None, indent=0) print(color_str('+' * 50, 'yellow', 'bold')) return logger.logs
def run(config, seed, device, logdir): set_global_seeds(seed) print('Initializing...') agent = Agent(config, make_env(config, seed), device) es = CMAES([config['train.mu0']]*agent.num_params, config['train.std0'], {'popsize': config['train.popsize'], 'seed': seed}) train_logs = [] checkpoint_count = 0 with ProcessPoolExecutor(max_workers=config['train.popsize'], initializer=initializer, initargs=(config, seed, device)) as executor: print('Finish initialization. Training starts...') for generation in range(config['train.generations']): start_time = time.perf_counter() solutions = es.ask() out = list(executor.map(fitness, solutions, chunksize=2)) Rs, Hs = zip(*out) es.tell(solutions, [-R for R in Rs]) logger = Logger() logger('generation', generation+1) logger('num_seconds', round(time.perf_counter() - start_time, 1)) logger('Returns', describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n')) logger('Horizons', describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n')) logger('fbest', es.result.fbest) train_logs.append(logger.logs) if generation == 0 or (generation+1)%config['log.freq'] == 0: logger.dump(keys=None, index=0, indent=0, border='-'*50) if (generation+1) >= int(config['train.generations']*(checkpoint_count/(config['checkpoint.num'] - 1))): agent.from_vec(tensorify(es.result.xbest, 'cpu')) agent.checkpoint(logdir, generation+1) checkpoint_count += 1 pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl') return None
def eval(self, n=None, **kwargs): start_time = perf_counter() returns = [] horizons = [] for _ in range(self.config['eval.num_episode']): observation = self.eval_env.reset() for _ in range(self.eval_env.spec.max_episode_steps): with torch.no_grad(): action = self.agent.choose_action(observation, mode='eval')['action'] next_observation, reward, done, info = self.eval_env.step(action) if done[0]: # [0] single environment returns.append(info[0]['episode']['return']) horizons.append(info[0]['episode']['horizon']) break observation = next_observation logger = Logger() logger('num_seconds', round(perf_counter() - start_time, 1)) logger('accumulated_trained_timesteps', kwargs['accumulated_trained_timesteps']) logger('accumulated_trained_episodes', kwargs['accumulated_trained_episodes']) logger('online_return', describe(returns, axis=-1, repr_indent=1, repr_prefix='\n')) logger('online_horizon', describe(horizons, axis=-1, repr_indent=1, repr_prefix='\n')) monitor_env = get_wrapper(self.eval_env, 'VecMonitor') logger('running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger('running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green')) return logger.logs
def train(self, n=None, **kwargs): train_logs, eval_logs = [], [] checkpoint_count = 0 for iteration in count(): if self.agent.total_timestep >= self.config['train.timestep']: break t0 = time.perf_counter() if iteration < self.config['replay.init_trial']: [traj] = self.runner(self.random_agent, self.env, 1) else: [traj] = self.runner(self.agent, self.env, 1, mode='train') self.replay.add(traj) # Number of gradient updates = collected episode length out_agent = self.agent.learn(D=None, replay=self.replay, T=traj.T) logger = Logger() logger('train_iteration', iteration+1) logger('num_seconds', round(time.perf_counter() - t0, 1)) [logger(key, value) for key, value in out_agent.items()] logger('episode_return', sum(traj.rewards)) logger('episode_horizon', traj.T) logger('accumulated_trained_timesteps', self.agent.total_timestep) train_logs.append(logger.logs) if iteration == 0 or (iteration+1) % self.config['log.freq'] == 0: logger.dump(keys=None, index=0, indent=0, border='-'*50) if self.agent.total_timestep >= int(self.config['train.timestep']*(checkpoint_count/(self.config['checkpoint.num'] - 1))): self.agent.checkpoint(self.logdir, iteration + 1) checkpoint_count += 1 if self.agent.total_timestep >= int(self.config['train.timestep']*(len(eval_logs)/(self.config['eval.num'] - 1))): eval_logs.append(self.eval(n=len(eval_logs))) return train_logs, eval_logs
def log_eval(self, eval_output, **kwargs): D = eval_output['D'] n = eval_output['n'] T = eval_output['T'] num_sec = eval_output['num_sec'] logger = Logger() batch_returns = D.numpy_rewards.sum(1) logger('evaluation_iteration', n + 1) logger('num_seconds', round(num_sec, 1)) logger('num_trajectories', D.N) logger('max_allowed_horizon', T) logger('mean_horizon', D.Ts.mean()) logger('total_timesteps', D.total_T) logger('accumulated_trained_timesteps', self.agent.total_T) logger('mean_return', batch_returns.mean()) logger('std_return', batch_returns.std()) logger('min_return', batch_returns.min()) logger('max_return', batch_returns.max()) print(color_str('+' * 50, 'yellow', 'bold')) logger.dump(keys=None, index=None, indent=0) print(color_str('+' * 50, 'yellow', 'bold')) return logger.logs
def train(self, n=None): self.agent.train() logger = Logger() for i, (data, label) in enumerate(self.agent.train_loader): data = data.to(self.agent.device) self.agent.optimizer.zero_grad() re_x, mu, logvar = self.agent(data) out = self.agent.vae_loss(re_x=re_x, x=data, mu=mu, logvar=logvar, loss_type='BCE') loss = out['loss'] loss.backward() self.agent.optimizer.step() logger('epoch', n) logger('iteration', i) logger('train_loss', out['loss'].item()) logger('reconstruction_loss', out['re_loss'].item()) logger('KL_loss', out['KL_loss'].item()) if i == 0 or (i + 1) % self.config['log.interval'] == 0: print('-' * 50) logger.dump(keys=None, index=-1, indent=0) print('-' * 50) return logger.logs
def log_train(self, train_output, **kwargs): D = train_output['D'] out_agent = train_output['out_agent'] n = train_output['n'] num_sec = train_output['num_sec'] logger = Logger() logger('train_iteration', n + 1) # starts from 1 logger('num_seconds', round(num_sec, 1)) [logger(key, value) for key, value in out_agent.items()] batch_returns = D.numpy_rewards.sum(1) logger('num_trajectories', D.N) logger('num_timesteps', D.total_T) logger('accumulated_trained_timesteps', self.agent.total_T) logger('mean_return', batch_returns.mean()) logger('std_return', batch_returns.std()) logger('min_return', batch_returns.min()) logger('max_return', batch_returns.max()) monitor_env = get_wrapper(self.runner.env, 'VecMonitor') infos = list( filter(lambda info: 'episode' in info, chain.from_iterable(D.infos))) if len(infos) > 0: online_returns = np.asarray( [info['episode']['return'] for info in infos]) online_horizons = np.asarray( [info['episode']['horizon'] for info in infos]) logger('online_N', len(infos)) logger('online_mean_return', online_returns.mean()) logger('online_std_return', online_returns.std()) logger('online_min_return', online_returns.min()) logger('online_max_return', online_returns.max()) logger('online_mean_horizon', online_horizons.mean()) logger('online_std_horizon', online_horizons.std()) logger('online_min_horizon', online_horizons.min()) logger('online_max_horizon', online_horizons.max()) running_returns = np.asarray(monitor_env.return_queue) running_horizons = np.asarray(monitor_env.horizon_queue) if running_returns.size > 0 and running_horizons.size > 0: logger('running_queue', [ len(monitor_env.return_queue), monitor_env.return_queue.maxlen ]) logger('running_mean_return', running_returns.mean()) logger('running_std_return', running_returns.std()) logger('running_min_return', running_returns.min()) logger('running_max_return', running_returns.max()) logger('running_mean_horizon', running_horizons.mean()) logger('running_std_horizon', running_horizons.std()) logger('running_min_horizon', running_horizons.min()) logger('running_max_horizon', running_horizons.max()) print('-' * 50) logger.dump(keys=None, index=None, indent=0) print('-' * 50) return logger.logs
def log_eval(self, eval_output): # Create evaluation logger logger = Logger(name='eval_logger') # Unpack evaluation for logging D = eval_output['D'] n = eval_output['n'] # Compute some metrics batch_returns = [sum(trajectory.all_r) for trajectory in D] batch_T = [trajectory.T for trajectory in D] # Loggings # Use item() for tensor to save memory logger.log(key='evaluation_iteration', val=n + 1) logger.log(key='num_trajectories', val=len(D)) logger.log(key='max_allowed_horizon', val=self.config['eval:T']) logger.log(key='average_horizon', val=np.mean(batch_T)) logger.log(key='num_timesteps', val=np.sum(batch_T)) logger.log(key='accumulated_trained_timesteps', val=self.accumulated_trained_timesteps) logger.log(key='average_return', val=np.mean(batch_returns)) logger.log(key='std_return', val=np.std(batch_returns)) logger.log(key='min_return', val=np.min(batch_returns)) logger.log(key='max_return', val=np.max(batch_returns)) # Dump the loggings print('-' * 50) logger.dump(keys=None, index=None, indent=0) print('-' * 50) return logger
def train(self, n=None, **kwargs): train_logs = [] eval_logs = [] eval_togo = 0 dump_togo = 0 num_episode = 0 checkpoint_count = 0 observation = self.env.reset() for i in count(): if i >= self.config['train.timestep']: break if i < self.config['replay.init_size']: action = [self.env.action_space.sample()] else: action = self.agent.choose_action(observation, mode='stochastic')['action'] next_observation, reward, done, info = self.env.step(action) eval_togo += 1 dump_togo += 1 if done[0]: # [0] due to single environment start_time = perf_counter() # NOTE: must use latest TimeLimit reach_time_limit = info[0].get('TimeLimit.truncated', False) reach_terminal = not reach_time_limit self.replay.add(observation[0], action[0], reward[0], info[0]['last_observation'], reach_terminal) # updates in the end of episode, for each time step out_agent = self.agent.learn( D=None, replay=self.replay, episode_length=info[0]['episode']['horizon']) num_episode += 1 if i >= int(self.config['train.timestep'] * (checkpoint_count / (self.config['checkpoint.num'] - 1))): self.agent.checkpoint(self.logdir, num_episode) checkpoint_count += 1 logger = Logger() logger('num_seconds', round(perf_counter() - start_time, 1)) logger('accumulated_trained_timesteps', i + 1) logger('accumulated_trained_episodes', num_episode) [logger(key, value) for key, value in out_agent.items()] logger('episode_return', info[0]['episode']['return']) logger('episode_horizon', info[0]['episode']['horizon']) train_logs.append(logger.logs) if dump_togo >= self.config['log.freq']: dump_togo %= self.config['log.freq'] logger.dump(keys=None, index=0, indent=0, border='-' * 50) if eval_togo >= self.config['eval.freq']: eval_togo %= self.config['eval.freq'] eval_logs.append( self.eval(accumulated_trained_timesteps=(i + 1), accumulated_trained_episodes=num_episode)) else: self.replay.add(observation[0], action[0], reward[0], next_observation[0], done[0]) observation = next_observation return train_logs, eval_logs
def log_train(self, train_output): # Create training logger logger = Logger(name='train_logger') # Unpack training output for logging D = train_output['D'] out_agent = train_output['out_agent'] n = train_output['n'] # Loggings # Use item() for tensor to save memory logger.log(key='train_iteration', val=n + 1) # iteration starts from 1 if self.config['algo:use_lr_scheduler']: logger.log(key='current_lr', val=out_agent['current_lr']) logger.log(key='loss', val=out_agent['loss'].item()) policy_loss = torch.stack(out_agent['batch_policy_loss']).mean().item() logger.log(key='policy_loss', val=policy_loss) entropy_loss = torch.stack( out_agent['batch_entropy_loss']).mean().item() logger.log(key='policy_entropy', val=-entropy_loss) # negation of entropy loss value_loss = torch.stack(out_agent['batch_value_loss']).mean().item() logger.log(key='value_loss', val=value_loss) # Get some data from trajectory list batch_returns = [trajectory.all_returns[0] for trajectory in D] batch_discounted_returns = [ trajectory.all_discounted_returns[0] for trajectory in D ] num_timesteps = sum([trajectory.T for trajectory in D]) # Log more information logger.log(key='num_trajectories', val=len(D)) logger.log(key='num_timesteps', val=num_timesteps) logger.log(key='accumulated_trained_timesteps', val=self.accumulated_trained_timesteps) logger.log(key='average_return', val=np.mean(batch_returns)) logger.log(key='average_discounted_return', val=np.mean(batch_discounted_returns)) logger.log(key='std_return', val=np.std(batch_returns)) logger.log(key='min_return', val=np.min(batch_returns)) logger.log(key='max_return', val=np.max(batch_returns)) # Dump the loggings print('-' * 50) logger.dump(keys=None, index=None, indent=0) print('-' * 50) return logger
def run(config, seed, device, logdir): set_global_seeds(seed) torch.set_num_threads(1) # VERY IMPORTANT TO AVOID GETTING STUCK print('Initializing...') agent = Agent(config, make_env(config, seed, 'eval'), device) es = OpenAIES( [config['train.mu0']] * agent.num_params, config['train.std0'], { 'popsize': config['train.popsize'], 'seed': seed, 'sigma_scheduler_args': config['train.sigma_scheduler_args'], 'lr': config['train.lr'], 'lr_decay': config['train.lr_decay'], 'min_lr': config['train.min_lr'], 'antithetic': config['train.antithetic'], 'rank_transform': config['train.rank_transform'] }) train_logs = [] checkpoint_count = 0 with Pool(processes=config['train.popsize'] // config['train.worker_chunksize']) as pool: print('Finish initialization. Training starts...') for generation in range(config['train.generations']): t0 = time.perf_counter() solutions = es.ask() data = [(config, seed, device, solution) for solution in solutions] out = pool.map(CloudpickleWrapper(fitness), data, chunksize=config['train.worker_chunksize']) Rs, Hs = zip(*out) es.tell(solutions, [-R for R in Rs]) logger = Logger() logger('generation', generation + 1) logger('num_seconds', round(time.perf_counter() - t0, 1)) logger('Returns', describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n')) logger('Horizons', describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n')) logger('fbest', es.result.fbest) train_logs.append(logger.logs) if generation == 0 or (generation + 1) % config['log.freq'] == 0: logger.dump(keys=None, index=0, indent=0, border='-' * 50) if (generation + 1) >= int(config['train.generations'] * (checkpoint_count / (config['checkpoint.num'] - 1))): agent.from_vec(tensorify(es.result.xbest, 'cpu')) agent.checkpoint(logdir, generation + 1) checkpoint_count += 1 pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl') return None
class ESMaster(BaseESMaster): @property def _num_params(self): worker = ESWorker() worker._prepare(self.config) num_params = worker.agent.num_params del worker return num_params def make_es(self, config): if self.config['es.algo'] == 'CMAES': es = CMAES(mu0=[self.config['es.mu0']] * self._num_params, std0=self.config['es.std0'], popsize=self.config['es.popsize']) elif self.config['es.algo'] == 'OpenAIES': es = OpenAIES(mu0=[self.config['es.mu0']] * self._num_params, std0=self.config['es.std0'], popsize=self.config['es.popsize'], std_decay=0.999, min_std=0.01, lr=1e-1, lr_decay=0.99, min_lr=1e-3, antithetic=True, rank_transform=True) self.logger = Logger() return es def process_es_result(self, result): best_f_val = result['best_f_val'] best_return = -best_f_val self.logger('generation', self.generation + 1) self.logger('best_return', best_return) if self.generation == 0 or (self.generation + 1) % self.config['log.interval'] == 0: print('-' * 50) self.logger.dump(keys=None, index=-1, indent=0) print('-' * 50) # Save the loggings and final parameters if (self.generation + 1) == self.config['train.num_iteration']: pickle_dump(obj=self.logger.logs, f=self.logdir / 'result', ext='.pkl') np.save(self.logdir / 'trained_param', result['best_param'])
def log_train(self, train_output): # Create training logger logger = Logger(name='train_logger') # Unpack training output for logging D = train_output['D'] out_agent = train_output['out_agent'] n = train_output['n'] # Loggings # Use item() for tensor to save memory logger.log(key='train_iteration', val=n + 1) # iteration starts from 1 if self.config['algo:use_lr_scheduler']: logger.log(key='current_lr', val=out_agent['current_lr']) logger.log(key='loss', val=out_agent['loss'].item()) policy_loss = torch.stack(out_agent['batch_policy_loss']).mean().item() logger.log(key='policy_loss', val=policy_loss) entropy_loss = torch.stack( out_agent['batch_entropy_loss']).mean().item() logger.log(key='policy_entropy', val=-entropy_loss) # negation of entropy loss value_loss = torch.stack(out_agent['batch_value_loss']).mean().item() logger.log(key='value_loss', val=value_loss) # Get some data from segment list all_immediate_reward = [segment.all_r for segment in D] num_timesteps = sum([segment.T for segment in D]) # Log more information logger.log(key='num_segments', val=sum([len(segment.split_transitions) for segment in D])) logger.log(key='num_timesteps', val=num_timesteps) logger.log(key='accumulated_trained_timesteps', val=self.accumulated_trained_timesteps) logger.log(key='average_immediate_reward', val=np.mean(all_immediate_reward)) logger.log(key='std_immediate_reward', val=np.std(all_immediate_reward)) logger.log(key='min_immediate_reward', val=np.min(all_immediate_reward)) logger.log(key='max_immediate_reward', val=np.max(all_immediate_reward)) # Dump the loggings print('-' * 50) logger.dump(keys=None, index=None, indent=0) print('-' * 50) return logger
def eval(self, n=None, **kwargs): t0 = time.perf_counter() with torch.no_grad(): D = self.runner(self.agent, self.eval_env, 10, mode='eval') logger = Logger() logger('eval_iteration', n+1) logger('num_seconds', round(time.perf_counter() - t0, 1)) logger('accumulated_trained_timesteps', self.agent.total_timestep) logger('online_return', describe([sum(traj.rewards) for traj in D], axis=-1, repr_indent=1, repr_prefix='\n')) logger('online_horizon', describe([traj.T for traj in D], axis=-1, repr_indent=1, repr_prefix='\n')) logger('running_return', describe(self.eval_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger('running_horizon', describe(self.eval_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green')) return logger.logs
def log_train(self, train_output, **kwargs): logger = Logger() D = train_output['D'] out_agent = train_output['out_agent'] n = train_output['n'] logger('train_iteration', n + 1) # starts from 1 logger('params', self.agent.policy.state_dict()) logger('num_segments', D.N) logger('num_timesteps', D.total_T) logger('accumulated_trained_timesteps', self.agent.total_T) print('-' * 50) logger.dump(keys=None, index=None, indent=0) print('-' * 50) return logger.logs
def log_train(self, train_output, **kwargs): # Create training logger logger = Logger(name='train_logger') # Unpack training output for logging D = train_output['D'] out_agent = train_output['out_agent'] n = train_output['n'] # Loggings: use item() to save memory logger.log('train_iteration', n + 1) # iteration starts from 1 if self.config['algo.use_lr_scheduler']: logger.log('current_lr', out_agent['current_lr']) logger.log('loss', out_agent['loss']) logger.log('policy_loss', out_agent['policy_loss']) logger.log( 'policy_entropy', -out_agent['entropy_loss']) # negate entropy loss is entropy logger.log('value_loss', out_agent['value_loss']) # Log something about trajectories batch_returns = [sum(trajectory.all_r) for trajectory in D] batch_discounted_returns = [ trajectory.all_discounted_returns[0] for trajectory in D ] num_timesteps = sum([trajectory.T for trajectory in D]) logger.log('num_trajectories', len(D)) logger.log('num_timesteps', num_timesteps) logger.log('accumulated_trained_timesteps', self.agent.total_T) logger.log('average_return', np.mean(batch_returns)) logger.log('average_discounted_return', np.mean(batch_discounted_returns)) logger.log('std_return', np.std(batch_returns)) logger.log('min_return', np.min(batch_returns)) logger.log('max_return', np.max(batch_returns)) # Dump loggings if n == 0 or (n + 1) % self.config['log.print_interval'] == 0: print('-' * 50) logger.dump(keys=None, index=None, indent=0) print('-' * 50) return logger.logs
def train(self, n=None): self.agent.train() # set to training mode # Create a logger train_output = Logger() # Iterate over data batches for one epoch for i, (data, label) in enumerate(self.train_loader): # Put data to device data = data.to(self.device) # Zero-out gradient buffer self.optimizer.zero_grad() # Forward pass of data re_x, mu, logvar = self.agent(data) # Calculate loss out = self.agent.calculate_loss(re_x=re_x, x=data, mu=mu, logvar=logvar, loss_type='BCE') loss = out['loss'] # Backward pass to calcualte gradients loss.backward() # Take a gradient step self.optimizer.step() # Record train output train_output.log('epoch', n) train_output.log('iteration', i) train_output.log('train_loss', out['loss'].item()) # item() saves memory train_output.log('reconstruction_loss', out['re_loss'].item()) train_output.log('KL_loss', out['KL_loss'].item()) # Dump logging if i == 0 or (i + 1) % self.config['log.interval'] == 0: print('-' * 50) train_output.dump(keys=None, index=-1, indent=0) print('-' * 50) return train_output.logs
def log_train(self, train_output, **kwargs): # Unpack D = train_output['D'] out_agent = train_output['out_agent'] n = train_output['n'] # Loggings logger = Logger(name='train_logger') logger.log('train_iteration', n + 1) # starts from 1 if self.config['algo.use_lr_scheduler']: logger.log('current_lr', out_agent['current_lr']) logger.log('loss', out_agent['loss']) logger.log('policy_loss', out_agent['policy_loss']) logger.log( 'policy_entropy', -out_agent['entropy_loss']) # entropy: negative entropy loss logger.log('value_loss', out_agent['value_loss']) all_immediate_reward = [segment.all_r for segment in D] num_timesteps = sum([segment.T for segment in D]) logger.log('num_segments', len(D)) logger.log('num_subsegments', sum([len(segment.trajectories) for segment in D])) logger.log('num_timesteps', num_timesteps) logger.log('accumulated_trained_timesteps', self.agent.total_T) logger.log('average_immediate_reward', np.mean(all_immediate_reward)) logger.log('std_immediate_reward', np.std(all_immediate_reward)) logger.log('min_immediate_reward', np.min(all_immediate_reward)) logger.log('max_immediate_reward', np.max(all_immediate_reward)) # Dump loggings if n == 0 or (n + 1) % self.config['log.print_interval'] == 0: print('-' * 50) logger.dump(keys=None, index=None, indent=0) print('-' * 50) return logger.logs
class ESMaster(BaseESMaster): def _network_size(self): worker = ESWorker() tmp_agent = worker.init(seed=0, config=self.config) num_params = worker.network.num_params del worker, tmp_agent return num_params def make_es(self, config): es = CMAES(mu0=[self.config['es.mu0']]*self._network_size(), std0=self.config['es.std0'], popsize=self.config['es.popsize']) self.logger = Logger() return es def _process_es_result(self, result): best_f_val = result['best_f_val'] best_return = -best_f_val # negate to get back reward # logging self.logger.log('generation', self.generation) self.logger.log('best_return', best_return) if self.generation == 0 or (self.generation+1) % self.config['log.interval'] == 0: print('-'*50) self.logger.dump(keys=None, index=-1, indent=0) print('-'*50) # Save the loggings and final parameters if (self.generation+1) == self.num_iteration: pickle_dump(obj=self.logger.logs, f=self.logdir/'result', ext='.pkl') np.save(self.logdir/'trained_param', result['best_param'])
def test_logger(self): logger = Logger(name='logger') logger.log('iteration', 1) logger.log('learning_rate', 1e-3) logger.log('training_loss', 0.12) logger.log('evaluation_loss', 0.14) logger.log('iteration', 2) logger.log('learning_rate', 5e-4) logger.log('training_loss', 0.11) logger.log('evaluation_loss', 0.13) logger.log('iteration', 3) logger.log('learning_rate', 1e-4) logger.log('training_loss', 0.09) logger.log('evaluation_loss', 0.10) # Test dump, because dump will call print, impossible to use assert logger.dump() logger.dump(keys=None, index=None, indent=1) logger.dump(keys=None, index=None, indent=2) logger.dump(keys=['iteration', 'evaluation_loss'], index=None, indent=0) logger.dump(keys=None, index=0, indent=0) logger.dump(keys=None, index=2, indent=0) logger.dump(keys=None, index=[0, 2], indent=0) logger.dump(keys=['iteration', 'training_loss'], index=[0, 2], indent=0) # Test save function file = './test_logger_file' logger.save(file=file) assert os.path.exists(file) # Load file logging = Logger.load(file) assert len(logging) == 4 assert 'iteration' in logging assert 'learning_rate' in logging assert 'training_loss' in logging assert 'evaluation_loss' in logging assert np.allclose(logging['iteration'], [1, 2, 3]) assert np.allclose(logging['learning_rate'], [1e-3, 5e-4, 1e-4]) assert np.allclose(logging['training_loss'], [0.12, 0.11, 0.09]) assert np.allclose(logging['evaluation_loss'], [0.14, 0.13, 0.1]) # Delete the temp logger file os.unlink(file)
def test_logger(): logger = Logger() logger('iteration', 1) logger('learning_rate', 1e-3) logger('train_loss', 0.12) logger('eval_loss', 0.14) logger('iteration', 2) logger('learning_rate', 5e-4) logger('train_loss', 0.11) logger('eval_loss', 0.13) logger('iteration', 3) logger('learning_rate', 1e-4) logger('train_loss', 0.09) logger('eval_loss', 0.10) def check(logs): assert len(logs) == 4 assert list(logs.keys()) == ['iteration', 'learning_rate', 'train_loss', 'eval_loss'] assert logs['iteration'] == [1, 2, 3] assert np.allclose(logs['learning_rate'], [1e-3, 5e-4, 1e-4]) assert np.allclose(logs['train_loss'], [0.12, 0.11, 0.09]) assert np.allclose(logs['eval_loss'], [0.14, 0.13, 0.10]) check(logger.logs) logger.dump() logger.dump(border='-'*50) logger.dump(keys=['iteration']) logger.dump(keys=['iteration', 'train_loss']) logger.dump(index=0) logger.dump(index=[1, 2]) logger.dump(index=0) logger.dump(keys=['iteration', 'eval_loss'], index=1) logger.dump(keys=['iteration', 'learning_rate'], indent=1) logger.dump(keys=['iteration', 'train_loss'], index=[0, 2], indent=1, border='#'*50) f = Path('./logger_file') logger.save(f) f = f.with_suffix('.pkl') assert f.exists() logs = pickle_load(f) check(logs) f.unlink() assert not f.exists() logger.clear() assert len(logger.logs) == 0
def evaluator(config, logdir, seed, make_env, learner_agent): torch.set_num_threads(1) # VERY IMPORTANT TO AVOID GETTING STUCK eval_logs = [] env = make_env(config, seed, 'train') agent = Agent(config, env, torch.device('cpu')) runner = EpisodeRunner(reset_on_call=True) evaluated_steps = config['eval.freq'] while learner_agent.total_timestep < config['train.timestep']: if learner_agent.total_timestep < evaluated_steps: time.sleep(1.0) else: t0 = time.perf_counter() agent.load_state_dict( learner_agent.state_dict()) # copy to CPU by default with torch.no_grad(): D = [] for _ in range(config['eval.num_episode']): D += runner(agent, env, env.spec.max_episode_steps) logger = Logger() logger('num_seconds', round(time.perf_counter() - t0, 1)) logger('num_trajectories', len(D)) logger('num_timesteps', sum([len(traj) for traj in D])) logger('accumulated_trained_timesteps', learner_agent.total_timestep) infos = [ info for info in chain.from_iterable([traj.infos for traj in D]) if 'episode' in info ] online_returns = [info['episode']['return'] for info in infos] online_horizons = [info['episode']['horizon'] for info in infos] logger( 'online_return', describe(online_returns, axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'online_horizon', describe(online_horizons, axis=-1, repr_indent=1, repr_prefix='\n')) monitor_env = get_wrapper(env, 'VecMonitor') logger( 'running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger.dump(keys=None, index=0, indent=0, border=color_str('+' * 50, color='green')) eval_logs.append(logger.logs) evaluated_steps += config['eval.freq'] pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')