def test_pickle_yaml(self): # Create some data a = {'one': 1, 'two': [2, 3]} b = {'three': 3, 'four': [4, 5]} c = [a, b] def _check(x): assert isinstance(x, list) assert len(x) == 2 assert all([isinstance(i, dict) for i in x]) assert list(x[0].keys()) == ['one', 'two'] assert list(x[1].keys()) == ['three', 'four'] assert list(x[0].values()) == [1, [2, 3]] assert list(x[1].values()) == [3, [4, 5]] # Pickle pickle_dump(c, '.tmp_pickle') _check(pickle_load('.tmp_pickle.pkl')) # remove the file os.unlink('.tmp_pickle.pkl') # Yaml yaml_dump(c, '.tmp_yaml') _check(yaml_load('.tmp_yaml.yml')) # remove the file os.unlink('.tmp_yaml.yml')
def save_running_average(self, f): r"""Save the running averages for observation and reward in a dictionary by pickling. It saves the mean and standard deviation for observation running average and the standard deviation for reward running average. A dictionary with keys 'obs_avg' and 'r_avg' will be created. Each key contains sub-keys ['mu', 'sigma']. Args: f (str): saving path """ # Get running average dictionary out = self.running_averages # Pickle it pickle_dump(obj=out, f=f, ext='.pkl')
def save_configs(self, f, method='pickle'): r"""Save the list of configurations returned from :meth:`make_configs`. Args: f (str): file path method (str): the method to save the list of configuration. Either 'pickle' or 'yaml' """ assert isinstance(method, str) methods = ['pickle', 'yaml'] assert method in methods, f'expected {methods}, got {method}' if method == 'pickle': pickle_dump(obj=self.configs, f=f, ext='.pkl') elif method == 'yaml': yaml_dump(obj=self.configs, f=f, ext='.yml')
def _process_es_result(self, result): best_f_val = result['best_f_val'] best_return = -best_f_val # negate to get back reward # logging self.logger.log('generation', self.generation) self.logger.log('best_return', best_return) if self.generation == 0 or (self.generation+1) % self.config['log.interval'] == 0: print('-'*50) self.logger.dump(keys=None, index=-1, indent=0) print('-'*50) # Save the loggings and final parameters if (self.generation+1) == self.num_iteration: pickle_dump(obj=self.logger.logs, f=self.logdir/'result', ext='.pkl') np.save(self.logdir/'trained_param', result['best_param'])
def __call__(self, config, seed, device_str): # Set random seeds set_global_seeds(seed) # Create device device = torch.device(device_str) # Use log dir for current job (run_experiment) logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) # Make environment (VecEnv) for training and evaluating env = make_vec_env( vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['train.N'], # batch size for multiple environments init_seed=seed) eval_env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=1, init_seed=seed) if config[ 'env.standardize']: # wrap with VecStandardize for running averages of observation and rewards env = VecStandardize(venv=env, use_obs=True, use_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, eps=1e-8) eval_env = VecStandardize( venv= eval_env, # remember to synchronize running averages during evaluation !!! use_obs=True, use_reward=False, # do not process rewards, no training clip_obs=env.clip_obs, clip_reward=env.clip_reward, gamma=env.gamma, eps=env.eps, constant_obs_mean=env.obs_runningavg. mu, # use current running average as constant constant_obs_std=env.obs_runningavg.sigma) env_spec = EnvSpec(env) # Create policy network = Network(config=config, env_spec=env_spec) if env_spec.control_type == 'Discrete': policy = CategoricalPolicy(config=config, network=network, env_spec=env_spec, learn_V=True) elif env_spec.control_type == 'Continuous': policy = GaussianPolicy( config=config, network=network, env_spec=env_spec, learn_V=True, min_std=config['agent.min_std'], std_style=config['agent.std_style'], constant_std=config['agent.constant_std'], std_state_dependent=config['agent.std_state_dependent'], init_std=config['agent.init_std']) network = network.to(device) # Create optimizer and learning rate scheduler optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr']) if config['algo.use_lr_scheduler']: if 'train.iter' in config: # iteration-based training max_epoch = config['train.iter'] elif 'train.timestep' in config: # timestep-based training max_epoch = config[ 'train.timestep'] + 1 # +1 to avoid 0.0 lr in final iteration lambda_f = lambda epoch: 1 - epoch / max_epoch # decay learning rate for each training epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Create agent kwargs = {'device': device} if config['algo.use_lr_scheduler']: kwargs['lr_scheduler'] = lr_scheduler agent = A2CAgent(config=config, policy=policy, optimizer=optimizer, **kwargs) # Create runner runner = SegmentRunner(agent=agent, env=env, gamma=config['algo.gamma']) eval_runner = TrajectoryRunner(agent=agent, env=eval_env, gamma=1.0) # Create engine engine = Engine(agent=agent, runner=runner, config=config, eval_runner=eval_runner) # Training and evaluation train_logs = [] eval_logs = [] for i in count(): # incremental iteration if 'train.iter' in config and i >= config[ 'train.iter']: # enough iterations break elif 'train.timestep' in config and agent.total_T >= config[ 'train.timestep']: # enough timesteps break # train and evaluation train_output = engine.train(n=i) # logging if i == 0 or (i + 1) % config['log.record_interval'] == 0 or ( i + 1) % config['log.print_interval'] == 0: train_log = engine.log_train(train_output) with torch.no_grad(): # disable grad, save memory eval_output = engine.eval(n=i) eval_log = engine.log_eval(eval_output) if i == 0 or (i + 1) % config[ 'log.record_interval'] == 0: # record loggings train_logs.append(train_log) eval_logs.append(eval_log) # Save all loggings pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl') pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl') return None
def __call__(self, config, seed, device_str): set_global_seeds(seed) device = torch.device(device_str) logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) # Environment related env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['train.N'], # batched environment init_seed=seed, rolling=True) eval_env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['eval.N'], init_seed=seed, rolling=False) if config['env.standardize']: # running averages of observation and reward env = VecStandardize(venv=env, use_obs=True, use_reward=False, # A2C clip_obs=10., clip_reward=10., gamma=0.99, eps=1e-8) eval_env = VecStandardize(venv=eval_env, # remember to synchronize running averages during evaluation !!! use_obs=True, use_reward=False, # do not process rewards, no training clip_obs=env.clip_obs, clip_reward=env.clip_reward, gamma=env.gamma, eps=env.eps, constant_obs_mean=env.obs_runningavg.mu, # use current running average as constant constant_obs_std=env.obs_runningavg.sigma) env_spec = EnvSpec(env) # Network and policy if config['network.recurrent']: network = LSTM(config=config, device=device, env_spec=env_spec) else: network = Network(config=config, device=device, env_spec=env_spec) if env_spec.control_type == 'Discrete': policy = CategoricalPolicy(config=config, network=network, env_spec=env_spec, device=device, learn_V=True) elif env_spec.control_type == 'Continuous': policy = GaussianPolicy(config=config, network=network, env_spec=env_spec, device=device, learn_V=True, min_std=config['agent.min_std'], std_style=config['agent.std_style'], constant_std=config['agent.constant_std'], std_state_dependent=config['agent.std_state_dependent'], init_std=config['agent.init_std']) # Optimizer and learning rate scheduler optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr']) if config['algo.use_lr_scheduler']: if 'train.iter' in config: # iteration-based max_epoch = config['train.iter'] elif 'train.timestep' in config: # timestep-based max_epoch = config['train.timestep'] + 1 # avoid zero lr in final iteration lambda_f = lambda epoch: 1 - epoch/max_epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Agent kwargs = {'device': device} if config['algo.use_lr_scheduler']: kwargs['lr_scheduler'] = lr_scheduler agent = A2CAgent(config=config, policy=policy, optimizer=optimizer, **kwargs) # Runner runner = SegmentRunner(agent=agent, env=env, gamma=config['algo.gamma']) eval_runner = TrajectoryRunner(agent=agent, env=eval_env, gamma=1.0) # Engine engine = Engine(agent=agent, runner=runner, config=config, eval_runner=eval_runner) # Training and evaluation train_logs = [] eval_logs = [] if config['network.recurrent']: rnn_states_buffer = agent.policy.rnn_states # for SegmentRunner for i in count(): if 'train.iter' in config and i >= config['train.iter']: # enough iterations break elif 'train.timestep' in config and agent.total_T >= config['train.timestep']: # enough timesteps break if config['network.recurrent']: if isinstance(rnn_states_buffer, list): # LSTM: [h, c] rnn_states_buffer = [buf.detach() for buf in rnn_states_buffer] else: rnn_states_buffer = rnn_states_buffer.detach() agent.policy.rnn_states = rnn_states_buffer train_output = engine.train(n=i) # Logging if i == 0 or (i+1) % config['log.record_interval'] == 0 or (i+1) % config['log.print_interval'] == 0: train_log = engine.log_train(train_output) if config['network.recurrent']: rnn_states_buffer = agent.policy.rnn_states # for SegmentRunner with torch.no_grad(): # disable grad, save memory eval_output = engine.eval(n=i) eval_log = engine.log_eval(eval_output) if i == 0 or (i+1) % config['log.record_interval'] == 0: train_logs.append(train_log) eval_logs.append(eval_log) # Save all loggings pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl') pickle_dump(obj=eval_logs, f=logdir/'eval_logs', ext='.pkl') return None