def init(self, seed, config): # Make environment # Remember to seed it in each working function ! self.env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=1, init_seed=seed) self.env_spec = EnvSpec(self.env) # Make agent self.network = Network(config=config, env_spec=self.env_spec) if self.env_spec.control_type == 'Discrete': self.policy = CategoricalPolicy(config=config, network=self.network, env_spec=self.env_spec) elif self.env_spec.control_type == 'Continuous': self.policy = GaussianPolicy(config=config, network=self.network, env_spec=self.env_spec) self.agent = Agent(policy=self.policy, config=config)
def init(self, seed, config): # Make environment # Remember to seed it in each working function ! self.env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['train.N'], init_seed=seed, rolling=False) self.env_spec = EnvSpec(self.env) # Make agent if config['network.recurrent']: self.network = LSTM(config=config, env_spec=self.env_spec) else: self.network = Network(config=config, env_spec=self.env_spec) if self.env_spec.control_type == 'Discrete': self.policy = CategoricalPolicy(config=config, network=self.network, env_spec=self.env_spec, device=None) elif self.env_spec.control_type == 'Continuous': self.policy = GaussianPolicy(config=config, network=self.network, env_spec=self.env_spec, device=None) self.agent = Agent(config=config, policy=self.policy)
def __call__(self, config, seed, device_str): # Set random seeds set_global_seeds(seed) # Create device device = torch.device(device_str) # Use log dir for current job (run_experiment) logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) # Make environment (VecEnv) for training and evaluating env = make_vec_env( vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['train.N'], # batch size for multiple environments init_seed=seed) eval_env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=1, init_seed=seed) if config[ 'env.standardize']: # wrap with VecStandardize for running averages of observation and rewards env = VecStandardize(venv=env, use_obs=True, use_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, eps=1e-8) eval_env = VecStandardize( venv= eval_env, # remember to synchronize running averages during evaluation !!! use_obs=True, use_reward=False, # do not process rewards, no training clip_obs=env.clip_obs, clip_reward=env.clip_reward, gamma=env.gamma, eps=env.eps, constant_obs_mean=env.obs_runningavg. mu, # use current running average as constant constant_obs_std=env.obs_runningavg.sigma) env_spec = EnvSpec(env) # Create policy network = Network(config=config, env_spec=env_spec) if env_spec.control_type == 'Discrete': policy = CategoricalPolicy(config=config, network=network, env_spec=env_spec, learn_V=True) elif env_spec.control_type == 'Continuous': policy = GaussianPolicy( config=config, network=network, env_spec=env_spec, learn_V=True, min_std=config['agent.min_std'], std_style=config['agent.std_style'], constant_std=config['agent.constant_std'], std_state_dependent=config['agent.std_state_dependent'], init_std=config['agent.init_std']) network = network.to(device) # Create optimizer and learning rate scheduler optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr']) if config['algo.use_lr_scheduler']: if 'train.iter' in config: # iteration-based training max_epoch = config['train.iter'] elif 'train.timestep' in config: # timestep-based training max_epoch = config[ 'train.timestep'] + 1 # +1 to avoid 0.0 lr in final iteration lambda_f = lambda epoch: 1 - epoch / max_epoch # decay learning rate for each training epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Create agent kwargs = {'device': device} if config['algo.use_lr_scheduler']: kwargs['lr_scheduler'] = lr_scheduler agent = A2CAgent(config=config, policy=policy, optimizer=optimizer, **kwargs) # Create runner runner = SegmentRunner(agent=agent, env=env, gamma=config['algo.gamma']) eval_runner = TrajectoryRunner(agent=agent, env=eval_env, gamma=1.0) # Create engine engine = Engine(agent=agent, runner=runner, config=config, eval_runner=eval_runner) # Training and evaluation train_logs = [] eval_logs = [] for i in count(): # incremental iteration if 'train.iter' in config and i >= config[ 'train.iter']: # enough iterations break elif 'train.timestep' in config and agent.total_T >= config[ 'train.timestep']: # enough timesteps break # train and evaluation train_output = engine.train(n=i) # logging if i == 0 or (i + 1) % config['log.record_interval'] == 0 or ( i + 1) % config['log.print_interval'] == 0: train_log = engine.log_train(train_output) with torch.no_grad(): # disable grad, save memory eval_output = engine.eval(n=i) eval_log = engine.log_eval(eval_output) if i == 0 or (i + 1) % config[ 'log.record_interval'] == 0: # record loggings train_logs.append(train_log) eval_logs.append(eval_log) # Save all loggings pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl') pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl') return None
def __call__(self, config, seed, device_str): set_global_seeds(seed) device = torch.device(device_str) logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) # Environment related env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['train.N'], # batched environment init_seed=seed, rolling=True) eval_env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['eval.N'], init_seed=seed, rolling=False) if config['env.standardize']: # running averages of observation and reward env = VecStandardize(venv=env, use_obs=True, use_reward=False, # A2C clip_obs=10., clip_reward=10., gamma=0.99, eps=1e-8) eval_env = VecStandardize(venv=eval_env, # remember to synchronize running averages during evaluation !!! use_obs=True, use_reward=False, # do not process rewards, no training clip_obs=env.clip_obs, clip_reward=env.clip_reward, gamma=env.gamma, eps=env.eps, constant_obs_mean=env.obs_runningavg.mu, # use current running average as constant constant_obs_std=env.obs_runningavg.sigma) env_spec = EnvSpec(env) # Network and policy if config['network.recurrent']: network = LSTM(config=config, device=device, env_spec=env_spec) else: network = Network(config=config, device=device, env_spec=env_spec) if env_spec.control_type == 'Discrete': policy = CategoricalPolicy(config=config, network=network, env_spec=env_spec, device=device, learn_V=True) elif env_spec.control_type == 'Continuous': policy = GaussianPolicy(config=config, network=network, env_spec=env_spec, device=device, learn_V=True, min_std=config['agent.min_std'], std_style=config['agent.std_style'], constant_std=config['agent.constant_std'], std_state_dependent=config['agent.std_state_dependent'], init_std=config['agent.init_std']) # Optimizer and learning rate scheduler optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr']) if config['algo.use_lr_scheduler']: if 'train.iter' in config: # iteration-based max_epoch = config['train.iter'] elif 'train.timestep' in config: # timestep-based max_epoch = config['train.timestep'] + 1 # avoid zero lr in final iteration lambda_f = lambda epoch: 1 - epoch/max_epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Agent kwargs = {'device': device} if config['algo.use_lr_scheduler']: kwargs['lr_scheduler'] = lr_scheduler agent = A2CAgent(config=config, policy=policy, optimizer=optimizer, **kwargs) # Runner runner = SegmentRunner(agent=agent, env=env, gamma=config['algo.gamma']) eval_runner = TrajectoryRunner(agent=agent, env=eval_env, gamma=1.0) # Engine engine = Engine(agent=agent, runner=runner, config=config, eval_runner=eval_runner) # Training and evaluation train_logs = [] eval_logs = [] if config['network.recurrent']: rnn_states_buffer = agent.policy.rnn_states # for SegmentRunner for i in count(): if 'train.iter' in config and i >= config['train.iter']: # enough iterations break elif 'train.timestep' in config and agent.total_T >= config['train.timestep']: # enough timesteps break if config['network.recurrent']: if isinstance(rnn_states_buffer, list): # LSTM: [h, c] rnn_states_buffer = [buf.detach() for buf in rnn_states_buffer] else: rnn_states_buffer = rnn_states_buffer.detach() agent.policy.rnn_states = rnn_states_buffer train_output = engine.train(n=i) # Logging if i == 0 or (i+1) % config['log.record_interval'] == 0 or (i+1) % config['log.print_interval'] == 0: train_log = engine.log_train(train_output) if config['network.recurrent']: rnn_states_buffer = agent.policy.rnn_states # for SegmentRunner with torch.no_grad(): # disable grad, save memory eval_output = engine.eval(n=i) eval_log = engine.log_eval(eval_output) if i == 0 or (i+1) % config['log.record_interval'] == 0: train_logs.append(train_log) eval_logs.append(eval_log) # Save all loggings pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl') pickle_dump(obj=eval_logs, f=logdir/'eval_logs', ext='.pkl') return None
def test_gaussian_policy(self, network_type): env_spec = self.make_env_spec() device = torch.device('cpu') def _create_net(env_spec, device): if network_type == 'FC': config = {} network = Network(config=config, env_spec=env_spec, device=device) assert network.num_params == 64 elif network_type == 'LSTM': config = {'network.rnn_size': 16} network = LSTM(config=config, env_spec=env_spec, device=device) return network if network_type == 'FC': config = {} elif network_type == 'LSTM': config = {'network.rnn_size': 16} network = _create_net(env_spec, device) high = np.unique(env_spec.action_space.high).item() low = np.unique(env_spec.action_space.low).item() def _check_policy(policy): assert hasattr(policy, 'config') assert hasattr(policy, 'network') assert hasattr(policy, 'env_spec') assert hasattr(policy, 'observation_space') assert hasattr(policy, 'action_space') assert hasattr(policy, 'device') assert hasattr(policy, 'recurrent') if network_type == 'FC': assert not policy.recurrent elif network_type == 'LSTM': assert policy.recurrent rnn_states = policy.rnn_states assert isinstance(rnn_states, list) and len(rnn_states) == 2 h0, c0 = rnn_states assert list(h0.shape) == [3, 16] and list(c0.shape) == list(h0.shape) assert np.allclose(h0.detach().numpy(), 0.0) assert np.allclose(c0.detach().numpy(), 0.0) assert hasattr(policy, 'min_std') assert hasattr(policy, 'std_style') assert hasattr(policy, 'constant_std') assert hasattr(policy, 'std_state_dependent') assert hasattr(policy, 'init_std') if network_type == 'FC': assert hasattr(policy.network, 'layers') assert len(policy.network.layers) == 1 elif network_type == 'LSTM': assert hasattr(policy.network, 'rnn') assert hasattr(policy.network, 'mean_head') assert hasattr(policy.network, 'logvar_head') assert hasattr(policy.network, 'value_head') assert hasattr(policy.network, 'device') assert policy.network.mean_head.weight.numel() + policy.network.mean_head.bias.numel() == 17 assert policy.network.mean_head.weight.abs().min().item() <= 0.01 # 0.01 scale for action head assert np.allclose(policy.network.mean_head.bias.detach().numpy(), 0.0) assert policy.network.value_head.weight.numel() + policy.network.value_head.bias.numel() == 16+1 assert policy.network.value_head.weight.abs().max().item() >= 0.1 # roughly +- 0.3 - 0.5 assert np.allclose(policy.network.value_head.bias.detach().numpy(), 0.0) obs = torch.from_numpy(np.array(env_spec.env.reset())).float() out_policy = policy(obs, out_keys=['action', 'action_logprob', 'state_value', 'entropy', 'perplexity'], info={}) if network_type == 'LSTM': new_rnn_states = policy.rnn_states assert isinstance(new_rnn_states, list) and len(new_rnn_states) == 2 h_new, c_new = new_rnn_states assert list(h_new.shape) == [3, 16] and list(c_new.shape) == [3, 16] assert not np.allclose(h_new.detach().numpy(), 0.0) assert not np.allclose(c_new.detach().numpy(), 0.0) mask = torch.ones(3, 16)*1000 mask[1] = mask[1].fill_(0.0) out_policy = policy(obs, out_keys=['action', 'action_logprob', 'state_value', 'entropy', 'perplexity'], info={'mask': mask}) c = policy.rnn_states[1] assert c[0].max().item() >= 1.0 and c[2].max().item() >= 1.0 assert c[1].max().item() <= 0.5 assert isinstance(out_policy, dict) assert 'action' in out_policy assert list(out_policy['action'].shape) == [3, 1] assert torch.all(out_policy['action'] <= high) assert torch.all(out_policy['action'] >= low) assert 'action_logprob' in out_policy assert list(out_policy['action_logprob'].shape) == [3] assert 'state_value' in out_policy assert list(out_policy['state_value'].shape) == [3] assert 'entropy' in out_policy assert list(out_policy['entropy'].shape) == [3] assert 'perplexity' in out_policy assert list(out_policy['perplexity'].shape) == [3] # test default without learn_V tmp = GaussianPolicy(config=config, network=network, env_spec=env_spec, device=device) assert not hasattr(tmp.network, 'value_head') # min_std network = _create_net(env_spec, device) policy = GaussianPolicy(config=config, network=network, env_spec=env_spec, device=device, learn_V=True, min_std=1e-06, std_style='exp', constant_std=None, std_state_dependent=True, init_std=None) _check_policy(policy) if network_type == 'FC': assert policy.network.num_params - 98 == 17 assert isinstance(policy.network.logvar_head, nn.Linear) assert isinstance(policy.network.value_head, nn.Linear) # std_style network = _create_net(env_spec, device) policy = GaussianPolicy(config=config, network=network, env_spec=env_spec, device=device, learn_V=True, min_std=1e-06, std_style='softplus', constant_std=None, std_state_dependent=True, init_std=None) _check_policy(policy) if network_type == 'FC': assert policy.network.num_params - 98 == 17 assert isinstance(policy.network.logvar_head, nn.Linear) assert isinstance(policy.network.value_head, nn.Linear) # constant_std network = _create_net(env_spec, device) policy = GaussianPolicy(config=config, network=network, env_spec=env_spec, device=device, learn_V=True, min_std=1e-06, std_style='exp', constant_std=0.1, std_state_dependent=False, init_std=None) _check_policy(policy) if network_type == 'FC': assert policy.network.num_params - 98 == 0 assert torch.is_tensor(policy.network.logvar_head) assert policy.network.logvar_head.allclose(torch.tensor(-4.6052)) # std_state_dependent and init_std network = _create_net(env_spec, device) policy = GaussianPolicy(config=config, network=network, env_spec=env_spec, device=device, learn_V=True, min_std=1e-06, std_style='exp', constant_std=None, std_state_dependent=False, init_std=0.5) _check_policy(policy) if network_type == 'FC': assert policy.network.num_params - 98 == 1 assert policy.network.logvar_head.allclose(torch.tensor(-1.3863)) assert isinstance(policy.network.logvar_head, nn.Parameter) assert policy.network.logvar_head.requires_grad == True
def test_gaussian_policy(self): env_spec = self.make_env_spec() network = Network(env_spec=env_spec) assert network.num_params == 64 high = np.unique(env_spec.action_space.high).item() low = np.unique(env_spec.action_space.low).item() def _check_policy(policy): assert hasattr(policy, 'config') assert hasattr(policy, 'network') assert hasattr(policy, 'env_spec') assert hasattr(policy, 'min_std') assert hasattr(policy, 'std_style') assert hasattr(policy, 'constant_std') assert hasattr(policy, 'std_state_dependent') assert hasattr(policy, 'init_std') assert hasattr(policy.network, 'layers') assert hasattr(policy.network, 'mean_head') assert hasattr(policy.network, 'logvar_head') assert hasattr(policy.network, 'value_head') assert len(policy.network.layers) == 1 assert policy.network.mean_head.weight.numel( ) + policy.network.mean_head.bias.numel() == 17 assert policy.network.mean_head.weight.abs().min().item( ) <= 0.01 # 0.01 scale for action head assert np.allclose(policy.network.mean_head.bias.detach().numpy(), 0.0) assert policy.network.value_head.weight.numel( ) + policy.network.value_head.bias.numel() == 16 + 1 assert policy.network.value_head.weight.abs().max().item( ) >= 0.1 # roughly +- 0.3 - 0.5 assert np.allclose(policy.network.value_head.bias.detach().numpy(), 0.0) obs = torch.from_numpy(np.array(env_spec.env.reset())).float() out_policy = policy(obs, out_keys=[ 'action', 'action_logprob', 'state_value', 'entropy', 'perplexity' ]) assert isinstance(out_policy, dict) assert 'action' in out_policy assert list(out_policy['action'].shape) == [1, 1] assert torch.all(out_policy['action'] <= high) assert torch.all(out_policy['action'] >= low) assert 'action_logprob' in out_policy assert list(out_policy['action_logprob'].shape) == [1] assert 'state_value' in out_policy assert list(out_policy['state_value'].shape) == [1] assert 'entropy' in out_policy assert list(out_policy['entropy'].shape) == [1] assert 'perplexity' in out_policy assert list(out_policy['perplexity'].shape) == [1] # test default without learn_V tmp = GaussianPolicy(config=None, network=network, env_spec=env_spec) assert not hasattr(tmp.network, 'value_head') # min_std network = Network(env_spec=env_spec) policy = GaussianPolicy(config=None, network=network, env_spec=env_spec, learn_V=True, min_std=1e-06, std_style='exp', constant_std=None, std_state_dependent=True, init_std=None) _check_policy(policy) assert policy.network.num_params - 98 == 17 assert isinstance(policy.network.logvar_head, nn.Linear) assert isinstance(policy.network.value_head, nn.Linear) # std_style network = Network(env_spec=env_spec) policy = GaussianPolicy(config=None, network=network, env_spec=env_spec, learn_V=True, min_std=1e-06, std_style='softplus', constant_std=None, std_state_dependent=True, init_std=None) _check_policy(policy) assert policy.network.num_params - 98 == 17 assert isinstance(policy.network.logvar_head, nn.Linear) assert isinstance(policy.network.value_head, nn.Linear) # constant_std network = Network(env_spec=env_spec) policy = GaussianPolicy(config=None, network=network, env_spec=env_spec, learn_V=True, min_std=1e-06, std_style='exp', constant_std=0.1, std_state_dependent=True, init_std=None) _check_policy(policy) assert policy.network.num_params - 98 == 0 assert torch.is_tensor(policy.network.logvar_head) assert policy.network.logvar_head.allclose(torch.tensor(-4.6052)) # std_state_dependent and init_std network = Network(env_spec=env_spec) policy = GaussianPolicy(config=None, network=network, env_spec=env_spec, learn_V=True, min_std=1e-06, std_style='exp', constant_std=None, std_state_dependent=False, init_std=0.5) _check_policy(policy) assert policy.network.num_params - 98 == 1 assert isinstance(policy.network.logvar_head, nn.Parameter) assert policy.network.logvar_head.requires_grad == True assert policy.network.logvar_head.allclose(torch.tensor(-1.3863))