示例#1
0
文件: algo.py 项目: wolegechu/lagom
 def init(self, seed, config):
     # Make environment
     # Remember to seed it in each working function !
     self.env = make_vec_env(vec_env_class=SerialVecEnv, 
                             make_env=make_gym_env, 
                             env_id=config['env.id'], 
                             num_env=1, 
                             init_seed=seed)
     self.env_spec = EnvSpec(self.env)
     
     # Make agent
     self.network = Network(config=config, env_spec=self.env_spec)
     if self.env_spec.control_type == 'Discrete':
         self.policy = CategoricalPolicy(config=config, network=self.network, env_spec=self.env_spec)
     elif self.env_spec.control_type == 'Continuous':
         self.policy = GaussianPolicy(config=config, network=self.network, env_spec=self.env_spec)
     self.agent = Agent(policy=self.policy, config=config)
示例#2
0
 def init(self, seed, config):
     # Make environment
     # Remember to seed it in each working function !
     self.env = make_vec_env(vec_env_class=SerialVecEnv, 
                             make_env=make_gym_env, 
                             env_id=config['env.id'], 
                             num_env=config['train.N'], 
                             init_seed=seed, 
                             rolling=False)
     self.env_spec = EnvSpec(self.env)
     
     # Make agent
     if config['network.recurrent']:
         self.network = LSTM(config=config, env_spec=self.env_spec)
     else:
         self.network = Network(config=config, env_spec=self.env_spec)
     if self.env_spec.control_type == 'Discrete':
         self.policy = CategoricalPolicy(config=config, network=self.network, env_spec=self.env_spec, device=None)
     elif self.env_spec.control_type == 'Continuous':
         self.policy = GaussianPolicy(config=config, network=self.network, env_spec=self.env_spec, device=None)
     self.agent = Agent(config=config, policy=self.policy)
示例#3
0
文件: algo.py 项目: wolegechu/lagom
    def __call__(self, config, seed, device_str):
        # Set random seeds
        set_global_seeds(seed)
        # Create device
        device = torch.device(device_str)
        # Use log dir for current job (run_experiment)
        logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)

        # Make environment (VecEnv) for training and evaluating
        env = make_vec_env(
            vec_env_class=SerialVecEnv,
            make_env=make_gym_env,
            env_id=config['env.id'],
            num_env=config['train.N'],  # batch size for multiple environments
            init_seed=seed)
        eval_env = make_vec_env(vec_env_class=SerialVecEnv,
                                make_env=make_gym_env,
                                env_id=config['env.id'],
                                num_env=1,
                                init_seed=seed)
        if config[
                'env.standardize']:  # wrap with VecStandardize for running averages of observation and rewards
            env = VecStandardize(venv=env,
                                 use_obs=True,
                                 use_reward=True,
                                 clip_obs=10.,
                                 clip_reward=10.,
                                 gamma=0.99,
                                 eps=1e-8)
            eval_env = VecStandardize(
                venv=
                eval_env,  # remember to synchronize running averages during evaluation !!!
                use_obs=True,
                use_reward=False,  # do not process rewards, no training
                clip_obs=env.clip_obs,
                clip_reward=env.clip_reward,
                gamma=env.gamma,
                eps=env.eps,
                constant_obs_mean=env.obs_runningavg.
                mu,  # use current running average as constant
                constant_obs_std=env.obs_runningavg.sigma)
        env_spec = EnvSpec(env)

        # Create policy
        network = Network(config=config, env_spec=env_spec)
        if env_spec.control_type == 'Discrete':
            policy = CategoricalPolicy(config=config,
                                       network=network,
                                       env_spec=env_spec,
                                       learn_V=True)
        elif env_spec.control_type == 'Continuous':
            policy = GaussianPolicy(
                config=config,
                network=network,
                env_spec=env_spec,
                learn_V=True,
                min_std=config['agent.min_std'],
                std_style=config['agent.std_style'],
                constant_std=config['agent.constant_std'],
                std_state_dependent=config['agent.std_state_dependent'],
                init_std=config['agent.init_std'])
        network = network.to(device)

        # Create optimizer and learning rate scheduler
        optimizer = optim.Adam(policy.network.parameters(),
                               lr=config['algo.lr'])
        if config['algo.use_lr_scheduler']:
            if 'train.iter' in config:  # iteration-based training
                max_epoch = config['train.iter']
            elif 'train.timestep' in config:  # timestep-based training
                max_epoch = config[
                    'train.timestep'] + 1  # +1 to avoid 0.0 lr in final iteration
            lambda_f = lambda epoch: 1 - epoch / max_epoch  # decay learning rate for each training epoch
            lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                       lr_lambda=lambda_f)

        # Create agent
        kwargs = {'device': device}
        if config['algo.use_lr_scheduler']:
            kwargs['lr_scheduler'] = lr_scheduler
        agent = A2CAgent(config=config,
                         policy=policy,
                         optimizer=optimizer,
                         **kwargs)

        # Create runner
        runner = SegmentRunner(agent=agent,
                               env=env,
                               gamma=config['algo.gamma'])
        eval_runner = TrajectoryRunner(agent=agent, env=eval_env, gamma=1.0)

        # Create engine
        engine = Engine(agent=agent,
                        runner=runner,
                        config=config,
                        eval_runner=eval_runner)

        # Training and evaluation
        train_logs = []
        eval_logs = []

        for i in count():  # incremental iteration
            if 'train.iter' in config and i >= config[
                    'train.iter']:  # enough iterations
                break
            elif 'train.timestep' in config and agent.total_T >= config[
                    'train.timestep']:  # enough timesteps
                break

            # train and evaluation
            train_output = engine.train(n=i)

            # logging
            if i == 0 or (i + 1) % config['log.record_interval'] == 0 or (
                    i + 1) % config['log.print_interval'] == 0:
                train_log = engine.log_train(train_output)

                with torch.no_grad():  # disable grad, save memory
                    eval_output = engine.eval(n=i)
                eval_log = engine.log_eval(eval_output)

                if i == 0 or (i + 1) % config[
                        'log.record_interval'] == 0:  # record loggings
                    train_logs.append(train_log)
                    eval_logs.append(eval_log)

        # Save all loggings
        pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl')
        pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')

        return None
示例#4
0
 def __call__(self, config, seed, device_str):
     set_global_seeds(seed)
     device = torch.device(device_str)
     logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)
     
     # Environment related
     env = make_vec_env(vec_env_class=SerialVecEnv, 
                        make_env=make_gym_env, 
                        env_id=config['env.id'], 
                        num_env=config['train.N'],  # batched environment
                        init_seed=seed, 
                        rolling=True)
     eval_env = make_vec_env(vec_env_class=SerialVecEnv, 
                             make_env=make_gym_env, 
                             env_id=config['env.id'], 
                             num_env=config['eval.N'], 
                             init_seed=seed, 
                             rolling=False)
     if config['env.standardize']:  # running averages of observation and reward
         env = VecStandardize(venv=env, 
                              use_obs=True, 
                              use_reward=False,  # A2C
                              clip_obs=10., 
                              clip_reward=10., 
                              gamma=0.99, 
                              eps=1e-8)
         eval_env = VecStandardize(venv=eval_env,  # remember to synchronize running averages during evaluation !!!
                                   use_obs=True, 
                                   use_reward=False,  # do not process rewards, no training
                                   clip_obs=env.clip_obs, 
                                   clip_reward=env.clip_reward, 
                                   gamma=env.gamma, 
                                   eps=env.eps, 
                                   constant_obs_mean=env.obs_runningavg.mu,  # use current running average as constant
                                   constant_obs_std=env.obs_runningavg.sigma)
     env_spec = EnvSpec(env)
     
     # Network and policy
     if config['network.recurrent']:
         network = LSTM(config=config, device=device, env_spec=env_spec)
     else:
         network = Network(config=config, device=device, env_spec=env_spec)
     if env_spec.control_type == 'Discrete':
         policy = CategoricalPolicy(config=config, 
                                    network=network, 
                                    env_spec=env_spec, 
                                    device=device,
                                    learn_V=True)
     elif env_spec.control_type == 'Continuous':
         policy = GaussianPolicy(config=config, 
                                 network=network, 
                                 env_spec=env_spec, 
                                 device=device,
                                 learn_V=True,
                                 min_std=config['agent.min_std'], 
                                 std_style=config['agent.std_style'], 
                                 constant_std=config['agent.constant_std'],
                                 std_state_dependent=config['agent.std_state_dependent'],
                                 init_std=config['agent.init_std'])
     
     # Optimizer and learning rate scheduler
     optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr'])
     if config['algo.use_lr_scheduler']:
         if 'train.iter' in config:  # iteration-based
             max_epoch = config['train.iter']
         elif 'train.timestep' in config:  # timestep-based
             max_epoch = config['train.timestep'] + 1  # avoid zero lr in final iteration
         lambda_f = lambda epoch: 1 - epoch/max_epoch
         lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f)
     
     # Agent
     kwargs = {'device': device}
     if config['algo.use_lr_scheduler']:
         kwargs['lr_scheduler'] = lr_scheduler
     agent = A2CAgent(config=config, 
                      policy=policy, 
                      optimizer=optimizer, 
                      **kwargs)
     
     # Runner
     runner = SegmentRunner(agent=agent, 
                            env=env, 
                            gamma=config['algo.gamma'])
     eval_runner = TrajectoryRunner(agent=agent, 
                                    env=eval_env, 
                                    gamma=1.0)
     
     # Engine
     engine = Engine(agent=agent, 
                     runner=runner, 
                     config=config, 
                     eval_runner=eval_runner)
     
     # Training and evaluation
     train_logs = []
     eval_logs = []
     
     if config['network.recurrent']:
         rnn_states_buffer = agent.policy.rnn_states  # for SegmentRunner
     
     for i in count():
         if 'train.iter' in config and i >= config['train.iter']:  # enough iterations
             break
         elif 'train.timestep' in config and agent.total_T >= config['train.timestep']:  # enough timesteps
             break
         
         if config['network.recurrent']:
             if isinstance(rnn_states_buffer, list):  # LSTM: [h, c]
                 rnn_states_buffer = [buf.detach() for buf in rnn_states_buffer]
             else:
                 rnn_states_buffer = rnn_states_buffer.detach()
             agent.policy.rnn_states = rnn_states_buffer
             
         train_output = engine.train(n=i)
         
         # Logging
         if i == 0 or (i+1) % config['log.record_interval'] == 0 or (i+1) % config['log.print_interval'] == 0:
             train_log = engine.log_train(train_output)
             
             if config['network.recurrent']:
                 rnn_states_buffer = agent.policy.rnn_states  # for SegmentRunner
                 
             with torch.no_grad():  # disable grad, save memory
                 eval_output = engine.eval(n=i)
             eval_log = engine.log_eval(eval_output)
             
             if i == 0 or (i+1) % config['log.record_interval'] == 0:
                 train_logs.append(train_log)
                 eval_logs.append(eval_log)
     
     # Save all loggings
     pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl')
     pickle_dump(obj=eval_logs, f=logdir/'eval_logs', ext='.pkl')
     
     return None
示例#5
0
    def test_gaussian_policy(self, network_type):
        env_spec = self.make_env_spec()
        device = torch.device('cpu')
        def _create_net(env_spec, device):
            if network_type == 'FC':
                config = {}
                network = Network(config=config, env_spec=env_spec, device=device)
                assert network.num_params == 64
            elif network_type == 'LSTM':
                config = {'network.rnn_size': 16}
                network = LSTM(config=config, env_spec=env_spec, device=device)
                
            return network
        
        if network_type == 'FC':
            config = {}
        elif network_type == 'LSTM':
            config = {'network.rnn_size': 16}
        
        network = _create_net(env_spec, device)
        
        high = np.unique(env_spec.action_space.high).item()
        low = np.unique(env_spec.action_space.low).item()
        
        def _check_policy(policy):
            assert hasattr(policy, 'config')
            assert hasattr(policy, 'network')
            assert hasattr(policy, 'env_spec')
            assert hasattr(policy, 'observation_space')
            assert hasattr(policy, 'action_space')
            assert hasattr(policy, 'device')
            assert hasattr(policy, 'recurrent')
            if network_type == 'FC':
                assert not policy.recurrent
            elif network_type == 'LSTM':
                assert policy.recurrent
                rnn_states = policy.rnn_states
                assert isinstance(rnn_states, list) and len(rnn_states) == 2
                h0, c0 = rnn_states
                assert list(h0.shape) == [3, 16] and list(c0.shape) == list(h0.shape)
                assert np.allclose(h0.detach().numpy(), 0.0)
                assert np.allclose(c0.detach().numpy(), 0.0)
            assert hasattr(policy, 'min_std')
            assert hasattr(policy, 'std_style')
            assert hasattr(policy, 'constant_std')
            assert hasattr(policy, 'std_state_dependent')
            assert hasattr(policy, 'init_std')
            
            if network_type == 'FC':
                assert hasattr(policy.network, 'layers')
                assert len(policy.network.layers) == 1
            elif network_type == 'LSTM':
                assert hasattr(policy.network, 'rnn')
            assert hasattr(policy.network, 'mean_head')
            assert hasattr(policy.network, 'logvar_head')
            assert hasattr(policy.network, 'value_head')
            assert hasattr(policy.network, 'device')
            
            assert policy.network.mean_head.weight.numel() + policy.network.mean_head.bias.numel() == 17
            assert policy.network.mean_head.weight.abs().min().item() <= 0.01  # 0.01 scale for action head
            assert np.allclose(policy.network.mean_head.bias.detach().numpy(), 0.0)
            assert policy.network.value_head.weight.numel() + policy.network.value_head.bias.numel() == 16+1
            assert policy.network.value_head.weight.abs().max().item() >= 0.1  # roughly +- 0.3 - 0.5
            assert np.allclose(policy.network.value_head.bias.detach().numpy(), 0.0)

            obs = torch.from_numpy(np.array(env_spec.env.reset())).float()
            out_policy = policy(obs, 
                                out_keys=['action', 'action_logprob', 'state_value', 'entropy', 'perplexity'], 
                                info={})
            
            if network_type == 'LSTM':
                new_rnn_states = policy.rnn_states
                assert isinstance(new_rnn_states, list) and len(new_rnn_states) == 2
                h_new, c_new = new_rnn_states
                assert list(h_new.shape) == [3, 16] and list(c_new.shape) == [3, 16]
                assert not np.allclose(h_new.detach().numpy(), 0.0)
                assert not np.allclose(c_new.detach().numpy(), 0.0)
                
                mask = torch.ones(3, 16)*1000
                mask[1] = mask[1].fill_(0.0)
                out_policy = policy(obs, 
                                    out_keys=['action', 'action_logprob', 'state_value', 'entropy', 'perplexity'], 
                                    info={'mask': mask})
                c = policy.rnn_states[1]
                assert c[0].max().item() >= 1.0 and c[2].max().item() >= 1.0
                assert c[1].max().item() <= 0.5
                
            assert isinstance(out_policy, dict)
            assert 'action' in out_policy
            assert list(out_policy['action'].shape) == [3, 1]
            assert torch.all(out_policy['action'] <= high)
            assert torch.all(out_policy['action'] >= low)
            assert 'action_logprob' in out_policy
            assert list(out_policy['action_logprob'].shape) == [3]
            assert 'state_value' in out_policy
            assert list(out_policy['state_value'].shape) == [3]
            assert 'entropy' in out_policy
            assert list(out_policy['entropy'].shape) == [3]
            assert 'perplexity' in out_policy
            assert list(out_policy['perplexity'].shape) == [3]
        
        # test default without learn_V
        tmp = GaussianPolicy(config=config, network=network, env_spec=env_spec, device=device)
        assert not hasattr(tmp.network, 'value_head')
        
        # min_std
        network = _create_net(env_spec, device)
        policy = GaussianPolicy(config=config, 
                                network=network, 
                                env_spec=env_spec, 
                                device=device,
                                learn_V=True,
                                min_std=1e-06, 
                                std_style='exp', 
                                constant_std=None, 
                                std_state_dependent=True, 
                                init_std=None)
        _check_policy(policy)
        if network_type == 'FC':
            assert policy.network.num_params - 98 == 17
        assert isinstance(policy.network.logvar_head, nn.Linear)
        assert isinstance(policy.network.value_head, nn.Linear)
        
        # std_style
        network = _create_net(env_spec, device)
        policy = GaussianPolicy(config=config, 
                                network=network, 
                                env_spec=env_spec, 
                                device=device,
                                learn_V=True,
                                min_std=1e-06, 
                                std_style='softplus', 
                                constant_std=None, 
                                std_state_dependent=True, 
                                init_std=None)
        _check_policy(policy)
        if network_type == 'FC':
            assert policy.network.num_params - 98 == 17
        assert isinstance(policy.network.logvar_head, nn.Linear)
        assert isinstance(policy.network.value_head, nn.Linear)
        
        # constant_std
        network = _create_net(env_spec, device)
        policy = GaussianPolicy(config=config, 
                                network=network, 
                                env_spec=env_spec, 
                                device=device,
                                learn_V=True,
                                min_std=1e-06, 
                                std_style='exp', 
                                constant_std=0.1, 
                                std_state_dependent=False, 
                                init_std=None)
        _check_policy(policy)
        if network_type == 'FC':
            assert policy.network.num_params - 98 == 0
        assert torch.is_tensor(policy.network.logvar_head)
        assert policy.network.logvar_head.allclose(torch.tensor(-4.6052))
        
        # std_state_dependent and init_std
        network = _create_net(env_spec, device)
        policy = GaussianPolicy(config=config, 
                                network=network, 
                                env_spec=env_spec, 
                                device=device,
                                learn_V=True,
                                min_std=1e-06, 
                                std_style='exp', 
                                constant_std=None, 
                                std_state_dependent=False, 
                                init_std=0.5)
        _check_policy(policy)
        if network_type == 'FC':
            assert policy.network.num_params - 98 == 1
            assert policy.network.logvar_head.allclose(torch.tensor(-1.3863))
        assert isinstance(policy.network.logvar_head, nn.Parameter)
        assert policy.network.logvar_head.requires_grad == True
示例#6
0
    def test_gaussian_policy(self):
        env_spec = self.make_env_spec()
        network = Network(env_spec=env_spec)

        assert network.num_params == 64

        high = np.unique(env_spec.action_space.high).item()
        low = np.unique(env_spec.action_space.low).item()

        def _check_policy(policy):
            assert hasattr(policy, 'config')
            assert hasattr(policy, 'network')
            assert hasattr(policy, 'env_spec')
            assert hasattr(policy, 'min_std')
            assert hasattr(policy, 'std_style')
            assert hasattr(policy, 'constant_std')
            assert hasattr(policy, 'std_state_dependent')
            assert hasattr(policy, 'init_std')

            assert hasattr(policy.network, 'layers')
            assert hasattr(policy.network, 'mean_head')
            assert hasattr(policy.network, 'logvar_head')
            assert hasattr(policy.network, 'value_head')
            assert len(policy.network.layers) == 1
            assert policy.network.mean_head.weight.numel(
            ) + policy.network.mean_head.bias.numel() == 17
            assert policy.network.mean_head.weight.abs().min().item(
            ) <= 0.01  # 0.01 scale for action head
            assert np.allclose(policy.network.mean_head.bias.detach().numpy(),
                               0.0)
            assert policy.network.value_head.weight.numel(
            ) + policy.network.value_head.bias.numel() == 16 + 1
            assert policy.network.value_head.weight.abs().max().item(
            ) >= 0.1  # roughly +- 0.3 - 0.5
            assert np.allclose(policy.network.value_head.bias.detach().numpy(),
                               0.0)

            obs = torch.from_numpy(np.array(env_spec.env.reset())).float()
            out_policy = policy(obs,
                                out_keys=[
                                    'action', 'action_logprob', 'state_value',
                                    'entropy', 'perplexity'
                                ])

            assert isinstance(out_policy, dict)
            assert 'action' in out_policy
            assert list(out_policy['action'].shape) == [1, 1]
            assert torch.all(out_policy['action'] <= high)
            assert torch.all(out_policy['action'] >= low)
            assert 'action_logprob' in out_policy
            assert list(out_policy['action_logprob'].shape) == [1]
            assert 'state_value' in out_policy
            assert list(out_policy['state_value'].shape) == [1]
            assert 'entropy' in out_policy
            assert list(out_policy['entropy'].shape) == [1]
            assert 'perplexity' in out_policy
            assert list(out_policy['perplexity'].shape) == [1]

        # test default without learn_V
        tmp = GaussianPolicy(config=None, network=network, env_spec=env_spec)
        assert not hasattr(tmp.network, 'value_head')

        # min_std
        network = Network(env_spec=env_spec)
        policy = GaussianPolicy(config=None,
                                network=network,
                                env_spec=env_spec,
                                learn_V=True,
                                min_std=1e-06,
                                std_style='exp',
                                constant_std=None,
                                std_state_dependent=True,
                                init_std=None)
        _check_policy(policy)
        assert policy.network.num_params - 98 == 17
        assert isinstance(policy.network.logvar_head, nn.Linear)
        assert isinstance(policy.network.value_head, nn.Linear)

        # std_style
        network = Network(env_spec=env_spec)
        policy = GaussianPolicy(config=None,
                                network=network,
                                env_spec=env_spec,
                                learn_V=True,
                                min_std=1e-06,
                                std_style='softplus',
                                constant_std=None,
                                std_state_dependent=True,
                                init_std=None)
        _check_policy(policy)
        assert policy.network.num_params - 98 == 17
        assert isinstance(policy.network.logvar_head, nn.Linear)
        assert isinstance(policy.network.value_head, nn.Linear)

        # constant_std
        network = Network(env_spec=env_spec)
        policy = GaussianPolicy(config=None,
                                network=network,
                                env_spec=env_spec,
                                learn_V=True,
                                min_std=1e-06,
                                std_style='exp',
                                constant_std=0.1,
                                std_state_dependent=True,
                                init_std=None)
        _check_policy(policy)
        assert policy.network.num_params - 98 == 0
        assert torch.is_tensor(policy.network.logvar_head)
        assert policy.network.logvar_head.allclose(torch.tensor(-4.6052))

        # std_state_dependent and init_std
        network = Network(env_spec=env_spec)
        policy = GaussianPolicy(config=None,
                                network=network,
                                env_spec=env_spec,
                                learn_V=True,
                                min_std=1e-06,
                                std_style='exp',
                                constant_std=None,
                                std_state_dependent=False,
                                init_std=0.5)
        _check_policy(policy)
        assert policy.network.num_params - 98 == 1
        assert isinstance(policy.network.logvar_head, nn.Parameter)
        assert policy.network.logvar_head.requires_grad == True
        assert policy.network.logvar_head.allclose(torch.tensor(-1.3863))