Пример #1
0
    def eval(self, n):
        self.agent.eval()

        start_time = time()

        if self.config['env.standardize']:
            eval_env = VecStandardize(
                venv=self.eval_env,
                use_obs=True,
                use_reward=False,  # do not process rewards, no training
                clip_obs=self.runner.env.clip_obs,
                clip_reward=self.runner.env.clip_reward,
                gamma=self.runner.env.gamma,
                eps=self.runner.env.eps,
                constant_obs_mean=self.runner.env.obs_runningavg.mu,
                constant_obs_std=self.runner.env.obs_runningavg.sigma)
        eval_runner = EpisodeRunner(self.config, self.agent, eval_env)
        T = eval_env.T
        D = eval_runner(T)

        eval_output = {}
        eval_output['D'] = D
        eval_output['n'] = n
        eval_output['T'] = T
        eval_output['num_sec'] = time() - start_time

        return eval_output
Пример #2
0
def test_get_wrapper(env_id):
    env = make_vec_env(SerialVecEnv, make_gym_env, env_id, 3, 0)
    env = VecStandardize(env)
    env = VecClipAction(env)

    out = get_wrapper(env, 'VecClipAction')
    assert out.__class__.__name__ == 'VecClipAction'
    del out

    out = get_wrapper(env, 'VecStandardize')
    assert out.__class__.__name__ == 'VecStandardize'
    del out

    out = get_wrapper(env, 'SerialVecEnv')
    assert out.__class__.__name__ == 'SerialVecEnv'
Пример #3
0
    def _prepare(self, config):
        self.env = make_vec_env(SerialVecEnv, make_gym_env, config['env.id'],
                                config['train.N'], 0)
        self.env = VecClipAction(self.env)
        if config['env.standardize']:
            self.env = VecStandardize(self.env,
                                      use_obs=True,
                                      use_reward=False,
                                      clip_obs=10.0,
                                      clip_reward=10.0,
                                      gamma=0.99,
                                      eps=1e-08)
        self.env_spec = EnvSpec(self.env)

        self.device = torch.device('cpu')

        self.agent = Agent(config, self.env_spec, self.device)
Пример #4
0
        def algorithm(config, seed, device):
            logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)
            seeder = Seeder(seed)
            seeds = seeder(size=config['env.count'])
            env_constructors = []
            for seed in seeds:
                env_constructors.append(partial(CraftingEnv, seed))
            env = VecStandardize(SerialVecEnv(env_constructors),
                                 clip_reward=100.0)
            env_spec = EnvSpec(env)

            agent = Agent(config, env_spec, device)
            runner = RollingSegmentRunner(config, agent, env)
            engine = Engine(agent, runner, env)

            for i in range(config['train.iter']):
                training_result = engine.train(i)
                print(f'Training iteration {i} complete.')
                if i % config['log.interval'] == 0:
                    logs = engine.log_train(training_result)
                    pickle_dump(obj=logs, f=logdir / f'iter_{i}_train_logs', ext='.pkl')
                    torch.save(engine.agent.policy.state_dict(),
                               logdir / 'trained_params')
Пример #5
0
from dial_control_rl import agent
from dial_control_rl.env import CraftingEnv

from lagom.utils import Seeder
from lagom.envs.vec_env import SerialVecEnv, VecStandardize
from lagom.envs import EnvSpec
from functools import partial

env = CraftingEnv()

seeder = Seeder(0)
seeds = seeder(size=1)
env_constructors = []
for seed in seeds:
    env_constructors.append(partial(CraftingEnv, seed))
env = VecStandardize(SerialVecEnv(env_constructors), clip_reward=100.0)
env_spec = EnvSpec(env)

policy = agent.Policy({'algo.rl': 0}, env_spec, torch.device('cpu'))
policy.load_state_dict(params)
policy = policy.double()


def V(x):
    out = policy(torch.tensor(x), ['V'])
    return out['V'][0]


def Q(x):
    out = policy(torch.tensor(x), ['action_dist'])
    out = out['action_dist']
Пример #6
0
    def __call__(self, config, seed, device):
        set_global_seeds(seed)
        logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)

        if config['env.time_aware_obs']:
            kwargs = {'extra_wrapper': [TimeAwareObservation]}
        else:
            kwargs = {}
        env = make_vec_env(SerialVecEnv,
                           make_gym_env,
                           config['env.id'],
                           config['train.N'],
                           seed,
                           monitor=True,
                           **kwargs)
        if config['eval.independent']:
            eval_env = make_vec_env(SerialVecEnv, make_gym_env,
                                    config['env.id'], config['eval.N'], seed)
        if config['env.clip_action']:
            env = VecClipAction(env)
            if config['eval.independent']:
                eval_env = VecClipAction(eval_env)
        if config[
                'env.standardize']:  # running averages of observation and reward
            env = VecStandardize(
                venv=env,
                use_obs=True,
                use_reward=False,  # A2C specific 
                clip_obs=10.,
                clip_reward=10.,
                gamma=0.99,
                eps=1e-8)
        env_spec = EnvSpec(env)

        agent = Agent(config, env_spec, device)

        runner = RollingSegmentRunner(config, agent, env)

        if config['eval.independent']:
            engine = Engine(agent, runner, config, eval_env=eval_env)
        else:
            engine = Engine(agent, runner, config)

        train_logs = []
        eval_logs = []
        for i in count():
            if 'train.iter' in config and i >= config[
                    'train.iter']:  # enough iterations
                break
            elif 'train.timestep' in config and agent.total_T >= config[
                    'train.timestep']:  # enough timesteps
                break

            train_output = engine.train(i)

            if i == 0 or (i + 1) % config['log.interval'] == 0:
                train_log = engine.log_train(train_output)
                train_logs.append(train_log)

                if config['eval.independent']:
                    with torch.no_grad():  # disable grad, save memory
                        eval_output = engine.eval(n=i)
                    eval_log = engine.log_eval(eval_output)
                    eval_logs.append(eval_log)

        pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl')
        pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')

        return None
Пример #7
0
    def __call__(self, config, seed, device_str):
        # Set random seeds
        set_global_seeds(seed)
        # Create device
        device = torch.device(device_str)
        # Use log dir for current job (run_experiment)
        logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)

        # Make environment (VecEnv) for training and evaluating
        env = make_vec_env(
            vec_env_class=SerialVecEnv,
            make_env=make_gym_env,
            env_id=config['env.id'],
            num_env=config['train.N'],  # batch size for multiple environments
            init_seed=seed)
        eval_env = make_vec_env(vec_env_class=SerialVecEnv,
                                make_env=make_gym_env,
                                env_id=config['env.id'],
                                num_env=1,
                                init_seed=seed)
        if config[
                'env.standardize']:  # wrap with VecStandardize for running averages of observation and rewards
            env = VecStandardize(venv=env,
                                 use_obs=True,
                                 use_reward=True,
                                 clip_obs=10.,
                                 clip_reward=10.,
                                 gamma=0.99,
                                 eps=1e-8)
            eval_env = VecStandardize(
                venv=
                eval_env,  # remember to synchronize running averages during evaluation !!!
                use_obs=True,
                use_reward=False,  # do not process rewards, no training
                clip_obs=env.clip_obs,
                clip_reward=env.clip_reward,
                gamma=env.gamma,
                eps=env.eps,
                constant_obs_mean=env.obs_runningavg.
                mu,  # use current running average as constant
                constant_obs_std=env.obs_runningavg.sigma)
        env_spec = EnvSpec(env)

        # Create policy
        network = Network(config=config, env_spec=env_spec)
        if env_spec.control_type == 'Discrete':
            policy = CategoricalPolicy(config=config,
                                       network=network,
                                       env_spec=env_spec,
                                       learn_V=True)
        elif env_spec.control_type == 'Continuous':
            policy = GaussianPolicy(
                config=config,
                network=network,
                env_spec=env_spec,
                learn_V=True,
                min_std=config['agent.min_std'],
                std_style=config['agent.std_style'],
                constant_std=config['agent.constant_std'],
                std_state_dependent=config['agent.std_state_dependent'],
                init_std=config['agent.init_std'])
        network = network.to(device)

        # Create optimizer and learning rate scheduler
        optimizer = optim.Adam(policy.network.parameters(),
                               lr=config['algo.lr'])
        if config['algo.use_lr_scheduler']:
            if 'train.iter' in config:  # iteration-based training
                max_epoch = config['train.iter']
            elif 'train.timestep' in config:  # timestep-based training
                max_epoch = config[
                    'train.timestep'] + 1  # +1 to avoid 0.0 lr in final iteration
            lambda_f = lambda epoch: 1 - epoch / max_epoch  # decay learning rate for each training epoch
            lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                       lr_lambda=lambda_f)

        # Create agent
        kwargs = {'device': device}
        if config['algo.use_lr_scheduler']:
            kwargs['lr_scheduler'] = lr_scheduler
        agent = A2CAgent(config=config,
                         policy=policy,
                         optimizer=optimizer,
                         **kwargs)

        # Create runner
        runner = SegmentRunner(agent=agent,
                               env=env,
                               gamma=config['algo.gamma'])
        eval_runner = TrajectoryRunner(agent=agent, env=eval_env, gamma=1.0)

        # Create engine
        engine = Engine(agent=agent,
                        runner=runner,
                        config=config,
                        eval_runner=eval_runner)

        # Training and evaluation
        train_logs = []
        eval_logs = []

        for i in count():  # incremental iteration
            if 'train.iter' in config and i >= config[
                    'train.iter']:  # enough iterations
                break
            elif 'train.timestep' in config and agent.total_T >= config[
                    'train.timestep']:  # enough timesteps
                break

            # train and evaluation
            train_output = engine.train(n=i)

            # logging
            if i == 0 or (i + 1) % config['log.record_interval'] == 0 or (
                    i + 1) % config['log.print_interval'] == 0:
                train_log = engine.log_train(train_output)

                with torch.no_grad():  # disable grad, save memory
                    eval_output = engine.eval(n=i)
                eval_log = engine.log_eval(eval_output)

                if i == 0 or (i + 1) % config[
                        'log.record_interval'] == 0:  # record loggings
                    train_logs.append(train_log)
                    eval_logs.append(eval_log)

        # Save all loggings
        pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl')
        pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')

        return None
Пример #8
0
 def __call__(self, config, seed, device_str):
     set_global_seeds(seed)
     device = torch.device(device_str)
     logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)
     
     # Environment related
     env = make_vec_env(vec_env_class=SerialVecEnv, 
                        make_env=make_gym_env, 
                        env_id=config['env.id'], 
                        num_env=config['train.N'],  # batched environment
                        init_seed=seed, 
                        rolling=True)
     eval_env = make_vec_env(vec_env_class=SerialVecEnv, 
                             make_env=make_gym_env, 
                             env_id=config['env.id'], 
                             num_env=config['eval.N'], 
                             init_seed=seed, 
                             rolling=False)
     if config['env.standardize']:  # running averages of observation and reward
         env = VecStandardize(venv=env, 
                              use_obs=True, 
                              use_reward=False,  # A2C
                              clip_obs=10., 
                              clip_reward=10., 
                              gamma=0.99, 
                              eps=1e-8)
         eval_env = VecStandardize(venv=eval_env,  # remember to synchronize running averages during evaluation !!!
                                   use_obs=True, 
                                   use_reward=False,  # do not process rewards, no training
                                   clip_obs=env.clip_obs, 
                                   clip_reward=env.clip_reward, 
                                   gamma=env.gamma, 
                                   eps=env.eps, 
                                   constant_obs_mean=env.obs_runningavg.mu,  # use current running average as constant
                                   constant_obs_std=env.obs_runningavg.sigma)
     env_spec = EnvSpec(env)
     
     # Network and policy
     if config['network.recurrent']:
         network = LSTM(config=config, device=device, env_spec=env_spec)
     else:
         network = Network(config=config, device=device, env_spec=env_spec)
     if env_spec.control_type == 'Discrete':
         policy = CategoricalPolicy(config=config, 
                                    network=network, 
                                    env_spec=env_spec, 
                                    device=device,
                                    learn_V=True)
     elif env_spec.control_type == 'Continuous':
         policy = GaussianPolicy(config=config, 
                                 network=network, 
                                 env_spec=env_spec, 
                                 device=device,
                                 learn_V=True,
                                 min_std=config['agent.min_std'], 
                                 std_style=config['agent.std_style'], 
                                 constant_std=config['agent.constant_std'],
                                 std_state_dependent=config['agent.std_state_dependent'],
                                 init_std=config['agent.init_std'])
     
     # Optimizer and learning rate scheduler
     optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr'])
     if config['algo.use_lr_scheduler']:
         if 'train.iter' in config:  # iteration-based
             max_epoch = config['train.iter']
         elif 'train.timestep' in config:  # timestep-based
             max_epoch = config['train.timestep'] + 1  # avoid zero lr in final iteration
         lambda_f = lambda epoch: 1 - epoch/max_epoch
         lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f)
     
     # Agent
     kwargs = {'device': device}
     if config['algo.use_lr_scheduler']:
         kwargs['lr_scheduler'] = lr_scheduler
     agent = A2CAgent(config=config, 
                      policy=policy, 
                      optimizer=optimizer, 
                      **kwargs)
     
     # Runner
     runner = SegmentRunner(agent=agent, 
                            env=env, 
                            gamma=config['algo.gamma'])
     eval_runner = TrajectoryRunner(agent=agent, 
                                    env=eval_env, 
                                    gamma=1.0)
     
     # Engine
     engine = Engine(agent=agent, 
                     runner=runner, 
                     config=config, 
                     eval_runner=eval_runner)
     
     # Training and evaluation
     train_logs = []
     eval_logs = []
     
     if config['network.recurrent']:
         rnn_states_buffer = agent.policy.rnn_states  # for SegmentRunner
     
     for i in count():
         if 'train.iter' in config and i >= config['train.iter']:  # enough iterations
             break
         elif 'train.timestep' in config and agent.total_T >= config['train.timestep']:  # enough timesteps
             break
         
         if config['network.recurrent']:
             if isinstance(rnn_states_buffer, list):  # LSTM: [h, c]
                 rnn_states_buffer = [buf.detach() for buf in rnn_states_buffer]
             else:
                 rnn_states_buffer = rnn_states_buffer.detach()
             agent.policy.rnn_states = rnn_states_buffer
             
         train_output = engine.train(n=i)
         
         # Logging
         if i == 0 or (i+1) % config['log.record_interval'] == 0 or (i+1) % config['log.print_interval'] == 0:
             train_log = engine.log_train(train_output)
             
             if config['network.recurrent']:
                 rnn_states_buffer = agent.policy.rnn_states  # for SegmentRunner
                 
             with torch.no_grad():  # disable grad, save memory
                 eval_output = engine.eval(n=i)
             eval_log = engine.log_eval(eval_output)
             
             if i == 0 or (i+1) % config['log.record_interval'] == 0:
                 train_logs.append(train_log)
                 eval_logs.append(eval_log)
     
     # Save all loggings
     pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl')
     pickle_dump(obj=eval_logs, f=logdir/'eval_logs', ext='.pkl')
     
     return None
Пример #9
0
    def test_vec_standardize(self, vec_env_class):
        venv = make_vec_env(vec_env_class, make_gym_env, 'CartPole-v1', 5, 1,
                            True)
        venv = VecStandardize(venv,
                              use_obs=True,
                              use_reward=True,
                              clip_obs=10.,
                              clip_reward=10.,
                              gamma=0.99,
                              eps=1e-8)
        assert isinstance(venv, VecEnvWrapper) and isinstance(
            venv, VecStandardize)
        obs = venv.reset()
        assert not np.allclose(venv.obs_runningavg.mu, 0.0)
        assert not np.allclose(venv.obs_runningavg.sigma, 0.0)
        a = [1] * 5
        [venv.step(a) for _ in range(20)]
        assert venv.obs_runningavg.N == 5 + 5 * 20
        assert venv.reward_runningavg.N == 5 * 20
        assert not np.allclose(venv.obs_runningavg.mu, 0.0)
        assert not np.allclose(venv.obs_runningavg.sigma, 0.0)
        running_avg = venv.running_averages
        assert isinstance(running_avg, dict)
        assert len(
            running_avg
        ) == 2 and 'obs_avg' in running_avg and 'r_avg' in running_avg
        assert 'mu' in running_avg['obs_avg'] and 'sigma' in running_avg[
            'obs_avg']
        assert not np.allclose(running_avg['obs_avg']['mu'], 0.0)
        assert not np.allclose(running_avg['obs_avg']['sigma'], 0.0)
        assert 'mu' not in running_avg['r_avg']
        assert 'sigma' in running_avg['r_avg']
        assert not np.allclose(running_avg['r_avg']['sigma'], 0.0)

        del venv, obs, a

        # other settings: clipping
        venv = make_vec_env(vec_env_class, make_gym_env, 'CartPole-v1', 5, 1,
                            True)
        venv = VecStandardize(venv,
                              use_obs=True,
                              use_reward=True,
                              clip_obs=0.01,
                              clip_reward=0.0001,
                              gamma=0.99,
                              eps=1e-8)
        obs = venv.reset()
        assert np.allclose(np.abs(np.asarray(obs)), 0.01)

        running_avg = venv.running_averages
        assert isinstance(running_avg, dict)
        assert len(
            running_avg
        ) == 2 and 'obs_avg' in running_avg and 'r_avg' in running_avg
        assert 'mu' in running_avg['obs_avg'] and 'sigma' in running_avg[
            'obs_avg']
        assert not np.allclose(running_avg['obs_avg']['mu'], 0.0)
        assert not np.allclose(running_avg['obs_avg']['sigma'], 0.0)
        assert 'mu' not in running_avg['r_avg']
        assert 'sigma' in running_avg['r_avg']
        assert running_avg['r_avg']['sigma'] is None

        a = [1] * 5
        obs, rewards, _, _ = venv.step(a)
        assert rewards.max() == 0.0001

        del venv, obs, a

        # other settings: turn off use_obs
        venv = make_vec_env(vec_env_class, make_gym_env, 'CartPole-v1', 5, 1,
                            True)
        venv = VecStandardize(venv,
                              use_obs=False,
                              use_reward=False,
                              clip_obs=0.001,
                              clip_reward=0.0001,
                              gamma=0.99,
                              eps=1e-8)
        obs = venv.reset()
        assert np.asarray(obs).max() > 0.001
        a = [1] * 5
        obs, rewards, _, _ = venv.step(a)
        assert np.asarray(rewards).max() >= 0.0001

        del venv, obs, a

        # other settings: gamma
        venv = make_vec_env(vec_env_class, make_gym_env, 'CartPole-v1', 5, 1,
                            True)
        with pytest.raises(AssertionError):
            venv = VecStandardize(
                venv,
                use_obs=False,
                use_reward=False,
                clip_obs=0.001,
                clip_reward=0.0001,
                gamma=1.0,  # not allowed
                eps=1e-8)

        del venv

        # other settings: constant value

        venv = make_vec_env(vec_env_class, make_gym_env, 'CartPole-v1', 5, 1,
                            True)
        venv = VecStandardize(venv,
                              use_obs=True,
                              use_reward=True,
                              clip_obs=10.,
                              clip_reward=10.,
                              gamma=0.99,
                              eps=1e-8,
                              constant_obs_mean=np.array([5.] * 4),
                              constant_obs_std=np.array([1.] * 4),
                              constant_reward_std=np.array(1000))

        obs = venv.reset()
        assert obs.min() < -4.0
        a = [1] * 5
        obs, rewards, _, _ = venv.step(a)
        assert rewards.min() <= 0.01