示例#1
0
    def __init__(self, obs_space):
        self.seed = 0
        self.rmsprop_epsilon = 1e-5
        self.gamma = 0.99
        self.use_gae = False
        self.tau = 0.95
        self.num_envs = 8
        self.lr = 7e-4
        self.weight_decay = 0.0
        self.max_grad_norm = 0.5
        self.alpha = 0.99
        self.update_steps = 5

        # Set a random seed used in ChainerRL.
        # If you use more than one processes, the results will be no longer
        # deterministic even with the same random seed.
        misc.set_random_seed(self.seed)

        # Set different random seeds for different subprocesses.
        # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
        # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
        process_seeds = np.arange(self.num_envs) + self.seed * self.num_envs
        assert process_seeds.max() < 2**3

        action_space = 1

        # Switch policy types accordingly to action space types
        model = A2CGaussian(obs_space, action_space)

        optimizer = chainer.optimizers.RMSprop(self.lr,
                                               eps=self.rmsprop_epsilon,
                                               alpha=self.alpha)
        optimizer.setup(model)
        optimizer.add_hook(
            chainer.optimizer.GradientClipping(self.max_grad_norm))
        if self.weight_decay > 0:
            optimizer.add_hook(NonbiasWeightDecay(self.weight_decay))

        self.agent = A2C(model,
                         optimizer,
                         gamma=self.gamma,
                         num_processes=self.num_envs,
                         update_steps=self.update_steps,
                         use_gae=self.use_gae,
                         tau=self.tau)
示例#2
0
def main():

    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='Pendulum-v0')
    parser.add_argument('--arch',
                        type=str,
                        default='Gaussian',
                        choices=('FFSoftmax', 'FFMellowmax', 'Gaussian'))
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--update-steps', type=int, default=5)
    parser.add_argument('--log-interval', type=int, default=1000)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-5)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        help='discount factor')
    parser.add_argument('--use-gae',
                        action='store_true',
                        default=False,
                        help='use generalized advantage estimation')
    parser.add_argument('--tau',
                        type=float,
                        default=0.95,
                        help='gae parameter')
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--max-grad-norm',
                        type=float,
                        default=0.5,
                        help='value loss coefficient')
    parser.add_argument('--alpha',
                        type=float,
                        default=0.99,
                        help='RMSprop optimizer alpha')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--num-envs', type=int, default=1)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])

    sample_env = make_env(process_idx=0, test=False)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Switch policy types accordingly to action space types
    if args.arch == 'Gaussian':
        model = A2CGaussian(obs_space.low.size, action_space.low.size)
    elif args.arch == 'FFSoftmax':
        model = A2CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A2CFFMellowmax(obs_space.low.size, action_space.n)

    optimizer = chainer.optimizers.RMSprop(args.lr,
                                           eps=args.rmsprop_epsilon,
                                           alpha=args.alpha)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm))
    if args.weight_decay > 0:
        optimizer.add_hook(NonbiasWeightDecay(args.weight_decay))

    agent = a2c.A2C(model,
                    optimizer,
                    gamma=args.gamma,
                    gpu=args.gpu,
                    num_processes=args.num_envs,
                    update_steps=args.update_steps,
                    use_gae=args.use_gae,
                    tau=args.tau)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=make_batch_env(test=True),
            steps=args.steps,
            log_interval=args.log_interval,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
        )
示例#3
0
def main():

    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('rom', type=str)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--use-sdl', action='store_true')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--replay-start-size', type=int, default=10000)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--use-lstm', action='store_true')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.set_defaults(use_sdl=False)
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    n_actions = ale.ALE(args.rom).number_of_actions

    if args.use_lstm:
        model = acer.ACERSharedModel(
            shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    else:
        model = acer.ACERSharedModel(
            shared=links.NIPSDQNHead(),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    replay_buffer = EpisodicReplayBuffer(10**6 // args.processes)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      beta=args.beta,
                      phi=dqn_phi)

    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test,
                      seed=env_seed)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)
        return env

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=args.max_episode_len,
                                      global_step_hooks=[lr_decay_hook])
示例#4
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--env', type=str, default='Hopper-v1')
    parser.add_argument('--arch',
                        type=str,
                        default='FFGaussian',
                        choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian'))
    parser.add_argument('--normalize-obs', action='store_true')
    parser.add_argument('--bound-mean', action='store_true')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--steps', type=int, default=10**6)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')

    parser.add_argument('--update-interval', type=int, default=2048)
    parser.add_argument('--batchsize', type=int, default=64)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--entropy-coef', type=float, default=0.0)
    args = parser.parse_args()

    logging.getLogger().setLevel(args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(test):
        env = gym.make(args.env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if args.reward_scale_factor and not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render:
            misc.env_modifiers.make_rendered(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Switch policy types accordingly to action space types
    if args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFGaussian':
        model = A3CFFGaussian(obs_space.low.size,
                              action_space,
                              bound_mean=args.bound_mean,
                              normalize_obs=args.normalize_obs)

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = PPO(
        model,
        opt,
        gpu=args.gpu,
        phi=phi,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps_vf=None,
        entropy_coef=args.entropy_coef,
        standardize_advantages=args.standardize_advantages,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        # Linearly decay the clipping parameter to zero
        def clip_eps_setter(env, agent, value):
            agent.clip_eps = value

        clip_eps_decay_hook = experiments.LinearInterpolationHook(
            args.steps, 0.2, 0, clip_eps_setter)

        experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(False),
            eval_env=make_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            max_episode_len=timestep_limit,
            step_hooks=[
                lr_decay_hook,
                clip_eps_decay_hook,
            ],
        )
示例#5
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--n-times-replay', type=int, default=8)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-frequency', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.getLogger().setLevel(args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            misc.env_modifiers.make_rendered(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    n_hidden_channels = 200

    model = acer.ACERSeparateModel(
        pi=links.Sequence(
            L.Linear(obs_space.low.size, n_hidden_channels), F.relu,
            L.Linear(n_hidden_channels, action_space.n, wscale=1e-3),
            SoftmaxDistribution),
        q=links.Sequence(
            L.Linear(obs_space.low.size, n_hidden_channels), F.relu,
            L.Linear(n_hidden_channels, action_space.n, wscale=1e-3),
            DiscreteActionValue),
    )

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    replay_buffer = EpisodicReplayBuffer(10**5 // args.processes)
    agent = acer.DiscreteACER(model,
                              opt,
                              t_max=args.t_max,
                              gamma=0.99,
                              replay_buffer=replay_buffer,
                              n_times_replay=args.n_times_replay,
                              beta=args.beta,
                              phi=phi)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        mean, median, stdev = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev'.format(
            args.eval_n_runs, mean, median, stdev))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_frequency=args.eval_frequency,
                                      max_episode_len=timestep_limit)
示例#6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir', type=str, default='results')
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--update-steps', type=int, default=5)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        help='discount factor')
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-5)
    parser.add_argument('--use-gae',
                        action='store_true',
                        default=False,
                        help='use generalized advantage estimation')
    parser.add_argument('--tau',
                        type=float,
                        default=0.95,
                        help='gae parameter')
    parser.add_argument('--alpha',
                        type=float,
                        default=0.99,
                        help='RMSprop optimizer alpha')
    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--max-grad-norm',
                        type=float,
                        default=40,
                        help='value loss coefficient')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--num-envs', type=int, default=1)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])

    sample_env = make_env(0, test=False)

    n_actions = sample_env.action_space.n

    model = A2CFF(n_actions)
    optimizer = rmsprop_async.RMSpropAsync(lr=args.lr,
                                           eps=args.rmsprop_epsilon,
                                           alpha=args.alpha)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm))
    if args.weight_decay > 0:
        optimizer.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = a2c.A2C(
        model,
        optimizer,
        gamma=args.gamma,
        gpu=args.gpu,
        num_processes=args.num_envs,
        update_steps=args.update_steps,
        phi=phi,
        use_gae=args.use_gae,
        tau=args.tau,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=make_batch_env(test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=make_batch_env(test=True),
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            log_interval=1000,
        )
示例#7
0
def main():

    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('rom', type=str)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--use-sdl', action='store_true')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--use-lstm', action='store_true')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.set_defaults(use_sdl=False)
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    n_actions = ale.ALE(args.rom).number_of_actions

    if args.use_lstm:
        model = A3CLSTM(n_actions)
    else:
        model = A3CFF(n_actions)

    # Draw the computational graph and save it in the output directory.
    fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None],
                                name='observation')
    with chainerrl.recurrent.state_reset(model):
        # The state of the model is reset again after drawing the graph
        chainerrl.misc.draw_computational_graph([model(fake_obs)],
                                                os.path.join(
                                                    args.outdir, 'model'))

    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = a3c.A3C(model,
                    opt,
                    t_max=args.t_max,
                    gamma=0.99,
                    beta=args.beta,
                    phi=dqn_phi)
    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test,
                      seed=env_seed)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)
        return env

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=args.max_episode_len,
                                      global_step_hooks=[lr_decay_hook])
示例#8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('config', help='YAML config file')
    parser.add_argument('outdir',
                        type=str,
                        help='directory to put training log')
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger_level', type=int, default=logging.INFO)
    args = parser.parse_args()
    print_args(args)

    # init a logger
    logging.basicConfig(level=args.logger_level)

    # load yaml config file
    with open(args.config) as f:
        config = yaml.load(f)

    # set random seed
    misc.set_random_seed(config['seed'])

    # create directory to put the results
    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    # save config file to outdir
    with open(os.path.join(args.outdir, 'config.yaml'), 'w') as f:
        yaml.dump(config, f, indent=4, default_flow_style=False)

    # define func to create env, target data sampler, and models
    if config['problem'] == 'toy':
        assert config['imsize'] == 3, 'invalid imsize'
        assert config['in_channel'] == 1, 'invalid in_channel'

        def make_env(process_idx, test):
            env = ToyEnv(config['imsize'])
            return env

        gen = SpiralToyModel(imsize, config['conditional'])
        dis = SpiralToyDiscriminator(imsize, config['conditional'])

        if config['conditional']:
            train_patterns = [(1, 4, 7), (0, 1, 2), (3, 4, 5), (2, 5, 8)]
            test_patterns = [(6, 7, 8)]
        else:
            train_patterns = [(1, 4, 7)]
            test_patterns = train_patterns

        dataset = ToyDataset(config['imsize'], train_patterns, test_patterns)

    else:
        # my paint env
        def make_env(process_idx, test):
            env = MyPaintEnv(max_episode_steps=config['max_episode_steps'],
                             imsize=config['imsize'],
                             pos_resolution=config['pos_resolution'],
                             brush_info_file=config['brush_info_file'])
            return env

        # generator
        gen = SpiralModel(config['imsize'], config['conditional'])
        dis = SpiralDiscriminator(config['imsize'], config['conditional'])

        if config['problem'] == 'mnist':
            single_label = config['mnist_target_label'] is not None
            dataset = MnistDataset(config['imsize'], single_label,
                                   config['mnist_target_label'],
                                   config['mnist_binarization'])
        elif config['problem'] == 'emnist':
            dataset = EMnistDataset(config['emnist_gz_images'],
                                    config['emnist_gz_labels'],
                                    config['emnist_single_label'])
        elif config['problem'] == 'jikei':
            dataset = JikeiDataset(config['jikei_npz'])
        elif config['problem'] == 'quickdraw':
            dataset = QuickdrawDataset(config['quickdraw_npz'])
        else:
            raise NotImplementedError()

    # initialize optimizers
    gen_opt = chainer.optimizers.Adam(alpha=config['lr'], beta1=0.5)
    dis_opt = chainer.optimizers.Adam(alpha=config['lr'], beta1=0.5)

    gen_opt.setup(gen)
    dis_opt.setup(dis)

    gen_opt.add_hook(chainer.optimizer.GradientClipping(40))
    dis_opt.add_hook(chainer.optimizer.GradientClipping(40))
    if config['weight_decay'] > 0:
        gen_opt.add_hook(NonbiasWeightDecay(config['weight_decay']))
        dis_opt.add_hook(NonbiasWeightDecay(config['weight_decay']))

    # init an spiral agent
    agent = SPIRAL(
        generator=gen,
        discriminator=dis,
        gen_optimizer=gen_opt,
        dis_optimizer=dis_opt,
        dataset=dataset,
        conditional=config['conditional'],
        reward_mode=config['reward_mode'],
        imsize=config['imsize'],
        max_episode_steps=config['max_episode_steps'],
        rollout_n=config['rollout_n'],
        gamma=config['gamma'],
        beta=config['beta'],
        gp_lambda=config['gp_lambda'],
        lambda_R=config['lambda_R'],
        staying_penalty=config['staying_penalty'],
        empty_drawing_penalty=config['empty_drawing_penalty'],
        n_save_final_obs_interval=config['n_save_final_obs_interval'],
        outdir=args.outdir)

    # load from a snapshot
    if args.load:
        agent.load(args.load)

    # training mode
    max_episode_len = config['max_episode_steps'] * config['rollout_n']
    steps = config['processes'] * config['n_update'] * max_episode_len

    save_interval = config['processes'] * config[
        'n_save_interval'] * max_episode_len
    eval_interval = config['processes'] * config[
        'n_eval_interval'] * max_episode_len

    step_hook = SpiralStepHook(config['max_episode_steps'], save_interval,
                               args.outdir)

    if config['processes'] == 1:
        # single process for easy to debug
        agent.process_idx = 0
        env = make_env(0, False)

        experiments.train_agent_with_evaluation(
            agent=agent,
            outdir=args.outdir,
            env=env,
            steps=steps,
            eval_n_runs=config['eval_n_runs'],
            eval_interval=eval_interval,
            max_episode_len=max_episode_len,
            step_hooks=[step_hook],
            save_best_so_far_agent=False)
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=config['processes'],
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=steps,
                                      eval_n_runs=config['eval_n_runs'],
                                      eval_interval=eval_interval,
                                      max_episode_len=max_episode_len,
                                      global_step_hooks=[step_hook],
                                      save_best_so_far_agent=False)
示例#9
0
    def __init__(self, args, sample_env):
        obs_space = sample_env.observation_space
        action_space = sample_env.action_space

        # Normalize observations based on their empirical mean and variance
        obs_normalizer = chainerrl.links.EmpiricalNormalization(
            obs_space.low.size, clip_threshold=5)

        # Switch policy types accordingly to action space types
        if args.arch == 'FFSoftmax':
            #model = A3CFFSoftmax(obs_space.low.size, action_space.n)
            model = A3CFFSoftmax(obs_space.low.size,
                                 sample_env.env_prop.get_softmax_layer_size(),
                                 n_hidden_channels=600,
                                 beta=cfg.beta)
        elif args.arch == 'FFMellowmax':
            model = A3CFFMellowmax(obs_space.low.size, action_space.n)
        elif args.arch == 'FFGaussian':
            model = A3CFFGaussian(obs_space.low.size,
                                  action_space,
                                  bound_mean=args.bound_mean,
                                  n_hidden_channels=cfg.n_hidden_channels)
        elif args.arch == 'FFParamSoftmax':
            model = A3CFFParamSoftmax(
                obs_space.low.size,
                sample_env.env_prop.get_pre_output_layer_size(),
                sample_env.env_prop.get_parametric_segments(),
                sample_env.env_prop.get_parametric_softmax_segments_sizes(),
                n_hidden_channels=600,
                beta=cfg.beta)
        else:
            raise NotImplementedError

        opt = chainer.optimizers.Adam(alpha=args.adam_lr, eps=1e-5)
        opt.setup(model)
        if args.weight_decay > 0:
            opt.add_hook(NonbiasWeightDecay(args.weight_decay))

        # a workaround for saving obs_normalizer
        # see https://github.com/chainer/chainerrl/issues/376
        if 'obs_normalizer' not in PPO.saved_attributes:
            PPO.saved_attributes.append('obs_normalizer')

        agent = PPO(
            model,
            opt,
            obs_normalizer=obs_normalizer,
            gpu=args.gpu,
            phi=lambda x: x.astype(np.float32, copy=False),
            gamma=args.ppo_gamma,
            lambd=args.ppo_lambda,
            update_interval=args.ppo_update_interval,
            minibatch_size=args.batchsize,
            epochs=args.epochs,
            clip_eps_vf=None,
            entropy_coef=args.entropy_coef,
            standardize_advantages=args.standardize_advantages,
        )

        if args.load:
            agent.load(args.load)

        self._agent = agent
def main():
    import logging

    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--arch',
                        type=str,
                        default='FFSoftmax',
                        choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian'))
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--frame-buffer-length', type=int, default=4)
    parser.add_argument('--render-b2w', action='store_true', default=False)
    # Additional params
    parser.add_argument('--min_reward', type=float, default=sys.float_info.min)

    args = parser.parse_args()

    def f_trim_reward(x, min_reward=args.min_reward):
        if x < min_reward:
            x = 0
        else:
            if x != 0:
                print("XXXXXXXXXXXXX ", x)
        return x

    logging.getLogger().setLevel(args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: f_trim_reward(x) * args.reward_scale_factor)
            misc.env_modifiers.make_reward_clipped(env, -1, 1)
        if args.render and process_idx == 0 and not test:
            misc.env_modifiers.make_rendered(env)

        env = env_gym_chainer.GymEnvironment(
            env,
            res_width=X_SHAPE,
            res_height=Y_SHAPE,
            agent_history_length=args.frame_buffer_length,
            render=args.render_b2w)

        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Switch policy types accordingly to action space types
    model = A3CFF(action_space.n)

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    # Clipping by gradient norm (changed from 40 to 10)
    # opt.add_hook(chainer.optimizer.GradientClipping(10))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    agent = SaliencyA3C(model,
                        opt,
                        t_max=args.t_max,
                        gamma=0.99,
                        beta=args.beta,
                        phi=phi)
    if args.load:
        agent.load(args.load)

    if args.demo:
        with chainer.using_config("train", False):
            env = make_env(0, True)
            eval_stats = experiments.eval_performance(
                env=env,
                agent=agent,
                n_runs=args.eval_n_runs,
                max_episode_len=timestep_limit)
            print('n_runs: {} mean: {} median: {} stdev {}'.format(
                args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
                eval_stats['stdev']))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--env', type=str, default='Hopper-v2')
    parser.add_argument('--num-envs', type=int, default=1)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--steps', type=int, default=10**6)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--window-size', type=int, default=100)

    parser.add_argument('--update-interval', type=int, default=2048)
    parser.add_argument('--log-interval', type=int, default=1000)
    parser.add_argument('--batchsize', type=int, default=64)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--entropy-coef', type=float, default=0.0)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])

    # Only for getting timesteps, and obs-action spaces
    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Normalize observations based on their empirical mean and variance
    obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size,
                                                            clip_threshold=5)

    winit_last = chainer.initializers.LeCunNormal(1e-2)

    # Switch policy types accordingly to action space types
    if isinstance(action_space, gym.spaces.Discrete):
        n_actions = action_space.n
        policy = chainer.Sequential(
            L.Linear(None, 64),
            F.tanh,
            L.Linear(None, 64),
            F.tanh,
            L.Linear(None, n_actions, initialW=winit_last),
            chainerrl.distribution.SoftmaxDistribution,
        )
    elif isinstance(action_space, gym.spaces.Box):
        action_size = action_space.low.size
        policy = chainer.Sequential(
            L.Linear(None, 64),
            F.tanh,
            L.Linear(None, 64),
            F.tanh,
            L.Linear(None, action_size, initialW=winit_last),
            chainerrl.policies.GaussianHeadWithStateIndependentCovariance(
                action_size=action_size,
                var_type='diagonal',
                var_func=lambda x: F.exp(2 * x),  # Parameterize log std
                var_param_init=0,  # log std = 0 => std = 1
            ),
        )
    else:
        print("""\
This example only supports gym.spaces.Box or gym.spaces.Discrete action spaces."""
              )  # NOQA
        return

    vf = chainer.Sequential(
        L.Linear(None, 64),
        F.tanh,
        L.Linear(None, 64),
        F.tanh,
        L.Linear(None, 1),
    )

    # Combine a policy and a value function into a single model
    model = chainerrl.links.Branched(policy, vf)

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = PPO(
        model,
        opt,
        obs_normalizer=obs_normalizer,
        gpu=args.gpu,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps_vf=None,
        entropy_coef=args.entropy_coef,
        standardize_advantages=args.standardize_advantages,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_batch_env(True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(False),
            eval_env=make_batch_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            return_window_size=args.window_size,
            max_episode_len=timestep_limit,
            save_best_so_far_agent=False,
            step_hooks=[
                lr_decay_hook,
            ],
        )
示例#12
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if(type(args) is list):
        args=make_args(args)
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2 ** 31

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2 ** 31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir,
                mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env
    def make_env_check():
        # Use different random seeds for train and test envs
        env_seed = args.seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=True,
            clip_rewards=True)
        env.seed(int(env_seed))
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv(
            [functools.partial(make_env, idx, test)
             for idx, env in enumerate(range(args.num_envs))])

    sample_env = make_env(0, test=False)

    n_actions = sample_env.action_space.n

    model = A2CFF(n_actions)
    optimizer = rmsprop_async.RMSpropAsync(lr=args.lr,
                                           eps=args.rmsprop_epsilon,
                                           alpha=args.alpha)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm))
    if args.weight_decay > 0:
        optimizer.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = a2c.A2C(
        model, optimizer, gamma=args.gamma,
        gpu=args.gpu,
        num_processes=args.num_envs,
        update_steps=args.update_steps,
        phi=phi,
        use_gae=args.use_gae,
        tau=args.tau,
    )

    if args.load_agent:
        agent.load(args.load_agent)

    if (args.mode=='train'):
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=make_batch_env(test=True),
            steps=args.steps,
            step_offset=args.step_offset,
            checkpoint_freq=args.checkpoint_frequency,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            log_interval=1000,
            log_type=args.log_type
        )
    elif (args.mode=='check'):
        return tools.make_video.check(env=make_env_check(),agent=agent,save_mp4=args.save_mp4)

    elif (args.mode=='growth'):
        return tools.make_video.growth(env=make_env_check(),agent=agent,outdir=args.outdir,max_num=args.max_frames,save_mp4=args.save_mp4)
示例#13
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--outdir', type=str, default='a3c_training', help='Directory path to save output files. If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='TTT-A3C-v0')
    parser.add_argument('--seed', type=int, default=17, help='Random seed [0, 2 ** 32)')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--steps', type=int, default=5*10 ** 5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--eval-interval', type=int, default=10 ** 5)
    parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax'))
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e0)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=1*1e-4)
    parser.add_argument('--weight-decay', type=float, default=0)
    parser.add_argument('--logger-level', type=int, default=logging.ERROR)
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer deterministic even with the same random seed.
    misc.set_random_seed(args.seed)
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2 ** 32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)

        # NOTE: uncomment the next line to start from a pretrained agent
        # env.set_agent(gym_ttt.pretrained_agent.get_pretrained_agent("./"))
        return env

    sample_env = gym.make(args.env)

    # number of steps after which an episode is ended (whether the game is over or not)
    timestep_limit = sample_env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')

    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Initialize the NN and the optimizer
    model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.8, beta=args.beta)
    if args.load:
        agent.load(args.load)

    # draw the policy and state value network
    chainerrl.misc.draw_computational_graph(
        [agent.model.pi_and_v(np.array([np.array([[0. for _ in range(3)] for _ in range(3)], dtype=np.float32)]))[0]],
        os.path.join(args.outdir, 'model_pi'))
    chainerrl.misc.draw_computational_graph(
        [agent.model.pi_and_v(np.array([np.array([[0. for _ in range(3)] for _ in range(3)], dtype=np.float32)]))[1]],
        os.path.join(args.outdir, 'model_v'))

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env             = env,
            agent           = agent,
            n_steps         = None,
            n_episodes      = args.eval_n_runs,
            max_episode_len = timestep_limit)

        print('n_runs: {} mean: {} median: {} stdev {}'.format(args.eval_n_runs, eval_stats['mean'], eval_stats['median'],eval_stats['stdev']))

    else:
        experiments.train_agent_async(
            agent           = agent,
            outdir          = args.outdir,
            processes       = args.processes,
            make_env        = make_env,
            profile         = args.profile,
            steps           = args.steps,
            eval_n_runs     = args.eval_n_runs,
            eval_interval   = args.eval_interval,
            max_episode_len = timestep_limit)
示例#14
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if (type(args) is list):
        args = make_args(args)
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # Set a random seed used in ChainerRL.
    # If you use more than one process (i.e. processes > 1),
    # the results will be no longer be deterministic
    # even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    n_actions = gym.make(args.env).action_space.n

    if args.use_lstm:
        model = A3CLSTM(n_actions)
    else:
        model = A3CFF(n_actions)

    # Draw the computational graph and save it in the output directory.
    fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None],
                                name='observation')
    with chainerrl.recurrent.state_reset(model):
        # The state of the model is reset again after drawing the graph
        chainerrl.misc.draw_computational_graph([model(fake_obs)],
                                                os.path.join(
                                                    args.outdir, 'model'))

    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = a3c.A3C(model,
                    opt,
                    t_max=args.t_max,
                    gamma=0.99,
                    beta=args.beta,
                    phi=phi)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_env_check():
        # Use different random seeds for train and test envs
        env_seed = args.seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=True,
                                           clip_rewards=True)
        env.seed(int(env_seed))
        return env

    if args.load_agent:
        agent.load(args.load_agent)

    if (args.mode == 'train'):
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            step_offset=args.step_offset,
            checkpoint_freq=args.checkpoint_frequency,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
            log_type=args.log_type)
    elif (args.mode == 'check'):
        return tools.make_video.check(env=make_env_check(),
                                      agent=agent,
                                      save_mp4=args.save_mp4)

    elif (args.mode == 'growth'):
        return tools.make_video.growth(env=make_env_check(),
                                       agent=agent,
                                       outdir=args.outdir,
                                       max_num=args.max_frames,
                                       save_mp4=args.save_mp4)
示例#15
0
    # Define agents to be used
    model = A3Cagent(obs_size, n_actions, args.nHidden)

    # Draw the computational graph and save it in the output directory.
    fake_obs = chainer.Variable(np.zeros(obs_size, dtype=np.float32)[None], name='observation')
    with chainerrl.recurrent.state_reset(model):
        # The state of the model is reset again after drawing the graph
        chainerrl.misc.draw_computational_graph(
                [model(fake_obs)],
                os.path.join(args.outdir, 'model'))

    opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.eps, alpha=args.alpha)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(args.gclipping))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    phi = lambda x: x.astype(np.float32, copy=False)

    agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=args.gamma,
                    beta=args.beta, phi=phi)

    lr_decay_hook = experiments.LinearInterpolationHook(args.steps, args.lr, 0, lr_setter)

    training = experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.threads,
            make_env=make_env,
            profile=False,
            steps=args.steps,
示例#16
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--env', type=str, default='Hopper-v2')
    parser.add_argument('--num-envs', type=int, default=1)
    parser.add_argument('--arch',
                        type=str,
                        default='FFGaussian',
                        choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian'))
    parser.add_argument('--bound-mean', action='store_true')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--steps', type=int, default=10**6)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--window-size', type=int, default=100)

    parser.add_argument('--update-interval', type=int, default=2048)
    parser.add_argument('--log-interval', type=int, default=1000)
    parser.add_argument('--batchsize', type=int, default=64)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--entropy-coef', type=float, default=0.0)
    args = parser.parse_args()

    #logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv([
            (lambda: make_env(idx, test))
            for idx, env in enumerate(range(args.num_envs))
        ])

    # Only for getting timesteps, and obs-action spaces
    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Normalize observations based on their empirical mean and variance
    obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size,
                                                            clip_threshold=5)

    # Switch policy types accordingly to action space types
    if args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFGaussian':
        model = A3CFFGaussian(obs_space.low.size,
                              action_space,
                              bound_mean=args.bound_mean)

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = PPO(
        model,
        opt,
        obs_normalizer=obs_normalizer,
        gpu=args.gpu,
        update_interval=args.ppo_update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps_vf=None,
        entropy_coef=args.entropy_coef,
        standardize_advantages=args.standardize_advantages,
    )

    if args.load:
        agent.load(args.load)

    # Linearly decay the learning rate to zero
    def lr_setter(env, agent, value):
        agent.optimizer.alpha = value

    lr_decay_hook = experiments.LinearInterpolationHook(
        args.steps, args.lr, 0, lr_setter)

    # Linearly decay the clipping parameter to zero
    def clip_eps_setter(env, agent, value):
        agent.clip_eps = value

    clip_eps_decay_hook = experiments.LinearInterpolationHook(
        args.steps, 0.2, 0, clip_eps_setter)

    experiments.train_agent_batch_with_evaluation(
        agent=agent,
        env=make_batch_env(False),
        eval_env=make_batch_env(True),
        outdir=args.outdir,
        steps=args.steps,
        eval_n_runs=args.eval_n_runs,
        eval_interval=args.eval_interval,
        log_interval=args.log_interval,
        return_window_size=args.window_size,
        max_episode_len=timestep_limit,
        save_best_so_far_agent=False,
        step_hooks=[
            lr_decay_hook,
            clip_eps_decay_hook,
        ],
    )
示例#17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--enemy_step_width', default=2, type=int)
    parser.add_argument('-l',
                        '--level',
                        default=0,
                        type=int,
                        help='difficulty')
    parser.add_argument('-o', '--out_dir', default=None)
    parser.add_argument('-p', '--player_step_width', default=4, type=int)
    parser.add_argument('-r', '--random_seed', default=None, type=int)
    args = parser.parse_args()

    print(argv2line(sys.argv))
    print()
    print_args(args)
    print()

    if args.out_dir is None:
        out_dir = 'results_' + dt.now().strftime('%Y%m%d%H%M%S')
    else:
        out_dir = args.out_dir
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    assert os.path.isdir(out_dir)

    np.random.seed(args.random_seed)
    envs = [
        DanmakuEnv(level=args.level, random_seed=rs)
        for rs in np.random.randint(np.iinfo(np.uint32).max, size=N_PROCESSES)
    ]
    obs_space = envs[0].observation_space
    action_space = envs[0].action_space

    model = Model(obs_space.shape[0], action_space.n)
    model(
        np.random.uniform(
            size=obs_space.shape).astype('float32')[np.newaxis, :, :, :])

    opt = RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
    opt.setup(model)
    opt.add_hook(GradientClipping(40))
    opt.add_hook(NonbiasWeightDecay(1e-4))

    agents = [
        A3C(model, opt, t_max=10, gamma=0.99, beta=1e-2, process_idx=idx)
        for idx in range(N_PROCESSES)
    ]

    states = [env.reset() for env in envs]
    rewards = [0.] * N_PROCESSES
    episode_count = 0
    step_count = 0
    print('episode: {0:06d}'.format(episode_count + 1))

    while step_count < N_STEPS:
        results = [
            train_one_step(idx, env, agent, state, reward)
            for idx, env, agent, state, reward in zip(range(N_PROCESSES), envs,
                                                      agents, states, rewards)
        ]
        states = [result[1] for result in results]
        rewards = [result[2] for result in results]

        step_count += 1

        print(dt.now())
        print('passed steps: {0:07d}'.format(step_count))
        print('statistics: {}'.format(agents[0].get_statistics()))

        if envs[0].t == 0:
            episode_count += 1
            print('episode: {0:06d}'.format(episode_count + 1))

        if (step_count) % SAVE_INTERVAL == 0:
            save_agent_path = os.path.join(
                out_dir, 'agent_step_{0:07d}_'.format(step_count) +
                dt.now().strftime('%Y%m%d%H%M%S'))
            agents[0].save(save_agent_path)
            print('save ' + save_agent_path)
示例#18
0
def main():

    # Prevent numpy from using multiple threads
    os.environ['OMP_NUM_THREADS'] = '1'

    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('rom', type=str)
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--replay-start-size', type=int, default=10000)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-frequency', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--use-lstm', action='store_true')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.set_defaults(use_sdl=False)
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    print('Output files are saved in {}'.format(args.outdir))

    n_actions = ale.ALE(args.rom).number_of_actions

    if args.use_lstm:
        model = acer.ACERSharedModel(
            shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    else:
        model = acer.ACERSharedModel(
            shared=links.NIPSDQNHead(),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    replay_buffer = EpisodicReplayBuffer(10**6 // args.processes)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      beta=args.beta,
                      phi=dqn_phi)

    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)

        return env

    if args.demo:
        env = make_env(0, True)
        mean, median, stdev = experiments.eval_performance(
            env=env, agent=agent, n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev'.format(
            args.eval_n_runs, mean, median, stdev))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_frequency=args.eval_frequency,
                                      max_episode_len=args.max_episode_len)
示例#19
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--arch',
                        type=str,
                        default='FFSoftmax',
                        choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian'))
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Switch policy types accordingly to action space types
    if args.arch == 'LSTMGaussian':
        model = A3CLSTMGaussian(obs_space.low.size, action_space.low.size)
    elif args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_steps=None,
                                      eval_n_episodes=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)
示例#20
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if (type(args) is list):
        args = make_args(args)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])

    sample_env = make_env(process_idx=0, test=False)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Switch policy types accordingly to action space types
    if args.arch == 'Gaussian':
        model = A2CGaussian(obs_space.low.size, action_space.low.size)
    elif args.arch == 'FFSoftmax':
        model = A2CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A2CFFMellowmax(obs_space.low.size, action_space.n)

    optimizer = chainer.optimizers.RMSprop(args.lr,
                                           eps=args.rmsprop_epsilon,
                                           alpha=args.alpha)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm))
    if args.weight_decay > 0:
        optimizer.add_hook(NonbiasWeightDecay(args.weight_decay))

    agent = a2c.A2C(model,
                    optimizer,
                    gamma=args.gamma,
                    gpu=args.gpu,
                    num_processes=args.num_envs,
                    update_steps=args.update_steps,
                    use_gae=args.use_gae,
                    tau=args.tau)
    if args.load_agent:
        agent.load(args.load_agent)

    if (args.mode == 'train'):
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=make_batch_env(test=True),
            steps=args.steps,
            log_interval=args.log_interval,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            step_offset=args.step_offset,
            checkpoint_freq=args.checkpoint_freq,
            log_type=args.log_type)
    elif (args.mode == 'check'):
        from matplotlib import animation
        import matplotlib.pyplot as plt

        frames = []
        env = make_env(process_idx=0, test=True)
        for i in range(3):
            obs = env.reset()
            done = False
            R = 0
            t = 0
            while not done and t < 200:
                frames.append(env.render(mode='rgb_array'))
                action = agent.act(obs)
                obs, r, done, _ = env.step(action)
                R += r
                t += 1
            print('test episode:', i, 'R:', R)
            agent.stop_episode()
        env.close()

        from IPython.display import HTML
        plt.figure(figsize=(frames[0].shape[1] / 72.0,
                            frames[0].shape[0] / 72.0),
                   dpi=72)
        patch = plt.imshow(frames[0])
        plt.axis('off')

        def animate(i):
            patch.set_data(frames[i])

        anim = animation.FuncAnimation(plt.gcf(),
                                       animate,
                                       frames=len(frames),
                                       interval=50)
        anim.save(args.save_mp4)
        return anim
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int, default=4)  # increase for more asynchronous workers
    parser.add_argument('--outdir', type=str, default='a3c_training', help='Directory path to save output files. If it does not exist, it will be created.')  # set directory to which output files will be written
    parser.add_argument('--env', type=str, default='1DIsing-A3C-v0')  # specify environment to explore

    parser.add_argument('--steps', type=int, default=1 * 10 ** 7)  # maximum number of steps before training ends
    parser.add_argument('--eval-interval', type=int, default=10**4)  # frequency at which the agent will be evaluated
    parser.add_argument('--eval-n-runs', type=int, default=10)  # number of evaluation runs per evaluation
    parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax'))  # NN to use for policy and state value estimates
    parser.add_argument('--t-max', type=int, default=5)  # increase for later truncation of the sum
    parser.add_argument('--beta', type=float, default=1e-2)    # increase for more exploration
    parser.add_argument('--gamma', type=float, default=0.99)    # increase for less discount of future rewards
    parser.add_argument('--lr', type=float, default=1 * 1e-4)  # decrease for slower learning rate
    parser.add_argument('--weight-decay', type=float, default=0)  # turn on to get weight decay

    parser.add_argument('--seed', type=int, default=17, help='Random seed [0, 2 ** 32)')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e0)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--logger-level', type=int, default=logging.ERROR)  # set to logging.DEBUG for (much more) information
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2 ** 32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.max_episode_steps
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    model = A3CFFSoftmax(obs_space.low.size, action_space.n)

    opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=args.gamma, beta=args.beta)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            max_episode_len=timestep_limit)
示例#22
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if (type(args) is list):
        args = make_args(args)
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    n_actions = gym.make(args.env).action_space.n

    if args.use_lstm:
        model = acer.ACERSharedModel(
            shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    else:
        model = acer.ACERSharedModel(
            shared=links.NIPSDQNHead(),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    replay_buffer = EpisodicReplayBuffer(10**6 // args.processes)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      beta=args.beta,
                      phi=phi)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_env_check():
        # Use different random seeds for train and test envs
        env_seed = args.seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=True,
                                           clip_rewards=True)
        env.seed(int(env_seed))
        return env

    if args.load_agent:
        agent.load(args.load_agent)

    if (args.mode == 'train'):

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
        )
    elif (args.mode == 'check'):
        return tools.make_video.check(env=make_env_check(),
                                      agent=agent,
                                      save_mp4=args.save_mp4)

    elif (args.mode == 'growth'):
        return tools.make_video.growth(env=make_env_check(),
                                       agent=agent,
                                       outdir=args.outdir,
                                       max_num=args.max_frames,
                                       save_mp4=args.save_mp4)
示例#23
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--arch',
                        type=str,
                        default='FFSoftmax',
                        choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian'))
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.getLogger().setLevel(args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            misc.env_modifiers.make_rendered(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Switch policy types accordingly to action space types
    if args.arch == 'LSTMGaussian':
        model = A3CLSTMGaussian(obs_space.low.size, action_space.low.size)
    elif args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    agent = a3c.A3C(model,
                    opt,
                    t_max=args.t_max,
                    gamma=0.99,
                    beta=args.beta,
                    phi=phi)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)
示例#24
0
def main():

    # Prevent numpy from using multiple threads
    os.environ['OMP_NUM_THREADS'] = '1'

    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('rom', type=str)
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--use-lstm', action='store_true')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.set_defaults(use_sdl=False)
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    print('Output files are saved in {}'.format(args.outdir))

    n_actions = ale.ALE(args.rom).number_of_actions

    if args.use_lstm:
        model = A3CLSTM(n_actions)
    else:
        model = A3CFF(n_actions)
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = a3c.A3C(model,
                    opt,
                    t_max=args.t_max,
                    gamma=0.99,
                    beta=args.beta,
                    phi=dqn_phi)
    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)
        return env

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=args.max_episode_len,
                                      global_step_hooks=[lr_decay_hook])
示例#25
0
def main():

    # Prevent numpy from using multiple threads
    os.environ['OMP_NUM_THREADS'] = '1'

    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('rom', type=str)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true')
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--lr', type=float, default=2.5e-4)

    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')

    # In the original paper, agent runs in 8 environments parallely
    # and samples 128 steps per environment.
    # Sample 128 * 8 steps, instead.
    parser.add_argument('--update-interval', type=int, default=128 * 8)

    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--epochs', type=int, default=3)
    parser.set_defaults(use_sdl=False)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    n_actions = ale.ALE(args.rom).number_of_actions

    model = A3CFF(n_actions)
    opt = chainer.optimizers.Adam(alpha=args.lr)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = PPO(
        model,
        opt,
        gpu=args.gpu,
        phi=dqn_phi,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps=0.1,
        clip_eps_vf=None,
        standardize_advantages=args.standardize_advantages,
    )
    if args.load:
        agent.load(args.load)

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = 2**31 - 1 - args.seed if test else args.seed
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test,
                      seed=env_seed)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)
        return env

    if args.demo:
        env = make_env(True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        # Linearly decay the clipping parameter to zero
        def clip_eps_setter(env, agent, value):
            agent.clip_eps = value

        clip_eps_decay_hook = experiments.LinearInterpolationHook(
            args.steps, 0.1, 0, clip_eps_setter)

        experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(False),
            eval_env=make_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            max_episode_len=args.max_episode_len,
            step_hooks=[
                lr_decay_hook,
                clip_eps_decay_hook,
            ],
        )
示例#26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument(
        '--max-episode-len',
        type=int,
        default=5 * 60 * 60 // 4,  # 5 minutes with 60/4 fps
        help='Maximum number of steps for each episode.')
    parser.add_argument('--lr', type=float, default=2.5e-4)

    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')

    # In the original paper, agent runs in 8 environments parallely
    # and samples 128 steps per environment.
    # Sample 128 * 8 steps, instead.
    parser.add_argument('--update-interval', type=int, default=128 * 8)

    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--epochs', type=int, default=3)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n

    model = A3CFF(n_actions)
    opt = chainer.optimizers.Adam(alpha=args.lr)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = PPO(
        model,
        opt,
        gpu=args.gpu,
        phi=phi,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps=0.1,
        clip_eps_vf=None,
        standardize_advantages=args.standardize_advantages,
    )
    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        # Linearly decay the clipping parameter to zero
        def clip_eps_setter(env, agent, value):
            agent.clip_eps = max(value, 1e-8)

        clip_eps_decay_hook = experiments.LinearInterpolationHook(
            args.steps, 0.1, 0, clip_eps_setter)

        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            eval_env=eval_env,
            outdir=args.outdir,
            steps=args.steps,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            train_max_episode_len=args.max_episode_len,
            save_best_so_far_agent=False,
            step_hooks=[
                lr_decay_hook,
                clip_eps_decay_hook,
            ],
        )
示例#27
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--processes', type=int, default=16)
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-interval', type=int, default=250000)
    parser.add_argument('--eval-n-steps', type=int, default=125000)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    n_actions = gym.make(args.env).action_space.n

    model = A3CFF(n_actions)

    # Draw the computational graph and save it in the output directory.
    fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None],
                                name='observation')
    with chainerrl.recurrent.state_reset(model):
        # The state of the model is reset again after drawing the graph
        chainerrl.misc.draw_computational_graph([model(fake_obs)],
                                                os.path.join(
                                                    args.outdir, 'model'))

    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = a3c.A3C(model,
                    opt,
                    t_max=args.t_max,
                    gamma=0.99,
                    beta=args.beta,
                    phi=phi)

    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_steps=None,
                                                  n_episodes=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
        )
示例#28
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('algo', default='ppo', choices=['ppo', 'gail', 'airl'], type=str)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--env', type=str, default='Hopper-v2')
    parser.add_argument('--arch', type=str, default='FFGaussian',
                        choices=('FFSoftmax', 'FFMellowmax',
                                 'FFGaussian'))
    parser.add_argument('--bound-mean', action='store_true')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--steps', type=int, default=10 ** 6)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--load_demo', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')

    parser.add_argument('--update-interval', type=int, default=2048)
    parser.add_argument('--batchsize', type=int, default=64)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--entropy-coef', type=float, default=0.0)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    if not (args.demo and args.load):
        args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Normalize observations based on their empirical mean and variance
    obs_normalizer = chainerrl.links.EmpiricalNormalization(
        obs_space.low.size, clip_threshold=5)

    # Switch policy types accordingly to action space types
    if args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFGaussian':
        model = A3CFFGaussian(obs_space.low.size, action_space,
                              bound_mean=args.bound_mean)

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    if args.algo == 'ppo':
        agent = PPO(model, opt,
                    obs_normalizer=obs_normalizer,
                    gpu=args.gpu,
                    update_interval=args.update_interval,
                    minibatch_size=args.batchsize, epochs=args.epochs,
                    clip_eps_vf=None, entropy_coef=args.entropy_coef,
                    standardize_advantages=args.standardize_advantages,
                    )
    elif args.algo == 'gail':
        import numpy as np
        from irl.gail import GAIL
        from irl.gail import Discriminator
        demonstrations = np.load(args.load_demo)
        D = Discriminator(gpu=args.gpu)
        agent = GAIL(demonstrations=demonstrations, discriminator=D,
                     model=model, optimizer=opt,
                     obs_normalizer=obs_normalizer,
                     gpu=args.gpu,
                     update_interval=args.update_interval,
                     minibatch_size=args.batchsize, epochs=args.epochs,
                     clip_eps_vf=None, entropy_coef=args.entropy_coef,
                     standardize_advantages=args.standardize_advantages,)
    elif args.algo == 'airl':
        import numpy as np
        from irl.airl import AIRL as Agent
        from irl.airl import Discriminator
        # obs_normalizer = None
        demonstrations = np.load(args.load_demo)
        D = Discriminator(gpu=args.gpu)
        agent = Agent(demonstrations=demonstrations, discriminator=D,
                      model=model, optimizer=opt,
                      obs_normalizer=obs_normalizer,
                      gpu=args.gpu,
                      update_interval=args.update_interval,
                      minibatch_size=args.batchsize, epochs=args.epochs,
                      clip_eps_vf=None, entropy_coef=args.entropy_coef,
                      standardize_advantages=args.standardize_advantages,)

    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
        outdir = args.load if args.load else args.outdir
        save_agent_demo(make_env(False), agent, outdir)
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        # Linearly decay the clipping parameter to zero
        def clip_eps_setter(env, agent, value):
            agent.clip_eps = max(value, 1e-8)

        clip_eps_decay_hook = experiments.LinearInterpolationHook(
            args.steps, 0.2, 0, clip_eps_setter)

        experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(False),
            eval_env=make_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            train_max_episode_len=timestep_limit,
            save_best_so_far_agent=False,
            step_hooks=[
                lr_decay_hook,
                clip_eps_decay_hook,
            ],
        )
        save_agent_demo(make_env(False), agent, args.outdir)
示例#29
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--replay-start-size', type=int, default=10000)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--use-lstm', action='store_true')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    n_actions = gym.make(args.env).action_space.n

    if args.use_lstm:
        model = acer.ACERSharedModel(
            shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    else:
        model = acer.ACERSharedModel(
            shared=links.NIPSDQNHead(),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    replay_buffer = EpisodicReplayBuffer(10**6 // args.processes)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      beta=args.beta,
                      phi=phi)

    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
        )
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('config', help='YAML config file', default="settings/photo_enhancement.yaml")
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--load_generator', type=str, default='generator_weights_for_demo.npz')
    parser.add_argument('--logger_level', type=int, default=logging.INFO)
    parser.add_argument('--file_name', type=str, default='images/demo_2.jpeg')
    args = parser.parse_args()
    print_args(args)

    # init a logger
    logging.basicConfig(level=args.logger_level)

    # load yaml config file
    with open(args.config) as f:
        config = yaml.load(f)

    # set random seed
    misc.set_random_seed(config['seed'])

    # define func to create env, target data sampler, and models
    if config['problem'] == 'photo_enhancement':
        def make_env(process_idx, test):
            assert test, "error: test should be True"
            env = PhotoEnhancementEnvDemo(batch_size=1,
                                          max_episode_steps=config['max_episode_steps'],
                                          imsize=config['imsize'],
                                          file_name=args.file_name)
            return env

        sample_env = make_env(0, True)

        gen = SpiralModel(config['imsize'], sample_env.num_parameters, config['L_stages'], config['conditional'])
        dis = SpiralDiscriminator(config['imsize'], config['conditional'])

        dataset = PhotoEnhancementDataset()

    else:
        raise NotImplementedError()

    # initialize optimizers
    gen_opt = chainer.optimizers.Adam(alpha=config['lr'], beta1=0.5)
    dis_opt = chainer.optimizers.Adam(alpha=config['lr'], beta1=0.5)

    gen_opt.setup(gen)
    dis_opt.setup(dis)

    gen_opt.add_hook(chainer.optimizer.GradientClipping(40))
    dis_opt.add_hook(chainer.optimizer.GradientClipping(40))
    if config['weight_decay'] > 0:
        gen_opt.add_hook(NonbiasWeightDecay(config['weight_decay']))
        dis_opt.add_hook(NonbiasWeightDecay(config['weight_decay']))

    # load generator's weight
    assert args.load_generator, "error: specify the weight of the model"
    if args.load_generator:
        serializers.load_npz(args.load_generator, gen)
        
    # init an spiral agent
    agent = SPIRAL(
        generator=gen,
        discriminator=dis,
        gen_optimizer=gen_opt,
        dis_optimizer=dis_opt,
        dataset=dataset,
        conditional=config['conditional'],
        reward_mode=config['reward_mode'],
        imsize=config['imsize'],
        max_episode_steps=config['max_episode_steps'],
        rollout_n=config['rollout_n'],
        gamma=config['gamma'],
        alpha=config['alpha'],
        beta=config['beta'],
        L_stages=config['L_stages'],
        U_update=config['U_update'],
        gp_lambda=config['gp_lambda'],
        n_save_final_obs_interval=config['n_save_final_obs_interval'],
        outdir=None,
        act_deterministically=True
    )

    # training mode
    max_episode_len = config['max_episode_steps'] * config['rollout_n']
    steps = config['processes'] * config['n_update'] * max_episode_len

    save_interval = config['processes'] * config['n_save_interval'] * max_episode_len
    eval_interval = config['processes'] * config['n_eval_interval'] * max_episode_len

    step_hook = SpiralStepHook(config['max_episode_steps'], save_interval, None)

    env = make_env(0, True)

    with chainer.using_config('train', False):
        eval_stats = experiments.evaluator.run_evaluation_episodes(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=1,
            max_episode_len=1)