示例#1
0
def simulation(clients, passed_config):
    """
    Performs a simulation of Pig Chase game based on passed config
    :param clients: not used, just to have uniform interface
    :param passed_config: type str, the name of the config to use 
    """
    sim_config = Config(passed_config)
    experiment_cfg = sim_config.get_section('EXPERIMENT')
    results_dir = os.path.join(get_results_path(),
                               'simulation_{}_{}_{}'.format(sim_config.get_str('BASIC', 'learner'),
                                                            sim_config.get_str('BASIC', 'network'),
                                                            datetime.utcnow().isoformat()[:-4]))
    experiment_cfg["outdir"] = results_dir
    sim_config.copy_config(results_dir)
    opponent = PigChaseChallengeAgent(name="Agent_1", p_focused=0.95)
    agent_env = getattr(env_simulator, sim_config.get_str('BASIC', 'simulator'))(opponent=opponent,
                                                                                 **sim_config.get_section(
                                                                                     'SIMULATOR'))
    learner = create_value_based_learner(passed_config)
    logger.log(msg='Experiment parameters {}'.format(
        ' '.join([name + ':' + str(value) for name, value in experiment_cfg.items()])),
        level=logging.INFO)
    logger.log(msg='Starting experiment, calling chainerrl function.',
               level=logging.INFO)
    experiments.train_agent_with_evaluation(agent=learner,
                                            env=agent_env,
                                            **experiment_cfg)
示例#2
0
    def _test_abc(self,
                  steps=100000,
                  require_success=True,
                  gpu=-1,
                  load_model=False):

        if self.recurrent and gpu >= 0:
            self.skipTest(
                'NStepLSTM does not support double backprop with GPU.')
        if self.recurrent and chainer.__version__ == '7.0.0b3':
            self.skipTest(
                'chainer==7.0.0b3 has a bug in double backrop of LSTM.'
                ' See https://github.com/chainer/chainer/pull/8037')

        env, _ = self.make_env_and_successful_return(test=False)
        test_env, successful_return = self.make_env_and_successful_return(
            test=True)

        agent = self.make_agent(env, gpu)

        if load_model:
            print('Load agent from', self.agent_dirname)
            agent.load(self.agent_dirname)

        max_episode_len = None if self.episodic else 2

        # Train
        train_agent_with_evaluation(
            agent=agent,
            env=env,
            eval_env=test_env,
            steps=steps,
            outdir=self.tmpdir,
            eval_interval=200,
            eval_n_steps=None,
            eval_n_episodes=5,
            successful_score=successful_return,
            train_max_episode_len=max_episode_len,
        )

        agent.stop_episode()

        # Test
        n_test_runs = 10
        eval_returns = run_evaluation_episodes(
            test_env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
            max_episode_len=max_episode_len,
        )
        if require_success:
            n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
            self.assertEqual(n_succeeded, n_test_runs)

        # Save
        agent.save(self.agent_dirname)
示例#3
0
    def _test_abc(self,
                  steps=1000000,
                  require_success=True,
                  gpu=-1,
                  load_model=False):

        env, _ = self.make_env_and_successful_return(test=False)
        test_env, successful_return = self.make_env_and_successful_return(
            test=True)

        agent = self.make_agent(env, gpu)

        if load_model:
            print('Load agent from', self.agent_dirname)
            agent.load(self.agent_dirname)

        max_episode_len = None if self.episodic else 2

        # Train
        train_agent_with_evaluation(
            agent=agent,
            env=env,
            eval_env=test_env,
            steps=steps,
            outdir=self.tmpdir,
            eval_interval=200,
            eval_n_steps=None,
            eval_n_episodes=5,
            successful_score=successful_return,
            train_max_episode_len=max_episode_len,
        )

        agent.stop_episode()

        # Test
        n_test_runs = 5
        for _ in range(n_test_runs):
            total_r = 0.0
            obs = test_env.reset()
            done = False
            reward = 0.0
            while not done:
                action = agent.act(obs)
                obs, reward, done, _ = test_env.step(action)
                total_r += reward
            agent.stop_episode()

            if require_success:
                self.assertAlmostEqual(total_r, successful_return)

        # Save
        agent.save(self.agent_dirname)
示例#4
0
    def _test_training(self,
                       gpu,
                       steps=5000,
                       load_model=False,
                       require_success=True):

        random_seed.set_random_seed(1)
        logging.basicConfig(level=logging.DEBUG)

        env = self.make_env_and_successful_return(test=False)[0]
        test_env, successful_return = self.make_env_and_successful_return(
            test=True)
        agent = self.make_agent(env, gpu)

        if load_model:
            print('Load agent from', self.agent_dirname)
            agent.load(self.agent_dirname)
            agent.replay_buffer.load(self.rbuf_filename)

        # Train
        train_agent_with_evaluation(agent=agent,
                                    env=env,
                                    steps=steps,
                                    outdir=self.tmpdir,
                                    eval_interval=200,
                                    eval_n_steps=None,
                                    eval_n_episodes=5,
                                    successful_score=1,
                                    eval_env=test_env)

        agent.stop_episode()

        # Test
        n_test_runs = 5
        for _ in range(n_test_runs):
            total_r = 0.0
            obs = test_env.reset()
            done = False
            reward = 0.0
            while not done:
                action = agent.act(obs)
                obs, reward, done, _ = test_env.step(action)
                total_r += reward
            agent.stop_episode()
            if require_success:
                self.assertAlmostEqual(total_r, successful_return)

        # Save
        agent.save(self.agent_dirname)
        agent.replay_buffer.save(self.rbuf_filename)
    def _test_abc(self,
                  steps=100000,
                  require_success=True,
                  gpu=-1,
                  load_model=False):

        env, _ = self.make_env_and_successful_return(test=False)
        test_env, successful_return = self.make_env_and_successful_return(
            test=True)

        agent = self.make_agent(env, gpu)

        if load_model:
            print('Load agent from', self.agent_dirname)
            agent.load(self.agent_dirname)

        max_episode_len = None if self.episodic else 2

        # Train
        train_agent_with_evaluation(
            agent=agent,
            env=env,
            eval_env=test_env,
            steps=steps,
            outdir=self.tmpdir,
            eval_interval=200,
            eval_n_steps=None,
            eval_n_episodes=5,
            successful_score=successful_return,
            train_max_episode_len=max_episode_len,
        )

        agent.stop_episode()

        # Test
        n_test_runs = 5
        eval_returns = run_evaluation_episodes(
            test_env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
            max_episode_len=max_episode_len,
        )
        if require_success:
            n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
            self.assertEqual(n_succeeded, n_test_runs)

        # Save
        agent.save(self.agent_dirname)
示例#6
0
    def _test_abc(self,
                  steps=1000000,
                  require_success=True,
                  gpu=-1,
                  load_model=False):

        env, _ = self.make_env_and_successful_return(test=False)
        test_env, successful_return = self.make_env_and_successful_return(
            test=True)
        agent = self.make_agent(env, gpu)

        if load_model:
            print('Load agent from', self.agent_dirname)
            agent.load(self.agent_dirname)

        # Train
        train_agent_with_evaluation(agent=agent,
                                    env=env,
                                    steps=steps,
                                    outdir=self.tmpdir,
                                    eval_interval=200,
                                    eval_n_runs=50,
                                    successful_score=1,
                                    eval_env=test_env)

        agent.stop_episode()

        # Test
        n_test_runs = 100
        n_succeeded = 0
        for _ in range(n_test_runs):
            total_r = 0.0
            obs = test_env.reset()
            done = False
            reward = 0.0
            while not done:
                action = agent.act(obs)
                obs, reward, done, _ = test_env.step(action)
                total_r += reward
            agent.stop_episode()
            if np.isclose(total_r, successful_return):
                n_succeeded += 1

        if require_success:
            self.assertGreater(n_succeeded, 0.8 * n_test_runs)

        # Save
        agent.save(self.agent_dirname)
    def train(self, episodes):
        """
        Trains the model for given number of episodes.
        """

        progress_bar = ProgressBar(self.pbar, episodes)

        experiments.train_agent_with_evaluation(
            self.agent,
            self.env,
            steps=episodes,  # Train the agent for 2000 steps
            eval_n_steps=None,  # We evaluate for episodes, not time
            eval_n_episodes=10,  # 10 episodes are sampled for each evaluation
            train_max_episode_len=100,  # Maximum length of each episode
            eval_interval=self.
            log_interval,  # Evaluate the agent after every 1000 steps
            step_hooks=[progress_bar],  # add hooks
            logger=self.logger,
            outdir=self.save_path)  # Save everything to 'supervisor' directory
示例#8
0
def value_based_experiment(clients, passed_config):
    rvb_config = Config(os.path.join(get_config_dir(), passed_config))
    results_dir = os.path.join(get_results_path(),
                               'simulation_{}_{}_{}'.format(rvb_config.get_str('BASIC', 'learner'),
                                                            rvb_config.get_str('BASIC', 'network'),
                                                            datetime.utcnow().isoformat()[:-6]))
    experiment_cfg = rvb_config.get_section('EXPERIMENT')

    experiment_cfg["outdir"] = results_dir
    opponent = PigChaseChallengeAgent(name="Agent_1")
    agent_st_build = CustomStateBuilder()
    opponent_st_build = PigChaseSymbolicStateBuilder()
    opponent_env = PigChaseEnvironment(remotes=clients,
                                       state_builder=opponent_st_build,
                                       role=0,
                                       randomize_positions=True)

    agent_env = PigChaseEnvironment(remotes=clients,
                                    state_builder=agent_st_build,
                                    role=1,
                                    randomize_positions=True)

    env = SingleEnvWrapper(agent_env=agent_env,
                           opponent_env=opponent_env,
                           opponent=opponent,
                           reward_norm=ENV_CAUGHT_REWARD)

    learner = create_value_based_learner(passed_config)

    logger.log(msg='Experiment parameters {}'.format(
        ' '.join(['{}:{}'.format(name, str(value)) for name, value in experiment_cfg.items()])),
        level=logging.INFO)

    logger.log(msg='Starting experiment, calling chainerrl function.',
               level=logging.INFO)

    experiments.train_agent_with_evaluation(agent=learner,
                                            env=env,
                                            **experiment_cfg)
示例#9
0
		replay_start_size=replay_start_size,
		target_update_interval=target_update_interval,
		update_interval=update_interval,
		phi=phi, 
		target_update_method=target_update_method,
		soft_update_tau=soft_update_tau,
		episodic_update_len=16
	)

	
# Start tensorboard writer
writer = SummaryWriter('runs/'+datetime.now().strftime('%B%d  %H:%M:%S'))

# Start training
experiments.train_agent_with_evaluation(
		agent=agent, 
		env=env, 
		eval_env=env,
		steps=steps,
		eval_n_runs=eval_n_runs, 
		eval_interval=eval_interval,
		outdir=outdir, 
		max_episode_len=timestep_limit
	)
	
# Draw the computational graph and save it in the output directory.
chainerrl.misc.draw_computational_graph(
	[q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
	os.path.join(outdir, 'model')
	)
示例#10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=10**6,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon',
                        type=float,
                        default=0.1,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon',
                        type=float,
                        default=0.05,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--arch',
                        type=str,
                        default='doubledqn',
                        choices=['nature', 'nips', 'dueling', 'doubledqn'],
                        help='Network architecture to use.')
    parser.add_argument('--steps',
                        type=int,
                        default=5 * 10**7,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=5 * 10**4,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=1 * 10**4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=10**5,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval',
                        type=int,
                        default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--no-clip-delta',
                        dest='clip_delta',
                        action='store_false')
    parser.set_defaults(clip_delta=True)

    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions),
                            DiscreteActionValue)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(lr=args.lr,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.DQN
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=args.target_update_interval,
                  clip_delta=args.clip_delta,
                  update_interval=args.update_interval,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            eval_env=eval_env,
        )
示例#11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true', default=False)
    parser.add_argument('--final-exploration-frames', type=int, default=10**6)
    parser.add_argument('--final-epsilon', type=float, default=0.1)
    parser.add_argument('--eval-epsilon', type=float, default=0.05)
    parser.add_argument('--arch',
                        type=str,
                        default='nature',
                        choices=['nature', 'nips', 'dueling'])
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument(
        '--max-episode-len',
        type=int,
        default=5 * 60 * 60 // 4,  # 5 minutes with 60/4 fps
        help='Maximum number of steps for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=5 * 10**4)
    parser.add_argument('--target-update-interval', type=int, default=10**4)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--activation', type=str, default='relu')
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--no-clip-delta',
                        dest='clip_delta',
                        action='store_false')
    parser.set_defaults(clip_delta=True)
    parser.add_argument('--agent',
                        type=str,
                        default='DQN',
                        choices=['DQN', 'DoubleDQN', 'PAL'])
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            misc.env_modifiers.make_rendered(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    activation = parse_activation(args.activation)
    q_func = parse_arch(args.arch, n_actions, activation)

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(lr=2.5e-4,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = parse_agent(args.agent)
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=args.target_update_interval,
                  clip_delta=args.clip_delta,
                  update_interval=args.update_interval,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # In testing DQN, randomly select 5% of actions
        eval_explorer = explorers.ConstantEpsilonGreedy(
            args.eval_epsilon, lambda: np.random.randint(n_actions))
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_explorer=eval_explorer,
            save_best_so_far_agent=False,
            max_episode_len=args.max_episode_len,
            eval_env=eval_env,
        )
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true', default=False)
    parser.add_argument('--final-exploration-frames', type=int, default=10**6)
    parser.add_argument('--final-epsilon', type=float, default=0.1)
    parser.add_argument('--eval-epsilon', type=float, default=0.05)
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=5 * 10**4)
    parser.add_argument('--target-update-interval', type=int, default=10**4)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n

    n_atoms = 51
    v_max = 10
    v_min = -10
    q_func = chainerrl.links.Sequence(
        chainerrl.links.NatureDQNHead(),
        chainerrl.q_functions.DistributionalFCStateQFunctionWithDiscreteAction(
            None,
            n_actions,
            n_atoms,
            v_min,
            v_max,
            n_hidden_channels=0,
            n_hidden_layers=0),
    )

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as https://arxiv.org/abs/1707.06887
    opt = chainer.optimizers.Adam(2.5e-4, eps=1e-2 / args.batch_size)
    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = chainerrl.agents.CategoricalDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator='mean',
        phi=phi,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=None,
                                                  n_episodes=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            eval_env=eval_env,
        )
示例#13
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default='dqn_out')
    parser.add_argument('--env', type=str, default='Pendulum-v0')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=10**4)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--steps', type=int, default=10**5)
    parser.add_argument('--prioritized-replay', action='store_true')
    parser.add_argument('--episodic-replay', action='store_true')
    parser.add_argument('--replay-start-size', type=int, default=1000)
    parser.add_argument('--target-update-interval', type=int, default=10**2)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=10**4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e-3)
    args = parser.parse_args()

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def make_env(for_eval):
        env = gym.make(args.env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not for_eval:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if ((args.render_eval and for_eval)
                or (args.render_train and not for_eval)):
            misc.env_modifiers.make_rendered(env)
        return env

    env = make_env(for_eval=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = env.observation_space
    obs_size = obs_space.low.size
    action_space = env.action_space

    if isinstance(action_space, spaces.Box):
        action_size = action_space.low.size
        # Use NAF to apply DQN to continuous action spaces
        q_func = q_functions.FCQuadraticStateQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            action_space=action_space)
        # Use the Ornstein-Uhlenbeck process for exploration
        ou_sigma = (action_space.high - action_space.low) * 0.2
        explorer = explorers.AdditiveOU(sigma=ou_sigma)
    else:
        n_actions = action_space.n
        q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size,
            n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10**5
    if args.episodic_replay:
        if args.minibatch_size is None:
            args.minibatch_size = 4
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                // args.update_interval
            rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer(
                rbuf_capacity, betasteps=betasteps)
        else:
            rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity)
    else:
        if args.minibatch_size is None:
            args.minibatch_size = 32
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                // args.update_interval
            rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                         betasteps=betasteps)
        else:
            rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    def phi(obs):
        return obs.astype(np.float32)

    agent = DQN(q_func,
                opt,
                rbuf,
                gpu=args.gpu,
                gamma=args.gamma,
                explorer=explorer,
                replay_start_size=args.replay_start_size,
                target_update_interval=args.target_update_interval,
                update_interval=args.update_interval,
                phi=phi,
                minibatch_size=args.minibatch_size,
                target_update_method=args.target_update_method,
                soft_update_tau=args.soft_update_tau,
                episodic_update=args.episodic_replay,
                episodic_update_len=16)

    if args.load:
        agent.load(args.load)

    eval_env = make_env(for_eval=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
            max_episode_len=timestep_limit)
示例#14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--steps',
                        type=int,
                        default=5 * 10**7,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=5 * 10**4,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--eval-n-steps', type=int, default=125000)
    parser.add_argument('--eval-interval', type=int, default=250000)
    parser.add_argument('--n-best-episodes', type=int, default=30)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=None),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, 0.05)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions),
                            DiscreteActionValue)

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyperparameters as the Nature paper
    opt = optimizers.RMSpropGraves(lr=2.5e-4,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        start_epsilon=1.0,
        end_epsilon=0.1,
        decay_steps=10**6,
        random_action_func=lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.DQN
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=10**4,
                  clip_delta=True,
                  update_interval=4,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=args.eval_n_steps,
                                                  n_episodes=None)
        print('n_episodes: {} mean: {} median: {} stdev {}'.format(
            eval_stats['episodes'], eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=True,
            eval_env=eval_env,
        )

        dir_of_best_network = os.path.join(args.outdir, "best")
        agent.load(dir_of_best_network)

        # run 30 evaluation episodes, each capped at 5 mins of play
        stats = experiments.evaluator.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.n_best_episodes,
            max_episode_len=4500,
            logger=None)
        with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f:
            json.dump(stats, f)
        print("The results of the best scoring network:")
        for stat in stats:
            print(str(stat) + ":" + str(stats[stat]))
示例#15
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='CartPole-v1')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=1000)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--steps', type=int, default=10**8)
    parser.add_argument('--replay-start-size', type=int, default=50)
    parser.add_argument('--target-update-interval', type=int, default=100)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=1000)
    parser.add_argument('--n-hidden-channels', type=int, default=12)
    parser.add_argument('--n-hidden-layers', type=int, default=3)
    parser.add_argument('--gamma', type=float, default=0.95)
    parser.add_argument('--minibatch-size', type=int, default=32)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1.0)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        env = gym.make(args.env)
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if ((args.render_eval and test) or (args.render_train and not test)):
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_size = env.observation_space.low.size
    action_space = env.action_space

    hidden_size = 64
    q_func = chainerrl.agents.iqn.ImplicitQuantileQFunction(
        psi=chainerrl.links.Sequence(
            L.Linear(obs_size, hidden_size),
            F.relu,
        ),
        phi=chainerrl.links.Sequence(
            chainerrl.agents.iqn.CosineBasisLinear(64, hidden_size),
            F.relu,
        ),
        f=L.Linear(hidden_size, env.action_space.n),
    )
    # Use epsilon-greedy for exploration
    explorer = explorers.LinearDecayEpsilonGreedy(args.start_epsilon,
                                                  args.end_epsilon,
                                                  args.final_exploration_steps,
                                                  action_space.sample)

    opt = optimizers.Adam(1e-3)
    opt.setup(q_func)

    rbuf_capacity = 50000  # 5 * 10 ** 5
    rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = chainerrl.agents.IQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
    )

    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
            max_episode_len=timestep_limit)
示例#16
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('algo', default='ppo', choices=['ppo', 'gail', 'airl'], type=str)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--env', type=str, default='Hopper-v2')
    parser.add_argument('--arch', type=str, default='FFGaussian',
                        choices=('FFSoftmax', 'FFMellowmax',
                                 'FFGaussian'))
    parser.add_argument('--bound-mean', action='store_true')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--steps', type=int, default=10 ** 6)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--load_demo', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')

    parser.add_argument('--update-interval', type=int, default=2048)
    parser.add_argument('--batchsize', type=int, default=64)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--entropy-coef', type=float, default=0.0)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    if not (args.demo and args.load):
        args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Normalize observations based on their empirical mean and variance
    obs_normalizer = chainerrl.links.EmpiricalNormalization(
        obs_space.low.size, clip_threshold=5)

    # Switch policy types accordingly to action space types
    if args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFGaussian':
        model = A3CFFGaussian(obs_space.low.size, action_space,
                              bound_mean=args.bound_mean)

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    if args.algo == 'ppo':
        agent = PPO(model, opt,
                    obs_normalizer=obs_normalizer,
                    gpu=args.gpu,
                    update_interval=args.update_interval,
                    minibatch_size=args.batchsize, epochs=args.epochs,
                    clip_eps_vf=None, entropy_coef=args.entropy_coef,
                    standardize_advantages=args.standardize_advantages,
                    )
    elif args.algo == 'gail':
        import numpy as np
        from irl.gail import GAIL
        from irl.gail import Discriminator
        demonstrations = np.load(args.load_demo)
        D = Discriminator(gpu=args.gpu)
        agent = GAIL(demonstrations=demonstrations, discriminator=D,
                     model=model, optimizer=opt,
                     obs_normalizer=obs_normalizer,
                     gpu=args.gpu,
                     update_interval=args.update_interval,
                     minibatch_size=args.batchsize, epochs=args.epochs,
                     clip_eps_vf=None, entropy_coef=args.entropy_coef,
                     standardize_advantages=args.standardize_advantages,)
    elif args.algo == 'airl':
        import numpy as np
        from irl.airl import AIRL as Agent
        from irl.airl import Discriminator
        # obs_normalizer = None
        demonstrations = np.load(args.load_demo)
        D = Discriminator(gpu=args.gpu)
        agent = Agent(demonstrations=demonstrations, discriminator=D,
                      model=model, optimizer=opt,
                      obs_normalizer=obs_normalizer,
                      gpu=args.gpu,
                      update_interval=args.update_interval,
                      minibatch_size=args.batchsize, epochs=args.epochs,
                      clip_eps_vf=None, entropy_coef=args.entropy_coef,
                      standardize_advantages=args.standardize_advantages,)

    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
        outdir = args.load if args.load else args.outdir
        save_agent_demo(make_env(False), agent, outdir)
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        # Linearly decay the clipping parameter to zero
        def clip_eps_setter(env, agent, value):
            agent.clip_eps = max(value, 1e-8)

        clip_eps_decay_hook = experiments.LinearInterpolationHook(
            args.steps, 0.2, 0, clip_eps_setter)

        experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(False),
            eval_env=make_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            train_max_episode_len=timestep_limit,
            save_best_so_far_agent=False,
            step_hooks=[
                lr_decay_hook,
                clip_eps_decay_hook,
            ],
        )
        save_agent_demo(make_env(False), agent, args.outdir)
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='CartPole-v1')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=1000)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--steps', type=int, default=10**8)
    parser.add_argument('--prioritized-replay', action='store_true')
    parser.add_argument('--episodic-replay', action='store_true')
    parser.add_argument('--replay-start-size', type=int, default=50)
    parser.add_argument('--target-update-interval', type=int, default=100)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=1000)
    parser.add_argument('--n-hidden-channels', type=int, default=12)
    parser.add_argument('--n-hidden-layers', type=int, default=3)
    parser.add_argument('--gamma', type=float, default=0.95)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1.0)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        env = gym.make(args.env)
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if ((args.render_eval and test) or (args.render_train and not test)):
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_size = env.observation_space.low.size
    action_space = env.action_space

    n_atoms = 51
    v_max = 500
    v_min = 0

    n_actions = action_space.n
    q_func = q_functions.DistributionalFCStateQFunctionWithDiscreteAction(
        obs_size,
        n_actions,
        n_atoms,
        v_min,
        v_max,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers)
    # Use epsilon-greedy for exploration
    explorer = explorers.LinearDecayEpsilonGreedy(args.start_epsilon,
                                                  args.end_epsilon,
                                                  args.final_exploration_steps,
                                                  action_space.sample)

    opt = optimizers.Adam(1e-3)
    opt.setup(q_func)

    rbuf_capacity = 50000  # 5 * 10 ** 5
    if args.episodic_replay:
        if args.minibatch_size is None:
            args.minibatch_size = 4
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                // args.update_interval
            rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer(
                rbuf_capacity, betasteps=betasteps)
        else:
            rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity)
    else:
        if args.minibatch_size is None:
            args.minibatch_size = 32
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                // args.update_interval
            rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                         betasteps=betasteps)
        else:
            rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = chainerrl.agents.CategoricalDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
        target_update_method=args.target_update_method,
        soft_update_tau=args.soft_update_tau,
        episodic_update=args.episodic_replay,
        episodic_update_len=16)

    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
            max_episode_len=timestep_limit)
示例#18
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--env', type=str, default='Hopper-v1')
    parser.add_argument('--arch',
                        type=str,
                        default='FFGaussian',
                        choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian'))
    parser.add_argument('--normalize-obs', action='store_true')
    parser.add_argument('--bound-mean', action='store_true')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--steps', type=int, default=10**6)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')

    parser.add_argument('--update-interval', type=int, default=2048)
    parser.add_argument('--batchsize', type=int, default=64)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--entropy-coef', type=float, default=0.0)
    args = parser.parse_args()

    logging.getLogger().setLevel(args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(test):
        env = gym.make(args.env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if args.reward_scale_factor and not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render:
            misc.env_modifiers.make_rendered(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Switch policy types accordingly to action space types
    if args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFGaussian':
        model = A3CFFGaussian(obs_space.low.size,
                              action_space,
                              bound_mean=args.bound_mean,
                              normalize_obs=args.normalize_obs)

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = PPO(
        model,
        opt,
        gpu=args.gpu,
        phi=phi,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps_vf=None,
        entropy_coef=args.entropy_coef,
        standardize_advantages=args.standardize_advantages,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        # Linearly decay the clipping parameter to zero
        def clip_eps_setter(env, agent, value):
            agent.clip_eps = value

        clip_eps_decay_hook = experiments.LinearInterpolationHook(
            args.steps, 0.2, 0, clip_eps_setter)

        experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(False),
            eval_env=make_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            max_episode_len=timestep_limit,
            step_hooks=[
                lr_decay_hook,
                clip_eps_decay_hook,
            ],
        )
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('rom', type=str)
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true', default=False)
    parser.add_argument('--final-exploration-frames', type=int, default=10**6)
    parser.add_argument('--final-epsilon', type=float, default=0.1)
    parser.add_argument('--eval-epsilon', type=float, default=0.05)
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument('--replay-start-size', type=int, default=5 * 10**4)
    parser.add_argument('--target-update-interval', type=int, default=10**4)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    # In training, life loss is considered as terminal states
    env = ale.ALE(args.rom, use_sdl=args.use_sdl, seed=train_seed)
    misc.env_modifiers.make_reward_clipped(env, -1, 1)
    # In testing, an episode is terminated  when all lives are lost
    eval_env = ale.ALE(args.rom,
                       use_sdl=args.use_sdl,
                       treat_life_lost_as_terminal=False,
                       seed=test_seed)

    n_actions = env.number_of_actions

    n_atoms = 51
    v_max = 10
    v_min = -10
    q_func = chainerrl.links.Sequence(
        chainerrl.links.NatureDQNHead(),
        chainerrl.q_functions.DistributionalFCStateQFunctionWithDiscreteAction(
            None,
            n_actions,
            n_atoms,
            v_min,
            v_max,
            n_hidden_channels=0,
            n_hidden_layers=0),
    )

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as https://arxiv.org/abs/1707.06887
    opt = chainer.optimizers.Adam(2.5e-4, eps=1e-2 / args.batch_size)
    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))
    agent = chainerrl.agents.CategoricalDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator='mean',
        phi=dqn_phi,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # In testing DQN, randomly select 5% of actions
        eval_explorer = explorers.ConstantEpsilonGreedy(
            args.eval_epsilon, lambda: np.random.randint(n_actions))
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_explorer=eval_explorer,
            save_best_so_far_agent=False,
            eval_env=eval_env)
示例#20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=10**6,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon',
                        type=float,
                        default=0.01,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon',
                        type=float,
                        default=0.001,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--steps',
                        type=int,
                        default=5 * 10**7,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=5 * 10**4,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=3 * 10**4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--demo-n-episodes', type=int, default=30)
    parser.add_argument('--eval-n-steps', type=int, default=125000)
    parser.add_argument('--eval-interval',
                        type=int,
                        default=250000,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval',
                        type=int,
                        default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate.')
    parser.add_argument('--recurrent',
                        action='store_true',
                        default=False,
                        help='Use a recurrent model. See the code for the'
                        ' model definition.')
    parser.add_argument('--flicker',
                        action='store_true',
                        default=False,
                        help='Use so-called flickering Atari, where each'
                        ' screen is blacked out with probability 0.5.')
    parser.add_argument('--no-frame-stack',
                        action='store_true',
                        default=False,
                        help='Disable frame stacking so that the agent can'
                        ' only see the current screen.')
    parser.add_argument('--episodic-update-len',
                        type=int,
                        default=10,
                        help='Maximum length of sequences for updating'
                        ' recurrent models')
    parser.add_argument('--batch-size',
                        type=int,
                        default=32,
                        help='Number of transitions (in a non-recurrent case)'
                        ' or sequences (in a recurrent case) used for an'
                        ' update.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
            flicker=args.flicker,
            frame_stack=not args.no_frame_stack,
        )
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)
    print('Observation space', env.observation_space)
    print('Action space', env.action_space)

    n_actions = env.action_space.n
    if args.recurrent:
        # Q-network with LSTM
        q_func = chainerrl.links.StatelessRecurrentSequential(
            L.Convolution2D(None, 32, 8, stride=4),
            F.relu,
            L.Convolution2D(None, 64, 4, stride=2),
            F.relu,
            L.Convolution2D(None, 64, 3, stride=1),
            functools.partial(F.reshape, shape=(-1, 3136)),
            F.relu,
            L.NStepLSTM(1, 3136, 512, 0),
            L.Linear(None, n_actions),
            DiscreteActionValue,
        )
        # Replay buffer that stores whole episodes
        rbuf = replay_buffer.EpisodicReplayBuffer(10**6)
    else:
        # Q-network without LSTM
        q_func = chainer.Sequential(
            L.Convolution2D(None, 32, 8, stride=4),
            F.relu,
            L.Convolution2D(None, 64, 4, stride=2),
            F.relu,
            L.Convolution2D(None, 64, 3, stride=1),
            functools.partial(F.reshape, shape=(-1, 3136)),
            L.Linear(None, 512),
            F.relu,
            L.Linear(None, n_actions),
            DiscreteActionValue,
        )
        # Replay buffer that stores transitions separately
        rbuf = replay_buffer.ReplayBuffer(10**6)

    # Draw the computational graph and save it in the output directory.
    fake_obss = np.zeros(env.observation_space.shape, dtype=np.float32)[None]
    if args.recurrent:
        fake_out, _ = q_func(fake_obss, None)
    else:
        fake_out = q_func(fake_obss)
    chainerrl.misc.draw_computational_graph([fake_out],
                                            os.path.join(args.outdir, 'model'))

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    opt = chainer.optimizers.Adam(1e-4, eps=1e-4)
    opt.setup(q_func)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = chainerrl.agents.DoubleDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator='mean',
        phi=phi,
        minibatch_size=args.batch_size,
        episodic_update_len=args.episodic_update_len,
        recurrent=args.recurrent,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.demo_n_episodes,
        )
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.demo_n_episodes,
            eval_stats['mean'],
            eval_stats['median'],
            eval_stats['stdev'],
        ))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
        )
示例#21
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='Pendulum-v0')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=10**4)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--steps', type=int, default=10**5)
    parser.add_argument('--prioritized-replay', action='store_true')
    parser.add_argument('--replay-start-size', type=int, default=1000)
    parser.add_argument('--target-update-interval', type=int, default=10**2)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=10**4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e-3)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if ((args.render_eval and test) or (args.render_train and not test)):
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = env.observation_space
    obs_size = obs_space.low.size
    action_space = env.action_space

    if isinstance(action_space, spaces.Box):
        action_size = action_space.low.size
        # Use NAF to apply DQN to continuous action spaces
        q_func = q_functions.FCQuadraticStateQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            action_space=action_space)
        # Use the Ornstein-Uhlenbeck process for exploration
        ou_sigma = (action_space.high - action_space.low) * 0.2
        explorer = explorers.AdditiveOU(sigma=ou_sigma)
    else:
        n_actions = action_space.n
        q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size,
            n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10**5
    if args.minibatch_size is None:
        args.minibatch_size = 32
    if args.prioritized_replay:
        betasteps = (args.steps - args.replay_start_size) \
            // args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                     betasteps=betasteps)
    else:
        rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = DQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
        target_update_method=args.target_update_method,
        soft_update_tau=args.soft_update_tau,
    )

    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
            train_max_episode_len=timestep_limit)
示例#22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--out_dir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int, default=10 ** 5,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon', type=float, default=0.1,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon', type=float, default=0.05,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--steps', type=int, default=10 ** 6,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--max-episode-len', type=int,
                        default=30 * 60 * 60 // 4,  # 30 minutes with 60/4 fps
                        help='Maximum number of timesteps for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=1000,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int, default=1 * 10 ** 4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--eval-interval', type=int, default=10 ** 5,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval', type=int, default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--logging-level', type=int, default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--lr', type=float, default=2.5e-4,
                        help='Learning rate.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    experiments.set_log_base_dir(args.out_dir)
    print('Output files are saved in {}'.format(args.out_dir))

    env = make_env(env_seed=args.seed)

    n_actions = env.action_space.n
    
    q_func = links.Sequence(
        links.NatureDQNHead(n_input_channels=3),
        L.Linear(512, n_actions),
        DiscreteActionValue
    )

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(
        lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10 ** 6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon,
        args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        x = x.transpose(2, 0, 1)
        return np.asarray(x, dtype=np.float32) / 255

    agent = agents.DQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator='sum',
        phi=phi
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.out_dir,
            save_best_so_far_agent=False,
            max_episode_len=args.max_episode_len,
            eval_env=env,
        )
示例#23
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--processes', type=int, default=8)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--batchsize', type=int, default=10)
    parser.add_argument('--rollout-len', type=int, default=10)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--n-times-replay', type=int, default=1)
    parser.add_argument('--replay-start-size', type=int, default=10000)
    parser.add_argument('--t-max', type=int, default=None)
    parser.add_argument('--tau', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--train-async', action='store_true', default=False)
    parser.add_argument('--prioritized-replay',
                        action='store_true',
                        default=False)
    parser.add_argument('--disable-online-update',
                        action='store_true',
                        default=False)
    parser.add_argument('--backprop-future-values',
                        action='store_true',
                        default=True)
    parser.add_argument('--no-backprop-future-values',
                        action='store_false',
                        dest='backprop_future_values')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    # If you use async training (--train-async), the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    if args.train_async:
        # Set different random seeds for different subprocesses.
        # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
        # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
        process_seeds = np.arange(args.processes) + args.seed * args.processes
        assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        if args.train_async:
            process_seed = int(process_seeds[process_idx])
            env_seed = 2**32 - 1 - process_seed if test else process_seed
        else:
            env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            misc.env_modifiers.make_rendered(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Switch policy types accordingly to action space types
    if isinstance(action_space, gym.spaces.Box):
        model = chainerrl.agents.pcl.PCLSeparateModel(
            pi=chainerrl.policies.FCGaussianPolicy(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                bound_mean=True,
                min_action=action_space.low,
                max_action=action_space.high,
                var_wscale=1e-3,
                var_bias=1,
                var_type='diagonal',
            ),
            v=chainerrl.v_functions.FCVFunction(
                obs_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
            ))
    else:
        model = chainerrl.agents.pcl.PCLSeparateModel(
            pi=chainerrl.policies.FCSoftmaxPolicy(
                obs_space.low.size,
                action_space.n,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers),
            v=chainerrl.v_functions.FCVFunction(
                obs_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
            ),
        )

    if not args.train_async and args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    if args.train_async:
        opt = rmsprop_async.RMSpropAsync(lr=args.lr, alpha=0.99)
    else:
        opt = chainer.optimizers.Adam(alpha=args.lr)
    opt.setup(model)

    if args.prioritized_replay:
        replay_buffer = \
            chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer(
                capacity=5 * 10 ** 3,
                uniform_ratio=0.1,
                default_priority_func=exp_return_of_episode,
                wait_priority_after_sampling=False,
                return_sample_weights=False)
    else:
        replay_buffer = chainerrl.replay_buffer.EpisodicReplayBuffer(
            capacity=5 * 10**3)

    agent = chainerrl.agents.PCL(
        model,
        opt,
        replay_buffer=replay_buffer,
        t_max=args.t_max,
        gamma=0.99,
        tau=args.tau,
        phi=lambda x: x.astype(np.float32, copy=False),
        rollout_len=args.rollout_len,
        n_times_replay=args.n_times_replay,
        replay_start_size=args.replay_start_size,
        batchsize=args.batchsize,
        train_async=args.train_async,
        disable_online_update=args.disable_online_update,
        backprop_future_values=args.backprop_future_values,
    )
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        if args.train_async:
            experiments.train_agent_async(agent=agent,
                                          outdir=args.outdir,
                                          processes=args.processes,
                                          make_env=make_env,
                                          profile=args.profile,
                                          steps=args.steps,
                                          eval_n_runs=args.eval_n_runs,
                                          eval_interval=args.eval_interval,
                                          max_episode_len=timestep_limit)
        else:
            experiments.train_agent_with_evaluation(
                agent=agent,
                env=make_env(0, test=False),
                eval_env=make_env(0, test=True),
                outdir=args.outdir,
                steps=args.steps,
                eval_n_runs=args.eval_n_runs,
                eval_interval=args.eval_interval,
                max_episode_len=timestep_limit)
示例#24
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument(
        '--env',
        type=str,
        choices=[
            'Pendulum-v0', 'AntBulletEnv-v0', 'HalfCheetahBulletEnv-v0',
            'HumanoidBulletEnv-v0', 'HopperBulletEnv-v0',
            'Walker2DBulletEnv-v0'
        ],
        help=
        'OpenAI Gym env and Pybullet (roboschool) env to perform algorithm on.'
    )
    parser.add_argument('--num-envs',
                        type=int,
                        default=1,
                        help='Number of envs run in parallel.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--load',
                        type=str,
                        default='',
                        help='Directory to load agent from.')
    parser.add_argument(
        '--expert-num-episode',
        type=int,
        default=0,
        help='the number of expert trajectory, if 0, no create demo mode.')
    parser.add_argument('--steps',
                        type=int,
                        default=10**6,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--eval-n-runs',
                        type=int,
                        default=10,
                        help='Number of episodes run for each evaluation.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=5000,
                        help='Interval in timesteps between evaluations.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=10000,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--batch-size',
                        type=int,
                        default=256,
                        help='Minibatch size')
    parser.add_argument('--render',
                        action='store_true',
                        help='Render env states in a GUI window.')
    parser.add_argument('--demo',
                        action='store_true',
                        help='Just run evaluation, not training.')
    parser.add_argument('--monitor',
                        action='store_true',
                        help='Wrap env with gym.wrappers.Monitor.')
    parser.add_argument('--log-interval',
                        type=int,
                        default=1000,
                        help='Interval in timesteps between outputting log'
                        ' messages during training')
    parser.add_argument('--logger-level',
                        type=int,
                        default=logging.INFO,
                        help='Level of the root logger.')
    parser.add_argument('--policy-output-scale',
                        type=float,
                        default=1.,
                        help='Weight initialization scale of polity output.')
    parser.add_argument('--debug', action='store_true', help='Debug mode.')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    if args.debug:
        chainer.set_debug(True)
    if args.expert_num_episode == 0:
        args.outdir = experiments.prepare_output_dir(
            args,
            args.outdir,
            argv=sys.argv,
            time_format=f'{args.env}_{args.seed}')
    else:
        args.outdir = experiments.prepare_output_dir(
            args,
            args.outdir,
            argv=sys.argv,
            time_format=f'{args.env}_{args.expert_num_episode}expert')
        args.replay_start_size = 1e8
    print('Output files are saved in {}'.format(args.outdir))

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Unwrap TimiLimit wrapper
        assert isinstance(env, gym.wrappers.TimeLimit)
        env = env.env
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)

        if isinstance(env.observation_space, Box):
            # Cast observations to float32 because our model uses float32
            env = chainerrl.wrappers.CastObservationToFloat32(env)
        else:
            env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
                args.env, max_frames=None),
                                               episode_life=not test,
                                               clip_rewards=not test)

        if isinstance(env.action_space, Box):
            # Normalize action space to [-1, 1]^n
            env = wrappers.NormalizeActionSpace(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])

    sample_env = make_env(process_idx=0, test=False)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space
    print('Observation space:', obs_space)
    print('Action space:', action_space)

    if isinstance(obs_space, Box):
        head = network.FCHead()
        phi = lambda x: x

    else:
        head = network.CNNHead(n_input_channels=4)
        phi = lambda x: np.asarray(x, dtype=np.float32) / 255

    if isinstance(action_space, Box):
        action_size = action_space.low.size
        policy = network.GaussianPolicy(copy.deepcopy(head), action_size)
        q_func1 = network.QSAFunction(copy.deepcopy(head), action_size)
        q_func2 = network.QSAFunction(copy.deepcopy(head), action_size)

        def burnin_action_func():
            """Select random actions until model is updated one or more times."""
            return np.random.uniform(action_space.low,
                                     action_space.high).astype(np.float32)

    else:
        action_size = action_space.n

        policy = network.SoftmaxPolicy(copy.deepcopy(head), action_size)
        q_func1 = network.QSFunction(copy.deepcopy(head), action_size)
        q_func2 = network.QSFunction(copy.deepcopy(head), action_size)

        def burnin_action_func():
            return np.random.randint(0, action_size)

    policy_optimizer = optimizers.Adam(3e-4).setup(policy)
    q_func1_optimizer = optimizers.Adam(3e-4).setup(q_func1)
    q_func2_optimizer = optimizers.Adam(3e-4).setup(q_func2)

    # Draw the computational graph and save it in the output directory.
    # fake_obs = chainer.Variable(
    #     policy.xp.zeros_like(obs_space.low, dtype=np.float32)[None],
    #     name='observation')
    # fake_action = chainer.Variable(
    #     policy.xp.zeros_like(action_space.low, dtype=np.float32)[None],
    #     name='action')
    # chainerrl.misc.draw_computational_graph(
    #     [policy(fake_obs)], os.path.join(args.outdir, 'policy'))
    # chainerrl.misc.draw_computational_graph(
    #     [q_func1(fake_obs, fake_action)], os.path.join(args.outdir, 'q_func1'))
    # chainerrl.misc.draw_computational_graph(
    #     [q_func2(fake_obs, fake_action)], os.path.join(args.outdir, 'q_func2'))

    rbuf = replay_buffer.ReplayBuffer(10**6)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    agent = sac.SoftActorCritic(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=0.99,
        is_discrete=isinstance(action_space, Discrete),
        replay_start_size=args.replay_start_size,
        gpu=args.gpu,
        minibatch_size=args.batch_size,
        phi=phi,
        burnin_action_func=burnin_action_func,
        entropy_target=-action_size if isinstance(action_space, Box) else
        -np.log((1.0 / action_size)) * 0.98,
        temperature_optimizer=chainer.optimizers.Adam(3e-4),
    )

    if len(args.load) > 0:
        agent.load(args.load, args.expert_num_episode == 0)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=make_env(process_idx=0, test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit,
        )
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    elif args.expert_num_episode > 0:
        episode_r = 0
        env = sample_env
        episode_len = 0
        t = 0
        logger = logging.getLogger(__name__)
        episode_results = []
        try:
            for ep in range(args.expert_num_episode):
                obs = env.reset()
                r = 0
                while True:
                    # a_t
                    action = agent.act_and_train(obs, r)
                    # o_{t+1}, r_{t+1}
                    obs, r, done, info = env.step(action)
                    t += 1
                    episode_r += r
                    episode_len += 1
                    reset = (episode_len == timestep_limit
                             or info.get('needs_reset', False))
                    if done or reset:
                        agent.stop_episode_and_train(obs, r, done=done)
                        logger.info('outdir:%s step:%s episode:%s R:%s',
                                    args.outdir, t, ep, episode_r)
                        episode_results.append(episode_r)
                        episode_r = 0
                        episode_len = 0
                        break
            logger.info('mean: %s',
                        sum(episode_results) / len(episode_results))
        except (Exception, KeyboardInterrupt):
            raise

        # Save
        save_name = os.path.join(
            os.path.join('demos', f'{args.expert_num_episode}_episode'),
            args.env)
        makedirs(save_name, exist_ok=True)
        agent.replay_buffer.save(os.path.join(save_name, 'replay'))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(process_idx=0, test=False),
            eval_env=make_env(process_idx=0, test=True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            # log_interval=args.log_interval,
            train_max_episode_len=timestep_limit,
            eval_max_episode_len=timestep_limit,
        )
示例#25
0
def main():

    # Prevent numpy from using multiple threads
    os.environ['OMP_NUM_THREADS'] = '1'

    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('rom', type=str)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true')
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--lr', type=float, default=2.5e-4)

    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')

    # In the original paper, agent runs in 8 environments parallely
    # and samples 128 steps per environment.
    # Sample 128 * 8 steps, instead.
    parser.add_argument('--update-interval', type=int, default=128 * 8)

    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--epochs', type=int, default=3)
    parser.set_defaults(use_sdl=False)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    n_actions = ale.ALE(args.rom).number_of_actions

    model = A3CFF(n_actions)
    opt = chainer.optimizers.Adam(alpha=args.lr)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = PPO(
        model,
        opt,
        gpu=args.gpu,
        phi=dqn_phi,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps=0.1,
        clip_eps_vf=None,
        standardize_advantages=args.standardize_advantages,
    )
    if args.load:
        agent.load(args.load)

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = 2**31 - 1 - args.seed if test else args.seed
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test,
                      seed=env_seed)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)
        return env

    if args.demo:
        env = make_env(True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        # Linearly decay the clipping parameter to zero
        def clip_eps_setter(env, agent, value):
            agent.clip_eps = value

        clip_eps_decay_hook = experiments.LinearInterpolationHook(
            args.steps, 0.1, 0, clip_eps_setter)

        experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(False),
            eval_env=make_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            max_episode_len=args.max_episode_len,
            step_hooks=[
                lr_decay_hook,
                clip_eps_decay_hook,
            ],
        )
示例#26
0
文件: dqn.py 项目: yue123161/DQEAF
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--seed', type=int, default=123,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu', type=int, default=-1)
    parser.add_argument('--final-exploration-steps',
                        type=int, default=10 ** 4)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--steps', type=int, default=50000)
    parser.add_argument('--prioritized-replay', action='store_true', default=False)
    parser.add_argument('--episodic-replay', action='store_true', default=False)
    parser.add_argument('--replay-start-size', type=int, default=1000)
    parser.add_argument('--target-update-interval', type=int, default=10 ** 2)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=50)
    parser.add_argument('--eval-interval', type=int, default=10 ** 3)
    parser.add_argument('--n-hidden-channels', type=int, default=512)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--monitor', action='store_true', default=True)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-3)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(
        args, args.outdir, argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        ENV_NAME = 'malware-test-v0' if test else 'malware-v0'
        env = gym.make(ENV_NAME)
        # Use different random seeds for train and test envs
        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        # if not test:
        #     misc.env_modifiers.make_reward_filtered(
        #         env, lambda x: x * args.reward_scale_factor)
        if ((args.render_eval and test) or
                (args.render_train and not test)):
            misc.env_modifiers.make_rendered(env)
        return env

    env = make_env(test=False)
    timestep_limit = 80
    obs_space = env.observation_space
    obs_size = obs_space.shape[0]
    action_space = env.action_space

    n_actions = action_space.n
    q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size, n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
    if args.gpu >= 0:
        q_func.to_gpu(args.gpu)

    # Use epsilon-greedy for exploration
    explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    if args.gpu < 0:
        chainerrl.misc.draw_computational_graph(
            [q_func(np.zeros_like(obs_space, dtype=np.float32)[None])],
            os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10 ** 5
    if args.episodic_replay:
        if args.minibatch_size is None:
            args.minibatch_size = 4
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                        // args.update_interval
            rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer(
                rbuf_capacity, betasteps=betasteps)
        else:
            rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity)
    else:
        if args.minibatch_size is None:
            args.minibatch_size = 32
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                        // args.update_interval
            rbuf = replay_buffer.PrioritizedReplayBuffer(
                rbuf_capacity, betasteps=betasteps)
        else:
            rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    def phi(obs):
        return obs.astype(np.float32)

    agent = DoubleDQN(q_func, opt, rbuf, gamma=args.gamma,
                      explorer=explorer, replay_start_size=args.replay_start_size,
                      target_update_interval=args.target_update_interval,
                      update_interval=args.update_interval,
                      phi=phi, minibatch_size=args.minibatch_size,
                      target_update_method=args.target_update_method,
                      soft_update_tau=args.soft_update_tau,
                      episodic_update=args.episodic_replay, episodic_update_len=16)

    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        q_hook = PlotHook('Average Q Value')
        loss_hook = PlotHook('Average Loss', plot_index=1)

        experiments.train_agent_with_evaluation(
            agent=agent, env=env, steps=args.steps,
            eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval,
            outdir=args.outdir, eval_env=eval_env,
            max_episode_len=timestep_limit,
            step_hooks=[q_hook, loss_hook],
            successful_score=7
        )
示例#27
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='Humanoid-v2')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=10**6)
    parser.add_argument('--actor-lr', type=float, default=1e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument('--n-hidden-channels', type=int, default=300)
    parser.add_argument('--n-hidden-layers', type=int, default=3)
    parser.add_argument('--replay-start-size', type=int, default=5000)
    parser.add_argument('--n-update-times', type=int, default=1)
    parser.add_argument('--target-update-interval', type=int, default=1)
    parser.add_argument('--target-update-method',
                        type=str,
                        default='soft',
                        choices=['hard', 'soft'])
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--gamma', type=float, default=0.995)
    parser.add_argument('--minibatch-size', type=int, default=200)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--use-bn', action='store_true', default=False)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    args = parser.parse_args()

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def reward_filter(r):
        return r * args.reward_scale_factor

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_size = np.asarray(env.observation_space.shape).prod()
    action_space = env.action_space

    action_size = np.asarray(action_space.shape).prod()
    if args.use_bn:
        q_func = q_functions.FCBNLateActionSAQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            normalize_input=True)
        pi = policy.FCBNDeterministicPolicy(
            obs_size,
            action_size=action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            min_action=action_space.low,
            max_action=action_space.high,
            bound_action=True,
            normalize_input=True)
    else:
        q_func = q_functions.FCSAQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        pi = policy.FCDeterministicPolicy(
            obs_size,
            action_size=action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            min_action=action_space.low,
            max_action=action_space.high,
            bound_action=True)
    model = DDPGModel(q_func=q_func, policy=pi)
    opt_a = optimizers.Adam(alpha=args.actor_lr)
    opt_c = optimizers.Adam(alpha=args.critic_lr)
    opt_a.setup(model['policy'])
    opt_c.setup(model['q_function'])
    opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
    opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

    rbuf = replay_buffer.ReplayBuffer(5 * 10**5)

    def random_action():
        a = action_space.sample()
        if isinstance(a, np.ndarray):
            a = a.astype(np.float32)
        return a

    ou_sigma = (action_space.high - action_space.low) * 0.2
    explorer = explorers.AdditiveOU(sigma=ou_sigma)
    agent = DDPG(model,
                 opt_a,
                 opt_c,
                 rbuf,
                 gamma=args.gamma,
                 explorer=explorer,
                 replay_start_size=args.replay_start_size,
                 target_update_method=args.target_update_method,
                 target_update_interval=args.target_update_interval,
                 update_interval=args.update_interval,
                 soft_update_tau=args.soft_update_tau,
                 n_times_update=args.n_update_times,
                 gpu=args.gpu,
                 minibatch_size=args.minibatch_size)

    if len(args.load) > 0:
        agent.load(args.load)

    eval_env = make_env(test=True)
    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_env=eval_env,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            train_max_episode_len=timestep_limit)
示例#28
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default='out')
    parser.add_argument('--env', type=str, default='Humanoid-v1')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=10**6)
    parser.add_argument('--actor-lr', type=float, default=1e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument('--n-hidden-channels', type=int, default=300)
    parser.add_argument('--n-hidden-layers', type=int, default=3)
    parser.add_argument('--replay-start-size', type=int, default=5000)
    parser.add_argument('--n-update-times', type=int, default=1)
    parser.add_argument('--target-update-frequency', type=int, default=1)
    parser.add_argument('--target-update-method',
                        type=str,
                        default='soft',
                        choices=['hard', 'soft'])
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-frequency', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-frequency', type=int, default=10**5)
    parser.add_argument('--gamma', type=float, default=0.995)
    parser.add_argument('--minibatch-size', type=int, default=200)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--use-bn', action='store_true', default=False)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    args = parser.parse_args()

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def reward_filter(r):
        return r * args.reward_scale_factor

    def make_env():
        env = gym.make(args.env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        misc.env_modifiers.make_reward_filtered(env, reward_filter)
        if args.render:
            misc.env_modifiers.make_rendered(env)

        def __exit__(self, *args):
            pass

        env.__exit__ = __exit__
        return env

    env = make_env()
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_size = np.asarray(env.observation_space.shape).prod()
    action_space = env.action_space

    action_size = np.asarray(action_space.shape).prod()
    if args.use_bn:
        q_func = q_functions.FCBNLateActionSAQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            normalize_input=True)
        pi = policy.FCBNDeterministicPolicy(
            obs_size,
            action_size=action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            min_action=action_space.low,
            max_action=action_space.high,
            bound_action=True,
            normalize_input=True)
    else:
        q_func = q_functions.FCSAQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        pi = policy.FCDeterministicPolicy(
            obs_size,
            action_size=action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            min_action=action_space.low,
            max_action=action_space.high,
            bound_action=True)
    model = DDPGModel(q_func=q_func, policy=pi)
    opt_a = optimizers.Adam(alpha=args.actor_lr)
    opt_c = optimizers.Adam(alpha=args.critic_lr)
    opt_a.setup(model['policy'])
    opt_c.setup(model['q_function'])
    opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
    opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

    rbuf = replay_buffer.ReplayBuffer(5 * 10**5)

    def phi(obs):
        return obs.astype(np.float32)

    def random_action():
        a = action_space.sample()
        if isinstance(a, np.ndarray):
            a = a.astype(np.float32)
        return a

    ou_sigma = (action_space.high - action_space.low) * 0.2
    explorer = explorers.AdditiveOU(sigma=ou_sigma)
    agent = DDPG(model,
                 opt_a,
                 opt_c,
                 rbuf,
                 gamma=args.gamma,
                 explorer=explorer,
                 replay_start_size=args.replay_start_size,
                 target_update_method=args.target_update_method,
                 target_update_frequency=args.target_update_frequency,
                 update_frequency=args.update_frequency,
                 soft_update_tau=args.soft_update_tau,
                 n_times_update=args.n_update_times,
                 phi=phi,
                 gpu=args.gpu,
                 minibatch_size=args.minibatch_size)
    agent.logger.setLevel(logging.DEBUG)

    if len(args.load) > 0:
        agent.load(args.load)

    if args.demo:
        mean, median, stdev = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev'.format(
            args.eval_n_runs, mean, median, stdev))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_frequency=args.eval_frequency,
            outdir=args.outdir,
            max_episode_len=timestep_limit)
示例#29
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--outdir', type=str, default='results')
    parser.add_argument('--beta', type=float, default=1e-4)
    parser.add_argument('--batchsize', type=int, default=10)
    parser.add_argument('--steps', type=int, default=10**5)
    parser.add_argument('--eval-interval', type=int, default=10**4)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.getLogger().setLevel(args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(test):
        env = gym.make(args.env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render and not test:
            misc.env_modifiers.make_rendered(env)
        return env

    train_env = make_env(test=False)
    timestep_limit = train_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = train_env.observation_space
    action_space = train_env.action_space

    # Switch policy types accordingly to action space types
    if isinstance(action_space, gym.spaces.Box):
        model = chainerrl.policies.FCGaussianPolicyWithFixedCovariance(
            obs_space.low.size,
            action_space.low.size,
            var=0.1,
            n_hidden_channels=200,
            n_hidden_layers=2,
            nonlinearity=chainer.functions.leaky_relu,
        )
    else:
        model = chainerrl.policies.FCSoftmaxPolicy(
            obs_space.low.size,
            action_space.n,
            n_hidden_channels=200,
            n_hidden_layers=2,
            nonlinearity=chainer.functions.leaky_relu,
        )

    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    opt = chainer.optimizers.Adam(alpha=args.lr)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(1))

    agent = chainerrl.agents.REINFORCE(model,
                                       opt,
                                       beta=args.beta,
                                       phi=phi,
                                       batchsize=args.batchsize)
    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=train_env,
            eval_env=eval_env,
            outdir=args.outdir,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            max_episode_len=timestep_limit)
示例#30
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--beta', type=float, default=1e-4)
    parser.add_argument('--batchsize', type=int, default=10)
    parser.add_argument('--steps', type=int, default=10 ** 5)
    parser.add_argument('--eval-interval', type=int, default=10 ** 4)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    train_env = make_env(test=False)
    timestep_limit = train_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = train_env.observation_space
    action_space = train_env.action_space

    # Switch policy types accordingly to action space types
    if isinstance(action_space, gym.spaces.Box):
        model = chainerrl.policies.FCGaussianPolicyWithFixedCovariance(
            obs_space.low.size,
            action_space.low.size,
            var=0.1,
            n_hidden_channels=200,
            n_hidden_layers=2,
            nonlinearity=chainer.functions.leaky_relu,
        )
    else:
        model = chainerrl.policies.FCSoftmaxPolicy(
            obs_space.low.size,
            action_space.n,
            n_hidden_channels=200,
            n_hidden_layers=2,
            nonlinearity=chainer.functions.leaky_relu,
        )

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [model(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    opt = chainer.optimizers.Adam(alpha=args.lr)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(1))

    agent = chainerrl.agents.REINFORCE(
        model, opt, beta=args.beta, batchsize=args.batchsize)
    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=train_env,
            eval_env=eval_env,
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            train_max_episode_len=timestep_limit)