示例#1
0
def play(env, session, timesteps_num):
    def stopping_criterion(env, t):
        return env.stop_criterion(t)

    ##########################
    # learning rate schedule #
    ##########################
    iterations_num = float(timesteps_num) / 4.0
    lr_multiplier = 1.0
    lr_schedule = utils.PiecewiseSchedule(
        [(0, 1e-4 * lr_multiplier),
         (iterations_num / 10, 1e-4 * lr_multiplier),
         (iterations_num / 2, 5e-5 * lr_multiplier)],
        outside_value=5e-5 * lr_multiplier)
    #################
    # set optimizer #
    #################
    OptimizerSepc = namedtuple('OptimizerSpec',
                               ['constructor', 'kwargs', 'lr_schedule'])
    optimizer = OptimizerSepc(constructor=tf.train.AdamOptimizer,
                              kwargs=dict(epsilon=1e-4),
                              lr_schedule=lr_schedule)
    ########################
    # exploration schedule #
    ########################
    exploration_schedule = utils.PiecewiseSchedule(
        [(0, 1.0), (1e6, 0.1), (iterations_num / 2, 0.01)], outside_value=0.01)
    #################
    # play the game #
    #################
    '''
    worker_max_num=multiprocessing.cpu_count()
    numworkers=10
    assert numworkers<worker_max_num
    for i in range(numworkers):
        t = threading.Thread(target=work, args=(str(i)))
        t.start()
    '''
    dqn_worker(env=env,
               name='dqn_worker',
               optimizer_spec=optimizer,
               session=session,
               exploration=exploration_schedule,
               replay_buffer_size=1000000,
               batch_size=32,
               gamma=0.99,
               learn_start=50000,
               learn_freq=4,
               history_frames_num=4,
               target_update_freq=10000,
               grad_norm_clipping=10,
               stop_criterion=stopping_criterion)
示例#2
0
def main():
    args = get_args()

    env = make_atari_env(args.env, args.seed)
    benchmark_env = make_atari_env(args.env, args.seed + 1)

    optimizer = tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4)

    exploration_schedule = utils.PiecewiseSchedule(
        [
            (0, 1.0),
            (args.prepopulate, 1.0),
            (args.prepopulate + args.explore_time, args.final_eps),
        ],
        outside_value=args.final_eps,
    )

    if not args.legacy:
        assert args.train_freq == 4  # Training frequency is undefined for DQN(lambda)
        replay_memory = make_replay_memory(args.return_est, args.mem_size,
                                           args.history_len, args.discount,
                                           args.cache_size, args.block_size,
                                           args.priority)
    else:
        assert args.cache_size == 80000  # Cache-related args are undefined for legacy DQN
        assert args.priority == 0.0
        assert args.block_size == 100
        replay_memory = make_legacy_replay_memory(args.return_est,
                                                  args.mem_size,
                                                  args.history_len,
                                                  args.discount)

    with utils.make_session(args.seed) as session:
        dqn.learn(
            session,
            env,
            benchmark_env,
            atari_cnn,
            replay_memory,
            optimizer,
            exploration_schedule,
            args.timesteps,
            args.batch_size,
            args.prepopulate,
            args.update_freq,
            train_freq=args.train_freq,
            grad_clip=args.grad_clip,
            log_every_n_steps=1000,
        )
    env.close()
示例#3
0
def create_scheduler(type='medium'):
    if type == 'none':
        return None
    if type == 'linear':
        return utils.LinearSchedule(200000, 1.0, 0.0)
    if type == 'medium':
        endpoints = [(0, 0), (2000, 0.1), (7000, 0.25), (40000, 0.5), (200000, 1.0)]
    elif type == 'high':
        endpoints = [(0, 0), (3000, 0.1), (15000, 0.25), (80000, 0.5), (500000, 1.0)]
    elif type == 'low': # low
        endpoints = [(0, 0), (1000, 0.1), (3000, 0.25), (20000, 0.5), (100000, 1.0)]
    elif type == 'tiny':  # low
        endpoints = [(0, 0), (1000, 0.1), (2000, 0.25), (5000, 0.5), (20000, 1.0)]
    elif type == 'exp':
        endpoints = [(0, 0), (1000, 0.01), (5000, 0.1), (10000, 0.5), (20000, 0.75), (50000, 0.9), (100000, 0.95), (200000, 1.0)]
    print('Building PiecewiseScheduler with <endpoints> = {}'.format(endpoints))
    scheduler = utils.PiecewiseSchedule(endpoints, outside_value=1.0)
    return scheduler
示例#4
0
def main():
    args = get_args()
    utils.set_global_seeds(args.seed)

    env = make_atari_env(args.env, args.seed)
    benchmark_env = make_atari_env(args.env, args.seed+1)

    optimizer = tf.train.AdamOptimizer(learning_rate=1e-4, epsilon=1e-4)

    n_timesteps = 10000000
    learning_starts = 50000
    exploration_schedule = utils.PiecewiseSchedule(
                               [(0, 1.0), (learning_starts, 1.0), (learning_starts + 1e6, 0.1)],
                               outside_value=0.1,
                           )

    replay_memory = NStepReplayMemory(
                        size=1000000,
                        history_len=args.history_len,
                        discount=0.99,
                        nsteps=args.nsteps,
                    )

    q_func = AtariRecurrentConvNet() if args.recurrent else AtariConvNet()

    dqn.learn(
        env,
        benchmark_env,
        q_func,
        replay_memory,
        optimizer=optimizer,
        exploration=exploration_schedule,
        max_timesteps=n_timesteps,
        batch_size=32,
        learning_starts=learning_starts,
        learning_freq=4,
        target_update_freq=10000,
        grad_clip=40.,
        log_every_n_steps=50000,
    )
    env.close()
示例#5
0
def main():
    seed = 0
    utils.set_global_seeds(seed)

    name = 'CartPole-v0'
    env = make_continuouscontrol_env(name, seed)
    benchmark_env = make_continuouscontrol_env(name, seed + 1)

    optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)

    n_timesteps = 500000
    learning_starts = 50000
    exploration_schedule = utils.PiecewiseSchedule(
        [(0, 1.0), (learning_starts, 1.0), (learning_starts + 3e5, 0.1)],
        outside_value=0.1,
    )

    replay_memory = NStepReplayMemory(
        size=500000,
        history_len=1,
        discount=0.99,
        nsteps=1,
    )

    dqn.learn(
        env,
        benchmark_env,
        CartPoleNet(),
        replay_memory,
        optimizer=optimizer,
        exploration=exploration_schedule,
        max_timesteps=n_timesteps,
        batch_size=32,
        learning_starts=learning_starts,
        learning_freq=4,
        target_update_freq=10000,
        log_every_n_steps=10000,
    )
    env.close()
def main():
    seed = 0
    name = 'CartPole-v0'
    env = make_gym_env(name, seed)
    benchmark_env = make_gym_env(name, seed + 1)

    optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)

    prepopulate = 50000
    exploration_schedule = utils.PiecewiseSchedule(
        [(0, 1.0), (prepopulate, 1.0), (prepopulate + 3e5, 0.1)],
        outside_value=0.1,
    )

    replay_memory = make_replay_memory(return_est='nstep-5',
                                       capacity=500000,
                                       history_len=1,
                                       discount=0.99,
                                       cache_size=80000,
                                       block_size=100,
                                       priority=0.0)

    with utils.make_session(seed) as session:
        dqn.learn(
            session,
            env,
            benchmark_env,
            cartpole_mlp,
            replay_memory,
            optimizer=optimizer,
            exploration=exploration_schedule,
            max_timesteps=500000,
            batch_size=32,
            prepopulate=prepopulate,
            target_update_freq=10000,
            train_freq=4,
            log_every_n_steps=10000,
        )
    env.close()
示例#7
0
def play(env,session,timesteps_num):
    ###################
    # build q network #
    ###################
    def build_cnn(input, act_num, scope, reuse=False):
        with tf.variable_scope(scope, reuse=reuse):
            out = input
            with tf.variable_scope('convnet'):
                out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
                out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
                out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
            out = layers.flatten(out)
            with tf.variable_scope('action_value'):
                out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu)
                out = layers.fully_connected(out, num_outputs=act_num, activation_fn=None)
            return out

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return utils.get_wrapper_by_name(env, "Monitor").get_total_steps() >= timesteps_num
    ##########################
    # learning rate schedule #
    ##########################
    iterations_num=float(timesteps_num)/4.0
    lr_multiplier=1.0
    lr_schedule=utils.PiecewiseSchedule([
        (0 , 1e-4*lr_multiplier),
        (iterations_num/10 , 1e-4*lr_multiplier),
        (iterations_num/2 , 5e-5*lr_multiplier)
    ],outside_value=5e-5*lr_multiplier)
    #################
    # set optimizer #
    #################
    OptimizerSepc = namedtuple('OptimizerSpec', ['constructor', 'kwargs', 'lr_schedule'])
    optimizer=OptimizerSepc(
        constructor=tf.train.AdamOptimizer,
        kwargs=dict(epsilon=1e-4),
        lr_schedule=lr_schedule
    )
    ########################
    # exploration schedule #
    ########################
    exploration_schedule=utils.PiecewiseSchedule([
        (0 , 1.0),
        (1e6 , 0.1),
        (iterations_num/2 , 0.01)
    ],outside_value=0.01)
    #################
    # play the game #
    #################
    agent.learn_by_dqn(env=env,
                       q_net=build_cnn,
                       optimizer_spec=optimizer,
                       session=session,
                       exploration=exploration_schedule,
                       replay_buffer_size=1000000,
                       batch_size=32,
                       gamma=0.99,
                       learn_start=50000,
                       learn_freq=4,
                       history_frames_num=4,
                       target_update_freq=10000,
                       grad_norm_clipping=10,
                       stop_criterion=stopping_criterion
                       )