示例#1
0
def atari_learn(env, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(
        env,
        q_func=atari_model,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=0.99,
        # learning_starts=50000,
        learning_starts=50,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10)
    env.close()
def lander_learn(
        env,
        session,
        num_timesteps,
        # YOUR OWN CODE
        seed,
        doubleQ=True,
        exp_name='doubleQ',
        schedule='PiecewiseSchedule',
        rew_file='lander_test.pk1'):
    # optimizer = lander_optimizer()
    # stopping_criterion = lander_stopping_criterion(num_timesteps)
    # exploration_schedule = lander_exploration_schedule(num_timesteps)

    dqn.learn(
        env=env,
        session=session,
        exploration=lander_exploration_schedule(num_timesteps, schedule),
        stopping_criterion=lander_stopping_criterion(num_timesteps),
        double_q=doubleQ,
        # YOUR OWN CODE
        rew_file=rew_file,
        seed=seed,
        env_name='LunarLander-v2',
        exp_name=exp_name,
        **lander_kwargs())
    env.close()
示例#3
0
def learn(env, session, args):
    if args.env == 'PongNoFrameskip-v4':
        lr_schedule = ConstantSchedule(1e-4)
        optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                      kwargs=dict(epsilon=1e-4),
                                      lr_schedule=lr_schedule)
        limit = max(int(args.num_steps / 2), 2e6)
        exploration_schedule = PiecewiseSchedule([
            (0, 1.00),
            (1e6, 0.10),
            (limit, 0.01),
        ],
                                                 outside_value=0.01)
        dqn.learn(env=env,
                  q_func=atari_model,
                  optimizer_spec=optimizer,
                  session=session,
                  exploration=exploration_schedule,
                  replay_buffer_size=1000000,
                  batch_size=32,
                  gamma=0.99,
                  learning_starts=50000,
                  learning_freq=4,
                  frame_history_len=4,
                  target_update_freq=10000,
                  grad_norm_clipping=10,
                  double_q=args.double_q,
                  logdir=args.logdir,
                  max_steps=args.num_steps)
    elif args.env == 'CartPole-v0':
        lr_schedule = ConstantSchedule(5e-4)
        optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                      kwargs=dict(epsilon=1e-4),
                                      lr_schedule=lr_schedule)
        exploration_schedule = PiecewiseSchedule([
            (0, 1.00),
            (5e4, 0.10),
            (1e5, 0.02),
        ],
                                                 outside_value=0.02)
        dqn.learn(env=env,
                  q_func=cartpole_model,
                  optimizer_spec=optimizer,
                  session=session,
                  exploration=exploration_schedule,
                  replay_buffer_size=10000,
                  batch_size=100,
                  gamma=0.99,
                  learning_starts=1000,
                  learning_freq=4,
                  frame_history_len=1,
                  target_update_freq=500,
                  grad_norm_clipping=10,
                  double_q=args.double_q,
                  logdir=args.logdir,
                  max_steps=args.num_steps,
                  cartpole=True)
    else:
        raise ValueError(args.env)
    env.close()
示例#4
0
def atari_learn(env, session, num_timesteps):
    num_iterations = float(num_timesteps) / 4.0
    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule(
        [(0, 1e-4 * lr_multiplier),
         (num_iterations / 10, 1e-4 * lr_multiplier),
         (num_iterations / 2, 5e-5 * lr_multiplier)],
        outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdadeltaOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([(0, 1.0), (1e6, 0.1),
                                              (num_iterations / 2, 0.01)],
                                             outside_value=0.01)
    dqn.learn(env=env,
              q_func=atari_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=10000,
              grad_norm_clipping=10,
              double_q=True)
    env.close()
def atari_learn(
        env,
        session,
        num_timesteps,
        # YOUR OWN CODE
        seed,
        doubleQ=True,
        exp_name='doubleQ',
        rew_file='ram_test.pk1'):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 0.2),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(
        env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=1,
        target_update_freq=10000,
        grad_norm_clipping=10,
        # YOUR OWN CODE
        double_q=doubleQ,
        rew_file=rew_file,
        seed=seed,
        env_name='Pong-ram-v0',
        exp_name=exp_name)
    env.close()
def atari_learn(env,
                session,
                num_timesteps,
                double_q,
                explore,
                env_name,
                ex2=ex2,
                coef=coef):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    # therefore, the exploration gradually decrease
    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    # TO-DO: Pay attention to arg here, double_q
    dqn.learn(env=env,
              q_func=atari_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=10000,
              grad_norm_clipping=10,
              double_q=double_q,
              rew_file='./pkl/' + env_name + '_' +
              time.strftime("%d-%m-%Y_%H-%M-%S") + '.pkl',
              explore=explore,
              ex2=ex2,
              coef=coef)
    env.close()
示例#7
0
def atari_learn(env,
                session,
                num_timesteps,
                lr_multiplier):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
        outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(
        constructor=tf.train.AdamOptimizer,
        kwargs=dict(epsilon=1e-4),
        lr_schedule=lr_schedule
    )

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule(
        [
            (0, 1.0),
            (1e6, 0.1),
            (num_iterations / 2, 0.01),
        ], outside_value=0.01
    )

    if not (os.path.exists('data')):
        os.makedirs('data')
    logdir = os.path.join('data', 'PongNoFrameskip-v4')
    if not (os.path.exists(logdir)):
        os.makedirs(logdir)

    dqn.learn(
        env=env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10,
        double_q=False,
        # rew_file='%s_lr_%s.pkl' % (os.path.join(logdir, time.strftime("%d-%m-%Y_%H-%M-%S")), str(lr_multiplier))
        rew_file='%s_lr_%s.pkl' % (os.path.join(logdir, time.strftime("%d-%m-%Y_%H-%M-%S")), 'vanilla')
    )
    env.close()
示例#8
0
def atari_learn(env,
                session,
                args,
                num_timesteps):
    logdir = os.path.join('data', args.exp_name)
    #if not(os.path.exists(logdir)):
        #os.makedirs(logdir)
    
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
                                         (0,                   1e-4 * lr_multiplier),
                                         (num_iterations / 10, 1e-4 * lr_multiplier),
                                         (num_iterations / 2,  5e-5 * lr_multiplier),
                                    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(
        constructor=tf.train.AdamOptimizer,
        kwargs=dict(epsilon=1e-4),
        lr_schedule=lr_schedule
    )

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule(
        [
            (0, 1.0),
            (1e6, 0.1),
            (num_iterations / 2, 0.01),
        ], outside_value=0.01
    )

    dqn.learn(
        env=env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=args.gamma,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10,
        double_q=args.double_q,
        logdir=logdir
    )
    env.close()
示例#9
0
def atari_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        if (t % 10000 == 0):
            print("get_total_steps:" +
                  str(get_wrapper_by_name(env, "Monitor").get_total_steps()) +
                  ", t:" + str(t) + ", num_timesteps:" + str(num_timesteps))
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    exploration_schedule2 = PiecewiseSchedule([
        (0, 1.0),
        (2e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                              outside_value=0.01)

    dqn.learn(
        env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,  #pipaek
        stopping_criterion=stopping_criterion,
        #replay_buffer_size=1000000,
        replay_buffer_size=2000000,  #pipaek
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10)
    env.close()
def cartpole_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    # lr_multiplier = 1.0
    # lr_multiplier = 0.1
    # lr_schedule = PiecewiseSchedule([
    # (0,                   1e-4 * lr_multiplier),
    # (num_iterations / 2,  1e-5 * lr_multiplier),
    # ],
    # outside_value=5e-5 * lr_multiplier)
    lr_schedule = InverseSchedule(initial_p=0.1, gamma=0.6)

    optimizer = dqn.OptimizerSpec(
        constructor=tf.train.GradientDescentOptimizer,
        # constructor=tf.train.AdamOptimizer,
        # kwargs=dict(epsilon=1e-4),
        kwargs=dict(),
        # constructor=tf.train.RMSPropOptimizer,
        # kwargs=dict(epsilon=1e-1),
        lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule(
        [
            (0, 1.0),
            # (0.2 * num_timesteps, 0.9),
            # (0.5 * num_timesteps, 0.5),
            (0.1 * num_timesteps, 0.1),
        ],
        outside_value=0.01)

    dqn.learn(
        env,
        q_func=cartpole_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=100000,
        batch_size=256,
        gamma=0.99,
        learning_starts=2000,
        learning_freq=1,
        frame_history_len=4,
        target_update_freq=1000,
        grad_norm_clipping=1000,
    )
    env.close()
示例#11
0
def lander_learn(env, session, num_timesteps, seed):
    optimizer = lander_optimizer()
    stopping_criterion = lander_stopping_criterion(num_timesteps)
    exploration_schedule = lander_exploration_schedule(num_timesteps)
    dqn.learn(env=env,
              session=session,
              exploration=lander_exploration_schedule(num_timesteps),
              stopping_criterion=lander_stopping_criterion(num_timesteps),
              double_q=True,
              **lander_kwargs())
    env.close()
示例#12
0
def arm_learn(env,
              session,
              scope_name,
              num_timesteps,
              spec_file=None,
              exp_dir=None):
    # # This is just a rough estimate
    # num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_timesteps / 40, 1e-4 * lr_multiplier),
        (num_timesteps / 8, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(t):
        return t >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (num_timesteps / 20, 0.3),
        (num_timesteps / 10, 0.1),
        (num_timesteps / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(env,
              q_func=arm_model,
              optimizer_spec=optimizer,
              session=session,
              scope_name=scope_name,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=2000,
              learning_freq=1,
              frame_history_len=1,
              target_update_freq=500,
              grad_norm_clipping=10,
              log_every_n_steps=500,
              spec_file=spec_file,
              exp_dir=exp_dir)

    ep_rew = env.get_episode_rewards()
    ep_len = env.get_episode_lengths()

    return ep_rew, ep_len
def atari_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 0.2),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    # Initialize Logging Dir
    data_path = osp.join(osp.dirname(osp.realpath(__file__)), 'data')

    if not (osp.exists(data_path)):
        os.makedirs(data_path)
    logdir = 'dqn_' + env.spec.id + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = osp.join(data_path, logdir)

    dqn.learn(env=env,
              q_func=atari_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=1,
              target_update_freq=10000,
              grad_norm_clipping=10,
              logdir=logdir)
    env.close()
示例#14
0
def atari_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    if REW_FILE == 'LinearSchedule':
        exploration_schedule = LinearSchedule(num_iterations,
                                              final_p=0.01,
                                              initial_p=1.0)
    elif REW_FILE == 'ConstantSchedule':
        exploration_schedule = ConstantSchedule(0.05)
    else:
        exploration_schedule = PiecewiseSchedule([
            (0, 1.0),
            (num_iterations / 5, 0.1),
            (num_iterations / 2, 0.01),
        ],
                                                 outside_value=0.01)

    dqn.learn(env=env,
              q_func=atari_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=100000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=10000,
              grad_norm_clipping=10,
              double_q=True,
              rew_file=REW_FILE)
    env.close()
def lander_learn(env, session, discount, num_timesteps, batch_size, double):
    optimizer = lander_optimizer()
    stopping_criterion = lander_stopping_criterion(num_timesteps)
    exploration_schedule = lander_exploration_schedule(num_timesteps)

    dqn.learn(env=env,
              session=session,
              exploration=lander_exploration_schedule(num_timesteps),
              stopping_criterion=lander_stopping_criterion(num_timesteps),
              batch_size=batch_size,
              gamma=discount,
              double_q=double,
              **lander_kwargs())
    env.close()
def atari_learn(env, session, discount, num_timesteps, batch_size, double,
                target_update_freq, **kwargs):
    # [Mehran Shakerinava] change end
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(
        env=env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        grad_norm_clipping=10,
        # [Mehran Shakerinava] change begin
        target_update_freq=target_update_freq,
        batch_size=batch_size,
        gamma=discount,
        double_q=double
        # [Mehran Shakerinava] change end
    )
    env.close()
示例#17
0
def knapsack_learn(env,
                   session,
                   num_timesteps,
                   lr_multiplier=1.0,
                   target_update_freq=10000,
                   exp_name='Knapsack_DQN',
                   boltzmann_exploration=False):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(env,
              q_func=knapsack_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=512,
              gamma=1,
              learning_starts=5000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=target_update_freq,
              grad_norm_clipping=10,
              exp_name=exp_name,
              boltzmann_exploration=boltzmann_exploration)
    env.close()
示例#18
0
def main():
    args = get_args()

    env = make_atari_env(args.env, args.seed)
    benchmark_env = make_atari_env(args.env, args.seed + 1)

    optimizer = tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4)

    exploration_schedule = utils.PiecewiseSchedule(
        [
            (0, 1.0),
            (args.prepopulate, 1.0),
            (args.prepopulate + args.explore_time, args.final_eps),
        ],
        outside_value=args.final_eps,
    )

    if not args.legacy:
        assert args.train_freq == 4  # Training frequency is undefined for DQN(lambda)
        replay_memory = make_replay_memory(args.return_est, args.mem_size,
                                           args.history_len, args.discount,
                                           args.cache_size, args.block_size,
                                           args.priority)
    else:
        assert args.cache_size == 80000  # Cache-related args are undefined for legacy DQN
        assert args.priority == 0.0
        assert args.block_size == 100
        replay_memory = make_legacy_replay_memory(args.return_est,
                                                  args.mem_size,
                                                  args.history_len,
                                                  args.discount)

    with utils.make_session(args.seed) as session:
        dqn.learn(
            session,
            env,
            benchmark_env,
            atari_cnn,
            replay_memory,
            optimizer,
            exploration_schedule,
            args.timesteps,
            args.batch_size,
            args.prepopulate,
            args.update_freq,
            train_freq=args.train_freq,
            grad_clip=args.grad_clip,
            log_every_n_steps=1000,
        )
    env.close()
示例#19
0
def atari_learn(env,
                session,
                num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0 
    lr_schedule = PiecewiseSchedule([
                                         (0,                   1e-4 * lr_multiplier),
                                         (num_iterations / 10, 1e-4 * lr_multiplier),
                                         (num_iterations / 2,  5e-5 * lr_multiplier),
                                    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(
        constructor=tf.train.AdamOptimizer,
        kwargs=dict(epsilon=1e-4),
        lr_schedule=lr_schedule
    )

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule(
        [
            (0, 0.2),
            (1e6, 0.1),
            (num_iterations / 2, 0.01),
        ], outside_value=0.01
    )

    dqn.learn(
        env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=1,
        target_update_freq=10000,
        grad_norm_clipping=10
    )
    env.close()
示例#20
0
def atari_learn(env,                
                num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0
    LEARNING_RATE = 5e-5
    lr_multiplier = 3.0
    lr_schedule = PiecewiseSchedule([
                                         (0,                   1e-4 * lr_multiplier),
                                         (num_iterations / 10, 1e-4 * lr_multiplier),
                                         (num_iterations / 2,  5e-5 * lr_multiplier),
                                    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(
        constructor=optim.Adam,
        kwargs=dict(lr=LEARNING_RATE, eps=1e-4)
    )


    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule(
        [
            (0, 1.0),
            (1e6, 0.1),
            (num_iterations / 2, 0.01),
        ], outside_value=0.01
    )

    dqn.learn(
        env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10,
        num_target_values = 10
    )
    env.close()
def lander_learn(env, session, num_timesteps, seed):

    optimizer = lander_optimizer()
    stopping_criterion = lander_stopping_criterion(num_timesteps)
    exploration_schedule = lander_exploration_schedule(num_timesteps)

    dqn.learn(env=env,
              session=session,
              exploration=lander_exploration_schedule(num_timesteps),
              stopping_criterion=lander_stopping_criterion(num_timesteps),
              double_q=True,
              save_name='lander_ddqn_replay500000_target5000',
              save=True,
              test=False,
              **lander_kwargs())
    env.close()
示例#22
0
def atari_learn(env, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return env.get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule(
        [
            # (0, 1.0),
            # (1e6, 0.1),
            (0, 0.4),
            (5e6, 0.1),
            (num_iterations / 2, 0.01),
        ],
        outside_value=0.01)
    lr_schedule = dict(milestones=[num_iterations / 2], gamma=0.5)

    dqn.learn(
        env=env,
        lr_schedule=lr_schedule,
        load_path='model/step_2400000.pth.tar',
        # load_path=None,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=700000,
        # replay_buffer_size=70000,
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        # learning_starts=50,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10,
        double_q=True)
    env.close()
示例#23
0
def lander_learn(env, session, num_timesteps, seed, double_q, explore):

    optimizer = lander_optimizer()
    stopping_criterion = lander_stopping_criterion(num_timesteps)
    exploration_schedule = lander_exploration_schedule(num_timesteps)

    dqn.learn(
        env=env,
        session=session,
        exploration=lander_exploration_schedule(num_timesteps),
        stopping_criterion=lander_stopping_criterion(num_timesteps),
        # double_q=True,
        double_q=double_q,
        rew_file='./pkl/lander_' + time.strftime("%d-%m-%Y_%H-%M-%S") + '.pkl',
        explore=explore,
        **lander_kwargs())
    env.close()
示例#24
0
def atari_learn(env, session, num_timesteps, result_dir):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    lander_optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                         kwargs={},
                                         lr_schedule=ConstantSchedule(1e-3))

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1),
        (num_timesteps * 0.1, 0.02),
    ],
                                             outside_value=0.02)

    dqn.learn(
        env=env,
        q_func=lander_model,
        optimizer_spec=lander_optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=50000,
        batch_size=32,
        gamma=1,
        learning_starts=1000,
        learning_freq=1,
        frame_history_len=1,
        target_update_freq=3000,
        grad_norm_clipping=10,
        lander=True,
        rew_file=osp.join(result_dir, 'episode_rewards.pkl'),
    )
    env.close()
示例#25
0
def lander_learn(env, session, num_timesteps, seed):
    # Initialize Logging Dir
    data_path = osp.join(osp.dirname(osp.realpath(__file__)), 'data')

    if not (osp.exists(data_path)):
        os.makedirs(data_path)
    logdir = 'dqn_' + env.spec.id + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = osp.join(data_path, logdir)

    dqn.learn(env=env,
              session=session,
              exploration=lander_exploration_schedule(num_timesteps),
              stopping_criterion=lander_stopping_criterion(num_timesteps),
              double_q=False,
              logdir=logdir,
              **lander_kwargs())
    env.close()
def lander_learn(env, session, seed, exp_name, num_timesteps, double_q,
                 replay_buffer_size):

    optimizer = lander_optimizer()
    stopping_criterion = lander_stopping_criterion(num_timesteps)
    exploration_schedule = lander_exploration_schedule(num_timesteps)

    dqn.learn(env=env,
              session=session,
              exp_name=exp_name,
              seed=seed,
              exploration=lander_exploration_schedule(num_timesteps),
              stopping_criterion=lander_stopping_criterion(num_timesteps),
              double_q=double_q,
              replay_buffer_size=replay_buffer_size,
              **lander_kwargs())
    env.close()
示例#27
0
def atari_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(
        env,
        q_func=atari_model,  #just the neural network as defined above
        optimizer_spec=
        optimizer,  #just a named tuple containing the grad alg, lr_shedule etc..
        session=session,  #The tf session 
        exploration=exploration_schedule,  #epsilon greedy schedule
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,  #Replay buffer size... size of what?
        batch_size=32,  #Gradient ascent batch size I guess..
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10)
    env.close()
示例#28
0
def run(worker_id, monitor, args):
    task_id, env_code, agent = args
    env = gym.make(env_code)
    _, reward_history, _ = dqn.learn(env,
                                     agent,
                                     monitor=monitor,
                                     worker_id=worker_id)
    env.close()
    return reward_history
示例#29
0
def atari_learn(env, session, num_timesteps, model, double_q, logdir):

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_timesteps / 10, 1e-4 * lr_multiplier),
        (num_timesteps / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= 4 * num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_timesteps / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(env=env,
              q_func=globals()["atari_model_" + model],
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=10000,
              grad_norm_clipping=10,
              double_q=double_q,
              rew_file=None,
              logdir=logdir)
    env.close()
示例#30
0
def knapsack_learn(env, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return False

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(env,
              q_func=knapsack_model,
              nn_size=3,
              n_hidden_units=128,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              target_update_freq=10000,
              grad_norm_clipping=10,
              double_DQN=True,
              n_steps_ahead=3)
    env.close()
示例#31
0
def game_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        pass

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(env,
              q_func=cnn_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=None,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=10000,
              grad_norm_clipping=10)
    env.close()