예제 #1
0
def learn(env, session, args):
    if args.env == 'PongNoFrameskip-v4':
        lr_schedule = ConstantSchedule(1e-4)
        optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                      kwargs=dict(epsilon=1e-4),
                                      lr_schedule=lr_schedule)
        limit = max(int(args.num_steps / 2), 2e6)
        exploration_schedule = PiecewiseSchedule([
            (0, 1.00),
            (1e6, 0.10),
            (limit, 0.01),
        ],
                                                 outside_value=0.01)
        dqn.learn(env=env,
                  q_func=atari_model,
                  optimizer_spec=optimizer,
                  session=session,
                  exploration=exploration_schedule,
                  replay_buffer_size=1000000,
                  batch_size=32,
                  gamma=0.99,
                  learning_starts=50000,
                  learning_freq=4,
                  frame_history_len=4,
                  target_update_freq=10000,
                  grad_norm_clipping=10,
                  double_q=args.double_q,
                  logdir=args.logdir,
                  max_steps=args.num_steps)
    elif args.env == 'CartPole-v0':
        lr_schedule = ConstantSchedule(5e-4)
        optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                      kwargs=dict(epsilon=1e-4),
                                      lr_schedule=lr_schedule)
        exploration_schedule = PiecewiseSchedule([
            (0, 1.00),
            (5e4, 0.10),
            (1e5, 0.02),
        ],
                                                 outside_value=0.02)
        dqn.learn(env=env,
                  q_func=cartpole_model,
                  optimizer_spec=optimizer,
                  session=session,
                  exploration=exploration_schedule,
                  replay_buffer_size=10000,
                  batch_size=100,
                  gamma=0.99,
                  learning_starts=1000,
                  learning_freq=4,
                  frame_history_len=1,
                  target_update_freq=500,
                  grad_norm_clipping=10,
                  double_q=args.double_q,
                  logdir=args.logdir,
                  max_steps=args.num_steps,
                  cartpole=True)
    else:
        raise ValueError(args.env)
    env.close()
예제 #2
0
def atari_learn(env, session, num_timesteps):
    num_iterations = float(num_timesteps) / 4.0
    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule(
        [(0, 1e-4 * lr_multiplier),
         (num_iterations / 10, 1e-4 * lr_multiplier),
         (num_iterations / 2, 5e-5 * lr_multiplier)],
        outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdadeltaOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([(0, 1.0), (1e6, 0.1),
                                              (num_iterations / 2, 0.01)],
                                             outside_value=0.01)
    dqn.learn(env=env,
              q_func=atari_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=10000,
              grad_norm_clipping=10,
              double_q=True)
    env.close()
예제 #3
0
def lander_optimizer():
    return dqn.OptimizerSpec(
        constructor=tf.train.AdamOptimizer,
        lr_schedule=ConstantSchedule(1e-3),
        # lr_schedule=ConstantSchedule(0.5e-3),
        # lr_schedule=ConstantSchedule(0.1e-3),
        # lr_schedule=ConstantSchedule(0.01e-3),
        kwargs={})
예제 #4
0
def lander_optimizer():
    lr_schedule = ConstantSchedule(1e-3)
    lr_lambda = lambda t: lr_schedule.value(t)
    return dqn.OptimizerSpec(
        constructor=torch.optim.Adam,
        lr_lambda=lr_lambda,
        kwargs={}
    )
def atari_learn(env,
                session,
                num_timesteps,
                double_q,
                explore,
                env_name,
                ex2=ex2,
                coef=coef):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    # therefore, the exploration gradually decrease
    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    # TO-DO: Pay attention to arg here, double_q
    dqn.learn(env=env,
              q_func=atari_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=10000,
              grad_norm_clipping=10,
              double_q=double_q,
              rew_file='./pkl/' + env_name + '_' +
              time.strftime("%d-%m-%Y_%H-%M-%S") + '.pkl',
              explore=explore,
              ex2=ex2,
              coef=coef)
    env.close()
def atari_learn(
        env,
        session,
        num_timesteps,
        # YOUR OWN CODE
        seed,
        doubleQ=True,
        exp_name='doubleQ',
        rew_file='ram_test.pk1'):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 0.2),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(
        env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=1,
        target_update_freq=10000,
        grad_norm_clipping=10,
        # YOUR OWN CODE
        double_q=doubleQ,
        rew_file=rew_file,
        seed=seed,
        env_name='Pong-ram-v0',
        exp_name=exp_name)
    env.close()
예제 #7
0
def atari_learn(env,
                session,
                num_timesteps,
                lr_multiplier):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
        outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(
        constructor=tf.train.AdamOptimizer,
        kwargs=dict(epsilon=1e-4),
        lr_schedule=lr_schedule
    )

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule(
        [
            (0, 1.0),
            (1e6, 0.1),
            (num_iterations / 2, 0.01),
        ], outside_value=0.01
    )

    if not (os.path.exists('data')):
        os.makedirs('data')
    logdir = os.path.join('data', 'PongNoFrameskip-v4')
    if not (os.path.exists(logdir)):
        os.makedirs(logdir)

    dqn.learn(
        env=env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10,
        double_q=False,
        # rew_file='%s_lr_%s.pkl' % (os.path.join(logdir, time.strftime("%d-%m-%Y_%H-%M-%S")), str(lr_multiplier))
        rew_file='%s_lr_%s.pkl' % (os.path.join(logdir, time.strftime("%d-%m-%Y_%H-%M-%S")), 'vanilla')
    )
    env.close()
예제 #8
0
def atari_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        if (t % 10000 == 0):
            print("get_total_steps:" +
                  str(get_wrapper_by_name(env, "Monitor").get_total_steps()) +
                  ", t:" + str(t) + ", num_timesteps:" + str(num_timesteps))
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    exploration_schedule2 = PiecewiseSchedule([
        (0, 1.0),
        (2e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                              outside_value=0.01)

    dqn.learn(
        env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,  #pipaek
        stopping_criterion=stopping_criterion,
        #replay_buffer_size=1000000,
        replay_buffer_size=2000000,  #pipaek
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10)
    env.close()
예제 #9
0
def atari_learn(env,
                session,
                args,
                num_timesteps):
    logdir = os.path.join('data', args.exp_name)
    #if not(os.path.exists(logdir)):
        #os.makedirs(logdir)
    
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
                                         (0,                   1e-4 * lr_multiplier),
                                         (num_iterations / 10, 1e-4 * lr_multiplier),
                                         (num_iterations / 2,  5e-5 * lr_multiplier),
                                    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(
        constructor=tf.train.AdamOptimizer,
        kwargs=dict(epsilon=1e-4),
        lr_schedule=lr_schedule
    )

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule(
        [
            (0, 1.0),
            (1e6, 0.1),
            (num_iterations / 2, 0.01),
        ], outside_value=0.01
    )

    dqn.learn(
        env=env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=args.gamma,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10,
        double_q=args.double_q,
        logdir=logdir
    )
    env.close()
예제 #10
0
def cartpole_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    # lr_multiplier = 1.0
    # lr_multiplier = 0.1
    # lr_schedule = PiecewiseSchedule([
    # (0,                   1e-4 * lr_multiplier),
    # (num_iterations / 2,  1e-5 * lr_multiplier),
    # ],
    # outside_value=5e-5 * lr_multiplier)
    lr_schedule = InverseSchedule(initial_p=0.1, gamma=0.6)

    optimizer = dqn.OptimizerSpec(
        constructor=tf.train.GradientDescentOptimizer,
        # constructor=tf.train.AdamOptimizer,
        # kwargs=dict(epsilon=1e-4),
        kwargs=dict(),
        # constructor=tf.train.RMSPropOptimizer,
        # kwargs=dict(epsilon=1e-1),
        lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule(
        [
            (0, 1.0),
            # (0.2 * num_timesteps, 0.9),
            # (0.5 * num_timesteps, 0.5),
            (0.1 * num_timesteps, 0.1),
        ],
        outside_value=0.01)

    dqn.learn(
        env,
        q_func=cartpole_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=100000,
        batch_size=256,
        gamma=0.99,
        learning_starts=2000,
        learning_freq=1,
        frame_history_len=4,
        target_update_freq=1000,
        grad_norm_clipping=1000,
    )
    env.close()
예제 #11
0
파일: run_dqn.py 프로젝트: cog-isa/HRL-grid
def arm_learn(env,
              session,
              scope_name,
              num_timesteps,
              spec_file=None,
              exp_dir=None):
    # # This is just a rough estimate
    # num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_timesteps / 40, 1e-4 * lr_multiplier),
        (num_timesteps / 8, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(t):
        return t >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (num_timesteps / 20, 0.3),
        (num_timesteps / 10, 0.1),
        (num_timesteps / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(env,
              q_func=arm_model,
              optimizer_spec=optimizer,
              session=session,
              scope_name=scope_name,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=2000,
              learning_freq=1,
              frame_history_len=1,
              target_update_freq=500,
              grad_norm_clipping=10,
              log_every_n_steps=500,
              spec_file=spec_file,
              exp_dir=exp_dir)

    ep_rew = env.get_episode_rewards()
    ep_len = env.get_episode_lengths()

    return ep_rew, ep_len
예제 #12
0
def atari_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 0.2),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    # Initialize Logging Dir
    data_path = osp.join(osp.dirname(osp.realpath(__file__)), 'data')

    if not (osp.exists(data_path)):
        os.makedirs(data_path)
    logdir = 'dqn_' + env.spec.id + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = osp.join(data_path, logdir)

    dqn.learn(env=env,
              q_func=atari_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=1,
              target_update_freq=10000,
              grad_norm_clipping=10,
              logdir=logdir)
    env.close()
예제 #13
0
def atari_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    if REW_FILE == 'LinearSchedule':
        exploration_schedule = LinearSchedule(num_iterations,
                                              final_p=0.01,
                                              initial_p=1.0)
    elif REW_FILE == 'ConstantSchedule':
        exploration_schedule = ConstantSchedule(0.05)
    else:
        exploration_schedule = PiecewiseSchedule([
            (0, 1.0),
            (num_iterations / 5, 0.1),
            (num_iterations / 2, 0.01),
        ],
                                                 outside_value=0.01)

    dqn.learn(env=env,
              q_func=atari_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=100000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=10000,
              grad_norm_clipping=10,
              double_q=True,
              rew_file=REW_FILE)
    env.close()
def atari_learn(env, session, discount, num_timesteps, batch_size, double,
                target_update_freq, **kwargs):
    # [Mehran Shakerinava] change end
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(
        env=env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        grad_norm_clipping=10,
        # [Mehran Shakerinava] change begin
        target_update_freq=target_update_freq,
        batch_size=batch_size,
        gamma=discount,
        double_q=double
        # [Mehran Shakerinava] change end
    )
    env.close()
예제 #15
0
def knapsack_learn(env,
                   session,
                   num_timesteps,
                   lr_multiplier=1.0,
                   target_update_freq=10000,
                   exp_name='Knapsack_DQN',
                   boltzmann_exploration=False):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(env,
              q_func=knapsack_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=512,
              gamma=1,
              learning_starts=5000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=target_update_freq,
              grad_norm_clipping=10,
              exp_name=exp_name,
              boltzmann_exploration=boltzmann_exploration)
    env.close()
예제 #16
0
def atari_learn(env,                
                num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0
    LEARNING_RATE = 5e-5
    lr_multiplier = 3.0
    lr_schedule = PiecewiseSchedule([
                                         (0,                   1e-4 * lr_multiplier),
                                         (num_iterations / 10, 1e-4 * lr_multiplier),
                                         (num_iterations / 2,  5e-5 * lr_multiplier),
                                    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(
        constructor=optim.Adam,
        kwargs=dict(lr=LEARNING_RATE, eps=1e-4)
    )


    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule(
        [
            (0, 1.0),
            (1e6, 0.1),
            (num_iterations / 2, 0.01),
        ], outside_value=0.01
    )

    dqn.learn(
        env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10,
        num_target_values = 10
    )
    env.close()
예제 #17
0
def atari_learn(env, session, num_timesteps, result_dir):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    lander_optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                         kwargs={},
                                         lr_schedule=ConstantSchedule(1e-3))

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1),
        (num_timesteps * 0.1, 0.02),
    ],
                                             outside_value=0.02)

    dqn.learn(
        env=env,
        q_func=lander_model,
        optimizer_spec=lander_optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=50000,
        batch_size=32,
        gamma=1,
        learning_starts=1000,
        learning_freq=1,
        frame_history_len=1,
        target_update_freq=3000,
        grad_norm_clipping=10,
        lander=True,
        rew_file=osp.join(result_dir, 'episode_rewards.pkl'),
    )
    env.close()
예제 #18
0
def atari_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(
        env,
        q_func=atari_model,  #just the neural network as defined above
        optimizer_spec=
        optimizer,  #just a named tuple containing the grad alg, lr_shedule etc..
        session=session,  #The tf session 
        exploration=exploration_schedule,  #epsilon greedy schedule
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,  #Replay buffer size... size of what?
        batch_size=32,  #Gradient ascent batch size I guess..
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10)
    env.close()
예제 #19
0
def atari_learn(env, session, num_timesteps, model, double_q, logdir):

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_timesteps / 10, 1e-4 * lr_multiplier),
        (num_timesteps / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= 4 * num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_timesteps / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(env=env,
              q_func=globals()["atari_model_" + model],
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=10000,
              grad_norm_clipping=10,
              double_q=double_q,
              rew_file=None,
              logdir=logdir)
    env.close()
예제 #20
0
def smb_learn(model_name, env, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    cont_train_model(
        model_name,
        env,
        optimizer_spec=optimizer,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=125000,
        batch_size=32,
        gamma=0.99,  # 0.99
        learning_starts=30000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10)
    env.close()
예제 #21
0
def knapsack_learn(env, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return False

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(env,
              q_func=knapsack_model,
              nn_size=3,
              n_hidden_units=128,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              target_update_freq=10000,
              grad_norm_clipping=10,
              double_DQN=True,
              n_steps_ahead=3)
    env.close()
예제 #22
0
def game_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        pass

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(env,
              q_func=cnn_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=None,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=10000,
              grad_norm_clipping=10)
    env.close()
예제 #23
0
def atari_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([(0, 1e-4 * lr_multiplier),
                                     (num_iterations / 10, 1e-4 * lr_multiplier),
                                     (num_iterations / 2,  5e-5 * lr_multiplier)],
                                      outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(
        constructor=tf.train.AdamOptimizer,
        kwargs=dict(epsilon=1e-4),
        lr_schedule=lr_schedule
    )

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    # Create action exploration/exploitation policy
    policy = LinearAnnealedPolicy(session=session, env=env, num_iterations=num_iterations)

    dqn.learn(
        env,
        policy = policy,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        #exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10
    )
    env.close()
예제 #24
0
def tt_learn(env, session, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        return t >= num_timesteps

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)

    dqn.learn(env,
              q_func=tt_model_dqn,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=10000,
              grad_norm_clipping=10,
              double_net=True)
예제 #25
0
def atari_learn(env, session, num_timesteps, env_test):
    # TODO: principle of Adam and more parameters
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=FLAGS.lr_schedule)

    # TODO: t input is not used here
    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    # TODO: better hyper parameters here
    if FLAGS.tabular:
        model = tabular_model
    else:
        model = atari_model
    dqn.learn(
        env,
        q_func=model,
        optimizer_spec=optimizer,
        session=session,
        exploration=FLAGS.exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=FLAGS.replay_buffer_size,
        batch_size=FLAGS.batch_size,
        gamma=FLAGS.discount_factor,
        learning_starts=FLAGS.learning_starts,
        learning_freq=FLAGS.learning_freq,
        frame_history_len=FLAGS.frame_history_len,
        target_update_freq=FLAGS.target_update_freq,
        grad_norm_clipping=10,
        env_test=env_test,
    )

    if env is not None:
        env.close()
예제 #26
0
def atari_learn(env,
                env_test,
                session,
                num_timesteps=2e7,
                learning_rate=None,
                exploration=None,
                dqn_config=None):
    '''
    fill the hyperparameters before running dqn
    :param env: ai gym env
    :param session: tensorflow session
    :param num_timesteps: int
    :param learning_rate: piecewise function
    :param exploration: piecewise function
    :param dqn_config: will override parameters above
    :return: none
    '''


    replay_buffer_size = 1000000
    batch_size = 32
    gamma = 0.99
    learning_starts = 50000
    learning_freq = 4
    frame_history_len = 4
    target_update_freq = 10000
    grad_norm_clipping = 10
    eval_obs_array = None
    room_q_interval = 1e5
    epoch_size = 5e3
    config_name = None


    if dqn_config:
        if dqn_config.has_key('num_timesteps'):
            num_timesteps = dqn_config['num_timesteps']
        if dqn_config.has_key('replay_buffer_size'):
            replay_buffer_size = dqn_config['replay_buffer_size']
        if dqn_config.has_key('batch_size'):
            batch_size = dqn_config['batch_size']
        if dqn_config.has_key('gamma'):
            gamma = dqn_config['gamma']
        if dqn_config.has_key('learning_starts'):
            learning_starts = dqn_config['learning_starts']
        if dqn_config.has_key('learning_freq'):
            learning_freq = dqn_config['learning_freq']
        if dqn_config.has_key('frame_history_len'):
            frame_history_len = dqn_config['frame_history_len']
        if dqn_config.has_key('target_update_freq'):
            target_update_freq = dqn_config['target_update_freq']
        if dqn_config.has_key('grad_norm_clipping'):
            grad_norm_clipping = dqn_config['grad_norm_clipping']
        if dqn_config.has_key('learning_rate'):
            learning_rate = dqn_config['learning_rate']
        if dqn_config.has_key('exploration'):
            exploration = dqn_config['exploration']
        if dqn_config.has_key('eval_obs_array'):
            eval_obs_array = dqn_config['eval_obs_array']
        if dqn_config.has_key('room_q_interval'):
            room_q_interval = dqn_config['room_q_interval']
        if dqn_config.has_key('epoch_size'):
            epoch_size = dqn_config['epoch_size']
        if dqn_config.has_key('config_name'):
            config_name = dqn_config['config_name']


    # log_dir = __cur_dir + 'logs/' + config_name + '_' + time + '/'
    cur_time = time.strftime("%m_%d_%y_%H:%M:%S", time.localtime(time.time()))
    log_dir = __cur_dir + 'logs/'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    if config_name != None:
        log_dir = log_dir + config_name + '_' + cur_time + '/'
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        dqn_network_dir = log_dir + 'dqn/'
        if not os.path.exists(dqn_network_dir):
            os.makedirs(dqn_network_dir)
        pkl_dir = log_dir + 'pkl/'
        if not os.path.exists(pkl_dir):
            os.makedirs(pkl_dir)
    else:
        log_dir = None
        print("config_name not specified! info may not be logged in this run.")


    # This is just a rough estimate
    num_iterations = float(num_timesteps) / learning_freq

    if learning_rate != None:
        lr_schedule = learning_rate
    else:
        lr_multiplier = 1.0
        lr_schedule = PiecewiseSchedule([
            (0,                   1e-4 * lr_multiplier),
            (num_iterations / 10, 1e-4 * lr_multiplier),
            (num_iterations / 2,  5e-5 * lr_multiplier),
        ],
            outside_value=5e-5 * lr_multiplier)

    if exploration != None:
        exploration_schedule = exploration
    else:
        exploration_schedule = PiecewiseSchedule(
            [
                (0, 1.0),
                (1e6, 0.1),
                (num_iterations / 2, 0.01),
            ], outside_value=0.01
        )

    optimizer = dqn.OptimizerSpec(
        constructor=tf.train.AdamOptimizer,
        kwargs=dict(epsilon=1e-4),
        lr_schedule=lr_schedule
    )

    def stopping_criterion(env, t):        # notice that here t is the number of steps of the wrapped env,

        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps


    dqn.learn(
        env,
        env_test,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=replay_buffer_size,
        batch_size=batch_size,
        gamma=gamma,
        learning_starts=learning_starts,
        learning_freq=learning_freq,
        frame_history_len=frame_history_len,
        target_update_freq=target_update_freq,
        grad_norm_clipping=grad_norm_clipping,
        eval_obs_array=eval_obs_array,
        room_q_interval=room_q_interval,
        epoch_size=epoch_size,
        log_dir=log_dir
    )
    env.close()
    env_test.close()
예제 #27
0
def atari_learn(env, session, num_timesteps, exper_name=None):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    # default exploration schedule
    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ],
                                             outside_value=0.01)
    # # no exploration
    # exploration_schedule = PiecewiseSchedule(
    #     [
    #         (0, 0),
    #         (num_iterations, 0),
    #     ], outside_value=0
    # )
    # # Only explore in beginning
    # exploration_schedule = PiecewiseSchedule(
    #     [
    #         (0, 1.0),
    #         (1e6, 0.1),
    #         (1e6+2, 0),
    #         (num_iterations, 0),
    #     ], outside_value=0
    # )
    # exploration_schedule = PiecewiseSchedule(
    #     [
    #         (0, 0.5),
    #         (1e6, 0.1),
    #         (1e6+2, 0),
    #         (num_iterations, 0),
    #     ], outside_value=0
    # )
    # exploration_schedule = PiecewiseSchedule(
    #     [
    #         (0, 0.1),
    #         (1e6, 0.1),
    #         (1e6+2, 0),
    #         (num_iterations, 0),
    #     ], outside_value=0
    # )

    dqn.learn(env,
              q_func=atari_model,
              optimizer_spec=optimizer,
              session=session,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=1000000,
              batch_size=32,
              gamma=0.99,
              learning_starts=50000,
              learning_freq=4,
              frame_history_len=4,
              target_update_freq=10000,
              grad_norm_clipping=10)
    env.close()
def main():
    num_timesteps = 5000
    num_simulations = 20  # I will restart the simulation periodically so it can learn from a fresh start
    num_iterations = num_timesteps * num_simulations
    printMyRoute(num_timesteps)
    sumoBinary = r"D:\InstalledProgram\SUMO\bin\sumo-gui.exe"
    #sumoBinary = r"D:\InstalledProgram\SUMO\bin\sumo.exe"

    traci.start([
        sumoBinary, "-c", "conf2.sumocfg", "--tripinfo-output",
        "Testtripinfo.xml", "--no-step-log", "--time-to-teleport", "-1"
    ])
    seed = random.randint(0, 9999)
    print('random seed = %d' % seed)
    session = get_session()
    rew_file = 'testreward.pkl'

    lr_multiplier = 1
    num_hidden = 100
    gamma = 0.95
    learning_freq = 10
    target_update_freq = 100
    explor1 = num_timesteps
    explor2 = num_timesteps * (num_simulations - 1)
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2, 5e-5 * lr_multiplier),
    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    exploration_schedule = PiecewiseSchedule(
        [
            (0, 1.0),
            (explor1, 0.1),
            (explor2, 0.01),
        ],
        outside_value=
        0.  # No exploration towards the end so we can see the true reward
    )

    #dqnlearn(
    dqnlearn(
        num_timesteps=num_timesteps,
        num_hidden=num_hidden,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        replay_buffer_size=200000,
        batch_size=32,
        gamma=gamma,
        learning_starts=100,
        learning_freq=learning_freq,
        frame_history_len=1,  # no frame history, we look only last data
        target_update_freq=target_update_freq,
        grad_norm_clipping=10,
        rew_file=rew_file,
        double_q=True  # True
    )
예제 #29
0
def main():
    num_timesteps = 50000  # It's actually not the number I get, look to that
    num_iterations = float(num_timesteps)
    printMyRoute(num_timesteps)
    #sumoBinary = r"D:\InstalledProgram\SUMO\bin\sumo-gui.exe"
    sumoBinary = r"D:\InstalledProgram\SUMO\bin\sumo.exe"

    # Run training
    f = open('triedParameters.txt', 'a+')
    ntries = 100
    for ntry in range(ntries):
        # hyperparameter random search
        lr_multiplier = 10**random.uniform(-2, 2)  #1.0
        num_hidden = random.randint(30, 100)
        gamma = random.uniform(
            0.8, 0.95
        )  # horizon should be 1/(1-gamma). We are getting reward every 6 seconds, so 0.9 for 60 seconds should be good
        learning_freq = random.randint(1, 20)
        target_update_freq = random.randint(10, 200)
        explor1 = random.randint(100, 1000)
        explor2 = int(num_iterations * random.uniform(0.1, 0.9))
        if explor2 < explor1:
            explor2 = (num_iterations + explor1) / 2
        print(ntry,
              lr_multiplier,
              num_hidden,
              gamma,
              learning_freq,
              target_update_freq,
              explor1,
              explor2,
              file=f)

        traci.start([
            sumoBinary, "-c", "conf2.sumocfg", "--tripinfo-output",
            "tripinfo" + str(ntry) + ".xml", "--no-step-log",
            "--time-to-teleport", "-1"
        ])
        seed = random.randint(0, 9999)
        print('random seed = %d' % seed)
        session = get_session()
        rew_file = 'reward' + str(ntry) + '.pkl'

        lr_schedule = PiecewiseSchedule([
            (0, 1e-4 * lr_multiplier),
            (num_iterations / 10, 1e-4 * lr_multiplier),
            (num_iterations / 2, 5e-5 * lr_multiplier),
        ],
                                        outside_value=5e-5 * lr_multiplier)
        optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                      kwargs=dict(epsilon=1e-4),
                                      lr_schedule=lr_schedule)

        exploration_schedule = PiecewiseSchedule(
            [
                (0, 1.0),
                (explor1, 0.1),
                (explor2, 0.01),
            ],
            outside_value=
            0.  # No exploration towards the end so we can see the true reward
        )

        #dqnlearn(
        dqnlearn(
            num_timesteps=num_timesteps,
            num_hidden=num_hidden,
            optimizer_spec=optimizer,
            session=session,
            exploration=exploration_schedule,
            replay_buffer_size=200000,
            batch_size=32,
            gamma=gamma,
            learning_starts=100,
            learning_freq=learning_freq,
            frame_history_len=1,  # no frame history, we look only last data
            target_update_freq=target_update_freq,
            grad_norm_clipping=10,
            rew_file=rew_file,
            double_q=True  # True
        )
예제 #30
0
def transfer_learn(env,
                   env_test,
                   env_test1,
                   session,
                   num_timesteps=2e7,
                   learning_rate=None,
                   learning_rate_term=None,
                   exploration=None,
                   dqn_config=None):
    '''
    fill the hyperparameters before running dqn
    :param env: ai gym env
    :param session: tensorflow session
    :param num_timesteps: int
    :param learning_rate: piecewise function
    :param exploration: piecewise function
    :param dqn_config: will override parameters above
    :return: none
    '''

    replay_buffer_size = 1000000
    batch_size = 32
    gamma = 0.99
    learning_starts = 50000
    learning_freq = 4
    frame_history_len = 4
    target_update_freq = 10000
    grad_norm_clipping = 10
    eval_obs_array = None
    room_q_interval = 1e5
    epoch_size = 5e4
    config_name = None
    transfer_config = None
    source_dirs = []
    #term_optimizer =

    if dqn_config:
        if dqn_config.has_key('num_timesteps'):
            num_timesteps = dqn_config['num_timesteps']
        if dqn_config.has_key('replay_buffer_size'):
            replay_buffer_size = dqn_config['replay_buffer_size']
        if dqn_config.has_key('batch_size'):
            batch_size = dqn_config['batch_size']
        if dqn_config.has_key('gamma'):
            gamma = dqn_config['gamma']
        if dqn_config.has_key('learning_starts'):
            learning_starts = dqn_config['learning_starts']
        if dqn_config.has_key('learning_freq'):
            learning_freq = dqn_config['learning_freq']
        if dqn_config.has_key('frame_history_len'):
            frame_history_len = dqn_config['frame_history_len']
        if dqn_config.has_key('target_update_freq'):
            target_update_freq = dqn_config['target_update_freq']
        if dqn_config.has_key('grad_norm_clipping'):
            grad_norm_clipping = dqn_config['grad_norm_clipping']
        if dqn_config.has_key('learning_rate'):
            learning_rate = dqn_config['learning_rate']
        if dqn_config.has_key('exploration'):
            exploration = dqn_config['exploration']
        if dqn_config.has_key('eval_obs_array'):
            eval_obs_array = dqn_config['eval_obs_array']
        if dqn_config.has_key('room_q_interval'):
            room_q_interval = dqn_config['room_q_interval']
        if dqn_config.has_key('epoch_size'):
            epoch_size = dqn_config['epoch_size']
        if dqn_config.has_key('config_name'):
            config_name = dqn_config['config_name']
        if dqn_config.has_key('transfer_config'):
            transfer_config = dqn_config['transfer_config']

    if transfer_config:
        if transfer_config.has_key('source_dirs'):
            source_dirs = transfer_config['source_dirs']
        if transfer_config.has_key('learning_rate_term'):
            learning_rate_term = transfer_config['learning_rate_term']

    if len(source_dirs) == 0:
        print('no source policies provided! check your config.')

    # log_dir = __cur_dir + 'logs/' + config_name + '_' + time + '/'
    cur_time = time.strftime("%m_%d_%y_%H:%M:%S", time.localtime(time.time()))
    log_dir = __cur_dir + 'logs/'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    if config_name != None:
        log_dir = log_dir + config_name + '_' + cur_time + '/'
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        dqn_network_dir = log_dir + 'dqn/'
        if not os.path.exists(dqn_network_dir):
            os.makedirs(dqn_network_dir)
        pkl_dir = log_dir + 'pkl/'
        if not os.path.exists(pkl_dir):
            os.makedirs(pkl_dir)
        tfb_dir = log_dir + 'tfb/'
        if not os.path.exists(tfb_dir):
            os.makedirs(tfb_dir)
    else:
        log_dir = None
        print("config_name not specified! info may not be logged in this run.")

    # This is just a rough estimate
    num_iterations = float(num_timesteps) / learning_freq

    if learning_rate != None:
        lr_schedule = learning_rate
    else:
        lr_multiplier = 1.0
        lr_schedule = PiecewiseSchedule([
            (0, 1e-4 * lr_multiplier),
            (num_iterations / 10, 1e-4 * lr_multiplier),
            (num_iterations / 2, 5e-5 * lr_multiplier),
        ],
                                        outside_value=5e-5 * lr_multiplier)

    lr_schedule_omega = PiecewiseSchedule([
        (0, 2e-4),
        (num_iterations / 2, 1e-4),
        (num_iterations * 3 / 4, 5e-5),
    ],
                                          outside_value=5e-5)

    if learning_rate_term is not None:
        lr_schedule_term = learning_rate_term
    else:
        lr_schedule_term = PiecewiseSchedule([
            (0, 2.5e-4),
            (num_iterations / 10, 1e-4),
            (num_iterations * 3 / 4, 5e-5),
        ],
                                             outside_value=5e-5)

    if exploration != None:
        exploration_schedule = exploration
    else:
        exploration_schedule = PiecewiseSchedule([
            (0, 1.0),
            (1e6, 0.1),
            (num_iterations / 2, 0.01),
        ],
                                                 outside_value=0.01)

    optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer,
                                  kwargs=dict(epsilon=1e-4),
                                  lr_schedule=lr_schedule)

    optimizer_omega = dqn.OptimizerSpec(
        constructor=tf.train.AdamOptimizer,
        kwargs=dict(epsilon=1e-4),  # not for SGD
        #kwargs=dict(),  # for SGD
        lr_schedule=lr_schedule_omega)

    optimizer_term = dqn.OptimizerSpec(
        constructor=tf.train.GradientDescentOptimizer,
        #kwargs=dict(epsilon=1e-4), not for SGD
        kwargs=dict(),
        lr_schedule=lr_schedule_term)

    def stopping_criterion(
            env, t
    ):  # notice that here t is the number of steps of the wrapped env,

        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

#return False

# init sources and primitive options

    options = []
    options += [
        Source(dqn_config, env, tf.train.get_checkpoint_state(d))
        for d in source_dirs
    ]
    for action in range(env.action_space.n):
        options.append(PrimitiveOption(action))

    dqn.learn(env,
              env_test,
              env_test1,
              transfer_model,
              optimizer,
              optimizer_omega,
              optimizer_term,
              session=session,
              options=options,
              exploration=exploration_schedule,
              stopping_criterion=stopping_criterion,
              replay_buffer_size=replay_buffer_size,
              batch_size=batch_size,
              gamma=gamma,
              learning_starts=learning_starts,
              learning_freq=learning_freq,
              frame_history_len=frame_history_len,
              target_update_freq=target_update_freq,
              grad_norm_clipping=grad_norm_clipping,
              eval_obs_array=eval_obs_array,
              room_q_interval=room_q_interval,
              epoch_size=epoch_size,
              log_dir=log_dir,
              transfer_config=transfer_config)
    env.close()
    env_test.close()
    env_test1.close()