Пример #1
0
def main():
    date = datetime.now().strftime('%Y%m%d%H%M%S')
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--load', type=str)
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    args = parser.parse_args()

    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env_name = args.env
    tmp_env = gym.make(env_name)
    is_atari = len(tmp_env.observation_space.shape) != 1
    if not is_atari:
        observation_space = tmp_env.observation_space
        constants = box_constants
        if isinstance(tmp_env.action_space, gym.spaces.Box):
            num_actions = tmp_env.action_space.shape[0]
        else:
            num_actions = tmp_env.action_space.n
        state_shape = [observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda s: s
        reward_preprocess = lambda r: r / 10.0
        # (window_size, dim) -> (dim, window_size)
        phi = lambda s: np.transpose(s, [1, 0])
    else:
        constants = atari_constants
        num_actions = tmp_env.action_space.n
        state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW]
        def state_preprocess(state):
            state = atari_preprocess(state, constants.STATE_SHAPE)
            state = np.array(state, dtype=np.float32)
            return state / 255.0
        reward_preprocess = lambda r: np.clip(r, -1.0, 1.0)
        # (window_size, H, W) -> (H, W, window_size)
        phi = lambda s: np.transpose(s, [1, 2, 0])

    # flag of continuous action space
    continuous = isinstance(tmp_env.action_space, gym.spaces.Box)
    upper_bound = tmp_env.action_space.high if continuous else None

    # save settings
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    sess = tf.Session()
    sess.__enter__()

    model = make_network(
        constants.CONVS, constants.FCS, use_lstm=constants.LSTM,
        padding=constants.PADDING, continuous=continuous)

    # learning rate with decay operation
    if constants.LR_DECAY == 'linear':
        lr = LinearScheduler(constants.LR, constants.FINAL_STEP, 'lr')
        epsilon = LinearScheduler(
            constants.EPSILON, constants.FINAL_STEP, 'epsilon')
    else:
        lr = ConstantScheduler(constants.LR, 'lr')
        epsilon = ConstantScheduler(constants.EPSILON, 'epsilon')

    agent = Agent(
        model,
        num_actions,
        nenvs=constants.ACTORS,
        lr=lr,
        epsilon=epsilon,
        gamma=constants.GAMMA,
        lam=constants.LAM,
        lstm_unit=constants.LSTM_UNIT,
        value_factor=constants.VALUE_FACTOR,
        entropy_factor=constants.ENTROPY_FACTOR,
        time_horizon=constants.TIME_HORIZON,
        batch_size=constants.BATCH_SIZE,
        grad_clip=constants.GRAD_CLIP,
        state_shape=state_shape,
        epoch=constants.EPOCH,
        phi=phi,
        use_lstm=constants.LSTM,
        continuous=continuous,
        upper_bound=upper_bound
    )

    saver = tf.train.Saver()
    if args.load:
        saver.restore(sess, args.load)

    # create environemtns
    envs = []
    for i in range(constants.ACTORS):
        env = gym.make(args.env)
        env.seed(constants.RANDOM_SEED)
        if is_atari:
            env = NoopResetEnv(env, noop_max=30)
            env = MaxAndSkipEnv(env)
            env = EpisodicLifeEnv(env)
        wrapped_env = EnvWrapper(
            env,
            r_preprocess=reward_preprocess,
            s_preprocess=state_preprocess
        ) 
        envs.append(wrapped_env)
    batch_env = BatchEnvWrapper(envs)

    sess.run(tf.global_variables_initializer())

    summary_writer = tf.summary.FileWriter(logdir, sess.graph)
    logger = TfBoardLogger(summary_writer)
    logger.register('reward', dtype=tf.float32)
    end_episode = lambda r, s, e: logger.plot('reward', r, s)

    def after_action(state, reward, global_step, local_step):
        if global_step % 10 ** 6 == 0:
            path = os.path.join(outdir, 'model.ckpt')
            saver.save(sess, path, global_step=global_step)

    trainer = BatchTrainer(
        env=batch_env,
        agent=agent,
        render=args.render,
        state_shape=state_shape[:-1],
        state_window=constants.STATE_WINDOW,
        final_step=constants.FINAL_STEP,
        after_action=after_action,
        end_episode=end_episode,
        training=not args.demo
    )
    trainer.start()
Пример #2
0
def main():
    date = datetime.now().strftime('%Y%m%d%H%M%S')
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='PongDeterministic-v4')
    parser.add_argument('--threads', type=int, default=8)
    parser.add_argument('--load', type=str)
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--record', action='store_true')
    args = parser.parse_args()

    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env_name = args.env
    tmp_env = gym.make(env_name)
    is_atari = len(tmp_env.observation_space.shape) != 1
    # box environment
    if not is_atari:
        observation_space = tmp_env.observation_space
        constants = box_constants
        actions = range(tmp_env.action_space.n)
        state_shape = [observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda s: s
        # (window_size, dim) -> (dim, window_size)
        phi = lambda s: np.transpose(s, [1, 0])
    # atari environment
    else:
        constants = atari_constants
        actions = get_action_space(env_name)
        state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW]
        def state_preprocess(state):
            # atari specific preprocessing
            state = atari_preprocess(state, constants.STATE_SHAPE)
            state = np.array(state, dtype=np.float32)
            return state / 255.0
        # (window_size, H, W) -> (H, W, window_size)
        phi = lambda s: np.transpose(s, [1, 2, 0])

    # save settings
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    sess = tf.Session()
    sess.__enter__()

    model = make_network(
        constants.CONVS, constants.FCS,
        lstm=constants.LSTM, padding=constants.PADDING)

    # share Adam optimizer with all threads!
    lr = tf.Variable(constants.LR)
    decayed_lr = tf.placeholder(tf.float32)
    decay_lr_op = lr.assign(decayed_lr)
    if constants.OPTIMIZER == 'rmsprop':
        optimizer = tf.train.RMSPropOptimizer(lr, decay=0.99, epsilon=0.1)
    else:
        optimizer = tf.train.AdamOptimizer(lr)


    master = make_agent(
        model, actions, optimizer, state_shape, phi, 'global', constants)

    global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
    saver = tf.train.Saver(global_vars)
    if args.load:
        saver.restore(sess, args.load)

    agents = []
    envs = []
    for i in range(args.threads):
        name = 'worker{}'.format(i)
        agent = make_agent(
            model, actions, optimizer, state_shape, phi, name, constants)
        agents.append(agent)
        env = gym.make(args.env)
        env.seed(constants.RANDOM_SEED)
        if is_atari:
            env = NoopResetEnv(env)
            env = EpisodicLifeEnv(env)
        wrapped_env = EnvWrapper(
            env,
            r_preprocess=lambda r: np.clip(r, -1, 1),
            s_preprocess=state_preprocess
        )
        envs.append(wrapped_env)

    sess.run(tf.global_variables_initializer())

    summary_writer = tf.summary.FileWriter(logdir, sess.graph)
    tflogger = TfBoardLogger(summary_writer)
    tflogger.register('reward', dtype=tf.float32)
    tflogger.register('eval_reward', dtype=tf.float32)
    end_episode = lambda r, gs, s, ge, e: tflogger.plot('reward', r, gs)

    def after_action(state, reward, shared_step, global_step, local_step):
        if constants.LR_DECAY == 'linear':
            decay = 1.0 - (float(shared_step) / constants.FINAL_STEP)
            if decay < 0.0:
                decay = 0.0
            sess.run(decay_lr_op, feed_dict={decayed_lr: constants.LR * decay})
        if shared_step % 10 ** 6 == 0:
            path = os.path.join(outdir, 'model.ckpt')
            saver.save(sess, path, global_step=shared_step)

    trainer = AsyncTrainer(
        envs=envs,
        agents=agents,
        render=args.render,
        state_shape=state_shape[:-1],
        state_window=constants.STATE_WINDOW,
        final_step=constants.FINAL_STEP,
        after_action=after_action,
        end_episode=end_episode,
        training=not args.demo,
        n_threads=args.threads
    )
    trainer.start()
Пример #3
0
def main():
    date = datetime.now().strftime('%Y%m%d%H%M%S')
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--load', type=str) # how to load
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true') # training or not training
    args = parser.parse_args()

    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env_name = args.env
    tmp_env = gym.make(env_name)
    is_atari = len(tmp_env.observation_space.shape) != 1
    if not is_atari:
        observation_space = tmp_env.observation_space
        constants = box_constants
        if isinstance(tmp_env.action_space, gym.spaces.Box):
            num_actions = tmp_env.action_space.shape[0] # for continuous action space, num_actions means how many continuous actions
        else:
            num_actions = tmp_env.action_space.n # for discrete action space, num_actions means how many selectable actions.
        state_shape = [observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda s: s
        reward_preprocess = lambda r: r / 10.0
        # (window_size, dim) -> (dim, window_size)
        phi = lambda s: np.transpose(s, [1, 0])
    else:
        constants = atari_constants
        num_actions = tmp_env.action_space.n
        state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW]
        def state_preprocess(state):
            state = atari_preprocess(state, constants.STATE_SHAPE)
            state = np.array(state, dtype=np.float32)
            return state / 255.0
        reward_preprocess = lambda r: np.clip(r, -1.0, 1.0)
        # (window_size, H, W) -> (H, W, window_size)
        phi = lambda s: np.transpose(s, [1, 2, 0]) # a transformation function

    # flag of continuous action space
    continuous = isinstance(tmp_env.action_space, gym.spaces.Box) # 'gym.spaces.Box' means continuous action space
    upper_bound = tmp_env.action_space.high if continuous else None

    # save settings
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    sess = tf.Session()
    sess.__enter__()

    model = make_network( # !!! just a lambda function
        constants.CONVS, constants.FCS, use_lstm=constants.LSTM,
        padding=constants.PADDING, continuous=continuous) # model is a function instance, 
                                                        # mlp network for continuous action space, cnn network for discrete

    # learning rate with decay operation
    if constants.LR_DECAY == 'linear':
        lr = LinearScheduler(constants.LR, constants.FINAL_STEP, 'lr')
        epsilon = LinearScheduler(
            constants.EPSILON, constants.FINAL_STEP, 'epsilon')
    else:
        lr = ConstantScheduler(constants.LR, 'lr')
        epsilon = ConstantScheduler(constants.EPSILON, 'epsilon')

    agent = Agent(
        model, # !!!
        num_actions,
        nenvs=constants.ACTORS,
        lr=lr,
        epsilon=epsilon,
        gamma=constants.GAMMA,
        lam=constants.LAM,
        lstm_unit=constants.LSTM_UNIT,
        value_factor=constants.VALUE_FACTOR,
        entropy_factor=constants.ENTROPY_FACTOR,
        time_horizon=constants.TIME_HORIZON,
        batch_size=constants.BATCH_SIZE,
        grad_clip=constants.GRAD_CLIP,
        state_shape=state_shape,
        epoch=constants.EPOCH,
        phi=phi,
        use_lstm=constants.LSTM,
        continuous=continuous,
        upper_bound=upper_bound
    )

    saver = tf.train.Saver(max_to_keep=5)
    if args.load:
        saver.restore(sess, args.load)
    else: # this else is important
        sess.run(tf.global_variables_initializer()) # 
    # create environemtns
    envs = []
    for i in range(constants.ACTORS): # 8 actors
        env = gym.make(args.env)
        env.seed(constants.RANDOM_SEED)
        if is_atari:
            env = NoopResetEnv(env, noop_max=30)
            env = MaxAndSkipEnv(env)
            env = EpisodicLifeEnv(env)
        wrapped_env = EnvWrapper(
            env,
            r_preprocess=reward_preprocess,
            s_preprocess=state_preprocess
        ) 
        envs.append(wrapped_env) # append all wrapped_envs
    batch_env = BatchEnvWrapper(envs) # envs is a list

    # sess.run(tf.global_variables_initializer()) # should not be here? otherwise it will override the loaded checkpoint

    summary_writer = tf.summary.FileWriter(logdir, sess.graph)
    logger = TfBoardLogger(summary_writer)
    logger.register('reward', dtype=tf.float32)
    end_episode = lambda r, s, e: logger.plot('reward', r, s) # record the reward a episode

    def after_action(state, reward, global_step, local_step):# after an action, check weather need to save model
        # demo mode will not save the model params
        if (global_step % 10**5 >=0 and global_step % 10**5 <= 10 ) and not args.demo : # save model about every 10 ** 5, can't use global step% 10**5 ==0, because global_step may not 
                                                                    # get the number of multiple of 10**5.
            path = os.path.join(outdir, 'model.ckpt')
            print('model saved, global step:{}'.format(global_step))
            saver.save(sess, path, global_step=global_step)

    trainer = BatchTrainer(
        env=batch_env,
        agent=agent, # Agent instannce
        render=args.render,
        state_shape=state_shape[:-1],
        state_window=constants.STATE_WINDOW,
        final_step=constants.FINAL_STEP, # final_step is a total time step limit
        # final_step=12345,
        after_action=after_action, # callback function after an action
        end_episode=end_episode,
        training=not args.demo # if --demo, then not training, if no --demo, then training the policy net and value net
    )
    trainer.start()