Exemplo n.º 1
0
def main():
    date = datetime.now().strftime('%Y%m%d%H%M%S')
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='PongDeterministic-v4')
    parser.add_argument('--threads', type=int, default=8)
    parser.add_argument('--load', type=str)
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--record', action='store_true')
    args = parser.parse_args()

    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env_name = args.env
    tmp_env = gym.make(env_name)
    is_atari = len(tmp_env.observation_space.shape) != 1
    # box environment
    if not is_atari:
        observation_space = tmp_env.observation_space
        constants = box_constants
        actions = range(tmp_env.action_space.n)
        state_shape = [observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda s: s
        # (window_size, dim) -> (dim, window_size)
        phi = lambda s: np.transpose(s, [1, 0])
    # atari environment
    else:
        constants = atari_constants
        actions = get_action_space(env_name)
        state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW]
        def state_preprocess(state):
            # atari specific preprocessing
            state = atari_preprocess(state, constants.STATE_SHAPE)
            state = np.array(state, dtype=np.float32)
            return state / 255.0
        # (window_size, H, W) -> (H, W, window_size)
        phi = lambda s: np.transpose(s, [1, 2, 0])

    # save settings
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    sess = tf.Session()
    sess.__enter__()

    model = make_network(
        constants.CONVS, constants.FCS,
        lstm=constants.LSTM, padding=constants.PADDING)

    # share Adam optimizer with all threads!
    lr = tf.Variable(constants.LR)
    decayed_lr = tf.placeholder(tf.float32)
    decay_lr_op = lr.assign(decayed_lr)
    if constants.OPTIMIZER == 'rmsprop':
        optimizer = tf.train.RMSPropOptimizer(lr, decay=0.99, epsilon=0.1)
    else:
        optimizer = tf.train.AdamOptimizer(lr)


    master = make_agent(
        model, actions, optimizer, state_shape, phi, 'global', constants)

    global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
    saver = tf.train.Saver(global_vars)
    if args.load:
        saver.restore(sess, args.load)

    agents = []
    envs = []
    for i in range(args.threads):
        name = 'worker{}'.format(i)
        agent = make_agent(
            model, actions, optimizer, state_shape, phi, name, constants)
        agents.append(agent)
        env = gym.make(args.env)
        env.seed(constants.RANDOM_SEED)
        if is_atari:
            env = NoopResetEnv(env)
            env = EpisodicLifeEnv(env)
        wrapped_env = EnvWrapper(
            env,
            r_preprocess=lambda r: np.clip(r, -1, 1),
            s_preprocess=state_preprocess
        )
        envs.append(wrapped_env)

    sess.run(tf.global_variables_initializer())

    summary_writer = tf.summary.FileWriter(logdir, sess.graph)
    tflogger = TfBoardLogger(summary_writer)
    tflogger.register('reward', dtype=tf.float32)
    tflogger.register('eval_reward', dtype=tf.float32)
    end_episode = lambda r, gs, s, ge, e: tflogger.plot('reward', r, gs)

    def after_action(state, reward, shared_step, global_step, local_step):
        if constants.LR_DECAY == 'linear':
            decay = 1.0 - (float(shared_step) / constants.FINAL_STEP)
            if decay < 0.0:
                decay = 0.0
            sess.run(decay_lr_op, feed_dict={decayed_lr: constants.LR * decay})
        if shared_step % 10 ** 6 == 0:
            path = os.path.join(outdir, 'model.ckpt')
            saver.save(sess, path, global_step=shared_step)

    trainer = AsyncTrainer(
        envs=envs,
        agents=agents,
        render=args.render,
        state_shape=state_shape[:-1],
        state_window=constants.STATE_WINDOW,
        final_step=constants.FINAL_STEP,
        after_action=after_action,
        end_episode=end_episode,
        training=not args.demo,
        n_threads=args.threads
    )
    trainer.start()
Exemplo n.º 2
0
def main():
    date = datetime.now().strftime("%Y%m%d%H%M%S")
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='Pendulum-v0')
    parser.add_argument('--log', type=str, default=date)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--load-constants', type=str, default=None)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--record', action='store_true')
    args = parser.parse_args()

    outdir = os.path.join(os.path.dirname(__file__),
                          'results/{}'.format(args.log))
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    logdir = os.path.join(os.path.dirname(__file__),
                          'logs/{}'.format(args.log))

    # load constant settings
    if args.load_constants is not None:
        restored_constants = restore_constants(args.load_constants)
        # save constant settings
        dump_constants(restored_constants,
                       os.path.join(outdir, 'constants.json'))
    else:
        # this condition expression prevents python interpreter bug
        dump_constants(constants, os.path.join(outdir, 'constants.json'))

    env = EnvWrapper(env=gym.make(args.env),
                     r_preprocess=lambda r: r * constants.REWARD_SCALE)

    obs_dim = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]

    sess = tf.Session()
    sess.__enter__()

    actor = make_actor_network(constants.ACTOR_HIDDENS)
    critic = make_critic_network(constants.CRITIC_HIDDENS)
    value = make_value_network(constants.VALUE_HIDDENS)
    replay_buffer = ReplayBuffer(constants.BUFFER_SIZE)

    agent = Agent(actor,
                  critic,
                  value,
                  obs_dim,
                  n_actions,
                  replay_buffer,
                  batch_size=constants.BATCH_SIZE,
                  action_scale=env.action_space.high,
                  gamma=constants.GAMMA,
                  tau=constants.TAU,
                  actor_lr=constants.ACTOR_LR,
                  critic_lr=constants.CRITIC_LR,
                  value_lr=constants.VALUE_LR,
                  reg_factor=constants.REG_FACTOR)

    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver()
    if args.load is not None:
        saver.restore(sess, args.load)

    # tensorboard logger
    train_writer = tf.summary.FileWriter(logdir, sess.graph)
    tflogger = TfBoardLogger(train_writer)
    tflogger.register('reward', dtype=tf.float32)
    tflogger.register('eval_reward', dtype=tf.float32)
    # json logger
    jsonlogger = JsonLogger(os.path.join(outdir, 'reward.json'))

    def end_episode(reward, step, episode):
        tflogger.plot('reward', reward, step)
        jsonlogger.plot(reward=reward, step=step, episode=episode)

    def after_action(state, reward, global_step, local_step):
        if global_step > 0 and global_step % constants.MODEL_SAVE_INTERVAL == 0:
            path = os.path.join(outdir, 'model.ckpt')
            saver.save(sess, path, global_step=global_step)

    evaluator = Evaluator(env=copy.deepcopy(env),
                          state_shape=[obs_dim],
                          state_window=1,
                          eval_episodes=10,
                          recorder=Recorder(outdir) if args.record else None,
                          record_episodes=3)
    should_eval = lambda step, episode: step > 0 and step % 10000 == 0
    end_eval = lambda s, e, r: tflogger.plot('eval_reward', np.mean(r), s)

    trainer = Trainer(env=env,
                      agent=agent,
                      render=args.render,
                      state_shape=[obs_dim],
                      state_window=1,
                      final_step=constants.FINAL_STEP,
                      end_episode=end_episode,
                      after_action=after_action,
                      training=not args.demo,
                      evaluator=evaluator,
                      should_eval=should_eval,
                      end_eval=end_eval)

    trainer.start()
Exemplo n.º 3
0
def main():
    date = datetime.now().strftime('%Y%m%d%H%M%S')
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--load', type=str)
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    args = parser.parse_args()

    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env_name = args.env
    tmp_env = gym.make(env_name)
    is_atari = len(tmp_env.observation_space.shape) != 1
    if not is_atari:
        observation_space = tmp_env.observation_space
        constants = box_constants
        if isinstance(tmp_env.action_space, gym.spaces.Box):
            num_actions = tmp_env.action_space.shape[0]
        else:
            num_actions = tmp_env.action_space.n
        state_shape = [observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda s: s
        reward_preprocess = lambda r: r / 10.0
        # (window_size, dim) -> (dim, window_size)
        phi = lambda s: np.transpose(s, [1, 0])
    else:
        constants = atari_constants
        num_actions = tmp_env.action_space.n
        state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW]
        def state_preprocess(state):
            state = atari_preprocess(state, constants.STATE_SHAPE)
            state = np.array(state, dtype=np.float32)
            return state / 255.0
        reward_preprocess = lambda r: np.clip(r, -1.0, 1.0)
        # (window_size, H, W) -> (H, W, window_size)
        phi = lambda s: np.transpose(s, [1, 2, 0])

    # flag of continuous action space
    continuous = isinstance(tmp_env.action_space, gym.spaces.Box)
    upper_bound = tmp_env.action_space.high if continuous else None

    # save settings
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    sess = tf.Session()
    sess.__enter__()

    model = make_network(
        constants.CONVS, constants.FCS, use_lstm=constants.LSTM,
        padding=constants.PADDING, continuous=continuous)

    # learning rate with decay operation
    if constants.LR_DECAY == 'linear':
        lr = LinearScheduler(constants.LR, constants.FINAL_STEP, 'lr')
        epsilon = LinearScheduler(
            constants.EPSILON, constants.FINAL_STEP, 'epsilon')
    else:
        lr = ConstantScheduler(constants.LR, 'lr')
        epsilon = ConstantScheduler(constants.EPSILON, 'epsilon')

    agent = Agent(
        model,
        num_actions,
        nenvs=constants.ACTORS,
        lr=lr,
        epsilon=epsilon,
        gamma=constants.GAMMA,
        lam=constants.LAM,
        lstm_unit=constants.LSTM_UNIT,
        value_factor=constants.VALUE_FACTOR,
        entropy_factor=constants.ENTROPY_FACTOR,
        time_horizon=constants.TIME_HORIZON,
        batch_size=constants.BATCH_SIZE,
        grad_clip=constants.GRAD_CLIP,
        state_shape=state_shape,
        epoch=constants.EPOCH,
        phi=phi,
        use_lstm=constants.LSTM,
        continuous=continuous,
        upper_bound=upper_bound
    )

    saver = tf.train.Saver()
    if args.load:
        saver.restore(sess, args.load)

    # create environemtns
    envs = []
    for i in range(constants.ACTORS):
        env = gym.make(args.env)
        env.seed(constants.RANDOM_SEED)
        if is_atari:
            env = NoopResetEnv(env, noop_max=30)
            env = MaxAndSkipEnv(env)
            env = EpisodicLifeEnv(env)
        wrapped_env = EnvWrapper(
            env,
            r_preprocess=reward_preprocess,
            s_preprocess=state_preprocess
        ) 
        envs.append(wrapped_env)
    batch_env = BatchEnvWrapper(envs)

    sess.run(tf.global_variables_initializer())

    summary_writer = tf.summary.FileWriter(logdir, sess.graph)
    logger = TfBoardLogger(summary_writer)
    logger.register('reward', dtype=tf.float32)
    end_episode = lambda r, s, e: logger.plot('reward', r, s)

    def after_action(state, reward, global_step, local_step):
        if global_step % 10 ** 6 == 0:
            path = os.path.join(outdir, 'model.ckpt')
            saver.save(sess, path, global_step=global_step)

    trainer = BatchTrainer(
        env=batch_env,
        agent=agent,
        render=args.render,
        state_shape=state_shape[:-1],
        state_window=constants.STATE_WINDOW,
        final_step=constants.FINAL_STEP,
        after_action=after_action,
        end_episode=end_episode,
        training=not args.demo
    )
    trainer.start()
Exemplo n.º 4
0
def main():
    date = datetime.now().strftime('%Y%m%d%H%M%S')
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--load', type=str) # how to load
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true') # training or not training
    args = parser.parse_args()

    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env_name = args.env
    tmp_env = gym.make(env_name)
    is_atari = len(tmp_env.observation_space.shape) != 1
    if not is_atari:
        observation_space = tmp_env.observation_space
        constants = box_constants
        if isinstance(tmp_env.action_space, gym.spaces.Box):
            num_actions = tmp_env.action_space.shape[0] # for continuous action space, num_actions means how many continuous actions
        else:
            num_actions = tmp_env.action_space.n # for discrete action space, num_actions means how many selectable actions.
        state_shape = [observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda s: s
        reward_preprocess = lambda r: r / 10.0
        # (window_size, dim) -> (dim, window_size)
        phi = lambda s: np.transpose(s, [1, 0])
    else:
        constants = atari_constants
        num_actions = tmp_env.action_space.n
        state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW]
        def state_preprocess(state):
            state = atari_preprocess(state, constants.STATE_SHAPE)
            state = np.array(state, dtype=np.float32)
            return state / 255.0
        reward_preprocess = lambda r: np.clip(r, -1.0, 1.0)
        # (window_size, H, W) -> (H, W, window_size)
        phi = lambda s: np.transpose(s, [1, 2, 0]) # a transformation function

    # flag of continuous action space
    continuous = isinstance(tmp_env.action_space, gym.spaces.Box) # 'gym.spaces.Box' means continuous action space
    upper_bound = tmp_env.action_space.high if continuous else None

    # save settings
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    sess = tf.Session()
    sess.__enter__()

    model = make_network( # !!! just a lambda function
        constants.CONVS, constants.FCS, use_lstm=constants.LSTM,
        padding=constants.PADDING, continuous=continuous) # model is a function instance, 
                                                        # mlp network for continuous action space, cnn network for discrete

    # learning rate with decay operation
    if constants.LR_DECAY == 'linear':
        lr = LinearScheduler(constants.LR, constants.FINAL_STEP, 'lr')
        epsilon = LinearScheduler(
            constants.EPSILON, constants.FINAL_STEP, 'epsilon')
    else:
        lr = ConstantScheduler(constants.LR, 'lr')
        epsilon = ConstantScheduler(constants.EPSILON, 'epsilon')

    agent = Agent(
        model, # !!!
        num_actions,
        nenvs=constants.ACTORS,
        lr=lr,
        epsilon=epsilon,
        gamma=constants.GAMMA,
        lam=constants.LAM,
        lstm_unit=constants.LSTM_UNIT,
        value_factor=constants.VALUE_FACTOR,
        entropy_factor=constants.ENTROPY_FACTOR,
        time_horizon=constants.TIME_HORIZON,
        batch_size=constants.BATCH_SIZE,
        grad_clip=constants.GRAD_CLIP,
        state_shape=state_shape,
        epoch=constants.EPOCH,
        phi=phi,
        use_lstm=constants.LSTM,
        continuous=continuous,
        upper_bound=upper_bound
    )

    saver = tf.train.Saver(max_to_keep=5)
    if args.load:
        saver.restore(sess, args.load)
    else: # this else is important
        sess.run(tf.global_variables_initializer()) # 
    # create environemtns
    envs = []
    for i in range(constants.ACTORS): # 8 actors
        env = gym.make(args.env)
        env.seed(constants.RANDOM_SEED)
        if is_atari:
            env = NoopResetEnv(env, noop_max=30)
            env = MaxAndSkipEnv(env)
            env = EpisodicLifeEnv(env)
        wrapped_env = EnvWrapper(
            env,
            r_preprocess=reward_preprocess,
            s_preprocess=state_preprocess
        ) 
        envs.append(wrapped_env) # append all wrapped_envs
    batch_env = BatchEnvWrapper(envs) # envs is a list

    # sess.run(tf.global_variables_initializer()) # should not be here? otherwise it will override the loaded checkpoint

    summary_writer = tf.summary.FileWriter(logdir, sess.graph)
    logger = TfBoardLogger(summary_writer)
    logger.register('reward', dtype=tf.float32)
    end_episode = lambda r, s, e: logger.plot('reward', r, s) # record the reward a episode

    def after_action(state, reward, global_step, local_step):# after an action, check weather need to save model
        # demo mode will not save the model params
        if (global_step % 10**5 >=0 and global_step % 10**5 <= 10 ) and not args.demo : # save model about every 10 ** 5, can't use global step% 10**5 ==0, because global_step may not 
                                                                    # get the number of multiple of 10**5.
            path = os.path.join(outdir, 'model.ckpt')
            print('model saved, global step:{}'.format(global_step))
            saver.save(sess, path, global_step=global_step)

    trainer = BatchTrainer(
        env=batch_env,
        agent=agent, # Agent instannce
        render=args.render,
        state_shape=state_shape[:-1],
        state_window=constants.STATE_WINDOW,
        final_step=constants.FINAL_STEP, # final_step is a total time step limit
        # final_step=12345,
        after_action=after_action, # callback function after an action
        end_episode=end_episode,
        training=not args.demo # if --demo, then not training, if no --demo, then training the policy net and value net
    )
    trainer.start()
Exemplo n.º 5
0
def main():
    date = datetime.now().strftime("%Y%m%d%H%M%S")
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='PongDeterministic-v4')
    parser.add_argument('--outdir', type=str, default=date)
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--record', action='store_true')
    args = parser.parse_args()

    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.outdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env = gym.make(args.env)

    # box environment
    if len(env.observation_space.shape) == 1:
        constants = box_constants
        actions = range(env.action_space.n)
        state_shape = [env.observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda state: state
        # (window_size, dim) -> (dim, window_size)
        phi = lambda state: np.transpose(state, [1, 0])
    # atari environment
    else:
        constants = atari_constants
        actions = get_action_space(args.env)
        state_shape = [84, 84, constants.STATE_WINDOW]

        def state_preprocess(state):
            state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)
            state = cv2.resize(state, (210, 160))
            state = cv2.resize(state, (84, 110))
            state = state[18:102, :]
            return np.array(state, dtype=np.float32) / 255.0

        # (window_size, H, W) -> (H, W, window_size)
        phi = lambda state: np.transpose(state, [1, 2, 0])

    # save constant variables
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    # exploration
    if constants.EXPLORATION_TYPE == 'linear':
        duration = constants.EXPLORATION_DURATION
        explorer = LinearDecayExplorer(final_exploration_step=duration)
    else:
        explorer = ConstantExplorer(constants.EXPLORATION_EPSILON)

    # optimizer
    if constants.OPTIMIZER == 'adam':
        optimizer = tf.train.AdamOptimizer(constants.LR)
    else:
        optimizer = tf.train.RMSPropOptimizer(learning_rate=constants.LR,
                                              momentum=constants.MOMENTUM,
                                              epsilon=constants.EPSILON)

    # wrap gym environment
    env = EnvWrapper(env,
                     s_preprocess=state_preprocess,
                     r_preprocess=lambda r: np.clip(r, -1, 1))

    replay_buffer = ReplayBuffer(constants.REPLAY_BUFFER_SIZE)

    sess = tf.Session()
    sess.__enter__()

    model = make_cnn(convs=constants.CONVS, hiddens=constants.FCS)

    agent = Agent(model,
                  actions,
                  state_shape,
                  replay_buffer,
                  explorer,
                  optimizer,
                  gamma=constants.GAMMA,
                  grad_norm_clipping=constants.GRAD_CLIPPING,
                  phi=phi,
                  learning_starts=constants.LEARNING_START_STEP,
                  batch_size=constants.BATCH_SIZE,
                  train_freq=constants.UPDATE_INTERVAL,
                  target_network_update_freq=constants.TARGET_UPDATE_INTERVAL)

    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver()
    if args.load is not None:
        saver.restore(sess, args.load)

    train_writer = tf.summary.FileWriter(logdir, sess.graph)
    tflogger = TfBoardLogger(train_writer)
    tflogger.register('reward', dtype=tf.float32)
    tflogger.register('eval_reward', dtype=tf.float32)
    jsonlogger = JsonLogger(os.path.join(outdir, 'reward.json'))

    # callback on the end of episode
    def end_episode(reward, step, episode):
        tflogger.plot('reward', reward, step)
        jsonlogger.plot(reward=reward, step=step, episode=episode)

    def after_action(state, reward, global_step, local_step):
        if global_step > 0 and global_step % constants.MODEL_SAVE_INTERVAL == 0:
            path = os.path.join(outdir, 'model.ckpt')
            saver.save(sess, path, global_step=global_step)

    evaluator = Evaluator(env=copy.deepcopy(env),
                          state_shape=state_shape[:-1],
                          state_window=constants.STATE_WINDOW,
                          eval_episodes=constants.EVAL_EPISODES,
                          recorder=Recorder(outdir) if args.record else None,
                          record_episodes=constants.RECORD_EPISODES)
    should_eval = lambda step, episode: step > 0 and step % constants.EVAL_INTERVAL == 0
    end_eval = lambda s, e, r: tflogger.plot('eval_reward', np.mean(r), s)

    trainer = Trainer(env=env,
                      agent=agent,
                      render=args.render,
                      state_shape=state_shape[:-1],
                      state_window=constants.STATE_WINDOW,
                      final_step=constants.FINAL_STEP,
                      after_action=after_action,
                      end_episode=end_episode,
                      training=not args.demo,
                      evaluator=evaluator,
                      should_eval=should_eval,
                      end_eval=end_eval)
    trainer.start()
Exemplo n.º 6
0
def main():
    date = datetime.now().strftime('%Y%m%d%H%M%S')
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='PongDeterministic-v4')
    parser.add_argument('--threads', type=int, default=8)
    parser.add_argument('--load', type=str)
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--record', action='store_true')
    args = parser.parse_args()

    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env_name = args.env
    tmp_env = gym.make(env_name)
    # box environment
    if len(tmp_env.observation_space.shape) == 1:
        observation_space = tmp_env.observation_space
        constants = box_constants
        actions = range(tmp_env.action_space.n)
        state_shape = [observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda s: s
        # (window_size, dim) -> (dim, window_size)
        phi = lambda s: np.transpose(s, [1, 0])
    # atari environment
    else:
        constants = atari_constants
        actions = range(tmp_env.action_space.n)
        state_shape = constants.STATE_SHAPE + [3]
        def state_preprocess(state):
            # atari specific preprocessing
            state = cv2.resize(state, tuple(constants.STATE_SHAPE))
            state = np.array(state, dtype=np.float32)
            return state / 255.0
        phi = lambda s: s[0]

    # save settings
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    sess = tf.Session()
    sess.__enter__()

    # share Adam optimizer with all threads!
    lr = tf.Variable(constants.LR)
    decayed_lr = tf.placeholder(tf.float32)
    decay_lr_op = lr.assign(decayed_lr)
    if constants.OPTIMIZER == 'rmsprop':
        optimizer = tf.train.RMSPropOptimizer(lr, decay=0.99, epsilon=0.1)
    else:
        optimizer = tf.train.AdamOptimizer(lr)

    master = make_agent(
        actions, optimizer, state_shape, phi, 'global', constants)

    global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
    saver = tf.train.Saver(global_vars)
    if args.load:
        saver.restore(sess, args.load)

    agents = []
    envs = []
    for i in range(args.threads):
        name = 'worker{}'.format(i)
        agent = make_agent(
            actions, optimizer, state_shape, phi, name, constants)
        agents.append(agent)
        env = gym.make(args.env)
        env.seed(i)
        env = NoopResetEnv(env)
        env = EpisodicLifeEnv(env)
        wrapped_env = EnvWrapper(
            env,
            r_preprocess=lambda r: np.clip(r, -1, 1),
            s_preprocess=state_preprocess
        )
        envs.append(wrapped_env)

    sess.run(tf.global_variables_initializer())

    summary_writer = tf.summary.FileWriter(logdir, sess.graph)
    tflogger = TfBoardLogger(summary_writer)
    tflogger.register('reward', dtype=tf.float32)
    tflogger.register('eval_reward', dtype=tf.float32)
    end_episode = lambda r, gs, s, ge, e: tflogger.plot('reward', r, gs)

    def after_action(state, reward, shared_step, global_step, local_step):
        if constants.LR_DECAY == 'linear':
            decay = 1.0 - (float(shared_step) / constants.FINAL_STEP)
            if decay < 0.0:
                decay = 0.0
            sess.run(decay_lr_op, feed_dict={decayed_lr: constants.LR * decay})
        if shared_step % 10 ** 6 == 0:
            path = os.path.join(outdir, 'model.ckpt')
            saver.save(sess, path, global_step=shared_step)

    trainer = AsyncTrainer(
        envs=envs,
        agents=agents,
        render=args.render,
        state_shape=state_shape[:-1],
        state_window=constants.STATE_WINDOW,
        final_step=constants.FINAL_STEP,
        after_action=after_action,
        end_episode=end_episode,
        training=not args.demo,
        n_threads=args.threads
    )
    trainer.start()
Exemplo n.º 7
0
def main():
    date = datetime.now().strftime("%Y%m%d%H%M%S")
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--outdir', type=str, default=date)
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--eval-render', action='store_true')
    parser.add_argument('--record', action='store_true')
    parser.add_argument('--demo', action='store_true')
    args = parser.parse_args()

    # learned model path settings
    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.outdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    # log path settings
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env = gym.make(args.env)

    # box environment
    if len(env.observation_space.shape) == 1:
        constants = box_constants
        actions = range(env.action_space.n)
        state_shape = [env.observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda state: state
        # (window_size, dim) -> (dim, window_size)
        phi = lambda state: np.transpose(state, [1, 0])
    # atari environment
    else:
        constants = atari_constants
        actions = get_action_space(args.env)
        state_shape = [84, 84, constants.STATE_WINDOW]
        def state_preprocess(state):
            state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)
            state = cv2.resize(state, (84, 84))
            return np.array(state, dtype=np.float32) / 255.0
        # (window_size, H, W) -> (H, W, window_size)
        phi = lambda state: np.transpose(state, [1, 2, 0])

    # save constant variables
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    # exploration
    if constants.EXPLORATION_TYPE == 'linear':
        duration = constants.EXPLORATION_DURATION
        explorer = LinearDecayExplorer(final_exploration_step=duration)
    else:
        explorer = ConstantExplorer(constants.EXPLORATION_EPSILON)

    # wrap gym environment
    env = EnvWrapper(
        env,
        s_preprocess=state_preprocess,
        r_preprocess=lambda r: np.clip(r, -1, 1)
    )

    # create encoder network
    network = make_network(
        constants.CONVS,
        constants.FCS,
        constants.DND_KEY_SIZE
    )

    replay_buffer = NECReplayBuffer(constants.REPLAY_BUFFER_SIZE)

    sess = tf.Session()
    sess.__enter__()

    # create DNDs
    dnds = []
    for i in range(len(actions)):
        dnd = DND(
            constants.DND_KEY_SIZE,
            constants.DND_CAPACITY,
            constants.DND_P,
            device=constants.DEVICES[i],
            scope='dnd{}'.format(i)
        )
        dnd._init_vars()
        dnds.append(dnd)

    # create NEC agent
    agent = Agent(
        network,
        dnds,
        actions,
        state_shape,
        replay_buffer,
        explorer,
        constants,
        phi=phi,
        run_options=run_options,
        run_metadata=run_metadata
    )

    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver()
    if args.load is not None:
        saver.restore(sess, args.load)

    # tensorboard logger
    train_writer = tf.summary.FileWriter(logdir, sess.graph)
    tflogger = TfBoardLogger(train_writer)
    tflogger.register('reward', dtype=tf.float32)
    tflogger.register('eval_reward', dtype=tf.float32)
    # json logger
    trainlogger = JsonLogger(os.path.join(outdir, 'train.json'))
    evallogger = JsonLogger(os.path.join(outdir, 'evaluation.json'))

    # callback on the end of episode
    def end_episode(reward, step, episode):
        tflogger.plot('reward', reward, step)
        trainlogger.plot(reward=reward, step=step, episode=episode)

    evaluator = Evaluator(
        env=copy.deepcopy(env),
        state_shape=state_shape[:-1],
        state_window=constants.STATE_WINDOW,
        eval_episodes=constants.EVAL_EPISODES,
        recorder=Recorder(outdir) if args.record else None,
        record_episodes=constants.RECORD_EPISODES,
        render=args.eval_render
    )
    def should_eval(step, episode):
        return step > 0 and step % constants.EVAL_INTERVAL == 0
    def end_eval(step, episode, rewards):
        mean_rewards = np.mean(rewards)
        tflogger.plot('eval_reward', mean_rewards, step)
        evallogger.plot(reward=mean_rewards, step=step, episode=episode)

    trainer = Trainer(
        env=env,
        agent=agent,
        render=args.render,
        state_shape=state_shape[:-1], # ignore last channel
        state_window=constants.STATE_WINDOW,
        final_step=constants.FINAL_STEP,
        end_episode=end_episode,
        training=not args.demo,
        evaluator=evaluator,
        should_eval=should_eval,
        end_eval=end_eval
    )
    trainer.start()