Пример #1
0
def create_doom(env_id,
                client_id,
                envWrap=True,
                record=False,
                outdir=None,
                noLifeReward=False,
                acRepeat=0,
                **_):
    from ppaquette_gym_doom import wrappers
    if 'labyrinth' in env_id.lower():
        if 'single' in env_id.lower():
            env_id = 'ppaquette/LabyrinthSingle-v0'
        elif 'fix' in env_id.lower():
            env_id = 'ppaquette/LabyrinthManyFixed-v0'
        else:
            env_id = 'ppaquette/LabyrinthMany-v0'
    elif 'very' in env_id.lower():
        env_id = 'ppaquette/DoomMyWayHomeFixed15-v0'
    elif 'sparse' in env_id.lower():
        env_id = 'ppaquette/DoomMyWayHomeFixed-v0'
    elif 'fix' in env_id.lower():
        if '1' in env_id or '2' in env_id:
            env_id = 'ppaquette/DoomMyWayHomeFixed' + str(env_id[-2:]) + '-v0'
        elif 'new' in env_id.lower():
            env_id = 'ppaquette/DoomMyWayHomeFixedNew-v0'
        else:
            env_id = 'ppaquette/DoomMyWayHomeFixed-v0'
    else:
        env_id = 'ppaquette/DoomMyWayHome-v0'

    # VizDoom workaround: Simultaneously launching multiple vizdoom processes
    # makes program stuck, so use the global lock in multi-threading/processing
    client_id = int(client_id)
    time.sleep(client_id * 10)
    env = gym.make(env_id)
    modewrapper = wrappers.SetPlayingMode('algo')
    obwrapper = wrappers.SetResolution('160x120')
    acwrapper = wrappers.ToDiscrete('minimal')
    env = modewrapper(obwrapper(acwrapper(env)))
    # env = env_wrapper.MakeEnvDynamic(env)  # to add stochasticity

    if record and outdir is not None:
        env = gym.wrappers.Monitor(env, outdir, force=True)

    if envWrap:
        fshape = (42, 42)
        frame_skip = acRepeat if acRepeat > 0 else 4
        env.seed(None)
        if noLifeReward:
            env = env_wrapper.NoNegativeRewardEnv(env)
        env = env_wrapper.BufferedObsEnv(env, skip=frame_skip, shape=fshape)
        env = env_wrapper.SkipEnv(env, skip=frame_skip)
    elif noLifeReward:
        env = env_wrapper.NoNegativeRewardEnv(env)

    env = Vectorize(env)
    env = DiagnosticsInfo(env)
    env = Unvectorize(env)
    return env
Пример #2
0
def create_mario(env_id,
                 client_id,
                 envWrap=True,
                 record=False,
                 outdir=None,
                 noLifeReward=False,
                 acRepeat=0,
                 **_):
    import ppaquette_gym_super_mario
    from ppaquette_gym_super_mario import wrappers
    if '-v' in env_id.lower():
        env_id = 'ppaquette/' + env_id
    else:
        env_id = 'ppaquette/SuperMarioBros-1-3-v0'  # shape: (224,256,3)=(h,w,c)
        #env_id = 'ppaquette/SuperMarioBros-4-2-Tiles-v0'

    # Mario workaround: Simultaneously launching multiple vizdoom processes makes program stuck,
    # so use the global lock in multi-threading/multi-processing
    # see: https://github.com/ppaquette/gym-super-mario/tree/master/ppaquette_gym_super_mario
    client_id = int(client_id)
    time.sleep(client_id * 50)
    env = gym.make(env_id)
    modewrapper = wrappers.SetPlayingMode('algo')
    acwrapper = wrappers.ToDiscrete()
    env = modewrapper(acwrapper(env))
    env = env_wrapper.MarioEnv(env, tilesEnv=False)
    #env = env_wrapper.MarioEnv(env, tilesEnv=True)
    if record and outdir is not None:
        env = gym.wrappers.Monitor(env, outdir, force=True)

    if envWrap:
        frame_skip = acRepeat if acRepeat > 0 else 4
        frame_skip = 6 if "1-1" in env_id else frame_skip
        fshape = (42, 42)
        env.seed(None)
        if noLifeReward:
            env = env_wrapper.NoNegativeRewardEnv(env)
        env = env_wrapper.BufferedObsEnv(env,
                                         skip=frame_skip,
                                         shape=fshape,
                                         maxFrames=False)
        if frame_skip > 1:
            env = env_wrapper.SkipEnv(env, skip=frame_skip)
    elif noLifeReward:
        env = env_wrapper.NoNegativeRewardEnv(env)

    env = Vectorize(env)
    env = DiagnosticsInfo(env)
    env = Unvectorize(env)

    # env.close() # TODO: think about where to put env.close !
    return env
Пример #3
0
def create_doom(record=False, outdir=None):
    from ppaquette_gym_doom import wrappers
    import env_wrapper
    env = gym.make('ppaquette/DoomMyWayHome-v0')
    modewrapper = wrappers.SetPlayingMode('algo')
    obwrapper = wrappers.SetResolution('160x120')
    acwrapper = wrappers.ToDiscrete('minimal')
    env = modewrapper(obwrapper(acwrapper(env)))

    if record:
        env = gym.wrappers.Monitor(env, outdir, force=True)
    fshape = (42, 42)

    env.seed(None)
    env = env_wrapper.NoNegativeRewardEnv(env)
    env = env_wrapper.BufferedObsEnv(env, skip=1, shape=fshape)
    return env
Пример #4
0
outputdir = './gray42/'
env_id = 'ppaquette/SuperMarioBros-1-1-v0'
env = gym.make(env_id)
modewrapper = wrappers.SetPlayingMode('algo')
acwrapper = wrappers.ToDiscrete()
env = modewrapper(acwrapper(env))
env = env_wrapper.MarioEnv(env)

freshape = fshape = (42, 42)
env.seed(None)
env = env_wrapper.NoNegativeRewardEnv(env)
env = env_wrapper.DQNObsEnv(env, shape=freshape)
env = env_wrapper.BufferedObsEnv(env,
                                 n=4,
                                 skip=1,
                                 shape=fshape,
                                 channel_last=True)
env = env_wrapper.EltwiseScaleObsEnv(env)

start = time.time()
episodes = 0
maxepisodes = 1
env.reset()
imCount = 1
utils.mkdir_p(outputdir + '/ep_%02d/' % (episodes + 1))
while (1):
    obs, reward, done, info = env.step(env.action_space.sample())
    print(outputdir)
    Image.fromarray(
        (255 * obs).astype('uint8')).save(outputdir + '/ep_%02d/%06d.jpg' %
Пример #5
0
def train():
    var = 2.
    pointer = 0
    # timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
    # if timestep_limit is None: timestep_limit = env.spec.timestep_limit
    J_D_loss = np.zeros((N_trials, len(N_vals)))
    J_G_loss = np.zeros((N_trials, len(N_vals)))
    for ep in range(MAX_EPISODES):

        ep_reward = 0
        ep_curious_reward = 0
        D_loss = 0
        G_loss = 0
        l_2_loss = 0
        # if M.pointer > MEMORY_CAPACITY:
        # for t in range(ITER_D_Training):
        #     b_s, b_a, b_r, b_curious_r, b_s_ = M.sample(BATCH_SIZE)
        #     generator.learn(b_s, b_a)
        #     b_g = generator.predict_batch(b_s, b_a)
        #     discriminator.learn(b_g, b_s_, b_s, b_a)
        #     if t%10 ==0:
        #         one_step_D_loss = discriminator.eval(b_g, b_s_, b_s, b_a)
        #         one_step_G_loss = generator.eval(b_s, b_a)
        #         print('Ep:', ep,
        #               '|D Training Step:%i'% int(t),
        #               '| D loss: %f' % float(one_step_D_loss),
        #               '| G loss: %f' % float(one_step_G_loss),
        #               )
        # # for t in range(ITER_G_Training):
        #     b_s, b_a, b_r, b_curious_r, b_s_ = M.sample(BATCH_SIZE)
        #
        #     if t % 10 == 0:
        #         one_step_G_loss = generator.eval(b_s, b_a)
        #         print('Ep:', ep,
        #               '|G Training Step:%i' % int(t),
        #               '| G loss: %f' % float(one_step_G_loss),
        #               )

        lstm_state = LSTM_unit.get_initial_state()
        env = gym.make(env_id)
        env = env_wrapper.BufferedObsEnv(env,
                                         n=TIME_STEP,
                                         skip=frame_skip,
                                         shape=fshape,
                                         channel_last=False)
        s = env.reset()
        s = np.expand_dims(s, -1)
        for t in range(MAX_EP_STEPS):
            a = actor.choose_action(s, lstm_state, var)
            g = generator.predict(s, lstm_state, a)
            s_, r, done, info, _ = env._step(a)
            s_ = np.expand_dims(s_, -1)
            curious_r = ITA * discriminator.determine(s, lstm_state, a, g)[0]
            new_lstm_state = LSTM_unit.get_state(s, lstm_state)
            one_step_l_2_loss = discriminator.observe_and_compare(
                s_,
                g,
            )
            l_2_loss += one_step_l_2_loss
            M.store_transition(s, lstm_state, a, r, curious_r, s_,
                               new_lstm_state)
            if M.pointer > MEMORY_CAPACITY:

                # for i in range(ITER_train_G):
                #     b_s, b_a, b_r, b_curious_r, b_s_ = M.sample(BATCH_SIZE)
                #     generator.learn(b_s, b_a)
                b_s, b_lstm_s, b_a, b_r, b_curious_r, b_s_, b_lstm_s_ = M.sample(
                    BATCH_SIZE)

                generator.learn(b_s, b_lstm_s, b_a)
                b_g = generator.predict_batch(b_s, b_lstm_s, b_a)
                discriminator.learn(b_g, b_s_, b_s, b_lstm_s, b_a)
                # Learn the minibatch
                # b_curious_r = discriminator.determine_batch(b_s, b_lstm_s, b_a, b_g)
                critic.learn(b_s, b_lstm_s, b_a, b_curious_r, b_s_, b_lstm_s_)
                actor.learn(b_s, b_lstm_s)
                one_step_D_loss = discriminator.eval(b_g, b_s_, b_s, b_lstm_s,
                                                     b_a)
                D_loss += one_step_D_loss
                one_step_G_loss = generator.eval(b_s, b_lstm_s, b_a)
                G_loss += one_step_G_loss
                if t % 10 == 0:
                    print(
                        'Ep:',
                        ep,
                        '|Step:%i' % int(t),
                        '| Curious_R: %f' % float(curious_r),
                        '| Prediction_error: %f' % float(one_step_l_2_loss),
                        '| D loss: %f' % float(one_step_D_loss),
                        '| G loss: %f' % float(one_step_G_loss),
                    )
            s = s_
            lstm_state = new_lstm_state
            ep_reward += r
            ep_curious_reward += curious_r

            if t == MAX_EP_STEPS - 1 or done or info[
                    'life'] == 0 or info['time'] <= 1:
                # if done:
                t = t + 1
                result = '| done' if done else '| ----'
                print(
                    'Ep:',
                    ep,
                    result,
                    '| R: %i' % int(ep_reward),
                    '| Curious_R: %f' % float(ep_curious_reward),
                    '| D_loss: %f' % float(D_loss / t),
                    '| G_loss: %f' % float(G_loss / t),
                    '| Prediction_error: %f' % float(l_2_loss / t),
                    '| Explore: %.2f' % var,
                )
                env.close()
                var = max([var * .9999, VAR_MIN])
                break

        if ep == N_vals[pointer]:

            # evaluate the minibatch
            J_D_loss[0, pointer] = D_loss / t
            J_G_loss[0, pointer] = G_loss / t
            J_Curious[0, pointer] = ep_curious_reward
            J_r[0, pointer] = ep_reward
            if pointer < N_vals.__len__() - 1:
                pointer += 1

    if os.path.isdir(path):
        shutil.rmtree(path)
    os.mkdir(path)
    ckpt_path = os.path.join('./' + MODE[n_model], 'Curious_GAN.ckpt')
    save_path = saver.save(sess, ckpt_path, write_meta_graph=False)
    print("\nSave Model %s\n" % save_path)
Пример #6
0
    MEMORY_CAPACITY = 5000
    BATCH_SIZE = 32
    VAR_MIN = 1
    RENDER = False
    LOAD = False
    MODE = ['easy', 'hard']
    n_model = 1
    ITA = 1  # Curious coefficient

    frame_skip = acRepeat if acRepeat > 0 else 4
    lock = multiprocessing.Lock()
    env = gym.make(env_id)
    env.configure(lock=lock)
    env = env_wrapper.BufferedObsEnv(env,
                                     n=TIME_STEP,
                                     skip=frame_skip,
                                     shape=fshape,
                                     channel_last=False)

    STATE_DIM = env.observation_space.shape
    ACTION_DIM = env.action_space.shape

    sess = tf.Session()
    with tf.name_scope("S"):
        S = tf.placeholder(tf.float32,
                           shape=[None, TIME_STEP, *fshape, 1],
                           name="s")
    with tf.name_scope("single_S_"):
        single_S_ = tf.placeholder(tf.float32,
                                   shape=[None, 1, *fshape, 1],
                                   name="single_s_")
Пример #7
0
def create_ple_env(env_id, record=False, outdir=None, **_):
    env = gym.make(env_id)
    env = env_wrapper.BufferedObsEnv(env, skip=4, shape=(42, 42))
    env = env_wrapper.SkipEnv(env, skip=4)
    return env
Пример #8
0
def create_mario(env_id,
                 client_id,
                 envWrap=True,
                 record=False,
                 outdir=None,
                 noLifeReward=False,
                 acRepeat=0,
                 **_):
    import ppaquette_gym_super_mario
    from ppaquette_gym_super_mario import wrappers
    if '-v' in env_id.lower():
        env_id = 'ppaquette/' + env_id
    else:
        env_id = 'ppaquette/SuperMarioBros-1-1-v0'  # shape: (224,256,3)=(h,w,c)

    # Mario workaround: Simultaneously launching multiple vizdoom processes makes program stuck,
    # so use the global lock in multi-threading/multi-processing
    # see: https://github.com/ppaquette/gym-super-mario/tree/master/ppaquette_gym_super_mario
    client_id = int(client_id)
    time.sleep(client_id * 50)
    env = gym.make(env_id)
    modewrapper = wrappers.SetPlayingMode('algo')
    acwrapper = wrappers.ToDiscrete()
    env = modewrapper(acwrapper(env))
    env = env_wrapper.MarioEnv(env)

    if record and outdir is not None:
        env = gym.wrappers.Monitor(env, outdir, force=True)

    if envWrap:
        frame_skip = acRepeat if acRepeat > 0 else 6
        fshape = (42, 42)
        env.seed(None)
        if noLifeReward:
            env = env_wrapper.NoNegativeRewardEnv(env)
        env = env_wrapper.BufferedObsEnv(env,
                                         skip=frame_skip,
                                         shape=fshape,
                                         maxFrames=False)
        if frame_skip > 1:
            env = env_wrapper.SkipEnv(env, skip=frame_skip)
    elif noLifeReward:
        env = env_wrapper.NoNegativeRewardEnv(env)

    env = Vectorize(env)
    env = DiagnosticsInfo(env)
    env = Unvectorize(env)
    # env.close() # TODO: think about where to put env.close !
    return env


# def DiagnosticsInfo(env, *args, **kwargs):
#     return vectorized.VectorizeFilter(env, DiagnosticsInfoI, *args, **kwargs)

# class DiagnosticsInfoI(vectorized.Filter):
#     def __init__(self, log_interval=503):
#         super(DiagnosticsInfoI, self).__init__()

#         self._episode_time = time.time()
#         self._last_time = time.time()
#         self._local_t = 0
#         self._log_interval = log_interval
#         self._episode_reward = 0
#         self._episode_length = 0
#         self._all_rewards = []
#         self._num_vnc_updates = 0
#         self._last_episode_id = -1

#     def _after_reset(self, observation):
#         logger.info('Resetting environment logs')
#         self._episode_reward = 0
#         self._episode_length = 0
#         self._all_rewards = []
#         return observation

#     def _after_step(self, observation, reward, done, info):
#         to_log = {}
#         if self._episode_length == 0:
#             self._episode_time = time.time()

#         self._local_t += 1
#         if info.get("stats.vnc.updates.n") is not None:
#             self._num_vnc_updates += info.get("stats.vnc.updates.n")

#         if self._local_t % self._log_interval == 0:
#             cur_time = time.time()
#             elapsed = cur_time - self._last_time
#             fps = self._log_interval / elapsed
#             self._last_time = cur_time
#             cur_episode_id = info.get('vectorized.episode_id', 0)
#             to_log["diagnostics/fps"] = fps
#             if self._last_episode_id == cur_episode_id:
#                 to_log["diagnostics/fps_within_episode"] = fps
#             self._last_episode_id = cur_episode_id
#             if info.get("stats.gauges.diagnostics.lag.action") is not None:
#                 to_log["diagnostics/action_lag_lb"] = info["stats.gauges.diagnostics.lag.action"][0]
#                 to_log["diagnostics/action_lag_ub"] = info["stats.gauges.diagnostics.lag.action"][1]
#             if info.get("reward.count") is not None:
#                 to_log["diagnostics/reward_count"] = info["reward.count"]
#             if info.get("stats.gauges.diagnostics.clock_skew") is not None:
#                 to_log["diagnostics/clock_skew_lb"] = info["stats.gauges.diagnostics.clock_skew"][0]
#                 to_log["diagnostics/clock_skew_ub"] = info["stats.gauges.diagnostics.clock_skew"][1]
#             if info.get("stats.gauges.diagnostics.lag.observation") is not None:
#                 to_log["diagnostics/observation_lag_lb"] = info["stats.gauges.diagnostics.lag.observation"][0]
#                 to_log["diagnostics/observation_lag_ub"] = info["stats.gauges.diagnostics.lag.observation"][1]

#             if info.get("stats.vnc.updates.n") is not None:
#                 to_log["diagnostics/vnc_updates_n"] = info["stats.vnc.updates.n"]
#                 to_log["diagnostics/vnc_updates_n_ps"] = self._num_vnc_updates / elapsed
#                 self._num_vnc_updates = 0
#             if info.get("stats.vnc.updates.bytes") is not None:
#                 to_log["diagnostics/vnc_updates_bytes"] = info["stats.vnc.updates.bytes"]
#             if info.get("stats.vnc.updates.pixels") is not None:
#                 to_log["diagnostics/vnc_updates_pixels"] = info["stats.vnc.updates.pixels"]
#             if info.get("stats.vnc.updates.rectangles") is not None:
#                 to_log["diagnostics/vnc_updates_rectangles"] = info["stats.vnc.updates.rectangles"]
#             if info.get("env_status.state_id") is not None:
#                 to_log["diagnostics/env_state_id"] = info["env_status.state_id"]

#         if reward is not None:
#             self._episode_reward += reward
#             if observation is not None:
#                 self._episode_length += 1
#             self._all_rewards.append(reward)

#         if done:
#             logger.info('True Game terminating: env_episode_reward=%s episode_length=%s', self._episode_reward, self._episode_length)
#             total_time = time.time() - self._episode_time
#             to_log["global/episode_reward"] = self._episode_reward
#             to_log["global/episode_length"] = self._episode_length
#             to_log["global/episode_time"] = total_time
#             to_log["global/reward_per_time"] = self._episode_reward / total_time
#             self._episode_reward = 0
#             self._episode_length = 0
#             self._all_rewards = []

#         if 'distance' in info: to_log['distance'] = info['distance']  # mario
#         if 'POSITION_X' in info:  # doom
#             to_log['POSITION_X'] = info['POSITION_X']
#             to_log['POSITION_Y'] = info['POSITION_Y']
#         return observation, reward, done, to_log