예제 #1
0
class Env(object):
    def __init__(self, act_space, act_repeats, frames, state_size, game):
        self.act_space = act_space
        self.act_repeats = act_repeats
        self.act_repeat = random.choice(self.act_repeats)
        self.frames = frames
        self.state_size = state_size
        self.game = game

        self.max_pos = -10000

        self.count = 0

        env = gym_super_mario_bros.make(game)
        if self.act_space == 7:
            self.env = JoypadSpace(env, SIMPLE_MOVEMENT)
        elif self.act_space == 12:
            self.env = JoypadSpace(env, COMPLEX_MOVEMENT)

        s_t = self.resize_image(self.env.reset())

        self.s_t = np.tile(s_t, [1, 1, frames])
        self.s = [self.s_t]

        self.a_t = random.randint(0, act_space - 1)
        self.a = [self.a_t]
        self.a_logits = []
        self.r = [0]
        self.pos = []

        self.v_cur = []

        state_in = np.zeros(self.state_size, dtype=np.float32)
        self.state_in = [state_in]

        self.done = False

    def step(self, a, a_logits, state_in):
        self.count += 1
        if self.count % self.act_repeat == 0:
            self.a_t = a
            self.count = 0
            self.act_repeat = random.choice(self.act_repeats)
        gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t)
        self.env.render()
        if not gdone:
            s_t1, r_t, done, info = self.env.step(self.a_t)
            r_t += gr_t
            r_t /= 2.
        else:
            s_t1 = gs_t1
            r_t = gr_t
            done = gdone
            info = ginfo
        r_t /= 15.
        s_t1 = self.resize_image(s_t1)
        channels = s_t1.shape[-1]
        self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1)

        self.s.append(self.s_t)
        self.a.append(self.a_t)
        self.a_logits.append(a_logits)
        self.r.append(r_t)
        self.max_pos = max(self.max_pos, info["x_pos"])
        self.pos.append(info["x_pos"])
        if (len(self.pos) > 500) and (info["x_pos"] - self.pos[-500] < 5) and (
                self.pos[-500] - info["x_pos"] < 5):
            done = True
        self.done = done

        self.state_in.append(state_in)

    def update_v(self, v_cur):
        self.v_cur.append(v_cur)

    def reset(self, force=False):
        if self.done or force:
            max_pos = self.max_pos
            self.max_pos = -10000
            logging.info("  Max Position  %s : %d" % (self.game, max_pos))
            self.count = 0
            self.act_repeat = random.choice(self.act_repeats)

            s_t = self.resize_image(self.env.reset())

            self.s_t = np.tile(s_t, [1, 1, self.frames])
            self.s = [self.s_t]

            self.a_t = random.randint(0, self.act_space - 1)
            self.a = [self.a_t]
            self.a_logits = []
            self.r = [0]
            self.pos = []

            self.v_cur = []

            state_in = np.zeros(self.state_size, dtype=np.float32)
            self.state_in = [state_in]

            self.done = False

    def get_state(self):
        return self.s_t

    def get_act(self):
        return self.a_t

    def get_max_pos(self):
        return self.max_pos

    def reset_max_pos(self):
        self.max_pos = -10000

    def get_state_in(self):
        return self.state_in[-1]

    def get_history(self, force=False):
        if self.done or force:
            if self.done:
                gaes = get_gaes(None, self.r, self.v_cur, self.v_cur[1:] + [0],
                                0.99, 0.95)[0]
                seg = Seg(self.s, self.a, self.a_logits, self.r, gaes,
                          self.v_cur, self.state_in)
                return seg
            if force and len(self.r) > 1:
                gaes = get_gaes(None, self.r[:-1], self.v_cur[:-1],
                                self.v_cur[1:], 0.99, 0.95)[0]
                seg = Seg(self.s[:-1], self.a[:-1], self.a_logits[:-1],
                          self.r[:-1], gaes, self.v_cur[:-1],
                          self.state_in[:-1])
                return seg
        return None

    @staticmethod
    def resize_image(image, size=84):
        image = Image.fromarray(image)
        image = image.convert("L")
        image = image.resize((size, size))
        image = np.array(image)
        image = image / 255.
        image = np.array(image, np.float32)
        return image[:, :, None]
예제 #2
0
from nes_py.wrappers import JoypadSpace
import gym
from Contra.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY
import time
"""
    pip install gym-contra
    https://github.com/OuYanghaoyue/gym_contra
"""

env = gym.make('Contra-v0')
env = JoypadSpace(env, RIGHT_ONLY)

print("actions", env.action_space)
print("observation_space ", env.observation_space.shape[0])

done = False
env.reset()
for step in range(5000):
    if done:
        print("Over")
        break
    time.sleep(0.01)

    action = env.action_space.sample()
    print("action", action)
    state, reward, done, info = env.step(action)
    env.render()

env.close()
예제 #3
0
def train():
    # Hyper parameters
    cfg = DictConfig({
        "epochs": 1,
        "lr": 1e-4,
        "use_extrinsic": True,
        "max_episode_len": 1000,
        "min_progress": 15,
        "frames_per_state": 3,
        "action_repeats": 6,
        "gamma_q": 0.85,
        "epsilon_random": 0.1,  # Sample random action with epsilon probability
        "epsilon_greedy_switch": 1000,
        "q_loss_weight": 0.01,
        "inverse_loss_weight": 0.5,
        "forward_loss_weight": 0.5,
        "intrinsic_weight": 1.0,
        "extrinsic_weight": 1.0,
    })

    # ---- setting up variables -----

    q_model = MarioModel(cfg.frames_per_state)
    icm_model = MarioICM(cfg.frames_per_state)

    optim = torch.optim.Adam(list(q_model.parameters()) +
                             list(icm_model.parameters()),
                             lr=cfg.lr)

    replay = ExperienceReplay(buffer_size=500, batch_size=100)
    env = gym_super_mario_bros.make("SuperMarioBros-v0")
    env = JoypadSpace(env, COMPLEX_MOVEMENT)

    # Counters and stats
    last_x_pos = 0
    current_episode = 0
    global_step = 0
    current_step = 0
    cumulative_reward = 0

    ep_rewards = []

    # ----- training loop ------

    for epoch in range(cfg.epochs):
        state = env.reset()
        done = False

        # Monte Carlo loop
        while not done:

            # ------------ Q Learning --------------

            if current_step == 0:
                state = prepare_initial_state(env.render("rgb_array"))
            else:
                state = prepare_multi_state(state, env.render("rgb_array"))

            q_values = q_model(state)
            action = sample_action(
                q_values,
                cfg.epsilon,
                apply_epsilon=global_step > cfg.epsilon_greedy_switch,
            )

            action_count = 0
            state2 = None
            while True:
                state2_, reward, done, info = env.step(action)
                if state2 is None:
                    state2 = state2_
                env.render()
                if action_count >= cfg.action_repeats or done:
                    break
                action_count += 1
            state2 = prepare_multi_state(state, state2)

            # Add intrinsic reward
            intrinsic_reward = get_intrinsic_reward(state, action, state2,
                                                    icm_model)
            print("in reward", intrinsic_reward.item())
            print("ex reward", reward)

            reward = (cfg.intrinsic_weight *
                      intrinsic_reward) + (cfg.extrinsic_weight * reward)

            q_loss = get_q_loss(q_values[0][action], reward, q_model, state2,
                                cfg.gamma_q)

            replay.add(state, action, reward, state2)
            state = state2

            # ------------- ICM -------------------

            state1_batch, action_batch, reward_batch, state2_batch = replay.get_batch(
            )

            action_pred, state2_encoded, state2_pred = icm_model(
                state1_batch, action_batch, state2_batch)

            inverse_loss = F.cross_entropy(action_pred, action_batch)
            forward_loss = F.mse_loss(state2_pred, state2_encoded)

            # ------------ Learning ------------

            final_loss = ((cfg.q_loss_weight * q_loss) +
                          (cfg.inverse_loss_weight * inverse_loss) +
                          (cfg.forward_loss_weight * forward_loss))

            optim.zero_grad()
            final_loss.backward()
            optim.step()

            # ------------ updates --------------

            # TODO: add loss scalars
            print("--------loss: ", final_loss.item())

            max_episode_len_reached = current_step >= cfg.max_episode_len
            no_progress = False  # TODO: Figure out the progress shit

            done = done or max_episode_len_reached or no_progress

            if done:
                if max_episode_len_reached:
                    # TODO: Add scalar: 'max episode len reached' current_episode, auto
                    pass
                elif no_progress:
                    # TODO: Add scalar: 'no progress' current_episode, auto
                    pass

                # TODO: add scalar: 'episode len' current_step, current_episode
                # TODO: Plot cumulative reward for each episode
                # TODO: Plot the x_pos after the episode
                # TODO: Plot total sum of rewards for each episode
                # TODO: Every n episodes store save the video -> imageio.mimwrite('gameplay.mp4', renders: ndArray of frames, fps=30)

                current_step = -1
                current_episode += 1

            global_step += 1
            current_step += 1
예제 #4
0
파일: env.py 프로젝트: zhenchangXia/MORL
class MoMarioEnv(Process):
    def __init__(self, args, env_idx, child_conn, history_size=4, h=84, w=84):
        super(MoMarioEnv, self).__init__()
        self.daemon = True
        self.env = JoypadSpace(gym_super_mario_bros.make(args.env_id),
                               SIMPLE_MOVEMENT)

        self.is_render = args.render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.coin = 0
        self.x_pos = 0
        self.time = 0
        self.score = 0
        self.n_mo = 5
        self.morall = np.zeros(self.n_mo)
        self.recent_rlist = deque(maxlen=100)
        self.recent_morlist = deque(maxlen=100)
        self.child_conn = child_conn
        self.life_done = args.life_done
        self.single_stage = args.single_stage
        self.stage_bonus = 0

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MoMarioEnv, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()
            obs, reward, done, info = self.env.step(action)

            if self.single_stage and info["flag_get"]:
                self.stage_bonus = 10000
                done = True
            ''' Construct Multi-Objective Reward'''  #####################################
            # [x_pos, time, death, coin]
            moreward = []
            # 1. x position
            xpos_r = info["x_pos"] - self.x_pos
            self.x_pos = info["x_pos"]
            # resolve an issue where after death the x position resets
            if xpos_r < -5:
                xpos_r = 0
            moreward.append(xpos_r)

            # 2. time penaltiy
            time_r = info["time"] - self.time
            self.time = info["time"]
            # time is aways decreasing
            if time_r > 0:
                time_r = 0
            moreward.append(time_r)

            # 3. death
            if self.lives > info['life']:
                death_r = -25
            else:
                death_r = 0
            moreward.append(death_r)

            # 4. coin
            coin_r = (info['coins'] - self.coin) * 100
            self.coin = info['coins']
            moreward.append(coin_r)

            # 5. enemy
            enemy_r = info['score'] - self.score
            if coin_r > 0 or done:
                enemy_r = 0
            self.score = info['score']
            moreward.append(enemy_r)

            ############################################################################

            if self.life_done:
                # when Mario loses life, changes the state to the terminal
                # state.
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                # normal terminal state
                force_done = done

            # reward range -15 ~ 15
            r = reward / 15
            self.rall += reward

            self.morall += np.array(moreward)
            mor = np.array(moreward) * self.n_mo / 15

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            score = info['score'] + self.stage_bonus

            if done:
                self.recent_rlist.append(self.rall)
                self.recent_morlist.append(self.morall)
                print(
                    "[Episode {}({})]\tStep: {}\tScore: {}\tMoReward: {}\tRecent MoReward: {}\tcoin: {}\tcurrent x:{}"
                    .format(self.episode, self.env_idx, self.steps,
                            score, self.morall,
                            np.mean(self.recent_morlist,
                                    axis=0), info['coins'], info['x_pos']))

                self.history = self.reset()

            self.child_conn.send(
                [self.history[:, :, :], r, force_done, done, mor, score])

    def reset(self):
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.coin = 0
        self.x_pos = 0
        self.time = 0
        self.score = 0
        self.stage_bonus = 0
        self.morall = np.zeros(self.n_mo)
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))
        x = np.float32(x) * (1.0 / 255.0)

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
예제 #5
0
class ICMTrainer:
    """
    Compose encoder, forward/inverse, and q_model into single trainer entity
    """
    def __init__(self):
        self.env = gym_super_mario_bros.make('SuperMarioBros-v0')
        self.env = JoypadSpace(self.env, COMPLEX_MOVEMENT)

        self.replay = SMBExperienceReplay(buffer_size=BUFFER_SIZE,
                                          batch_size=BATCH_SIZE)
        self.q_model = Qnetwork()
        self.encoder = Phi()
        self.forward_model = Fnet()
        self.inverse_model = Gnet()
        all_model_params = list(self.q_model.parameters()) + list(
            self.encoder.parameters())
        all_model_params += list(self.forward_model.parameters()) + \
                            list(self.inverse_model.parameters())
        self.opt = optim.Adam(lr=0.001, params=all_model_params)

    @staticmethod
    def combined_loss(q_loss, inverse_loss, forward_loss):
        """
        overall loss fn
        lambda*Qloss + (1-beta)*forward_loss + beta*inverse_loss
        """
        loss_ = (1 - BETA) * inverse_loss
        loss_ += BETA * forward_loss
        loss_ = loss_.sum() / loss_.flatten().shape[0]
        loss = loss_ + LAMBDA * q_loss
        return loss

    def icm_loss(self,
                 state1,
                 action,
                 state2,
                 forward_scale=1.,
                 inverse_scale=1e4):
        """ calculate forward and inverse model losses for ICM """
        fwd_loss_fn = nn.MSELoss(reduction='none')
        inverse_loss_fn = nn.CrossEntropyLoss(reduction='none')

        # encode input states
        state1_hat = self.encoder(state1)
        state2_hat = self.encoder(state2)
        # detach because don't want to back-prop here on the encoder
        state2_hat_pred = self.forward_model(state1_hat.detach(),
                                             action.detach())
        forward_pred_err = fwd_loss_fn(
            state2_hat_pred, state2_hat.detach()).sum(dim=1).unsqueeze(dim=1)
        forward_pred_err *= forward_scale
        pred_action = self.inverse_model(state1_hat, state2_hat)
        inverse_pred_err = inverse_loss_fn(
            pred_action,
            action.detach().flatten()).unsqueeze(dim=1)
        inverse_pred_err *= inverse_scale
        return forward_pred_err, inverse_pred_err

    def batch_forward_pass(self, use_extrinsic=True):
        """ single forward pass that generates forward err, inverse err and q_loss"""
        # pylint: disable=E1101
        state1_batch, action_batch, reward_batch, state2_batch = self.replay.get_batch(
        )
        # reshape action/reward batches to be compatible with models
        action_batch = action_batch.view(action_batch.shape[0], 1)
        reward_batch = reward_batch.view(reward_batch.shape[0], 1)

        # run ICM
        forward_pred_err, inverse_pred_err = self.icm_loss(
            state1_batch, action_batch, state2_batch)
        # scale forward pred err using the Eta parameter
        i_reward = (1. / ETA) * forward_pred_err
        reward = i_reward.detach()
        if use_extrinsic:  # whether to include explicit rewards in training
            reward += reward_batch
        # discount expected values for next state
        qvals = self.q_model(state2_batch)
        reward += GAMMA * torch.max(qvals)
        reward_pred = self.q_model(state1_batch)
        reward_target = reward_pred.clone()
        # convert action batch (integers) to OHE
        indices = torch.stack(
            (torch.arange(action_batch.shape[0]), action_batch.squeeze()),
            dim=0)
        indices = indices.tolist()
        reward_target[indices] = reward.squeeze()

        q_loss = 1e5 * nn.MSELoss()(F.normalize(reward_pred),
                                    F.normalize(reward_target.detach()))
        return forward_pred_err, inverse_pred_err, q_loss

    def repeat_action(self, action):
        """ given action,
        repeat it specified times,
        and return combined state and rewards """
        state_deque = deque(maxlen=FRAMES_PER_STATE)
        sum_rewards = 0
        for _ in range(ACTION_REPEATS):
            state2, e_reward_, done, info = self.env.step(action)
            if done:
                break
            sum_rewards += e_reward_
            downscaled_state2 = downscale_img(state2, to_gray=True)
            # pylint: disable=E1101
            prepped_state2 = torch.from_numpy(downscaled_state2). \
                float().unsqueeze(dim=0)
            state_deque.append(prepped_state2)
        return state_deque, done, sum_rewards, info

    def train(self):
        """ full training loop """
        self.env.reset()
        state1 = prepare_initial_state(self.env.render('rgb_array'))
        losses = []
        ep_lengths = []
        episode_length = 0
        last_x_pos = self.env.env.env._x_position
        for i in range(TRAINING_STEPS):
            self.opt.zero_grad()
            episode_length += 1
            q_val_pred = self.q_model(state1)
            if i > SWITCH_TO_EPS_GREEDY:
                action = int(sample_action(q_val_pred, EPS))
            else:
                action = int(sample_action(q_val_pred))

            state_deque, done, extrinsic_reward, info = self.repeat_action(
                action)
            # pylint: disable=E1101
            state2 = torch.stack(list(state_deque), dim=1)
            self.replay.add_memory(
                state1,
                action,
                extrinsic_reward,  # summed across repeated actions
                state2)
            if i % MAX_EPISODE_LEN == 0 and i != 0:
                if (info['x_pos'] - last_x_pos) < MIN_PROGRESS:
                    done = True
                else:
                    last_x_pos = info['x_pos']
            if done:
                print("Episode over.")
                ep_lengths.append(info['x_pos'])
                self.env.reset()
                state1 = prepare_initial_state(self.env.render('rgb_array'))
                last_x_pos = self.env.env.env._x_position
                episode_length = 0
            else:
                state1 = state2
            # Enter mini-batch training
            if len(self.replay.memory) < BATCH_SIZE:
                continue

            forward_pred_err, inverse_pred_err, q_loss \
                = self.batch_forward_pass(use_extrinsic=False)
            loss = self.combined_loss(q_loss, forward_pred_err,
                                      inverse_pred_err)
            loss_list = (q_loss.mean(), forward_pred_err.flatten().mean(),
                         inverse_pred_err.flatten().mean(), episode_length)
            if i % 250 == 0:
                print("Epoch {}, Loss: {}".format(i, loss))
                print(
                    "Forward loss: {} \n Inverse loss: {} \n Qloss: {}".format(
                        forward_pred_err.mean(), inverse_pred_err.mean(),
                        q_loss.mean()))
                print(info)
            losses.append(loss_list)
            loss.backward()
            self.opt.step()
class MarioEnvironment(Process):
    def __init__(
            self,
            env_id,
            is_render,
            env_idx,
            child_conn,
            history_size=4,
            life_done=True,
            h=84,
            w=84, movement=COMPLEX_MOVEMENT, sticky_action=True,
            p=0.25):
        super(MarioEnvironment, self).__init__()
        self.daemon = True
        self.env = JoypadSpace(
            gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)

        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.life_done = life_done

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MarioEnvironment, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()

            obs, reward, done, info = self.env.step(action)

            # when Mario loses life, changes the state to the terminal
            # state.
            if self.life_done:
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                force_done = done

            # reward range -15 ~ 15
            log_reward = reward / 15
            self.rall += log_reward

            r = log_reward

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            if done:
                self.recent_rlist.append(self.rall)
                print(
                    "[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Stage: {} current x:{}   max x:{}".format(
                        self.episode,
                        self.env_idx,
                        self.steps,
                        self.rall,
                        np.mean(
                            self.recent_rlist),
                        info['stage'],
                        info['x_pos'],
                        self.max_pos))

                self.history = self.reset()

            self.child_conn.send([self.history[:, :, :], r, force_done, done, log_reward])

    def reset(self):
        self.last_action = 0
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.stage = 1
        self.max_pos = 0
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
예제 #7
0
파일: Test_R2D3.py 프로젝트: hybug/test_ppo
class Env(object):
    def __init__(self, game, **kwargs):
        self.act_space = kwargs.get("act_space")
        self.state_size = kwargs.get("state_size")
        self.burn_in = kwargs.get("burn_in")
        self.seqlen = kwargs.get("seqlen")
        self.n_step = kwargs.get("n_step")
        self.use_soft = kwargs.get("use_soft")
        self.frames = kwargs.get("frames")
        self.sample_epsilon_per_step = kwargs.get("sample_epsilon_per_step")

        self.epsilon = np.power(0.4, random.uniform(4, 8))
        self.game = game

        self.count = 0
        self.count_maxpos = []

        env = gym_super_mario_bros.make(game)
        if self.act_space == 7:
            self.env = JoypadSpace(env, SIMPLE_MOVEMENT)
        elif self.act_space == 12:
            self.env = JoypadSpace(env, COMPLEX_MOVEMENT)

        self.max_pos = -10000
        self.done = True
        self.reset()

    def step(self, a, state_in):
        maxpos = self.reset()

        self.count += 1
        if not self.use_soft:
            if self.sample_epsilon_per_step:
                self.epsilon = np.power(0.4, random.uniform(4, 8))
            if random.random() < self.epsilon:
                a = random.randint(0, self.act_space - 1)
        self.a_t = a
        gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t)
        self.env.render()
        if not gdone:
            s_t1, r_t, done, info = self.env.step(self.a_t)
            r_t += gr_t
            r_t /= 2.
        else:
            s_t1 = gs_t1
            r_t = gr_t
            done = gdone
            info = ginfo
        s_t1 = self.resize_image(s_t1)
        channels = s_t1.shape[-1]
        self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1)

        self.s.append(self.s_t)
        self.a.append(self.a_t)
        self.r.append(r_t)
        self.max_pos = max(self.max_pos, info["x_pos"])
        self.pos.append(info["x_pos"])
        if (len(self.pos) > 100) and (info["x_pos"] - self.pos[-100] < 5) and (
                self.pos[-100] - info["x_pos"] < 5):
            done = True
        self.done = done
        if self.done:
            self.mask.append(0)
        else:
            self.mask.append(1)

        self.state_in.append(state_in)
        """
        get segs
        """
        # segs = self.get_history()
        #
        # return segs
        return maxpos

    def reset(self):
        if self.done:
            self.count_maxpos.append(self.max_pos)
            print(self.game, self.max_pos, len(self.count_maxpos[1:]),
                  np.mean(self.count_maxpos[1:]))
            self.epsilon = np.power(0.4, random.uniform(4, 8))

            self.count = 0

            s_t = self.resize_image(self.env.reset())

            self.s_t = np.tile(s_t, [1, 1, self.frames])
            self.s = [self.s_t]

            self.a_t = random.randint(0, self.act_space - 1)
            self.a = [self.a_t]
            self.r = [0]
            self.mask = [1]

            self.max_pos = -10000
            self.pos = []

            state_in = np.zeros(self.state_size, dtype=np.float32)
            self.state_in = [state_in]

            self.done = False
            return self.count_maxpos
        return None

    def get_state(self):
        return self.s_t

    def get_act(self):
        return self.a_t

    def get_reward(self):
        return self.r[-1]

    def get_max_pos(self):
        return self.max_pos

    def get_state_in(self):
        return self.state_in[-1]

    @staticmethod
    def resize_image(image, size=84):
        image = Image.fromarray(image)
        image = image.convert("L")
        image = image.resize((size, size))
        image = np.array(image, np.uint8)
        return image[:, :, None]
예제 #8
0
class MarioEnvironment(dm_env.Environment):
    def __init__(
        self,
        skip_frames: int = 3,
        img_rescale_pc: float = 0.4,
        stack_func: Optional[Callable[[List[np.ndarray]],
                                      np.ndarray]] = np.hstack,
        stack_mode: str = "all",
        grayscale: bool = True,
        black_background: bool = True,
        in_game_score_weight: float = 0.01,
        movement_type: str = "simple",
        world_and_level: Optional[Tuple[int, int]] = None,
        idle_frames_threshold: Optional[int] = 1250,
        colorful_rendering: bool = True,
    ) -> None:
        assert stack_mode in ("first_and_last", "all")
        self._stack_mode = stack_mode

        env_name = (f"SuperMarioBros" if world_and_level is None else
                    "SuperMarioBros-%d-%d" % world_and_level)
        env_name += f"-v{int(black_background)}"
        self._smb_env = gym_super_mario_bros.make(env_name)
        self._smb_env = JoypadSpace(self._smb_env,
                                    MOVEMENTS_TYPES[movement_type])

        self._actions_queue = []
        self._colorful_env = None
        if (grayscale or black_background) and colorful_rendering:
            self._colorful_env = gym_super_mario_bros.make(
                "SuperMarioBros-%d-%d-v0" % world_and_level)
            self._colorful_env = JoypadSpace(self._colorful_env,
                                             MOVEMENTS_TYPES[movement_type])

        self._stack_func = stack_func
        self._grayscale = grayscale

        self._score_weight = in_game_score_weight
        self._idle_frames_threshold = idle_frames_threshold

        self._last_score = 0
        self._last_x = 40
        self._idle_counter = 0

        self._rescale_pc = img_rescale_pc
        self._skip_frames = skip_frames

        self._obs_shape = self.reset().observation.shape
        self._num_actions = self._smb_env.action_space.n

    def reset(self):
        """ Returns the first `TimeStep` of a new episode. """
        self._smb_env.reset()
        self._last_score = 0
        self._last_x = 40
        self._idle_counter = 0

        self._actions_queue = []
        if self._colorful_env is not None:
            self._colorful_env.reset()

        return dm_env.restart(self.step(0).observation)

    def _is_idle(self, info):
        if self._idle_frames_threshold is None:
            return False

        x = info["x_pos"]
        delta_x = x - self._last_x
        self._last_x = x

        if abs(delta_x) < 1:
            self._idle_counter += 1
            return self._idle_counter > self._idle_frames_threshold

        self._idle_counter = 0
        return False

    def step(self, action) -> TimeStep:
        """ Updates the environment's state. """
        # NOTE:
        # The gym_super_mario_bros environment reuses the numpy array it
        # returns as observation. When stacking observations, this might be
        # a source of bugs (all observations in the stack might be representing
        # the same, final frame!), so always copy the arrays when doing that.
        # The observation arrays are already being copied inside
        # `self._preprocess_img`, so no explicit copying is needed here.

        action = int(action)
        initial_img, total_reward, done, info = self._smb_env.step(action)
        self._actions_queue.append(action)
        done = done or self._is_idle(info)

        # Skipping frames:
        if self._skip_frames > 0:
            imgs = [self._process_img(initial_img)]
            skip_count = 0
            while skip_count < self._skip_frames:
                skip_count += 1
                if not done:
                    last_img, reward, done, info = self._smb_env.step(action)
                    self._actions_queue.append(action)
                    done = done or self._is_idle(info)
                    total_reward += reward
                else:
                    last_img = np.zeros_like(initial_img)

                if self._stack_mode == "all" or skip_count == self._skip_frames:
                    imgs.append(self._process_img(last_img))

            obs = self._stack_func(imgs)
        # Single frame:
        else:
            obs = self._process_img(initial_img)

        score_diff = info["score"] - self._last_score
        self._last_score = info["score"]
        total_reward = np.float64(total_reward +
                                  self._score_weight * score_diff)

        if done:
            return dm_env.termination(reward=total_reward, observation=obs)
        return dm_env.transition(reward=total_reward, observation=obs)

    def observation_spec(self):
        return dm_env.specs.BoundedArray(shape=self._obs_shape,
                                         dtype=np.float32,
                                         name="image",
                                         minimum=0,
                                         maximum=1)

    def action_spec(self):
        return dm_env.specs.DiscreteArray(dtype=np.int32,
                                          name="action",
                                          num_values=self._num_actions)

    def _process_img(self, img):
        img = np.divide(img, 255)
        img = img[50:, :, :]

        if abs(self._rescale_pc - 1) > 1e-2:
            img = rescale(img, scale=self._rescale_pc, multichannel=True)

        if self._grayscale:
            img = img @ RGB2GRAY_COEFFICIENTS

        return img.astype(np.float32, copy=True)

    def render(self, mode="human", return_all_imgs=False):
        if return_all_imgs:
            assert self._colorful_env is not None and mode == "rgb_array", (
                "The option 'return_all_imgs' is valid only when using "
                "colorful rendering and rgb array mode!")

        # Regular rendering:
        if self._colorful_env is None:
            return self._smb_env.render(mode)

        # Colorful rendering:
        img_list = []
        for action in self._actions_queue:
            self._colorful_env.step(action)
            if return_all_imgs:
                # NOTE: make sure a copy of the returned rgb array is made!
                img_list.append(self._colorful_env.render(mode).copy())

        self._actions_queue = []
        return img_list if return_all_imgs else self._colorful_env.render(mode)

    def plot_obs(self, obs):
        plt.imshow(obs, cmap="gray" if self._grayscale else None)
        plt.show()

    def close(self):
        self._smb_env.close()
예제 #9
0
class DQLMarioAgent(DQLAgent.DQLAgent):
    def __init__(self, action_type, batch_size, model_type, success_margin,
                 success_score, memory_size, record_video, target_model,
                 project, wrapper_type):
        super().__init__(action_type, batch_size, model_type, success_margin,
                         success_score, memory_size, record_video,
                         target_model, project)

        self.env = gym_super_mario_bros.make('SuperMarioBros-v0')
        if wrapper_type == 'COMPLEX':
            self.env = JoypadSpace(self.env, COMPLEX_MOVEMENT)  # -> 12
        elif wrapper_type == 'SIMPLE':
            self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)  # -> 7
        else:
            self.env = JoypadSpace(self.env, RIGHT_ONLY)  # -> 5

        self.action_size = self.env.action_space.n
        self.num_states = 1
        self.state_single_size = 80
        self.state_size = (self.state_single_size, self.state_single_size)
        self.action = self.env.action_space.sample()
        self.first_last_x_pos = self.env.env.env._x_position
        self.max_distance = self.first_last_x_pos

        self.DLModel = NNModel.DLModel(env=self.env,
                                       action_size=self.action_size,
                                       state_size=self.state_single_size,
                                       states=self.num_states,
                                       model_type=model_type,
                                       output_dir=self.others_dir)

    def append_new_frame(self):
        """ Save generated env's frame """
        self.renders.append(
            img_as_ubyte(
                resize(self.env.render(mode='rgb_array'), (480, 480, 3))))

    def get_first_state(self):
        """
        :return: inicial state
        """
        first_state = self.env.reset()
        return Utils.prepare_initial_state(first_state,
                                           self.state_size,
                                           channels=1)

    def get_first_x_pos(self):
        """
        :return: inicial x position
        """
        return self.first_last_x_pos

    def prepare_state(self, next_state, channels=1):
        """
        Remove upper image info, reduce channels, and reduce image size
        :param channels: number of layers
        :param next_state: state to process
        :return: preprocessed image generated by env
        """
        return Utils.prepare_initial_state(next_state,
                                           self.state_size,
                                           channels=channels)

    def reset_max_distance(self):
        """ Reset episode max distance """
        self.max_distance = self.first_last_x_pos

    def update_max_distance(self, dist):
        """
        Update episode max distance
        :param dist: new distance
        :return: new max distance
        """
        if dist > self.max_distance:
            self.max_distance = dist
        return self.max_distance