Пример #1
0
def main():
    #env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v1')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    timestart = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M:%S')
    # env = VideoRecorderWrapper(env, PROJ_DIR + "/../video", str(timestart), 50)

    env = VideoRecorderWrapper(env, PROJ_DIR + "/../video/final", str(timestart), 1)
    env = DownsampleEnv(env, (84, 84))
    env = PenalizeDeathEnv(env, penalty=-25)
    env = FrameStackEnv(env, 4)
    # good
    #act = deepq.load(PROJ_DIR+"/../models/mario_model_2018-08-12-13:00:58.pkl")

    # better
    act = deepq.load(PROJ_DIR + "/../models/mario_model_2018-08-12-19:21:50.pkl")
    
    episode = 0
    while True:
        obs, done = env.reset(), False
        stepnr = 0
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])

            if stepnr % 20 == 0:
                plot_obs(obs)

            episode_rew += rew
            stepnr += 1
        print("Episode reward", episode_rew, episode)
        episode = episode+1
Пример #2
0
	def run(self, solution, level, render, mode):
		env = gym_super_mario_bros.make(level)
		env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)

		done = True
		reason_finish = "no_more_commands"

		pos = 0
		total_r = 0

		for step in range(len(solution)):
			if done:
				state = env.reset()

			state, reward, done, info = env.step(solution[pos])
			pos+=1


			if reward == -15: #faleceu
				reason_finish = "death"
				break

			if mode == "level" and info['flag_get'] == True:
				reason_finish = "win"
				break

			total_r = total_r + reward
			if render == "true":
				env.render()


		env.close()
		return total_r, pos, info, reason_finish
Пример #3
0
def main(path="./models/deepq/mario_reward_1736.7.pkl"):
    step_mul = 16
    steps = 200

    FLAGS = flags.FLAGS
    flags.DEFINE_string("env", "SuperMarioBros-v0", "RL environment to train.")
    flags.DEFINE_string("algorithm", "deepq", "RL algorithm to use.")

    FLAGS(sys.argv)
    # 1. Create gym environment
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)

    act = deepq.load(path)
    nstack = 4
    nh, nw, nc = env.observation_space.shape
    history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8)

    obs, done = env.reset(), False
    # history = update_history(history, obs)
    episode_rew = 0
    while not done:
        env.render()
        action = act([obs])[0]
        obs, rew, done, _ = env.step(action)
        # history = update_history(history, obs)
        episode_rew += rew
        print("action : %s reward : %s" % (action, rew))

    print("Episode reward", episode_rew)
Пример #4
0
class Environment:

    actionMap = {
        0: 'NOOP',
        1: 'Right',
        2: 'Right-Jump',
        3: 'Right-Sprint',
        4: 'Right-Jump-Sprint',
        5: 'Jump',
        6: 'Left'
    }

    def __init__(self, rows=19, columns=16, verbose=True, raw=True, variant=1):
        self.verbose = verbose
        self.raw = raw
        self.variant = variant
        self.img2state = Img2State(rows=19, columns=16)
        self.game = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make('SuperMarioBros-v3'), SIMPLE_MOVEMENT)
        self.state = self.img2state.transfrom(self.game.reset(),
                                              raw=self.raw,
                                              variant=self.variant)
        self.reward = 0

        # Actions
        self.A = list(Environment.actionMap.keys())

    def step(self, action: int):
        if action not in self.A:
            raise Exception('Wrong Action...')

        state, self.reward, done, info = self.game.step(action)
        self.state = self.img2state.transfrom(state,
                                              raw=self.raw,
                                              variant=self.variant)

        if done and self.state[8]:
            self.reward = 100
        elif self.state[8]:
            self.reward = 30
        elif self.state[9]:
            self.reward = 15

        if self.verbose:
            self.game.render()

        return done

    def reset(self):
        self.state = self.img2state.transfrom(self.game.reset(),
                                              raw=self.raw,
                                              variant=self.variant)
        self.reward = 0
Пример #5
0
class Environment(threading.Thread):
    stop_signal = False

    def __init__(self,
                 render=False,
                 eps_start=EPS_START,
                 eps_end=EPS_STOP,
                 eps_steps=EPS_STEPS):
        threading.Thread.__init__(self)
        self.render = render

        # Make the super mario gym environment and apply wrappers
        self.env = gym.make(ENV)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT)
        self.env = preprocess.GrayScaleImage(self.env,
                                             height=HIGHT,
                                             width=WIDTH,
                                             grayscale=True)
        # self.env = wrappers.Monitor(self.env, "./Super_Mario_AI/videos", force = True, write_upon_reset=True)
        self.agent = Agent(TEMPERATURE)

    def runEpisode(self):
        s = self.env.reset()
        R = 0
        while True:
            time.sleep(THREAD_DELAY)  # yield

            if self.render: self.env.render()

            a = self.agent.act(s)
            s_, r, done, info = self.env.step(a)

            if done:  # terminal state
                s_ = None

            self.agent.train(s, a, r, s_)

            s = s_
            R += r

            if done or self.stop_signal:
                break

        print("Total R:", R)

    def run(self):
        while not self.stop_signal:
            self.runEpisode()

    def stop(self):
        self.stop_signal = True
Пример #6
0
class MarioEnv:
    def __init__(self, os='mac', display=False):
        self.display = display
        if os == 'mac' or os == 'linux':
            env = gym_super_mario_bros.make('SuperMarioBros-v0')
            self.env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
        else:
            raise Exception("bad os")
        self.act_dim = self.env.action_space.n
        self.obs_dim = (1, 128, 128)
        print("env created with act_dim", self.act_dim, "obs_dim",
              self.obs_dim)
        self.transform = transforms.Compose([
            transforms.ToTensor(),  # chain 2 transforms together using list.
            transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
        ])

    def reset(self):
        state = self.env.reset()
        return self.__resize_image(state)

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        if reward == 0:
            reward = -0.5
        state_t = self.__resize_image(state)
        return state_t, \
               np.reshape(reward, -1), \
               np.reshape(done, -1)

    def close(self):
        self.env.close()

    def __resize_image(self, state):
        state_new = cv2.resize(state, (128, 128))
        img = Image.fromarray(state_new)
        state_t = self.transform(img)[0, :, :].unsqueeze(0)
        state_t = state_t.float().to(DEVICE)
        return state_t.unsqueeze(0)

    def render(self):
        if self.display:
            self.env.render()
Пример #7
0
def main():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)

    done = True
    max_step = 5000
    print(env.observation_space.shape)
    #win下加ascii=True才会不换行
    qbar = tqdm(max_step, ascii=True)
    for step in range(max_step):
        qbar.update()
        if done:
            state = env.reset()
        action = get_action(state, env.action_space)
        state, reward, done, info = env.step(action)
        if done:
            print(str(step) + " 英雄请卷土重来" + str(info))
        env.render()
    env.close()
    qbar.close()
Пример #8
0
class MarioEnv(Process):
    def __init__(self,
                 env_id,
                 idx,
                 child_conn,
                 queue,
                 n_step,
                 is_render=False):
        super(MarioEnv, self).__init__()

        self.idx = idx
        self.env_id = env_id

        self.child_conn = child_conn
        self.queue = queue
        self.is_render = is_render
        self.n_step = n_step
        self.steps = 0
        self.episodes = 0
        self.accum_reward = 0
        self.transition = []

    def run(self):
        super(MarioEnv, self).run()

        self.env = gym_super_mario_bros.make(self.env_id)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT)
        self.reset()
        print('[ Worker %2d ] ' % (self.idx), end='')
        print('Playing <', self.env_id, '>')

        self.request_action(0, False)

        while True:
            action = self.child_conn.recv()
            next_state, reward, done, info = self.env.step(action)
            self.steps += 1
            self.accum_reward += reward
            next_state = rgb2dataset(next_state)

            if self.is_render and self.idx == 0:
                self.env.render()

            # make a transition
            self.transition.append(next_state)
            if len(self.transition) > 4:
                self.transition.pop(0)

            if done:
                self.send_result(info['x_pos'])
                self.reset()
                self.request_action(reward, True)
            else:
                self.request_action(reward, False)

    def reset(self):
        state = self.env.reset()
        state = rgb2dataset(state)
        self.transition.clear()
        self.transition.append(state)

        self.steps = 0
        self.episodes += 1
        self.accum_reward = 0

    def request_action(self, reward, done):
        self.queue.put([self.idx, "OnStep", [self.transition, reward, done]])

    def send_result(self, x_pos):
        self.queue.put([
            self.idx, "Result",
            [self.episodes, self.steps, self.accum_reward, x_pos]
        ])
Пример #9
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    movement.append(['B'])
    movement.append(['down'])
    movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 4
    width = 84
    resize_height = 110
    final_height = 84
    size = [channels, final_height, width]

    batch_size = 32
    replay_capacity = 100000
    replay_dir = '/home/hansencb/mario_replay/'
    epsilon = 1
    gamma = 0.9

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)
    target_model = simple_net(channels, len(movement), device).to(device)

    model_file = 'mario_agent'
    model.load_state_dict(torch.load(model_file))
    target_model.load_state_dict(torch.load(model_file))

    lr = 0.001
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    total_reward_file = 'total_reward.txt'
    with open(total_reward_file, 'w') as f:
        f.write('Reward\tSteps\n')

    max_steps = 5000
    num_eps = 1000

    data = dataset(replay_capacity, batch_size, replay_dir, 1, size)

    for episode in range(num_eps):
        print('Episode {}'.format(episode + 1))
        state = env.reset()
        state = preprocess(state, [resize_height, width], final_height)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            if step % 3 == 0:
                if random.random() < epsilon:
                    action = random.randint(0, len(movement) - 1)
                else:
                    q_val, action = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            if reward > 0:
                reward = 1
            else:
                reward = -1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width],
                                    final_height)
            next_state = torch.cat((state[1:, :, :], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)
            train(model, device, optimizer,
                  data.get_batch(model, device, gamma))

            state = next_state

            env.render()
            #time.sleep(0.03)

            if done:
                with open(total_reward_file, 'a') as f:
                    f.write('{}\t{}\n'.format(episode_reward, step))

                break

        epsilon -= (1 / num_eps)
        if episode % 10 == 0:
            target_model.load_state_dict(model.state_dict())

            with open(model_file, 'wb') as f:
                torch.save(model.state_dict(), f)

    env.close()
Пример #10
0
class Simulator:
    def __init__(self, movements, max_steps):
        """
        Creates a new Simulator.
        The Simulator lets individuals play the game and assigns their resulting fitness to them.
        :param movements: a list of movements the individuals are allowed to make
        :param max_steps: the maximum number of simulation steps an individual is allowed to use
        """
        self.movements = movements
        self.max_steps = max_steps

        # TODO maybe another name on "env_expanded"?
        self.env_expanded = gym_super_mario_bros.SuperMarioBrosEnv(
            frames_per_step=1, rom_mode='vanilla')
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env_expanded,
                                                 self.movements)
        # self.env.metadata['video.frames_per_second'] = 120
        # self.env_expanded.metadata['video.frames_per_second'] = 120

        self._log = logging.getLogger('MLProject.Simulator')

    def _simulate_individual(self, individual: Individual, render):
        """
        Simulates a single individual and assigns its fitness score.
        This involves letting the individual play a game of Mario,
        and assigning the resulting fitness to the individual.
        :param individual:
        """
        state = self.env.reset()

        x_pos = 0
        last_x_pos = 0
        reward_final = 0
        accumulated_fitness = 0
        died = False

        last_fps_time = time.time()
        frames = 0
        steps_standing_still = 0
        number_of_steps_standing_still_before_kill = 200

        for step in range(self.max_steps):
            self.state_downscaled = get_sensor_map(self.env_expanded)

            action = individual.agent.act(self.state_downscaled)
            # print('\r', _vectofixedstr(action, 12), end=' ')
            action = np.argmax(action)

            state, reward, done, info = self.env.step(action)

            if info['flag_get']:
                accumulated_fitness += x_pos

            x_pos = info['x_pos'] + accumulated_fitness

            reward_final += reward

            # Checks if reward is 0 to see if Mario stood still in the last step

            if last_x_pos - 1 <= x_pos <= last_x_pos + 1:
                steps_standing_still += 1
                if steps_standing_still >= number_of_steps_standing_still_before_kill:
                    break
            else:
                steps_standing_still = 0

            last_x_pos = x_pos

            if render:
                self.env.render()

            if info["life"] <= 2:
                died = True
                break

            # now = time.time()
            frames += 1
            """
            if now - last_fps_time >= 1:
                fps = frames / (now - last_fps_time)
                self._log.debug('FPS: {}'.format(fps))
                last_fps_time = now
                frames = 0
            """

        fps = frames / (time.time() - last_fps_time)
        self._log.debug('Steps per second: {:.2f}'.format(fps))

        individual.fitness = x_pos
        # individual.fitness = reward_final

        if died:
            self._log.debug(
                'Individual {} died. It achieved fitness {}'.format(
                    individual.id, individual.fitness))
        else:
            self._log.debug(
                'Individual {} ran out of simulation steps. It achieved fitness {}'
                .format(individual.id, individual.fitness))

    def simulate_generation(self, generation: Generation, render=True):
        """
        Simulates the whole generation and assigns each individual a fitness score.
        :param generation:
        :param render:
        """
        for individual in generation.individuals:
            self._simulate_individual(individual, render)

    def shutdown(self):
        """
        Does nothing. Needed for compatibility with ParallelSimulator
        """
        pass
Пример #11
0
        print("Made save path at: {}".format(save_dir))
    save_path = save_dir / AGENT_FILENAME

    if Path.is_file(save_path):
        print("Loading saved agent...")
        agent.load(save_path)

    done = False
    batch_size = 32

    for e in range(1, EPISODES + 1):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        time = 0
        while True:
            env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done or time >= 500:
                print("episode: {}/{}, score: {}, e: {:.2}".format(
                    e, EPISODES, time, agent.epsilon))
                break
            time += 1
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        if e % 10 == 0:
            agent.save(save_path)
Пример #12
0
class MarioEnvironment(Process):
    def __init__(self,
                 env_id,
                 is_render,
                 env_idx,
                 child_conn,
                 history_size=4,
                 h=84,
                 w=84):
        super(MarioEnvironment, self).__init__()
        self.daemon = True
        self.env = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make(env_id), movement)

        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MarioEnvironment, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()
            obs, reward, done, info = self.env.step(action)

            if life_done:
                # when Mario loses life, changes the state to the terminal
                # state.
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                # normal terminal state
                force_done = done

            # reward range -15 ~ 15
            log_reward = reward / 15
            self.rall += log_reward

            r = log_reward

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            if done:
                self.recent_rlist.append(self.rall)
                print(
                    "[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Stage: {} current x:{}   max x:{}"
                    .format(self.episode, self.env_idx, self.steps, self.rall,
                            np.mean(self.recent_rlist), info['stage'],
                            info['x_pos'], self.max_pos))

                self.history = self.reset()
            else:
                self.child_conn.send(
                    [self.history[:, :, :], r, False, done, log_reward])

    def reset(self):
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.stage = 1
        self.max_pos = 0
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))
        x = np.float32(x) * (1.0 / 255.0)

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
class MarioEnvironment(Process):
    def __init__(
            self,
            env_id,
            is_render,
            env_idx,
            child_conn,
            history_size=4,
            life_done=False,
            h=84,
            w=84, movement=COMPLEX_MOVEMENT, sticky_action=True,
            p=0.25):
        super(MarioEnvironment, self).__init__()
        self.daemon = True
        self.env = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)

        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.life_done = life_done
        self.sticky_action = sticky_action
        self.last_action = 0
        self.p = p

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MarioEnvironment, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()

            # sticky action
            if self.sticky_action:
                if np.random.rand() <= self.p:
                    action = self.last_action
                self.last_action = action

            # 4 frame skip
            reward = 0.0
            done = None
            for i in range(4):
                obs, r, done, info = self.env.step(action)
                if self.is_render:
                    self.env.render()
                reward += r
                if done:
                    break

            # when Mario loses life, changes the state to the terminal
            # state.
            if self.life_done:
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                force_done = done

            # reward range -15 ~ 15
            log_reward = reward / 15
            self.rall += log_reward

            r = int(info.get('flag_get', False))

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            if done:
                self.recent_rlist.append(self.rall)
                print(
                    "[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Stage: {} current x:{}   max x:{}".format(
                        self.episode,
                        self.env_idx,
                        self.steps,
                        self.rall,
                        np.mean(
                            self.recent_rlist),
                        info['stage'],
                        info['x_pos'],
                        self.max_pos))

                self.history = self.reset()

            self.child_conn.send([self.history[:, :, :], r, force_done, done, log_reward])

    def reset(self):
        self.last_action = 0
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.stage = 1
        self.max_pos = 0
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
Пример #14
0
                    n_y=env.action_space.n,
                    learning_rate=0.01,
                    reward_decay=0.99)

for episodes in range(EPISODES):

    observation = env.reset()

    observation = np.array(observation).reshape(1, 240, 256, 3)

    episode_reward = 0
    print("episode", episodes)

    while True:

        if RENDER_ENV: env.render()

        action = PG.choose_action(observation)

        next_state, reward, done, info = env.step(action)

        PG.store_transition(next_state, action, reward)

        episode_rewards_sum = sum(PG.episode_rewards)

        if done:
            episode_rewards_sum = sum(PG.episode_rewards)
            rewards.append(episode_rewards_sum)
            print(episode_rewards_sum)
            max_reward_so_far = np.amax(rewards)
Пример #15
0
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT


env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
print(env.observation_space)

print(env.get_action_meanings())
print(env.get_keys_to_action())

done = True
totalReward = 0
maxReward = 0
for step in range(1000):
    if done:
        state = env.reset()
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    #print(f"State: {state.shape} {state}")
    #print(f"Info: {info}")
    totalReward += reward
    if totalReward > maxReward:
        maxReward = totalReward
    print(f"{step} ({400 - info['time']}s): {action} -> {reward} (total: {totalReward}, max: {maxReward})")
    env.render(mode='human')


env.close()
Пример #16
0
class MarioEnv(Process):
    def __init__(self,
                 env_id,
                 idx,
                 child_conn,
                 queue,
                 s_dim,
                 a_dim,
                 g_net,
                 g_opt,
                 update_iter=10,
                 is_render=False,
                 use_cuda=False):
        super(MarioEnv, self).__init__()

        self.idx = idx
        self.env_id = env_id

        self.child_conn = child_conn
        self.queue = queue
        self.is_render = is_render
        # self.n_step = n_step
        self.update_iter = update_iter
        self.steps = 0
        self.episodes = 0
        self.accum_reward = 0
        self.transition = []

        self.use_cuda = use_cuda
        self.device = torch.device("cuda:0" if use_cuda else "cpu")

        self.s_dim = s_dim
        self.a_dim = a_dim
        self.g_net = g_net
        self.g_opt = g_opt

        self.buffer_state = []
        self.buffer_action = []
        self.buffer_reward = []

    def run(self):
        super(MarioEnv, self).run()

        self.model = A3C(
            self.s_dim,
            self.a_dim,
            gamma=0.95,
            epsilon_start=1.0,
            epsilon_end=0.1,
            epsilon_length=100000,
            use_cuda=self.use_cuda,
        )
        self.model.l_net.load_state_dict(self.g_net.state_dict())

        self.env = gym_super_mario_bros.make(self.env_id)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT)
        self.reset()
        print('[ Worker %2d ] ' % (self.idx), end='')
        print('Playing <', self.env_id, '>')

        while True:
            if len(self.transition) != 4:
                action = self.model.get_action(self.transition, is_random=True)
            else:
                action = self.model.get_action(self.transition,
                                               is_random=False)

            next_state, reward, done, info = self.env.step(action)
            self.steps += 1
            self.accum_reward += reward
            next_state = rgb2dataset(next_state)

            if self.is_render and self.idx == 0:
                self.env.render()

            self.buffer_state.append(self.transition)
            self.buffer_action.append(action)
            self.buffer_reward.append(reward)

            if len(self.buffer_state
                   ) > 0 and self.steps % self.update_iter == 0:
                next_transition = self.transition[1:]
                next_transition.append(next_state)

                self.train(next_transition, done)

                self.buffer_state.clear()
                self.buffer_action.clear()
                self.buffer_reward.clear()

            # make a transition
            self.transition.append(next_state)
            if len(self.transition) > 4:
                self.transition.pop(0)

            if done:
                self.send_result(info['x_pos'])
                self.reset()

    def reset(self):
        state = self.env.reset()
        state = rgb2dataset(state)
        self.transition.clear()
        self.transition.append(state)

        self.steps = 0
        self.episodes += 1
        self.accum_reward = 0

    def send_result(self, x_pos):
        self.queue.put([
            self.idx, "Result",
            [self.episodes, self.steps, self.accum_reward, x_pos]
        ])

    def train(self, next_transition, done):
        if done:
            v_s_ = 0.
        else:
            _, v = self.model.l_net.forward(
                torch.Tensor([next_transition]).to(self.device))
            v_s_ = v.cpu().detach().numpy()[0][0]

        prob, v = self.model.l_net.forward(
            torch.Tensor(self.buffer_state).to(self.device))

        buffer_v_target = []
        for r in self.buffer_reward[::-1]:
            v_s_ = r + self.model.gamma * v_s_
            buffer_v_target.append(v_s_)
        buffer_v_target.reverse()
        buffer_v_target = torch.Tensor(np.array(buffer_v_target)).to(
            self.device)
        buffer_action = torch.Tensor(np.array(self.buffer_action)).to(
            self.device)

        # LOSS 함수 구성
        td_error = buffer_v_target - v
        loss_critic = td_error.pow(2)

        dist = torch.distributions.Categorical(prob)
        loss_actor = -dist.log_prob(buffer_action) * td_error.detach()

        loss = (loss_critic + loss_actor).mean()

        self.g_opt.zero_grad()
        loss.backward()
        for lp, gp in zip(self.model.l_net.parameters(),
                          self.g_net.parameters()):
            gp._grad = lp.grad.clone().cpu()
        self.g_opt.step()

        self.model.l_net.load_state_dict(self.g_net.state_dict())
Пример #17
0
class MoMarioEnv(Process):
    def __init__(self, args, env_idx, child_conn, history_size=4, h=84, w=84):
        super(MoMarioEnv, self).__init__()
        self.daemon = True
        self.env = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make(args.env_id), SIMPLE_MOVEMENT)

        self.is_render = args.render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.coin = 0
        self.x_pos = 0
        self.time = 0
        self.score = 0
        self.n_mo = 5
        self.morall = np.zeros(self.n_mo)
        self.recent_rlist = deque(maxlen=100)
        self.recent_morlist = deque(maxlen=100)
        self.child_conn = child_conn
        self.life_done = args.life_done
        self.single_stage = args.single_stage
        self.stage_bonus = 0

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MoMarioEnv, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()
            obs, reward, done, info = self.env.step(action)

            if self.single_stage and info["flag_get"]:
                self.stage_bonus = 10000
                done = True
            ''' Construct Multi-Objective Reward'''  #####################################
            # [x_pos, time, death, coin]
            moreward = []
            # 1. x position
            xpos_r = info["x_pos"] - self.x_pos
            self.x_pos = info["x_pos"]
            # resolve an issue where after death the x position resets
            if xpos_r < -5:
                xpos_r = 0
            moreward.append(xpos_r)

            # 2. time penaltiy
            time_r = info["time"] - self.time
            self.time = info["time"]
            # time is aways decreasing
            if time_r > 0:
                time_r = 0
            moreward.append(time_r)

            # 3. death
            if self.lives > info['life']:
                death_r = -25
            else:
                death_r = 0
            moreward.append(death_r)

            # 4. coin
            coin_r = (info['coins'] - self.coin) * 100
            self.coin = info['coins']
            moreward.append(coin_r)

            # 5. enemy
            enemy_r = info['score'] - self.score
            if coin_r > 0 or done:
                enemy_r = 0
            self.score = info['score']
            moreward.append(enemy_r)

            ############################################################################

            if self.life_done:
                # when Mario loses life, changes the state to the terminal
                # state.
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                # normal terminal state
                force_done = done

            # reward range -15 ~ 15
            r = reward / 15
            self.rall += reward

            self.morall += np.array(moreward)
            mor = np.array(moreward) * self.n_mo / 15

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            score = info['score'] + self.stage_bonus

            if done:
                self.recent_rlist.append(self.rall)
                self.recent_morlist.append(self.morall)
                print(
                    "[Episode {}({})]\tStep: {}\tScore: {}\tMoReward: {}\tRecent MoReward: {}\tcoin: {}\tcurrent x:{}"
                    .format(self.episode, self.env_idx, self.steps,
                            score, self.morall,
                            np.mean(self.recent_morlist,
                                    axis=0), info['coins'], info['x_pos']))

                self.history = self.reset()

            self.child_conn.send(
                [self.history[:, :, :], r, force_done, done, mor, score])

    def reset(self):
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.coin = 0
        self.x_pos = 0
        self.time = 0
        self.score = 0
        self.stage_bonus = 0
        self.morall = np.zeros(self.n_mo)
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))
        x = np.float32(x) * (1.0 / 255.0)

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
Пример #18
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    #movement.append(['B'])
    #movement.append(['down'])
    #movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 3
    frames = 4
    width = 128
    resize_height = 180
    final_height = 128
    bottom_chop = 15
    size = [channels * frames, final_height, width]

    batch_size = 16
    replay_capacity = 100000
    replay_dir = '/home-local/bayrakrg/mario_replay/'
    start_epsilon = 1.0
    stop_epsilon = 0.01
    epsilon_decay = 0.00005
    gamma = 0.75

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)
    target_model = simple_net(channels, len(movement), device).to(device)

    model_file = 'mario_agent'
    model.load_state_dict(torch.load(model_file))
    target_model.load_state_dict(torch.load(model_file))

    lr = 0.0001
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    total_reward_file = 'total_reward.txt'
    with open(total_reward_file, 'w') as f:
        f.write('Reward\tSteps\n')

    max_steps = 500
    num_eps = 10000

    data = dataset(replay_capacity, batch_size, replay_dir, 1, size)

    tau = 0
    max_tau = 10000
    decay_step = 0

    for episode in range(num_eps):
        print('Episode {}'.format(episode + 1))
        state = env.reset()
        state = preprocess(state, [resize_height, width, 3], final_height,
                           bottom_chop)
        state = torch.cat((state, state, state, state))
        action = 0
        episode_reward = 0

        for step in range(max_steps):
            tau += 1
            decay_step += 1

            epsilon = stop_epsilon + (start_epsilon - stop_epsilon) * np.exp(
                -epsilon_decay * decay_step)

            if random.random() < epsilon:
                action = random.randint(0, len(movement) - 1)
            else:
                q_val, action, q_vals = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            if step == max_steps - 1:
                reward -= 10

            if reward > 0:
                reward = 1
            else:
                reward = -1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width, 3],
                                    final_height, bottom_chop)
            next_state = torch.cat((state[3:, :, :], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)
            train(model, device, optimizer,
                  data.get_batch(model, target_model, device, gamma))

            state = next_state

            env.render()

            if tau > max_tau:
                target_model.load_state_dict(model.state_dict())
                tau = 0

            if done:
                break

        with open(total_reward_file, 'a') as f:
            f.write('{}\t{}\n'.format(episode_reward, step))

        if episode % 5 == 0:
            with open(model_file, 'wb') as f:
                torch.save(model.state_dict(), f)

    env.close()
Пример #19
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    #movement.append(['B'])
    #movement.append(['down'])
    #movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 4
    # width = 84
    # resize_height = 110
    # final_height = 84
    width=128
    resize_height = 168
    final_height = 128
    size = [channels, final_height, width]

    batch_size = 16
    replay_capacity = 100000
    replay_dir = '/home/hansencb/mario_replay/'

    gamma = 0.95

    start_epsilon = 0.3
    stop_epsilon = 0.01
    epsilon_decay = 0.00025

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)
    target_model = simple_net(channels, len(movement), device).to(device)

    data_file = 'data_loader'
    model_file = 'mario_agent'
    continue_train = True
    model.load_state_dict(torch.load(model_file))

    if continue_train:
        target_model.load_state_dict(torch.load(model_file))

    lr = 0.00005
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    total_reward_file ='total_reward.txt'


    if not continue_train:
        with open(total_reward_file, 'w') as f:
            f.write('Reward\tSteps\n')


    max_steps = 5000
    num_eps = 5000

    if continue_train:
        with open(data_file, 'rb') as f:
            data = pickle.load(f)
            data.batch_size = batch_size
    else:
        data = dataset(replay_capacity, batch_size, replay_dir, size)

        #initialize memory with 100 experiences
        done = True
        for i in range(100):
            if done:
                state = env.reset()
                state = preprocess(state, [resize_height, width], final_height)
                state = torch.cat((state, state, state, state))

            action = random.randint(0,len(movement)-1)
            next_state, reward, done, info = env.step(int(action))

            # if reward>0:
            #     reward = 1
            # else:
            #     reward = -1
            reward /= 15
            if reward == 0:
                reward = -0.1

            next_state = preprocess(next_state, [resize_height, width], final_height)
            next_state = torch.cat((state[1:, :, :], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)

            state = next_state


    tau = 0
    max_tau = 2000
    decay_step = 0
    farthest = 3000
    cur_x = 1

    #training loop
    for episode in range(num_eps):
        print('Episode {}'.format(episode+1))
        state = env.reset()
        state = preprocess(state, [resize_height, width], final_height)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            tau += 1


            #epsilon = stop_epsilon+(start_epsilon - stop_epsilon)*np.exp(-epsilon_decay*decay_step)
            epsilon = start_epsilon * np.exp(1-(1/(cur_x/farthest)))
            if epsilon < stop_epsilon:
                epsilon = stop_epsilon

            if random.random() < epsilon:
                action = random.randint(0,len(movement)-1)
            else:
                q_val, action, q_vals = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            cur_x = info['x_pos']

            if cur_x > farthest:
                farthest = cur_x

            # if reward > 0:
            #     reward = 1
            # else:
            #     reward = -1

            reward /= 15
            if reward == 0:
                reward = -0.1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width], final_height)
            next_state = torch.cat((state[1:,:,:], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)
            batch = data.get_batch(model, target_model, device, gamma)
            loss, abs_err = train(model, device, optimizer, batch)

            data.update_batch(batch['idx'], np.squeeze(torch.Tensor.numpy(abs_err)))

            state = next_state

            env.render()
            #time.sleep(0.03)

            if tau > max_tau:
                target_model.load_state_dict(model.state_dict())
                tau = 0

            if done:
                break

        decay_step += step
        with open(total_reward_file, 'a') as f:
            f.write('{}\t{}\n'.format(episode_reward, step))

        if episode % 5 == 0:
            with open(model_file, 'wb') as f:
                torch.save(model.state_dict(), f)
            with open(data_file, 'wb') as f:
                pickle.dump(data, f)


    env.close()
Пример #20
0
def process_image(image, x, y, h, w):
    image = image[y:y + h, x:x + w]
    image = convert_to_gray_scale(image)
    return image


# array = csv_train.get_values()
x_list = []
done = True
action = 0
for step in range(10):
    if done:
        state = env.reset()
    state, reward, done, info = env.step(1)
    env.render()
    print(info['x_pos'])
    x = info['x_pos'] - 22
    y = env.unwrapped._y_position
    print(env.unwrapped._y_position)
    print(env.unwrapped._x_position)
    h = 40  # 52
    w = 60  # 20
    print(env.observation_space.shape)
    x_list.append(info['x_pos'])
    image = env.render('rgb_array')
    image = image[y:y + h, x:x + w]
    # image = cv2.resize(image, dsize=(128, 120), interpolation=cv2.INTER_CUBIC)
    print(image.shape)
    image = convert_to_gray_scale(image)
    print(image.shape)
Пример #21
0
    # Build Bellman equation for the Q function
    inputs[i:i + 1] = np.expand_dims(state, axis=0)
    targets[i] = model.predict(state)
    Q_sa = model.predict(state_new)

    if done:
        targets[i, action] = reward
    else:
        targets[i, action] = reward + gamma * np.max(Q_sa)

# Train network to output the Q function
    model.train_on_batch(inputs, targets)
print('Learning Finished')

# THIRD STEP: Play!

observation = env.reset()
obs = np.expand_dims(observation, axis=0)
state = np.stack((obs, obs), axis=1)
done = False
tot_reward = 0.0
while not done:
    env.render()  # Uncomment to see game running
    Q = model.predict(state)
    action = np.argmax(Q)
    observation, reward, done, info = env.step(action)
    obs = np.expand_dims(observation, axis=0)
    state = np.append(np.expand_dims(obs, axis=0), state[:, :1, :], axis=1)
    tot_reward += reward
print('Game ended! Total reward: {}'.format(reward))
Пример #22
0
def replay_genome(genome, movements, gen):
    env_expanded = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1,
                                                          rom_mode='vanilla')
    env = BinarySpaceToDiscreteSpaceEnv(env_expanded, movements)

    print('Number of genes: ', len(genome.connection_genes))
    for gene in genome.connection_genes:
        print(gene.in_node, gene.out_node, gene.weight, gene.innovation_number,
              gene.type, gene.enabled)

    done = True
    unticked = 0
    tick_interval = 1 / 30
    last_tick_time = time.time()

    fps = 0
    frames = 0
    last_fps_time = time.time()

    for _ in range(500000):

        unticked += time.time() - last_tick_time
        last_tick_time = time.time()
        ticked = False

        # while unticked >= tick_interval:
        if done:
            state = env.reset()

        state_downscaled = get_sensor_map(env_expanded)

        action = genome.calculate_action(state_downscaled)

        # print('\rFPS: {:.3f}'.format(fps), end=' ')
        # print(vectofixedstr(action, 10), end=' ')
        action = np.argmax(action)
        print('\rtaking action', movements[action], end='', flush=True)

        state, reward, done, info = env.step(action)

        #filename = get_path_of('all_pictures/mario/')
        #imsave(filename + 'mario_' + str(_) + '.png', state)

        save_state = np.full((13, 10, 3), 255, dtype=np.int)

        COLORS = [[250, 250, 250], [0, 0, 0], [196, 0, 0], [0, 0, 196]]

        for i in range(13):
            for j in range(10):
                if state_downscaled[(i, j)] == -1:
                    save_state[(i, j)] = COLORS[3]
                elif state_downscaled[(i, j)] == 0:
                    save_state[(i, j)] = COLORS[0]
                else:
                    save_state[(i, j)] = COLORS[1]

        save_state[(7, 2)] = COLORS[2]

        # filename = get_path_of('all_pictures/input_downscaled/')
        # imsave(filename + 'state_' + str(_) + '.png', save_state.astype(np.uint8))

        # make_controller(movements[action], _, gen)

        env.render()

        if info["life"] <= 2:
            died = True
            break

        ticked = True
        frames += 1
        unticked -= tick_interval

        # if ticked:
        #     now = time.time()
        #     if now - last_fps_time >= 1:
        #         fps = frames / (now - last_fps_time)
        #         last_fps_time = now
        #         frames = 0
        # else:
        #     time.sleep(0.001)

    env.close()
Пример #23
0
class MarioEnv(Process):
    def __init__(self,
                 env_id,
                 idx,
                 child_conn,
                 queue,
                 n_step,
                 is_render=False):
        super(MarioEnv, self).__init__()

        self.idx = idx
        self.env_id = env_id

        self.child_conn = child_conn
        self.queue = queue
        self.is_render = is_render
        self.n_step = n_step
        self.steps = 0
        self.episodes = 0
        self.accum_reward = 0
        self.transition = None
        self.prev_xpos = 0
        self.prev_life = 0

    def run(self):
        super(MarioEnv, self).run()

        self.env = gym_super_mario_bros.make(self.env_id)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT)
        self.reset()
        print('[ Worker %2d ] ' % (self.idx), end='')
        print('Playing <', self.env_id, '>')

        self.request_action(0, False)

        while True:
            action = self.child_conn.recv()
            #            print(SIMPLE_MOVEMENT[action])
            next_state, reward, done, info = self.env.step(action)

            force_done = False
            if reward == -15:
                force_done = True

            self.steps += 1
            self.accum_reward += reward
            next_state = rgb2dataset(next_state)

            if self.is_render and self.idx == 0:
                self.env.render()

            # make a transition
            self.transition[:3, :, :] = self.transition[1:, :, :]
            self.transition[3, :, :] = next_state

            if done:
                self.send_result(self.prev_xpos)

                self.reset()
                self.request_action(reward, force_done)
            else:
                self.request_action(reward, force_done)
            self.prev_xpos = info['x_pos']

    def reset(self):
        state = self.env.reset()
        state = rgb2dataset(state)

        self.transition = np.zeros([4, 84, 84])
        self.transition[-1, :] = state

        self.steps = 0
        self.episodes += 1
        self.accum_reward = 0

    def request_action(self, reward, done):
        self.queue.put([self.idx, "OnStep", [self.transition, reward, done]])

    def send_result(self, x_pos):
        self.queue.put([
            self.idx, "Result",
            [self.episodes, self.steps, self.accum_reward, x_pos]
        ])
Пример #24
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    #movement.append(['B'])
    #movement.append(['down'])
    #movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 3
    frames = 4
    width = 128
    resize_height = 180
    final_height = 128
    bottom_chop = 15

    epsilon = 0.0

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)

    model_file = 'mario_agent'
    model.load_state_dict(torch.load(model_file))

    max_steps = 5000
    num_eps = 1

    for episode in range(num_eps):
        print('Episode {}'.format(episode + 1))
        state = env.reset()
        state = preprocess(state, [resize_height, width, 3], final_height,
                           bottom_chop)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            if step % 3 == 0:
                if random.random() < epsilon:
                    action = random.randint(0, len(movement) - 1)
                else:
                    q_val, action, q_vals = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            if reward > 0:
                reward = 1
            else:
                reward = -1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width, 3],
                                    final_height, bottom_chop)
            next_state = torch.cat((state[3:, :, :], next_state))

            state = next_state

            env.render()
            time.sleep(0.03)

            if done:
                break

    env.close()
Пример #25
0
class Agent:
    def __init__(self, level_name):
        self.level_name = level_name
        # setup environment
        self.env = gym_super_mario_bros.make(level_name)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT)
        # one hot encoded version of our actions
        self.possible_actions = np.array(
            np.identity(self.env.action_space.n, dtype=int).tolist())

        # resest graph
        tf.reset_default_graph()

        # instantiate the DQNetwork
        self.DQNetwork = DQNetwork(state_size, action_size, learning_rate)

        # instantiate memory
        self.memory = Memory(max_size=memory_size)

        # initialize deque with zero images
        self.stacked_frames = deque(
            [np.zeros((100, 128), dtype=np.int) for i in range(stack_size)],
            maxlen=4)

        for i in range(pretrain_length):
            # If it's the first step
            if i == 0:
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

            # Get next state, the rewards, done by taking a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
            next_state, reward, done, _ = self.env.step(choice)

            # stack the frames
            next_state, self.stacked_frames = stack_frames(
                self.stacked_frames, next_state, False)

            # if the episode is finished (we're dead)
            if done:
                # we inished the episode
                next_state = np.zeros(state.shape)

                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # start a new episode
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)
            else:
                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # our new state is now the next_state
                state = next_state

        # saver will help us save our model
        self.saver = tf.train.Saver()

        # setup tensorboard writer
        self.writer = tf.summary.FileWriter("logs/")

        # losses
        tf.summary.scalar("Loss", self.DQNetwork.loss)

        self.write_op = tf.summary.merge_all()

    def predict_action(self, sess, explore_start, explore_stop, decay_rate,
                       decay_step, state, actions):
        # first we randomize a number
        exp_exp_tradeoff = np.random.rand()

        explore_probability = explore_stop + (
            explore_start - explore_stop) * np.exp(-decay_rate * decay_step)

        if explore_probability > exp_exp_tradeoff:
            # make a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
        else:
            # estimate the Qs values state
            Qs = sess.run(self.DQNetwork.output,
                          feed_dict={
                              self.DQNetwork.inputs_:
                              state.reshape((1, *state.shape))
                          })

            # take the biggest Q value (= best action)
            choice = np.argmax(Qs)
            action = self.possible_actions[choice]

        return action, choice, explore_probability

    def play_notebook(self):
        import matplotlib.pyplot as plt
        # imports to render env to gif
        from JSAnimation.IPython_display import display_animation
        from matplotlib import animation
        from IPython.display import display

        # http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html
        def display_frames_as_gif(frames):
            """
            Displays a list of frames as a gif, with controls
            """
            #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
            patch = plt.imshow(frames[0])
            plt.axis('off')

            def animate(i):
                patch.set_data(frames[i])

            anim = animation.FuncAnimation(plt.gcf(),
                                           animate,
                                           frames=len(frames),
                                           interval=50)
            display(display_animation(anim, default_mode='loop'))

        frames = []
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output,
                                  feed_dict={self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    frames.append(self.env.render(mode='rgb_array'))

                    total_rewards += reward

                    if done:
                        print("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break

                    next_state, self.stacked_frames = stack_frames(
                        self.stacked_frames, next_state, False)
                    state = next_state

            self.env.close()

        display_frames_as_gif(frames)

    def play(self):
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output,
                                  feed_dict={self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    self.env.render()

                    total_rewards += reward

                    if done:
                        print("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break

                    next_state, self.stacked_frames = stack_frames(
                        self.stacked_frames, next_state, False)
                    state = next_state
            self.env.close()

    def train(self):
        with tf.Session() as sess:
            # initialize the variables
            sess.run(tf.global_variables_initializer())

            # initialize decay rate (that will be used to reduce epsilon)
            decay_step = 0

            for episode in range(total_episodes):
                # set step to 0
                step = 0

                # initialize rewards of episode
                episode_rewards = []

                # make a new episode and opserve the first state
                state = self.env.reset()

                # remember that stack frame function
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

                print("Episode:", episode)

                while step < max_steps:
                    step += 1
                    #print("step:", step)

                    # increase decay_step
                    decay_step += 1

                    # predict an action
                    action, choice, explore_probability = self.predict_action(
                        sess, explore_start, explore_stop, decay_rate,
                        decay_step, state, self.possible_actions)

                    # perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)

                    if episode_render:
                        self.env.render()

                    # add the reward to total reward
                    episode_rewards.append(reward)

                    # the game is finished
                    if done:
                        print("done")
                        # the episode ends so no next state
                        next_state = np.zeros((110, 84), dtype=np.int)

                        next_state, self.stacked_frames = stack_frames(
                            self.stacked_frames, next_state, False)

                        # set step = max_steps to end episode
                        step = max_steps

                        # get total reward of the episode
                        total_reward = np.sum(episode_rewards)

                        print("Episode:", episode, "Total reward:",
                              total_reward, "Explore P:", explore_probability,
                              "Training Loss:", loss)

                        #rewards_list.append((episode, total_reward))

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add(
                            (state, action, reward, next_state, done))
                    else:
                        # stack frame of the next state
                        next_state, self.stacked_frames = stack_frames(
                            self.stacked_frames, next_state, False)

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add(
                            (state, action, reward, next_state, done))

                        # s_{i} := s_{i+1}
                        state = next_state

                    ### Learning part
                    # obtain random mini-batch from memory
                    batch = self.memory.sample(batch_size)
                    states_mb = np.array([each[0] for each in batch], ndmin=3)
                    actions_mb = np.array([each[1] for each in batch])
                    rewards_mb = np.array([each[2] for each in batch])
                    next_states_mb = np.array([each[3] for each in batch],
                                              ndmin=3)
                    dones_mb = np.array([each[4] for each in batch])

                    target_Qs_batch = []

                    # get Q values for next_state
                    Qs_next_state = sess.run(
                        self.DQNetwork.output,
                        feed_dict={self.DQNetwork.inputs_: next_states_mb})

                    # set Q_target = r if episode ends with s+1
                    for i in range(len(batch)):
                        terminal = dones_mb[i]

                    # if we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    else:
                        target = rewards_mb[i] + gamma * np.max(
                            Qs_next_state[i])
                        target_Qs_batch.append(target)

                    targets_mb = np.array([each for each in target_Qs_batch])

                    loss, _ = sess.run(
                        [self.DQNetwork.loss, self.DQNetwork.optimizer],
                        feed_dict={
                            self.DQNetwork.inputs_: states_mb,
                            self.DQNetwork.target_Q: targets_mb,
                            self.DQNetwork.actions_: actions_mb
                        })

                    # write tf summaries
                    summary = sess.run(self.write_op,
                                       feed_dict={
                                           self.DQNetwork.inputs_: states_mb,
                                           self.DQNetwork.target_Q: targets_mb,
                                           self.DQNetwork.actions_: actions_mb
                                       })
                    self.writer.add_summary(summary, episode)
                    self.writer.flush()

                # save model every 5 episodes
                if episode % 5 == 0:
                    self.saver.save(sess,
                                    "models/{0}.cpkt".format(self.level_name))
                    print("Model Saved")
Пример #26
0
def train(num_episodes, episode_length, learning_rate , scenario = "deatmatch.cfg", map_path = 'map02', render= True):
        #discount factor
        discount_factor = 0.99
        # 버퍼에 익스피리언스를 업데이트 하는 주기
        learning_rate = 0.01
        update_frequency = 5
        store_frequency  = 50

        #아웃풋을 프린팅하는 주기
        print_frequency = 1000

        #total reward와 total loss를 저장할 변수를 초기화

        total_reward = 0
        total_loss = 0
        old_q_value = 0

        # episodic reward와 loss를 저장할 리스트를 초기화
        rewards = []
        losses = []


        env = gym_super_mario_bros.make('SuperMarioBros-v0')
        env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)

        env.reset()
        actionDRQN = DRQN((240, 256, 3), 11,learning_rate)
        targetDRQN = DRQN((240, 256, 3), 11,learning_rate)


        #experience buffer cell_size
        experiences = ExperienceReplay(1000000)
        # 모델 저장

        saver = tf.train.Saver({v.name: v for v in actionDRQN.parameters}, max_to_keep = 1)


        #학습을 시작해보자
        #샘플링을 위해 모든 변수를 초기화 시킨다. 그리고 버퍼에서 트렌지션을 storing한다.
        sample = 10
        store = 100

        with tf.Session() as sess:

            #모든 텐서플로우 변수를 초기화 한다.
            sess.run(tf.global_variables_initializer())
            for episode in range(num_episodes):
                #새로운 에피소드를 시작한다.
                env.reset()

                for frame in range(episode_length):
                    env.render()
                    state = env.observation_space.shape
                    print(state)
                    action = actionDRQN.prediction.eval(feed_dict = {actionDRQN.input: state})
                    #env.step (action을 통하)
                    next_state, reward, done, info = env.step(action)
                    #reward를 업데이트
                    total_reward += reward

                    state= next_state
                    #game이 끝나면 break한다.
                    if done:
                        break
                    #transition을 버퍼에 넣는다.
                    if (frame%store)==0:
                        experience.appendToBuffer((s,action,reward))

                    #buffer에서 샘플을 뽑는다.
                    if (frame%sample) == 0:
                        memory = experiences.sample(1)
                        mem_frame=memory[0][0]
                        mem_reward = memory[0][2]

                        #train

                        Q1 = actionDRQN.output.eval(feed_dict = {actionDRQN.input : state})
                        Q2 = targetDRQN.output.eval(feed_dict = {targetDRQN.input : mem_frame})

                        #learning rate

                        learning_rate = actionDRQN.learning_rate.eval()

                        #Q value를 계산한다.
                        Qtarget = old_q_value + learning+_rate * (mem_reward + discount_factor * Q2 - old_q_value)

                        #update

                        old_q_value = Qtarget

                        # loss 계산
                        loss =actionDRQN.loss.eval(feed_dict = {actionDRQN.target_vector : Qtarget, actionDRQN.input : mem_frame})

                        p
                        #update loss
                        total_loss += loss

                        # 두 네트워크를 업데이트한다.

                        actionDRQN.update.run(feed_dict = {actionDRQN.target+vector : Qtarget, actionDRQN.input : mem_frame})
                        targetDRQN.update.run(feed_dict = {targetDRQN.target+vector : Qtarget, targetDRQN.input : mem_frame})
                        rewards.append((episode, total_reward))
                        losses.append((episode, total_loss))

                        total_reward = 0
                        total_loss = 0
Пример #27
0
    def run(self):
        global episode
        env = gym_super_mario_bros.make('SuperMarioBros-1-1-v3')
        env = BinarySpaceToDiscreteSpaceEnv(env, REALLY_COMPLEX_MOVEMENT)
        step = 0

        while episode < EPISODES:
            done = False

            max_x = 40
            no_progress = 0
            score = 0
            state = env.reset()

            # Making initial history with random actions
            for _ in range(5):
                next_state = state
                state, _, _, _ = env.step(0)

            state = preprocess(state)
            history = np.stack((state, state, state, state), axis=2)
            history = np.reshape([history], (1, 88, 128, 4))

            while not done:
                # Rendering code
                # Seems to be causing error in Mac OS
                if self.thread_num == 1:
                    env.render()
                step += 1
                self.t += 1

                step_reward = 0

                action, policy = self.get_action(history)

                # Taking 3 steps with selected action
                # Mimicking frame skip
                for _ in range(6):
                    next_state, reward, done, info = env.step(action)
                    score += reward
                    step_reward += reward
                    if done:
                        break

                # Kill Mario if Mario is making no progress for 10 seconds
                x_now = info.get('x_pos')
                # Handling exception x_pos = 65535
                if x_now == 65535:
                    x_now = max_x
                if max_x <= x_now:
                    max_x = x_now
                    no_progress = 0
                else:
                    no_progress += 1
                if no_progress == 150:
                    done = True
                    reward -= 1
                    step_reward -= 1
                    score -= 1
                    print("#", self.thread_num, " STUCK")

                # Preprocessing each states
                next_state = preprocess(next_state)
                next_state = np.reshape([next_state], (1, 88, 128, 1))
                next_history = np.append(next_state,
                                         history[:, :, :, :3],
                                         axis=3)

                # Average policy max value
                self.avg_p_max += np.amax(
                    self.actor.predict(np.float32(history / 255.)))

                # Appending sample
                self.append_sample(history, action, step_reward)
                history = next_history
                if self.t >= self.t_max or done:
                    #if done:
                    self.train_model(done)
                    self.update_local_model()
                    self.t = 0

                if done:
                    # Recording training information

                    episode += 1
                    print("#", self.thread_num,
                          "  episode:", episode, "  score:",
                          format(score,
                                 '.2f'), "  step:", step, "max_x :", max_x)

                    stats = [score, self.avg_p_max / float(step), step]
                    for i in range(len(stats)):
                        self.sess.run(self.update_ops[i],
                                      feed_dict={
                                          self.summary_placeholders[i]:
                                          float(stats[i])
                                      })
                    summary_str = self.sess.run(self.summary_op)
                    self.summary_writer.add_summary(summary_str, episode + 1)
                    self.avg_p_max = 0
                    self.avg_loss = 0
                    step = 0