示例#1
0
    def play_hill_climber(self, env, steps):
        _NOP = 0
        steps = 100
        env = JoypadSpace(env, self.actions)

        change_button_interval = 6  # every 6 steps
        actions_in_sequence = int(steps / change_button_interval) + 1

        best_action_sequence = [
            self.sample_no_start(env) for _ in range(actions_in_sequence)
        ]
        env.reset()
        best_score = self.evaluate_action_sequence(env, steps,
                                                   change_button_interval,
                                                   best_action_sequence)
        while True:
            env.reset()
            new_action_sequence = self.get_modified_actions(
                env, best_action_sequence, 0.2)
            new_score = self.evaluate_action_sequence(env, steps,
                                                      change_button_interval,
                                                      new_action_sequence)
            print('eval seq:', new_action_sequence)
            print('got score:', new_score, 'vs best score:', best_score)
            if new_score > best_score:
                best_score, best_action_sequence = new_score, new_action_sequence

        env.close()
示例#2
0
def main():
    """
    Main entry point function for program.
    """

    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = JoypadSpace(env, RIGHT_ONLY)

    action_size = len(RIGHT_ONLY)
    cdqn = CDQN(action_size, memory_size=10000, image_shape=(45, 64, 1))

    batch_size = 1024
    games = 10000
    skip = 100
    beaten = False

    for game in range(games):

        print("Game: {}".format(game + 1), end=" ")
        done = True
        total_reward = 0
        for step in range(8000):

            # Preprocess first image
            if done:
                state = env.reset()
                state = preprocess_image(state)[..., tf.newaxis]

            # Play move
            action = cdqn.act(state)
            next_state, reward, done, info = env.step(action)
            total_reward += reward

            # Remember move
            next_state = preprocess_image(next_state)[..., tf.newaxis]
            cdqn.remember(state, action, total_reward, next_state, done)
            state = next_state

            # Render game
            env.render()

            if done:
                break

            # Train when there are enough examples in memory
            #if len(cdqn.memory) >= batch_size and step % skip == 0:
        print("Reward: {}".format(total_reward))

        for e in range(5):
            print('Epoch {}'.format(e + 1))
            cdqn.experience_replay(batch_size)

        if game % 10 == 0:
            cdqn.update_target_model()

        print("Reward: {}".format(total_reward))
        tf.saved_model.save(cdqn.network, "model.sav")

    env.close()
示例#3
0
文件: neat_mario.py 项目: nlopez99/AI
    def fitness_func(self, genome, config, o):
        # create the environment
        game = gym_super_mario_bros.make('SuperMarioBros-v2')
        env = JoypadSpace(game, SIMPLE_MOVEMENT)
        try:
            # reset environment and create network from config file
            state = env.reset()
            neural_net = neat.nn.recurrent.RecurrentNetwork.create(
                genome, config)
            # frame count
            i = 0
            # starting mario position
            start_mario_distance = 40
            done = False

            # get shape of pixels
            inx, iny, inc = env.observation_space.shape
            inx, iny = int(inx / 8), int(iny / 8)

            while not done:
                # env.render() uncomment this to see mario play
                # resize image array and convert to grayscale
                state = cv2.resize(state, (inx, iny))
                state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)
                state = np.reshape(state, (inx, iny))
                # flatten array so the network likes it
                state = state.flatten()

                # feed the state through the network and get max output
                output = neural_net.activate(state)
                action = output.index(max(output))

                # do the action from the net
                observation, reward, done, info = env.step(action)
                state = observation
                # increase frame count
                i += 1

                # check if 50 frames if mario moves and break from loop to restart if he hasn't
                if i % 50 == 0:
                    if start_mario_distance == info['x_pos']:
                        break
                    else:
                        start_mario_distance = info['x_pos']

            # give a negative reward if mario didn't move else reward the distance he moved
            fitness = -1 if info['x_pos'] <= 40 else info['x_pos']

            # if at the end of the level dump the current genome to file
            if fitness >= 4000:
                pickle.dump(genome, open("winning_genome.pkl", "wb"))

            # put current fitness into queue
            o.put(fitness)
            env.close()

        except KeyboardInterrupt:
            env.close()
            sys.exit()
示例#4
0
    def play(self):
        env = gym_tetris.make('TetrisA-v0')
        env = JoypadSpace(env, MOVEMENT)
        state = env.reset()
        model = self.global_model
        model_path = os.path.join(self.save_dir,
                                  'model_{}.h5'.format('Tetris'))
        print('Loading model from: {}'.format(model_path))
        model.load_weights(model_path)
        done = False
        step_counter = 0
        reward_sum = 0
        pieza_colocada = True
        informacion = env.get_info()
        antiguo_statistics = informacion['statistics']
        state = [0, 0, 0, 0]
        while not done:
            env.render()
            if pieza_colocada:
                pieza_colocada = False
                pos = 5
                giro = 0
                u = -1
                state = [state]
                policy, value = model(
                    tf.convert_to_tensor(state, dtype=tf.float32))
                policy = tf.nn.softmax(policy)
                action = np.argmax(policy)
                pos_objetivo = action % 10
                giro_objetivo = action // 10
            if (giro % giro_objetivo) != 0 and not done:
                state, reward, done, info = env.step(1)
                accion = 0
                giro = giro + 1
            elif pos > pos_objetivo and not done:
                state, reward, done, info = env.step(6)
                pos = pos - 1
                accion = 0
            elif pos < pos_objetivo and not done:
                state, reward, done, info = env.step(3)
                pos = pos + 1
                accion = 0
            elif not done and not pieza_colocada:
                state, reward, done, info = env.step(9)
                accion = 9
            else:
                accion = 0
            if not done:
                state, reward, done, info = env.step(accion)
            env.render()
            informacion = env.get_info()
            if antiguo_statistics != informacion['statistics']:
                antiguo_statistics = informacion['statistics']
                step_counter += 1

        env.close()
示例#5
0
def main():
    stats_gen = StatsGenerator(1, 'results/final_tats.txt')

    env = gym_super_mario_bros.make('SuperMarioBros-v0')

    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    alpha, gamma, epsilon = 0.1, 1, 0.3
    marioQLearner = MarioQLearner(env, alpha, gamma, epsilon, stats_gen)
    marioQLearner.action()
    env.close()
示例#6
0
    def run_player(self, member):
        env = gym_super_mario_bros.make(self.env)
        env = JoypadSpace(env, self.actions)
        env = WarpFrame(env)
        env = FrameStack(env, 4)
        player = MarioPlayer(self.num_of_actions, member.genes)

        if self.record:
            rec_output_path = os.path.join(
                self.current_gen_output_dir, "vid",
                "{name}.mp4".format(name=member.get_name()))
            rec = monitor.video_recorder.VideoRecorder(env,
                                                       path=rec_output_path)

        state = env.reset()
        done = False

        last_x_pos = 0
        same_x_pos_cunt = 0

        for step in range(self.steps_scale):
            if done:
                break
            action = player.act(state)
            state, reward, done, info = env.step(action)

            if self.record:
                rec.capture_frame()
            if self.render:
                env.render()

            player.update_info(info)
            player.update_reward(reward)
            if last_x_pos == info['x_pos']:
                same_x_pos_cunt += 1
            else:
                same_x_pos_cunt = 0
                last_x_pos = info['x_pos']
            if same_x_pos_cunt > self.standing_steps_limit:  # end the run if player don't advance:
                done = True
            if not self.allow_death and info[
                    'life'] < INITIAL_LIFE:  # will repeat death, so why try more
                done = True
            if info['flag_get']:  # if got to the flag - run is ended.
                done = True

        if self.record:
            rec.close()
        env.close()
        member.set_fitness_score(player.calculate_fitness())
        outcome = player.get_run_info()
        outcome['generation'] = self.generation
        outcome['index'] = member.get_name()
        return outcome
示例#7
0
def main():

    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    done = False

    for e in range(100):
        state = env.reset()

        while not done:
            env.render()
            state, reward, done, info = env.step(env.action_space.sample())

    env.close()
示例#8
0
def contra_game_render():
    env = gym.make('Contra-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    print("actions", env.action_space)
    print("observation_space ", env.observation_space.shape)
    done = False
    env.reset()
    for step in range(5000):
        if done:
            print("Over")
            break
        state, reward, done, info = env.step(env.action_space.sample())
        env.render()

    env.close()
示例#9
0
def main():
    env = gym.make('SuperMarioBros-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    obs_shape = env.observation_space.shape
    obs_size = reduce(operator.mul, obs_shape, 1)
    action_size = env.action_space.n

    q = MLP(obs_size, action_size)
    q_target = MLP(obs_size, action_size)
    q_target.load_state_dict(q.state_dict())
    if torch.cuda.is_available():
        q = q.cuda()
        q_target = q_target.cuda()

    memory = ReplayBuffer()
    print_interval = 20
    score = 0.0
    optimizer = optim.Adam(q.parameters(), lr=learning_rate)

    for n_epi in range(10000):
        epsilon = max(0.01, 0.08 - 0.01 *
                      (n_epi / 200))  # Linear annealing from 8% to 1%
        s = env.reset()
        done = False

        while not done:
            a = q.sample_action(torch.from_numpy(np.array(s)).float(), epsilon)
            s_prime, r, done, info = env.step(a)
            done_mask = 0.0 if done else 1.0
            memory.put((s, a, r / 100.0, s_prime, done_mask))
            s = s_prime

            score += r
            if done:
                break

        if memory.size() > 2000:
            train(q, q_target, memory, optimizer)

        if n_epi % print_interval == 0 and n_epi != 0:
            q_target.load_state_dict(q.state_dict())
            print(
                "n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".
                format(n_epi, score / print_interval, memory.size(),
                       epsilon * 100))
            score = 0.0
    env.close()
示例#10
0
class agent:
    def __init__(self):
        self.env = gym_super_mario_bros.make('SuperMarioBros-v0')
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        self.size = self.env.observation_space.shape
        self.options = self.env.action_space.n
        self.baseline = 0

    def get_screen(self):
        self.env.render()

    def close(self):
        self.env.close()

    def doStep(self, a):
        sP, r, done, info = self.env.step(a)
        return r, done, sP
示例#11
0
def play_random_custom(env, steps):
    _NOP = 0

    actions = [['start'], ['NOOP'], ['right', 'A'], ['left', 'A'],
               ['left', 'B'], ['right', 'B'], ['up'], ['down'], ['A'], ['B']]

    env = JoypadSpace(env, actions)

    env.reset()

    action = 0
    start = time.time()
    # play_human
    for t in range(0, steps):
        # get the mapping of keyboard keys to actions in the environment
        if hasattr(env, 'get_keys_to_action'):
            keys_to_action = env.get_keys_to_action()
        elif hasattr(env.unwrapped, 'get_keys_to_action'):
            keys_to_action = env.unwrapped.get_keys_to_action()
        else:
            raise ValueError('env has no get_keys_to_action method')

        # # change action every 6 frames
        if t % 6 == 0:
            action = env.action_space.sample()

            # after 500 timesteps, stop pressing start button
            if t > 500:
                while action == 0:
                    action = env.action_space.sample()

        observation, reward, done, info = env.step(action)
        # print("---------------------------t: ", t)
        # print("action space: ", action, env.action_space)
        # print("obs: ", observation)
        # print("reward: ", reward)
        # print("info: ", info)
        # runs game at about 60fps
        time.sleep(0.016667)
        env.render()

    end = time.time()
    env.close()
    print("time: ", (end - start), " seconds  for ", steps, "steps")
示例#12
0
def run_random_actions():
    """
    randomly take 1 of the 12 complex movement actions
    and print action, rewards
    """
    env = JoypadSpace(gym_super_mario_bros.make('SuperMarioBros-v0'),
                      COMPLEX_MOVEMENT)

    done = True
    for step in range(50):
        if done:
            env.reset()
        # randomly take an action from action_space
        random_action = env.action_space.sample()
        # info returns meta-data incl. coins, life, score etc.
        # state is RGB image (240, 256, 3)
        state, reward, done, info = env.step(random_action)
        print('# {}: Action: {}, Reward: {}, Done: {}'.format(
            step, random_action, reward, done))
    env.close()
示例#13
0
def main():
    env = gym_super_mario_bros.make('SuperMarioBros-v1')
    env = JoypadSpace(env, USE_MOVEMENT)
    interval = 20
    q = QNetWork()
    q_target = QNetWork()
    input_shape = (batch_size, 240, 256, 3)
    q.build(input_shape=input_shape)
    q_target.build(input_shape=input_shape)
    for src, dest in zip(q.variables, q_target.variables):
        dest.assign(src)
    memory = ReplayBuffer()

    score = 0.
    optimizer = optimizers.Adam(lr=learning_rate)
    for n_epi in range(10000):
        eqsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200))
        s = env.reset()
        for t in range(10000):
            a = q.sample_action(s, eqsilon)
            s_prime, r, done, _ = env.step(a)
            env.render()
            done = 0. if done else 1.
            memory.put((s, a, r, s_prime, done))
            s = s_prime
            score += r
            if not done:
                break
        print ("epeide :   {} ".format(n_epi))
        if memory.size() > 100:
            train(q, q_target, memory, optimizer)
        # print("22,  ", tf.size(q), tf.size(q))
        if n_epi % interval == 0 and n_epi != 0:
            # print(q.variables, q_target.variables)
            for src, dest in zip(q.variables, q_target.variables):
                dest.assign(src)  # 影子网络权值来自Q
            print(" # of epsode {}, avg_score {}, buffer size {}".format(n_epi, score/interval, memory.size()))
            score = 0.
        if n_epi % 200 == 0 and not n_epi:
            q_target.network.save_weights('dqn_weights{}.ckpt'.format(int(n_epi / 200)))
    env.close()
示例#14
0
def eval_genome(genome):
    env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")
    env = JoypadSpace(env, COMPLEX_MOVEMENT)

    done = False
    timeout = 100

    state = env.reset()

    rewards = 0

    while not done and timeout > 0:
        state_resized = resize(state,
                               (state.shape[0] // 8, state.shape[1] // 8),
                               anti_aliasing=False)
        state_resized = np.apply_along_axis(
            rgb2dec,
            1,
            (np.reshape(state_resized,
                        (state_resized.shape[0] * state_resized.shape[1], 3)) *
             255),
        )

        state, reward, done, info = env.step(
            np.argmax(genome.evaluate(state_resized)))

        rewards += reward

        if reward <= 0:
            timeout -= 1
        else:
            timeout += 1

        env.render()

    env.close()

    return rewards
示例#15
0
def run(file):
    config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                     neat.DefaultSpeciesSet, neat.DefaultStagnation,
                     'config-feedforward')

    genome = pickle.load(open(file, 'rb'))
    #print(genome)
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v2')
    env = JoypadSpace(env, RIGHT_ONLY)

    env1 = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env1 = JoypadSpace(env1, RIGHT_ONLY)


    net = neat.nn.FeedForwardNetwork.create(genome, config)
    try:
        obs = env.reset()
        env1.reset()

        inx = int(obs.shape[0] / 8)
        iny = int(obs.shape[1] / 8)
        done = False
        while not done:
            #env.render()
            env1.render()
            obs = cv2.resize(obs, (inx, iny))
            obs = cv2.cvtColor(obs, cv2.COLOR_BGR2GRAY)
            obs = np.reshape(obs, (inx, iny))

            imgarray = np.ndarray.flatten(obs)

            actions = net.activate(imgarray)
            action =  np.argmax(actions)
            
            _,_,_,info1 = env1.step(action)
            s, reward, done, info = env.step(action)
            xpos = info['x_pos']


            print(done, action, xpos)
            obs = s
        env1.close()
        env.close()
    except KeyboardInterrupt:
        env.close()
        env1.close()
        exit()
示例#16
0
def play_model(args):

    # if gpu is to be used
    device = torch.device(
        "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu")

    # Build env (first level, right only)
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    # setup networks
    init_screen = get_screen(env, device)
    _, _, screen_height, screen_width = init_screen.shape

    # Get number of actions from gym action space
    args.n_actions = env.action_space.n

    target_net = DQN(screen_height, screen_width, args.n_actions).to(device)

    if args.targetNet:
        target_net.load_state_dict(
            torch.load(args.targetNet, map_location=device))

    with torch.no_grad():
        i = 0
        observation = env.reset()
        while i < 5000:
            env.render()
            state = get_screen(env, device)
            action = int(target_net(state).max(1)[1].view(1, 1))
            observation, reward, done, info = env.step(action)

            if done:
                break
            i += 1

    env.close()
class Environment():
    def __init__(self, input_mode=RIGHT_ONLY, level_mode=0):
        # inputMode = RIGHT_ONLY, SIMPLE_MOVEMENT, or COMPLEX_MOVEMENT
        # levelMode = SuperMarioBros-vX
        self.env = gym_super_mario_bros.make(f"SuperMarioBros-v{level_mode}")
        self.env = JoypadSpace(self.env, input_mode)
        self.env.reset()

    
    # Performs action and returns result
    def input_action(self, action):
        return self.env.step(action)

    # Renders the environment
    def render(self):
        self.env.render()

    # Resets the environment
    def reset(self):
        self.env.reset()
   
    # Closes the environment
    def close(self):
        self.env.close()
示例#18
0
        def play_random_custom(env, steps):
            _NOP = 0

            env = JoypadSpace(env, actions)

            env.reset()

            action = 0
            start = time.time()

            if SHOULD_TRAIN:

                init_screen = get_screen()
                _, _, screen_height, screen_width = init_screen.shape

                # INIT Neural Network
                policy = Policy(screen_height, screen_width, len(actions))

                if SHOULD_LOAD_STATE:
                    print("Loading model from: ", DATA_PATH)
                    policy.load_state_dict(torch.load(DATA_PATH))

                optimizer = optim.Adam(policy.parameters(), lr=1e-2)
                eps = np.finfo(np.float32).eps.item()

                # Helper functions
                def select_action(state):
                    global steps_done
                    sample = random.random()
                    eps_threshold = reward_threshold
                    # eps_threshold = EPS_END + (EPS_START - EPS_END) * \
                    #     math.exp(-1. * steps_done / EPS_DECAY)
                    steps_done += 1
                    if sample > eps_threshold:
                        with torch.no_grad():
                            # t.max(1) will return largest column value of each row.
                            # second column on max result is index of where max element was
                            # found, so we pick action with the larger expected reward.
                            return policy(state).max(1)[1].view(1, 1)
                    else:
                        return torch.tensor([[random.randrange(len(actions))]],
                                            device=device,
                                            dtype=torch.long)

                def finish_episode():
                    R = 0
                    policy_loss = []
                    returns = []
                    for r in policy.rewards[::-1]:
                        R = r + GAMMA * R
                        returns.insert(0, R)
                    returns = torch.tensor(returns)
                    returns = (returns - returns.mean()) / \
                        (returns.std() + eps)
                    for log_prob, R in zip(policy.saved_log_probs, returns):
                        policy_loss.append(-log_prob * R)
                    optimizer.zero_grad()
                    print("POLICY LOSS: ", policy_loss)
                    # policy_loss = torch.cat(policy_loss).sum()
                    # policy_loss.backward()
                    optimizer.step()
                    torch.save(policy.state_dict(), DATA_PATH)
                    del policy.rewards[:]
                    del policy.saved_log_probs[:]

                running_reward = 10
                for i_episode in count(1):
                    print("Episode: ", i_episode)
                    state, ep_reward = env.reset(), 0
                    # Don't infinite loop while learning
                    for t in range(1, num_steps_per_episode):
                        action = select_action(state).data.cpu().numpy()[0][0]
                        # print("ACTION:", action)
                        state, reward, done, info = env.step(action)
                        if SHOULD_RENDER:
                            env.render()
                        policy.rewards.append(reward)
                        ep_reward += reward
                        if done:
                            break

                    running_reward = 0.05 * ep_reward + \
                        (1 - 0.05) * running_reward
                    finish_episode()
                    if i_episode % log_interval == 0:
                        print(
                            'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'
                            .format(i_episode, ep_reward, running_reward))
                    print("Running reward: ", running_reward)
                    if running_reward > reward_threshold:
                        print("Solved! Running reward is now {} and "
                              "the last episode runs to {} time steps!".format(
                                  running_reward, t))
                        break
            else:
                # PLAY RANDOMLY
                for t in range(0, steps):
                    # get the mapping of keyboard keys to actions in the environment
                    if hasattr(env, 'get_keys_to_action'):
                        keys_to_action = env.get_keys_to_action()
                    elif hasattr(env.unwrapped, 'get_keys_to_action'):
                        keys_to_action = env.unwrapped.get_keys_to_action()
                    else:
                        raise ValueError(
                            'env has no get_keys_to_action method')

                    # # change action every 6 frames
                    if t % 6 == 0:
                        action = env.action_space.sample()

                        # after 500 timesteps, stop pressing start button
                        if t > 500:
                            while action == 0:
                                action = env.action_space.sample()

                    observation, reward, done, info = env.step(action)
                    print("---------------------------t: ", t)
                    print("action space: ", action, env.action_space)
                    print("obs: ", observation.shape)
                    print("reward: ", reward)
                    print("info: ", info)
                    # runs game at about 60fps
                    time.sleep(0.016667)
                    env.render()

            end = time.time()
            env.close()
            print("time: ", (end - start), " seconds  for ", steps, "steps")
示例#19
0
from nes_py.wrappers import JoypadSpace
from my_gym_super_mario_bros.actions import SIMPLE_MOVEMENT
from my_gym_super_mario_bros import make
env = make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)


# from nes_py.wrappers import JoypadSpace
# import gym_super_mario_bros
# from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
# env = gym_super_mario_bros.make('SuperMarioBros-v0')
# env = JoypadSpace(env, SIMPLE_MOVEMENT)


done = True
for step in range(100):
    if done:
        state = env.reset()
    cur_act = env.action_space.sample()
    state, reward, done, info = env.step(cur_act)
    print("Current Step State:",state.shape)
    print("Current Step Reward:",reward)
    # env.render()

env.close()
            t.start()
            time.sleep(0.5)
        try:
            [t.join() for t in threads]  # wait for threads to finish
        except KeyboardInterrupt:
            print("Exiting threads!")

    def save_weights(self):
        print("Saving Weights")
        self.global_network.save_weights("A3CMarioWeights.h5")

    def restore_weights(self):
        print("Restoring Weights!")
        self.global_network.load_weights("A3CMarioWeights.h5")


test_env = gym_super_mario_bros.make(env_name)
test_env = JoypadSpace(test_env, SIMPLE_MOVEMENT)
test_env = atari_wrapper.wrap_dqn(test_env)

NUM_ACTIONS = test_env.action_space.n
OBS_SPACE = test_env.observation_space.shape[0]

state = test_env.reset()
state = np.expand_dims(state, axis=0)

stats = Stats()
agent = A3CAgent()
agent.start_threads()
test_env.close()
class Agent:
    def __init__(self, level_name):  
        self.level_name = level_name  
        # setup environment
        self.env = gym_super_mario_bros.make(level_name)
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        # one hot encoded version of our actions
        self.possible_actions = np.array(np.identity(self.env.action_space.n, dtype=int).tolist())

        # resest graph
        tf.reset_default_graph()
        
        # instantiate the DQNetwork
        self.DQNetwork = DQNetwork(state_size, action_size, learning_rate)
        
        # instantiate memory
        self.memory = Memory(max_size=memory_size)
        
        # initialize deque with zero images
        self.stacked_frames = deque([np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4)

        for i in range(pretrain_length):    
            # If it's the first step
            if i == 0:
                state = self.env.reset()        
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

            # Get next state, the rewards, done by taking a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
            next_state, reward, done, _ = self.env.step(choice)

            # stack the frames
            next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

            # if the episode is finished (we're dead)
            if done:
                # we inished the episode
                next_state = np.zeros(state.shape)

                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # start a new episode
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)
            else:
                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # our new state is now the next_state
                state = next_state
       
        # saver will help us save our model
        self.saver = tf.train.Saver()

        # setup tensorboard writer
        self.writer = tf.summary.FileWriter("logs/")

        # losses
        tf.summary.scalar("Loss", self.DQNetwork.loss)
        
        self.write_op = tf.summary.merge_all()
    
    def predict_action(self, sess, explore_start, explore_stop, decay_rate, decay_step, state, actions):
        # first we randomize a number
        exp_exp_tradeoff = np.random.rand()

        explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)

        if explore_probability > exp_exp_tradeoff:
            # make a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
        else:
            # estimate the Qs values state
            Qs = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: state.reshape((1, *state.shape))})

            # take the biggest Q value (= best action)
            choice = np.argmax(Qs)
            action = self.possible_actions[choice]

        return action, choice, explore_probability
    
    def play_notebook(self):
        import matplotlib.pyplot as plt
        # imports to render env to gif
        from JSAnimation.IPython_display import display_animation
        from matplotlib import animation
        from IPython.display import display

        # http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html
        def display_frames_as_gif(frames):
            """
            Displays a list of frames as a gif, with controls
            """
            #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
            patch = plt.imshow(frames[0])
            plt.axis('off')

            def animate(i):
                patch.set_data(frames[i])

            anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
            display(display_animation(anim, default_mode='loop'))

        frames = []
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network 
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    frames.append(self.env.render(mode = 'rgb_array'))

                    total_rewards += reward

                    if done:
                        print ("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break


                    next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)
                    state = next_state

            self.env.close()

        display_frames_as_gif(frames)
        
    def play(self):
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            #self.env = wrap_env(self.env)

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network 
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    self.env.render()

                    total_rewards += reward

                    if done:
                        print ("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break

                    next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)
                    state = next_state
            self.env.close()
    
    def train(self):        
        with tf.Session() as sess:
            # initialize the variables
            sess.run(tf.global_variables_initializer())

            # initialize decay rate (that will be used to reduce epsilon)
            decay_step = 0

            for episode in range(total_episodes):
                # set step to 0
                step = 0

                # initialize rewards of episode
                episode_rewards = []

                # make a new episode and opserve the first state
                state = self.env.reset()

                # remember that stack frame function
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

                print("Episode:", episode)

                while step < max_steps:
                    step += 1
                    #print("step:", step)

                    # increase decay_step
                    decay_step += 1

                    # predict an action
                    action, choice, explore_probability = self.predict_action(sess,
                                                         explore_start, 
                                                         explore_stop, 
                                                         decay_rate, 
                                                         decay_step, 
                                                         state, 
                                                         self.possible_actions)

                    # perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)

                    if episode_render:
                        self.env.render()

                    # add the reward to total reward
                    episode_rewards.append(reward)

                    # the game is finished
                    if done:
                        print("done")
                        # the episode ends so no next state
                        next_state = np.zeros((110, 84), dtype=np.int)

                        next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

                        # set step = max_steps to end episode
                        step = max_steps

                        # get total reward of the episode
                        total_reward = np.sum(episode_rewards)

                        print("Episode:", episode, 
                              "Total reward:", total_reward, 
                              "Explore P:", explore_probability, 
                              "Training Loss:", loss)

                        #rewards_list.append((episode, total_reward))

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add((state, action, reward, next_state, done))
                    else:
                        # stack frame of the next state
                        next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add((state, action, reward, next_state, done))

                        # s_{i} := s_{i+1}
                        state = next_state

                    ### Learning part
                    # obtain random mini-batch from memory
                    batch = self.memory.sample(batch_size)
                    states_mb = np.array([each[0] for each in batch], ndmin=3)
                    actions_mb = np.array([each[1] for each in batch])
                    rewards_mb = np.array([each[2] for each in batch])
                    next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                    dones_mb = np.array([each[4] for each in batch])

                    target_Qs_batch = []

                    # get Q values for next_state
                    Qs_next_state = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: next_states_mb})

                    # set Q_target = r if episode ends with s+1
                    for i in range(len(batch)):
                        terminal = dones_mb[i]

                    # if we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)

                    targets_mb = np.array([each for each in target_Qs_batch])

                    loss, _ = sess.run([self.DQNetwork.loss, self.DQNetwork.optimizer],
                                      feed_dict={self.DQNetwork.inputs_: states_mb, 
                                                 self.DQNetwork.target_Q: targets_mb, 
                                                 self.DQNetwork.actions_: actions_mb})

                    # write tf summaries
                    summary = sess.run(self.write_op, feed_dict={self.DQNetwork.inputs_: states_mb, 
                                                 self.DQNetwork.target_Q: targets_mb, 
                                                 self.DQNetwork.actions_: actions_mb})
                    self.writer.add_summary(summary, episode)
                    self.writer.flush()

                # save model every 5 episodes
                if episode % 5 == 0:
                    self.saver.save(sess, "models/{0}.cpkt".format(self.level_name))
                    print("Model Saved")
示例#22
0
def train_agent(args):
    # if gpu is to be used
    device = torch.device(
        "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu")

    # Build env (first level, right only)
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    # setup networks
    init_screen = get_screen(env, device)
    _, _, screen_height, screen_width = init_screen.shape

    # Get number of actions from gym action space
    args.n_actions = env.action_space.n

    policy_net = DQN(screen_height, screen_width, args.n_actions).to(device)
    target_net = DQN(screen_height, screen_width, args.n_actions).to(device)

    if args.targetNet:
        target_net.load_state_dict(
            torch.load(args.targetNet, map_location=device))

    if args.policyNet:
        target_net.load_state_dict(
            torch.load(args.policyNet, map_location=device))

    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.RMSprop(policy_net.parameters())
    memory = ReplayMemory(10000)

    args.steps_done = 0

    num_episodes = 1

    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        last_screen = get_screen(env, device)
        current_screen = get_screen(env, device)
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(state, policy_net, args, device)
            _, reward, done, _ = env.step(action.item())
            reward = torch.tensor([reward], device=device)

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env, device)
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model(optimizer, memory, policy_net, target_net, args,
                           device)
            if done:
                episode_durations.append(t + 1)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % args.target_update == 0:
            target_net.load_state_dict(policy_net.state_dict())
            torch.save(policy_net.state_dict(), args.output_policyNet)
            torch.save(target_net.state_dict(), args.output_targetNet)

        if i_episode % 10 == 0:
            print(f'{i_episode+1}/{num_episodes}: Completed Episode.')

    print('Complete')
    env.close()

    torch.save(policy_net.state_dict(), args.output_policyNet)
    torch.save(target_net.state_dict(), args.output_targetNet)
示例#23
0
class NesEnv():
    def __init__(self, env, seed, max_episode_length, action_repeat,
                 bit_depth, args):
        from nes_py.wrappers import JoypadSpace
        import gym_tetris
        from gym_tetris.actions import SIMPLE_MOVEMENT

        self._env = gym_tetris.make(env,skip_level=True)
        self._env.seed(seed)
        self._env = JoypadSpace(self._env, SIMPLE_MOVEMENT)
        self.max_episode_length = max_episode_length
        self.action_repeat = action_repeat
        self.bit_depth = bit_depth
        self.small_image = args.small_image
        self.add_reward = args.add_reward
        self.typeb = "1" in env
        self.acc = 0.03 if self.typeb else 3
        self.living = 0.003 if self.typeb else 0.3
        self.dim=1 if args.binary_image else 3
        if args.binary_image:
            self._process_obs=_images_to_observation_binary
        else:
            self._process_obs=_images_to_observation
        self.one_skip=False
        if not args.add_reward:
            self.acc=0
            self.living=0

    def reset(self):
        self.t = 0  # Reset internal timer
        state = self._env.reset()
        # hack the memory of the nes env, setting level to 29
        # self._env.ram[0x0064]=29
        # skip some frames
        for i in range(85):
            state,r,d,i=self._env.step(0)
        # print(self.observation_size)
        observation = self._process_obs(state, self.bit_depth,
                                             self.observation_size)  # NxCxHxW
        return observation

    def step(self, action):
        action = action.argmax().item()  # convert onehot action to int
        reward = 0
        state, done = None, None
        total=3 if self._env.ram[0x0068]<2 else 1
        for k in range(3):
            # print(f"Timer: {self._env.ram[0x0065]},State {self._env.ram[0x0068]}")
            state, reward_k, done, info = self._env.step(action if k==0 else 0)
            reward += reward_k
            self.t += 1  # Increment internal timer
            done = done or self.t == self.max_episode_length
            if done:
                break
        flag=False
        while self._env.ram[0x0065]>0 and self._env.ram[0x0068]>=2 and not done:
            flag=True
            o,r,d,info=self._env.step(0)
            reward+=r
            done=d or done
        if flag and self.one_skip:
            o,r,d,info=self._env.step(0)
            reward+=r
            done=d or done
            state=o
        if flag:
            reward+=self.acc
            if info['board_height']>10:
                reward-=self.acc
        reward+=self.living
        observation = self._process_obs(state, self.bit_depth,
                                             self.observation_size)
        return observation, reward, done

    def render(self):
        self._env.render()

    def close(self):
        self._env.close()

    @property
    def observation_size(self):
        # self._env.observation_space.shape: H x W x C (240x256x3)
        return (self.dim, 96, 96) if self.small_image else (self.dim, 128, 128)  # C x H x W
        # return (3, 120, 128) # C x H x W # TODO: Lixin

    @property
    def action_size(self):
        return self._env.action_space.n

    def sample_random_action(self):
        indices = torch.tensor(self._env.action_space.sample())
        return F.one_hot(indices, self.action_size).float()
示例#24
0
class Agent:
    def __init__(self, level_name):
        self.level_name = level_name
        self.env = gym_super_mario_bros.make(level_name)
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        self.possible_actions = np.array(
            np.identity(self.env.action_space.n, dtype=int).tolist())

        tf.compat.v1.reset_default_graph()

        self.DQNet = DQNet(state_size, action_size, learning_rate)
        self.memory = Memory(max_size=memory_size)
        self.stacked_frames = deque(
            [np.zeros((100, 128), dtype=np.int) for i in range(stack_size)],
            maxlen=4)

        for i in range(pretrain_length):
            if i == 0:
                state = self.env.reset()
                state, self.stacked_frames = stack_frame(
                    self.stacked_frames, state, True)

                choice = random.randint(1, len(self.possible_actions)) - 1
                action = self.possible_actions[choice]
                next_state, reward, done, _ = self.env.step(choice)

                next_state, self.stacked_frames = stack_frame(
                    self.stacked_frames, next_state, False)

                if done:
                    next_state = np.zeros(state.shape)
                    self.memory.add((state, action, reward, next_state, done))
                    state = self.env.reset()
                    state, self.stacked_frames = stack_frame(
                        self.stacked_frames, state, True)

                else:
                    self.memory.add((state, action, reward, next_state, done))
                    state = next_state

        self.saver = tf.compat.v1.train.Saver()
        self.writer = tf.compat.v1.summary.FileWriter("logs/")
        tf.summary.scalar("Loss", self.DQNet.loss)
        self.write_op = tf.compat.v1.summary.merge_all()

    def predict_action(
        self,
        sess,
        explore_start,
        explore_stop,
        decay_rate,
        decay_step,
        state,
        actions,
    ):
        exp_exp_tradeoff = np.random.rand()

        explore_probs = explore_stop + (explore_start - explore_stop) * np.exp(
            -decay_rate * decay_step)

        if explore_probs > exp_exp_tradeoff:
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]

        else:
            QS = sess.run(self.DQNet.output,
                          feed_dict={
                              self.DQNet.inputs: state.reshape(
                                  (1, *state.shape))
                          })
            choice = np.argmax(QS)
            action = self.possible_actions[choice]

        return action, choice, explore_probs

    def play_note(self):
        import matplotlib.pyplot as plt
        from JSAnimation.IPython_display import display_animation
        from matplotlib import animation
        from IPython.display import display

        def display_frame_gif(frames):
            patch = plt.imshow(frames[0])
            plt.axis('off')

            def animate(i):
                patch.set_data(frames[i])

            anim = animation.FuncAnimation(plt.gcf(),
                                           animate,
                                           frames=len(frames),
                                           interval=50)
            display(display_animation(anim, default_mode='loop'))

        frames = []
        with tf.compat.v1.Session as sess:
            total_test_rewards = []

            self.saver.restore(sess, "model/{0}.cpkt".format(self.level_name))

            for episode in range(1):
                total_rewards = 0
                state = self.env.reset()
                state, self.stacked_frames = stack_frame(
                    self.stacked_frames, state, True)
                print("*************************************")
                print('EPISODE', episode)

                while True:
                    state = state.reshape((1, *state_size))
                    QS = sess.run(self.DQNet.output,
                                  feed_dict={self.DQNet.inputs: state})
                    choice = np.argmax(QS)
                    next_state, reward, done, _ = self.env.step(choice)
                    frames.append(self.env.render(mode='rgb_array'))

                    if done:
                        print("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break
                    next_state, self.stacked_frames = stack_frame(
                        self.stacked_frames, next_state, False)
                    state = next_state
            self.env.close()

    def play(self):
        with tf.compat.v1.Session() as sess:
            total_test_rewards = []
            self.saver.restore(sess, "model/{0}.cpkt".format(self.level_name))
            for episode in range(1):
                total_rewards = 0
                state = self.env.reset()
                state, self.stacked_frames = stack_frame(
                    self.stacked_frames, state, True)
                print("*************************************")
                print('EPISODE', episode)

                while True:
                    state = state.reshape((1, *state_size))
                    QS = sess.run(self.DQNet.output,
                                  feed_dict={self.DQNet.inputs: state})
                    choice = np.argmax(QS)
                    next_state, reward, done, _ = self.env.step(choice)
                    self.env.render()

                    total_rewards += reward

                    if done:
                        print("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break

                    next_state, self.stacked_frames = stack_frame(
                        self.stacked_frames, next_state, False)
                    state = next_state
            self.env.close()

    def train(self):
        with tf.compat.v1.Session() as sess:
            sess.run(tf.compat.v1.global_variables_initializer())
            decay_step = 0

            for episode in range(total_episodes):
                step = 0
                episodes_rewards = []
                state = self.env.reset()
                state, self.stacked_frames = stack_frame(
                    self.stacked_frames, state, True)
                print("EPISODE", episode)

                while step < max_steps:
                    step += 1

                    decay_step += 1
                    action, choice, explore_probs = self.predict_action(
                        sess, explore_start, explore_stop, decay_rate,
                        decay_step, state, self.possible_actions)

                    next_state, reward, done, _ = self.env.step(choice)

                    if episode_render:
                        self.env.render()

                    episodes_rewards.append(reward)

                    if done:
                        print('done')

                        next_state = np.zeros((100, 128), dtype=np.int)

                        next_state, self.stacked_frames = stack_frame(
                            self.stacked_frames, next_state, False)

                        step = max_steps

                        total_rewards = np.sum(episodes_rewards)

                        print('Episode: {}'.format(episode),
                              'Total reward: {}'.format(total_rewards),
                              'Explore P: {:.4f}'.format(explore_probs),
                              'Training Loss {:.4f}'.format(loss))

                        self.memory.add(
                            (state, action, reward, next_state, done))

                    else:
                        next_state, self.stacked_frames = stack_frame(
                            self.stacked_frames, next_state, False)
                        self.memory.add(
                            (state, action, reward, next_state, done))
                        state = next_state

                    batch = self.memory.sample(batch_size)
                    states_mb = np.array([each[0] for each in batch], ndmin=3)
                    actions_mb = np.array([each[1] for each in batch])
                    rewards_mb = np.array([each[2] for each in batch])
                    next_state_mb = np.array([each[3] for each in batch],
                                             ndmin=3)
                    dones_mb = np.array([each[4] for each in batch])

                    target_Qs_batch = []

                    Qs_next_state = sess.run(
                        self.DQNet.output,
                        feed_dict={self.DQNet.inputs: next_state_mb})

                    for i in range(len(batch)):
                        terminal = dones_mb[i]

                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])

                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state)
                        target_Qs_batch.append(target)

                    target_mb = np.array([each for each in target_Qs_batch])

                    loss, _ = sess.run(
                        [self.DQNet.loss, self.DQNet.optimizer],
                        feed_dict={
                            self.DQNet.inputs: states_mb,
                            self.DQNet.target_q: target_mb,
                            self.DQNet.action: actions_mb
                        })

                    summary = sess.run(self.write_op,
                                       feed_dict={
                                           self.DQNet.inputs: states_mb,
                                           self.DQNet.target_q: target_mb,
                                           self.DQNet.action: actions_mb
                                       })

                    self.writer.add_summary(summary, episode)
                    self.writer.flush()

                if episode % 5 == 0:
                    self.saver.save(sess,
                                    "models/{0}.cpkt".format(self.level_name))
                    print("model Saved")
示例#25
0
class SimpleMario():
    def __init__(self):
        self.BASE_DIR = os.getcwd()
        self.SAVE_DESTINALTION = os.path.join(self.BASE_DIR, "saved_model")

        self.env = gym_super_mario_bros.make('SuperMarioBros-v0')
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)

        self.valid_move_indx = set(range(7))

        self.fresh_start()

    def get_action_set(self) -> dict:
        """Dict of all possible actions
        - key[int]: input for the action
        - value: action

        Returns:
            dict: dict of all action
        """
        # ['NOOP', 'right', 'right A', 'right B', 'right A B', 'A', 'left']
        return dict({
            0: "stay",
            1: "forward",
            2: "forward , A ,",
            3: "forward , B ",
            4: "forward , A , B",
            5: "jump",
            6: "backward"
        })

    def close_env(self):
        """
        Cleans inmemory data for the enviroment
        - CLEAN AFTER YOU ARE DONE
        """
        self.env.close()

    def fresh_start(self):
        """
        Reset Every time you need to do a fresh start
        - like when you die
        - want to restart from begining
        """
        self.env.reset()

    def get_env_state(self) -> dict:
        """Get Environment Details
        
        {
            'state': env.state,
            'reward': env.reward,
            'isdead': env.isdead,
            'info': env.info,
        }

        Returns:
            dict: environment Details
        """
        respTuple = self.env.step(0)
        respData = dict()
        respData['state'] = respTuple[0]
        respData['reward'] = respTuple[1]
        respData['isdead'] = respTuple[2]
        respData['info'] = respTuple[3]
        return respData

    # TODO: make it happen
    def get_miv_env(self):
        """ Still in progress """
        envConfig = self.get_env_state()
        envState = envConfig['state']
        envState = [np.argmax(pixel) for pixel in envState]
        return envState

    def make_move(self, move):
        """Returns Null is invalid input
        valid input: self.get_action_set()

        Args:
            move (int): move index

        Returns:
            tuple: 
                - (numpy.ndarray) the state as a result of the action
                - (float) the reward achieved by taking the action
                - (bool) a flag denoting whether the episode has ended
                - (dict) a dictionary of extra information
        """
        if move in self.valid_move_indx:
            return self.env.step(move)
        return [None, None, True, None]

    def play_game(self, moveCount: int = 0):
        """Play game using interface

        - After playing make sure to restart
        
        Key input [int] :
            Enter your move: range(0,7)
            Do you want to restart: {
                -1: Do Nothing
                 0: ShutDown preview
                else: Reset and Restart
            }


        Args:
            moveCount (int, optional): 
                key for action to take from self.get_action_set. 
                Defaults to 0.
        """
        self.fresh_start()
        self.env.render()
        for cou in range(moveCount):
            move_indx = int(input("Enter your move: "))
            for _ in range(30):
                state, reward, done, info = self.make_move(move_indx)
                self.env.render()

            if done:
                self.fresh_start()
                restart = int(input("Do you want to restart: "))
                if (restart == -1):
                    pass
                elif (restart == 0):
                    break

    def generate_random_file_name(self):
        if not os.path.isdir(self.SAVE_DESTINALTION):
            os.makedirs(self.SAVE_DESTINALTION)
        fileContentCount = len(os.listdir(self.SAVE_DESTINALTION))
        return f"MarioEnv{fileContentCount}.npy"

    def save_env(self, destination: str = ""):
        """ Dont Really need it """
        if destination == "":
            if not os.path.isdir(self.SAVE_DESTINALTION):
                os.makedirs(self.SAVE_DESTINALTION)
        else:
            self.SAVE_DESTINALTION = destination

        fileDestination = os.path.join(self.SAVE_DESTINALTION,
                                       self.generate_random_file_name())

        fileContent = self.get_env_state()

        with open(fileDestination, 'w') as savedModel:
            np.save(fileDestination,
                    np.array(list(fileContent.items()), dtype=object))

    def load_env(self, fileLocation: str = ""):
        """ Dont really need it """
        if (fileLocation == ""):
            raise Exception("File Location Not Provided")
        elif not os.path.isfile(fileLocation):
            raise Exception("Invalid File Location")

        fileContent = dict()

        with open(fileLocation, 'rb') as fileObj:
            fileListContent = np.load(fileObj, allow_pickle=True)

        for item in fileListContent:
            fileContent[item[0]] = item[1]

        return fileContent
示例#26
0
def train_model(parameters):
    #Initialization of environment and agent
    env = gym_super_mario_bros.make(parameters['environment'])
    env = JoypadSpace(env, RIGHT_ONLY)
    env = wrapper(env)

    states = (84, 84, 4)
    actions = env.action_space.n
    
    agent = DDQNagent(parameters, states, actions)

    if parameters['train']:
        #TENSORBOARD
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")    
        log_dir = 'logs/mario/' + current_time + '/10k'    
        summary_writer = tf.summary.create_file_writer(log_dir)    
        summary_writer.set_as_default()
        
        
        maxXpos = 0    # Maximum X position of the Agent
        max_reward = 0    # Maximum reward
        start_time = time.time()    # Start time
        
        #Initialization of varialbes for plots
        graph_reward = np.zeros(parameters['episodes_to_play'])    # Reward
        graph_pos = np.zeros(parameters['episodes_to_play'])    # Pozition
        graph_mean_reward = np.zeros(parameters['episodes_to_play'])    # Mean Reward
    
    
        episodes = parameters['episodes_to_play']    # Number of episodes to train
        rewards = []    # Rewards array
        
        
        start = time.time()    # Time for calculating processed frames per second
        step = 0    # Total steps
        
        #Lerning cycle
        for e in range(episodes):
    
            #Default state of the environment
            state = env.reset()
        
            total_reward = 0    # Reward gained for actual epsiode
            iter = 0
        
            while True:
                #Select an action
                action = agent.run(state)
        
                #Apply action to environment
                next_state, reward, done, info = env.step(action)
    
                #Write new data to memory
                agent.update_memory(experience=(state, next_state, action, reward, done))
    
                #Learn
                agent.learn()
        
                #Sum of rewards for every action
                total_reward += reward
        
                #Change current state to next one
                state = next_state
        
                iter += 1
                
                #Render
                if parameters['render']:
                    env.render()
        
                #Check finish condition
                if done or info['flag_get']:
                    break
        
            #New data for variable that be used for plot
            rewards.append(total_reward / iter)
            
            #Update info
            if maxXpos < info['x_pos']:
                maxXpos = info['x_pos']
            if max_reward < total_reward:
                max_reward = total_reward
            
            if info['flag_get'] == True:
                agent.flag_reached = agent.flag_reached + 1
            
            #Epsilon decay
            if agent.eps >= 0.0:
                agent.eps = agent.eps - agent.eps_decay
            
            #Updtate variables for plots
            graph_reward[e] = total_reward
            graph_pos[e] = info['x_pos']
            graph_mean_reward[e] = np.mean(graph_reward)
            
            #TENSORBOARD
            tf.summary.scalar("Rewards", total_reward, step=e)
            tf.summary.scalar("Position", info['x_pos'], step=e)
            tf.summary.scalar("Mean reward", np.mean(graph_reward), step=e)
            tf.summary.scalar("Flags", agent.flag_reached, step=e)
            tf.summary.scalar("Loss", agent.loss, step=e)
            
            
            #Console information
            print("Episode reward: " + str(total_reward) + ' - Pos: ' + str(info['x_pos']))
            # Print
            if e % 10 == 0:
                end = time.time()
                print('Flags reached: ' + str(agent.flag_reached) + ' - Max reward: ' +str(max_reward))
                print('Episode {e} - '
                      'Frame {f} - '
                      'Frames/sec {fs} - '
                      'Epsilon {eps} - '
                      'Mean Reward {r} - '
                      'Time {t} sec - '
                      'Max pos {pos}'.format(e=e,
                                               f=agent.step,
                                               fs=np.round((agent.step - step) / (time.time() - start)),
                                               eps=np.round(agent.eps, 4),
                                               r=np.mean(rewards[-100:]),
                                               t=round(end - start_time),
                                               pos=maxXpos))
    
    
                start = time.time()    
                step = agent.step    
        
        #After learning draw plots and save weights
        draw_graph(graph_reward,'Rewards')
        draw_graph(graph_pos, 'Position')
        draw_graph(graph_mean_reward, 'Mean reward')
        agent.save_weights()
        env.close() 
        
    else:
        #If train is equal to false, it is possible to load weights and observe result
        print('Weights file path (hdf5): ')
        weights_name = input()
        try:
            agent.model_target.load_weights(weights_name)
            agent.model_test(env)
        except:
            print("Weights with this name or on this path not found")
        env.close()
示例#27
0
class MarioManager():
    '''
        Initialize the environment, class contains basic Open Gym AI operations
        along with screen proccessing operations done by pytorch
    '''
    def __init__(self, device):
        self.device = device
        self.env = JoypadSpace(gym_super_mario_bros.make('SuperMarioBros-v0'), RIGHT_ONLY)
        self.env.reset()
        self.current_screen = None
        self.done = False
        self.current_score = 0
        self.current_coins = 0
        self.x = -9999999
        self.coins = 0
        self.score = 0
        self.count_same_posn = 0

    def reset(self):
        self.env.reset()
        self.current_screen = None

    def close(self):
        self.env.close()

    def render(self, mode = 'human'):
        return self.env.render(mode)

    def num_actions(self):
        return self.env.action_space.n

    def take_act(self, action):
        observation, reward, self.done, info = self.env.step(action.item()) #uses action.item
        #if new coins
        if self.coins != info['coins']:
            reward += int(info['coins']) - self.coins
            self.coins = int(info['coins'])
        #if ghost is killed
        if self.score != info['score']:
            reward += int(info['score']) - self.score
            self.score = int(info['score'])
        #checking for same position in the game, means he is stuck, kill him
        if self.x == info['x_pos']:
            self.count_same_posn += 1
        #if he moved after being stuck, give a reward
        elif self.count_same_posn > 0 and self.x != info['x_pos']:
            self.count_same_posn = 0
            reward += 15
        #else reset count to 0
        else:
            self.count_same_posn = 0
        # if reward == 0:
        #     reward -= 1
        #make negative reward even more negative

        #kill him after the first life to speed up training
        # if info['life'] < 2:
        #     self.done = True
        #check that he actually moved to the right
        if self.x < info['x_pos']:
            reward += 0
        #he didn't more right byt taking the action, pinalize
        else:
            reward -= 1
            
        if info['x_pos'] != 40:
            self.x = info['x_pos']
        return torch.tensor([reward], device = self.device)

    def return_count(self):
        return self.count_same_posn

    def return_posn(self):
        return self.x

    def is_starting(self):
        return self.current_screen is None

    def state(self):
        if self.is_starting() or self.done:
            self.current_screen = self.get_proccessed_screen()
            black_screen = torch.zeros_like(self.current_screen)
            return black_screen
        else:
            # screen = self.current_screen
            # next_screen = self.get_proccessed_screen()
            # self.current_screen = next_screen
            self.current_screen =  self.get_proccessed_screen()
            return self.current_screen

    def screen_height(self):
        return self.get_proccessed_screen().shape[2]

    def screen_width(self):
        return self.get_proccessed_screen().shape[3]

    def get_proccessed_screen(self):
        screen = self.render('rgb_array').transpose((2,0,1))
        screen = self.crop_screen(screen)
        return self.transform_screen_data(screen)

    def crop_screen(self, screen):
        screen_height = screen.shape[1]
        top = int(screen_height * 0.5)
        bottom = int(screen_height * 0.9)
        screen = screen[:,top:bottom, :]
        return screen

    def transform_screen_data(self, screen):
        screen = np.ascontiguousarray(screen, dtype = np.float32)/255
        screen = torch.from_numpy(screen)
        size = t.Compose(
        [t.ToPILImage(),
        t.Resize((15,40)),
        #t.Grayscale(num_output_channels=1),
        t.ToTensor()])

        return size(screen).unsqueeze(0).to(self.device)
示例#28
0
class MarioEnvironment(dm_env.Environment):
    def __init__(
        self,
        skip_frames: int = 3,
        img_rescale_pc: float = 0.4,
        stack_func: Optional[Callable[[List[np.ndarray]],
                                      np.ndarray]] = np.hstack,
        stack_mode: str = "all",
        grayscale: bool = True,
        black_background: bool = True,
        in_game_score_weight: float = 0.01,
        movement_type: str = "simple",
        world_and_level: Optional[Tuple[int, int]] = None,
        idle_frames_threshold: Optional[int] = 1250,
        colorful_rendering: bool = True,
    ) -> None:
        assert stack_mode in ("first_and_last", "all")
        self._stack_mode = stack_mode

        env_name = (f"SuperMarioBros" if world_and_level is None else
                    "SuperMarioBros-%d-%d" % world_and_level)
        env_name += f"-v{int(black_background)}"
        self._smb_env = gym_super_mario_bros.make(env_name)
        self._smb_env = JoypadSpace(self._smb_env,
                                    MOVEMENTS_TYPES[movement_type])

        self._actions_queue = []
        self._colorful_env = None
        if (grayscale or black_background) and colorful_rendering:
            self._colorful_env = gym_super_mario_bros.make(
                "SuperMarioBros-%d-%d-v0" % world_and_level)
            self._colorful_env = JoypadSpace(self._colorful_env,
                                             MOVEMENTS_TYPES[movement_type])

        self._stack_func = stack_func
        self._grayscale = grayscale

        self._score_weight = in_game_score_weight
        self._idle_frames_threshold = idle_frames_threshold

        self._last_score = 0
        self._last_x = 40
        self._idle_counter = 0

        self._rescale_pc = img_rescale_pc
        self._skip_frames = skip_frames

        self._obs_shape = self.reset().observation.shape
        self._num_actions = self._smb_env.action_space.n

    def reset(self):
        """ Returns the first `TimeStep` of a new episode. """
        self._smb_env.reset()
        self._last_score = 0
        self._last_x = 40
        self._idle_counter = 0

        self._actions_queue = []
        if self._colorful_env is not None:
            self._colorful_env.reset()

        return dm_env.restart(self.step(0).observation)

    def _is_idle(self, info):
        if self._idle_frames_threshold is None:
            return False

        x = info["x_pos"]
        delta_x = x - self._last_x
        self._last_x = x

        if abs(delta_x) < 1:
            self._idle_counter += 1
            return self._idle_counter > self._idle_frames_threshold

        self._idle_counter = 0
        return False

    def step(self, action) -> TimeStep:
        """ Updates the environment's state. """
        # NOTE:
        # The gym_super_mario_bros environment reuses the numpy array it
        # returns as observation. When stacking observations, this might be
        # a source of bugs (all observations in the stack might be representing
        # the same, final frame!), so always copy the arrays when doing that.
        # The observation arrays are already being copied inside
        # `self._preprocess_img`, so no explicit copying is needed here.

        action = int(action)
        initial_img, total_reward, done, info = self._smb_env.step(action)
        self._actions_queue.append(action)
        done = done or self._is_idle(info)

        # Skipping frames:
        if self._skip_frames > 0:
            imgs = [self._process_img(initial_img)]
            skip_count = 0
            while skip_count < self._skip_frames:
                skip_count += 1
                if not done:
                    last_img, reward, done, info = self._smb_env.step(action)
                    self._actions_queue.append(action)
                    done = done or self._is_idle(info)
                    total_reward += reward
                else:
                    last_img = np.zeros_like(initial_img)

                if self._stack_mode == "all" or skip_count == self._skip_frames:
                    imgs.append(self._process_img(last_img))

            obs = self._stack_func(imgs)
        # Single frame:
        else:
            obs = self._process_img(initial_img)

        score_diff = info["score"] - self._last_score
        self._last_score = info["score"]
        total_reward = np.float64(total_reward +
                                  self._score_weight * score_diff)

        if done:
            return dm_env.termination(reward=total_reward, observation=obs)
        return dm_env.transition(reward=total_reward, observation=obs)

    def observation_spec(self):
        return dm_env.specs.BoundedArray(shape=self._obs_shape,
                                         dtype=np.float32,
                                         name="image",
                                         minimum=0,
                                         maximum=1)

    def action_spec(self):
        return dm_env.specs.DiscreteArray(dtype=np.int32,
                                          name="action",
                                          num_values=self._num_actions)

    def _process_img(self, img):
        img = np.divide(img, 255)
        img = img[50:, :, :]

        if abs(self._rescale_pc - 1) > 1e-2:
            img = rescale(img, scale=self._rescale_pc, multichannel=True)

        if self._grayscale:
            img = img @ RGB2GRAY_COEFFICIENTS

        return img.astype(np.float32, copy=True)

    def render(self, mode="human", return_all_imgs=False):
        if return_all_imgs:
            assert self._colorful_env is not None and mode == "rgb_array", (
                "The option 'return_all_imgs' is valid only when using "
                "colorful rendering and rgb array mode!")

        # Regular rendering:
        if self._colorful_env is None:
            return self._smb_env.render(mode)

        # Colorful rendering:
        img_list = []
        for action in self._actions_queue:
            self._colorful_env.step(action)
            if return_all_imgs:
                # NOTE: make sure a copy of the returned rgb array is made!
                img_list.append(self._colorful_env.render(mode).copy())

        self._actions_queue = []
        return img_list if return_all_imgs else self._colorful_env.render(mode)

    def plot_obs(self, obs):
        plt.imshow(obs, cmap="gray" if self._grayscale else None)
        plt.show()

    def close(self):
        self._smb_env.close()