예제 #1
0
class WaterWorld:
    def __init__(self, fps=30, display_screen=False):
        game = PyGameWaterWorld()
        self.game = PLE(game, fps=fps, display_screen=display_screen)
        action_set = self.game.getActionSet()
        self.action_map = {i: a for (i, a) in enumerate(action_set)}
        self.action_space = spaces.Discrete(len(self.action_map))
        self.metadata = {'render.modes': ['human', 'rgb_array']}

        box = np.ones((48, 48, 3), dtype='float32')
        self.observation_space = spaces.Box(low=box * 0, high=box * 255)

    def reset(self):
        self.game.reset_game()
        return self.game.getScreenRGB()

    def step(self, action):
        a = self.action_map[action]
        r = self.game.act(a)
        done = self.game.game_over()
        info = {}
        return self.game.getScreenRGB(), r, done, info

    def close(self):
        pass
class Env:
  def __init__(self):
    self.game = FlappyBird(pipe_gap=125)
    self.env = PLE(self.game, fps=30, display_screen=True)
    self.env.init()
    self.env.getGameState = self.game.getGameState # maybe not necessary

    # by convention we want to use (0,1)
    # but the game uses (None, 119)
    self.action_map = self.env.getActionSet() #[None, 119]

  def step(self, action):
    action = self.action_map[action]
    reward = self.env.act(action)
    done = self.env.game_over()
    obs = self.get_observation()
    # don't bother returning an info dictionary like gym
    return obs, reward, done

  def reset(self):
    self.env.reset_game()
    return self.get_observation()

  def get_observation(self):
    # game state returns a dictionary which describes
    # the meaning of each value
    # we only want the values
    obs = self.env.getGameState()
    return np.array(list(obs.values()))

  def set_display(self, boolean_value):
    self.env.display_screen = boolean_value
예제 #3
0
def test_play(agent, game, n, accelerated=False):
    p = PLE(game,
            fps=30,
            frame_skip=1,
            num_steps=1,
            force_fps=accelerated,
            display_screen=DISPLAY)
    cumulated = np.zeros(n, dtype=np.int32)
    for i in range(n):
        p.reset_game()
        while not p.game_over():
            state = game.getGameState()
            qvals = agent.get_qvals(state)
            act = agent.greedy_action(qvals, 0)
            reward = p.act(ACTIONS[act])
            if reward > 0:
                cumulated[i] += 1
        print('Game:', i, ', doors:', cumulated[i])
    average_score = np.mean(cumulated)
    max_score = np.max(cumulated)
    min_score = np.min(cumulated)
    print('\nTest over', n, 'tests:')
    print('average_score', 'max_score', 'min_score\n', average_score,
          max_score, min_score)
    return average_score, max_score, min_score
예제 #4
0
def train(nb_episodes, agent):
    reward_values = agent.reward_values()

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=False,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    env.init()

    score = 0
    biggest_score = -50000
    avg_score = 0
    episodes = 0
    to_break = False
    while nb_episodes > 0:
        # pick an action
        state = env.game.getGameState()

        state = agent.state_binner(state)
        action = agent.training_policy(state)

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # let the agent observe the current state transition
        newState = env.game.getGameState()
        newState = agent.state_binner(newState)
        agent.observe(state, action, reward, newState, env.game_over())
        agent.frames += 1
        score += reward

        if ((agent.frames % 10000) == 0):
            to_break = True
        # reset the environment if the game is over
        if env.game_over():
            avg_score += score
            if score > biggest_score:
                biggest_score = score
                if biggest_score > 450:
                    break
                print(biggest_score)
                print(nb_episodes)
            if nb_episodes % 100 == 0:
                print(avg_score / 100)
                if avg_score / 100 >= 5:
                    break
                avg_score = 0
            if to_break:
                break

            #print("score for this episode: %d" % score)
            env.reset_game()

            nb_episodes -= 1
            score = 0

    return biggest_score
예제 #5
0
class Env:
    def __init__(self):
        self.game = FlappyBird(pipe_gap=110)
        self.env = PLE(self.game, fps=30, display_screen=False)
        self.env.init()
        self.env.getGameState = self.game.getGameState  # maybe not necessary

        # by convention we want to use (0,1)
        # but the game uses (None, 119)
        self.action_map = self.env.getActionSet()  # [None, 119]

    def step(self, action):
        action = self.action_map[action]
        reward = self.env.act(action)
        done = self.env.game_over()
        obs = self.get_observation()
        return obs, reward, done

    def reset(self):
        self.env.reset_game()
        return self.get_observation()

    def get_observation(self):
        # game state returns a dictionary which describes
        # the meaning of each value
        # we only want the values
        obs = self.env.getGameState()
        return np.array(list(obs.values()))

    def set_display(self, boolean_value):
        self.env.display_screen = boolean_value
예제 #6
0
def _test_ple():
    from ple.games.pong import Pong
    from ple.games.flappybird import FlappyBird
    from ple import PLE
    # os.environ['SDL_VIDEODsRIVER'] = 'dummy'
    game = Pong()
    game = FlappyBird()
    ple_game = PLE(game, fps=30, display_screen=True)
    ple_game.init()
    ALLOWED_ACTIONS = ple_game.getActionSet()

    print(ALLOWED_ACTIONS)
    action = 0
    start = time()
    t = 0
    while True:
        ep_reward = 0
        ple_game.reset_game()
        while not ple_game.game_over():
            sleep(0.1)
            t += 1
            if t % 15 == 5:
                action = 0
            else:
                action = 1
            reward = ple_game.act(ALLOWED_ACTIONS[action])
            # print(reward)
            ep_reward += reward
        print(ep_reward, t, t / (time() - start))
예제 #7
0
class Env:
    def __init__(self):
        # initializing the instance of FlappyBird class
        self.game = FlappyBird(pipe_gap=100)
        # then pass this object into PLE constructor and create an instance of that
        self.env = PLE(self.game, fps=30, display_screen=False)
        # init does some necessary things under the hood
        self.env.init()
        self.env.getGameState = self.game.getGameState  # maybe not necessary
        self.action_map = self.env.getActionSet()

    # function which takes an action
    def step(self, action):
        action = self.action_map[action]
        reward = self.env.act(action)
        done = self.env.game_over()
        obs = self.get_observation()
        return obs, reward, done

    def reset(self):
        self.env.reset_game()
        return self.get_observation()

    def get_observation(self):
        # game state returns a dictionary which describes
        # the meaning of each value
        # we only want the values
        obs = self.env.getGameState()
        return np.array(list(obs.values()))

    def set_display(self, boolean_value):
        self.env.display_screen = boolean_value
예제 #8
0
class PLEEnv(Env):
    def __init__(self, game, _id, render=True, reset_done=True, num_steps=100):
        super().__init__(_id, render, reset_done)
        self.num_steps = num_steps
        self.game = game
        self.start()

    def start(self):
        if not self.env_instance:
            self.env_instance = PLE(self.game,
                                    fps=30,
                                    display_screen=self.render)
            self.env_instance.init()

    def step(self, action):
        reward = self.env_instance.act(action)
        obs = self.env_instance.getGameState()
        done = self.env_instance.game_over()
        return obs, reward, done

    def reset(self):
        self.env_instance.reset_game()
        obs = self.env_instance.getGameState()
        return obs

    def close(self):
        pass

    def restart(self):
        self.close()
        self.reset()
예제 #9
0
def main(argv):
    try:
        opts, _ = getopt.getopt(argv, "hr")
    except getopt.GetoptError:
        print("birdML.py [-h | -r]")
        sys.exit(2)

    record = False
    for opt, arg in opts:
        if opt == '-h':
            print("-h to help")
            print("-r record")
        elif opt == '-r':
            record = True

    netb = netBrain()
    netb.summary()
    game = FlappyBird()
    p = PLE(game, fps=30, display_screen=True, force_fps=True)
    p.init()
    actions = p.getActionSet()

    out = 1

    epochs = 50
    for i in range(epochs):
        lstates = []
        rewards = []
        if record:
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            out = cv2.VideoWriter('Videos/test_' + str(i) + '.mov', fourcc,
                                  30.0, (288, 512))
        for d in range(10):
            while not p.game_over():
                if record:
                    obs = p.getScreenRGB()
                    obs = cv2.transpose(obs)
                    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR)
                    out.write(obs)
                st = game.getGameState()
                gstate = list(st.values())
                gstate = np.array([np.array(gstate)])
                lstates.append(gstate[0])
                pred = netb.predict(gstate)[0]
                a = pred.argmax()
                p.act(actions[a])
                if st['next_pipe_bottom_y'] < st['player_y']:
                    pred[0] = 1.0
                    pred[1] = 0.0
                elif st['next_pipe_top_y'] > st['player_y']:
                    pred[0] = 0.0
                    pred[1] = 1.0
                rewards.append(pred)
            p.reset_game()
        netb.fit(np.array(lstates),
                 np.array(rewards),
                 batch_size=10,
                 epochs=10)
        if record:
            out.release()
예제 #10
0
    def play(self, n=1, file_path=None):

        # use "Fancy" for full background, random bird color and random pipe color,
        # use "Fixed" (default) for black background and constant bird and pipe colors.
        game = FlappyBird(graphics="fixed")

        # Note: if you want to see you agent act in real time, set force_fps to False.
        # But don't use this setting for learning, just for display purposes.
        env = PLE(game,
                  fps=30,
                  frame_skip=1,
                  num_steps=1,
                  force_fps=False,
                  display_screen=True)

        # Init the environment (settings, display...)
        env.init()

        # Load the model
        model = load_model(file_path)

        # Let's play n games, and see if the model is correctly trained
        for _ in range(n):
            env.reset_game()
            while not env.game_over():
                S = self.get_game_data(game)
                Q = model.predict(S, batch_size=1)
                A = np.argmax(Q[0])
                env.act(self.ACTIONS[A])
예제 #11
0
def evaluate(agent):
    env = PLE(game, fps=30, display_screen=True)
    actionset = env.getActionSet()
    eval_reward = []
    for i in range(5):
        env.init()
        env.reset_game()
        obs = list(env.getGameState().values())
        episode_reward = 0
        while True:
            action = agent.predict(obs)
            observation = env.getScreenRGB()
            score = env.score()
            #action = agent.pickAction(reward, observation)
            observation = cv2.transpose(observation)
            font = cv2.FONT_HERSHEY_SIMPLEX
            observation = cv2.putText(observation, str(int(score)), (0, 25),
                                      font, 1.2, (255, 255, 255), 2)
            cv2.imshow("ss", observation)
            cv2.waitKey(10)  # 预测动作,只选最优动作
            reward = env.act(actionset[action])
            obs = list(env.getGameState().values())
            done = env.game_over()
            episode_reward += reward
            if done:
                break
        eval_reward.append(episode_reward)
        cv2.destroyAllWindows()
    return np.mean(eval_reward)
예제 #12
0
def run_game(nb_episodes, agent):
    """ Runs nb_episodes episodes of the game with agent picking the moves.
        An episode of FlappyBird ends with the bird crashing into a pipe or going off screen.
    """

    reward_values = {
        "positive": 1.0,
        "negative": 0.0,
        "tick": 0.0,
        "loss": 0.0,
        "win": 0.0
    }
    # TODO: when training use the following instead:
    # reward_values = agent.reward_values

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=True,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    # TODO: to speed up training change parameters of PLE as follows:
    # display_screen=False, force_fps=True
    env.init()

    score = 0
    tot_nb_episodes = nb_episodes
    average = 0
    highscore = 0
    over_50_count = 0
    while nb_episodes > 0:
        # pick an action
        # TODO: for training using agent.training_policy instead
        state, ignore = agent.state_binner(env.game.getGameState())
        action = agent.policy(state)

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # TODO: for training let the agent observe the current state transition

        score += reward

        # reset the environment if the game is over
        if env.game_over() or score >= 60:
            average += score
            if score > highscore:
                highscore = score
            if score >= 50:
                over_50_count += 1
            print("score for this episode: %d" % score)
            env.reset_game()
            nb_episodes -= 1
            score = 0
    print("Average for {} runs {:.2f}".format(tot_nb_episodes,
                                              average / tot_nb_episodes))
    over_50_p = (over_50_count / tot_nb_episodes) * 100
    print("The percentage of scores over 50 is: %d" % (over_50_p))
    return over_50_p
예제 #13
0
def test_agent(policy, file_writer=None, test_games=10, step=0):
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=False)
    env.init()

    test_rewards = []
    for _ in range(test_games):
        env.reset_game()
        no_op(env)

        game_rew = 0

        while not env.game_over():

            state = flappy_game_state(env)

            action = 119 if policy(state) == 1 else None

            for _ in range(2):
                game_rew += env.act(action)

        test_rewards.append(game_rew)

        if file_writer is not None:
            summary = tf.Summary()
            summary.value.add(tag='test_performance', simple_value=game_rew)
            file_writer.add_summary(summary, step)
            file_writer.flush()

    return test_rewards
예제 #14
0
class Flappy():
    def __init__(self, display=False):
        self.game = flappybird.FlappyBird(width=288, height=512, pipe_gap=100)
        self._ple = PLE(self.game, fps=30, display_screen=display)
        self.game.rng.seed(np.random.randint(0, 999999))
        self.reset()

    def reset(self):
        self._ple.reset_game()
        self.game.init()
        self.game.backdrop.background_image.fill(0)
        ret = self.step(1)
        return ret[0]

    def render(self, close=False):
        if close:
            pygame.quit()

    def step(self, action):
        if action:
            reward = self._ple.act(119)
        else:
            reward = self._ple.act(0)

        done = self._ple.game_over()
        state = self.game.getScreenRGB()
        info = None
        return state, reward, done, info
예제 #15
0
def main_test():
    final_score = 0
    previous_action = 1
    model = build_neural_network_model()
    game = FlappyBird(width=288, height=512, pipe_gap=100)
    env = PLE(game, fps=30, display_screen=True, state_preprocessor=process_state)
    model = load_model("model.h5")
    env.init()
    passed = 0
    old_y = 0
    for i in range(game_steps):
        if i == game_steps - 1:
            print("Score: {}".format(final_score))
        if env.game_over():
            print("Final Score: {}".format(final_score))
            time.sleep(1)
            final_score = 0
            env.reset_game()

        observation = env.getGameState()

        vector = model.predict(np.matrix(list(observation[0].values())))
        a_star = np.argmax(vector[0])
        print(vector[0][0], vector[0][1], a_star)
        time.sleep(0.05)
        env_reward = env.act(env.getActionSet()[a_star])
        if env_reward == 1:
            final_score += 1
class PleEnvAdapter(EnvAdapter):
    """Pygame learning env adapter"""
    def __init__(self, *args, **kwargs):
        super(PleEnvAdapter, self).__init__(*args, **kwargs)

        if not self.render:
            os.putenv('SDL_VIDEODRIVER', 'fbcon')
            os.environ["SDL_VIDEODRIVER"] = "dummy"

        Game = envs_lookup_table[self.env_name]
        self.env = PLE(Game(),
                       display_screen=self.render,
                       force_fps=not self.render)
        self.env.init()

    def get_input_shape(self):
        return (len(self.env.getGameState()), )

    def reset(self):
        self.env.reset_game()

    def step(self, action) -> (object, float, bool):
        reward = self.env.act(self.env.getActionSet()[action])
        observation = self.env.getGameState()
        observation = [val for key, val in observation.items()]
        done = self.env.game_over()
        return observation, reward, done

    def get_n_actions(self) -> int:
        return len(self.env.getActionSet())

    def get_random_action(self):
        return random.randint(0, len(self.env.getActionSet()) - 1)
예제 #17
0
def train(nb_frames, agent):
    reward_values = agent.reward_values()

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=False,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    env.init()

    score = 0
    biggest_score = -5
    avg_score = 0
    avrage = []
    count = []
    nb_episodes = 0
    number_of_frames = 0
    while number_of_frames < nb_frames:
        # pick an action
        state = env.game.getGameState()
        state = agent.state_binner(state)
        action = agent.training_policy(state)

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # let the agent observe the current state transition
        newState = env.game.getGameState()
        newState = agent.state_binner(newState)
        agent.observe(state, action, reward, newState, env.game_over())

        score += reward
        number_of_frames += 1
        # reset the environment if the game is over
        if env.game_over():
            nb_episodes += 1
            avg_score += score
            if score > biggest_score:
                biggest_score = score
                print(biggest_score)
                print(nb_episodes)
                print(number_of_frames)
            if nb_episodes % 100 == 0:
                print(avg_score / 100)
                avrage.append(avg_score / 100)
                count.append(number_of_frames)
                avg_score = 0

            #print("score for this episode: %d" % score)
            agent.calculate()
            env.reset_game()

            score = 0

    print(biggest_score)
    data = {"Count": count, "Avrage": avrage}
    df = pd.DataFrame(data)
    sns.relplot(x="Count", y="Avrage", ci=None, kind="line", data=df)
예제 #18
0
def train(nb_frames, agent, a, g, results):
    print("alpha %f" % a)
    print("gamma %f" % g)
    reward_values = agent.reward_values()

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=False,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    env.init()

    score = 0
    biggest_score = -5
    avg_score = 0

    number_of_frames = 0
    nb_episodes = 0
    while number_of_frames < nb_frames:
        # pick an action
        state = env.game.getGameState()
        state = agent.state_binner(state)
        action = agent.training_policy(state)

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # let the agent observe the current state transition
        newState = env.game.getGameState()
        newState = agent.state_binner(newState)
        agent.observe(state, action, reward, newState, env.game_over())

        score += reward
        number_of_frames += 1
        # reset the environment if the game is over
        if env.game_over():
            nb_episodes += 1
            avg_score += score
            if score > biggest_score:
                biggest_score = score
                print(biggest_score)
                print(nb_episodes)
                print(number_of_frames)
            if nb_episodes % 100 == 0:
                print(avg_score / 100)
                results[0].append(avg_score / 100)
                results[1].append(number_of_frames)
                results[2].append(a)
                results[3].append(g)
                avg_score = 0

            #print("score for this episode: %d" % score)
            env.reset_game()

            score = 0
    print(biggest_score)
    return results
예제 #19
0
    def score(self, training=True, nb_episodes=10):
        reward_values = {
            'positive': 1.0,
            'negative': 0.0,
            'tick': 0.0,
            'loss': 0.0,
            'win': 0.0
        }

        env = PLE(FlappyBird(),
                  fps=30,
                  display_screen=False,
                  force_fps=True,
                  rng=None,
                  reward_values=reward_values)
        env.init()

        total_episodes = nb_episodes
        score = 0
        scores = []
        while nb_episodes > 0:
            # pick an action
            state = env.game.getGameState()
            action = self.policy(state)

            # step the environment
            reward = env.act(env.getActionSet()[action])

            score += reward

            # reset the environment if the game is over
            if env.game_over() or score >= 100:
                scores.append(score)
                env.reset_game()
                nb_episodes -= 1
                score = 0

        avg_score = sum(scores) / float(len(scores))
        print('Games played: {}'.format(total_episodes))
        print('Average score: {}'.format(avg_score))

        if training:
            score_file = '{}/scores.csv'.format(self.name)
            # If file doesn't exist, add the header
            if not os.path.isfile(score_file):
                with open(score_file, 'a') as f:
                    f.write('avg_score,episode_count,num_of_frames,min,max\n')

            # Append scores to the file
            with open(score_file, 'a') as f:
                f.write('{},{},{},{},{}\n'.format(avg_score,
                                                  self.num_of_episodes,
                                                  self.num_of_frames,
                                                  min(scores), max(scores)))

        else:
            with open('scores.txt', 'a') as f:
                for score in scores:
                    f.write('{},{}\n'.format(self.name, score))
예제 #20
0
class PLEEnv(gym.Env):
    def __init__(self, env_config):
        game = Catcher(width=screen_wh, height=screen_wh)

        fps = 30  # fps we want to run at
        frame_skip = 2
        num_steps = 2
        force_fps = False  # False for slower speed
        display_screen = True
        # make a PLE instance.
        self.env = PLE(game,
                       fps=fps,
                       frame_skip=frame_skip,
                       num_steps=num_steps,
                       force_fps=force_fps,
                       display_screen=display_screen)
        self.env.init()
        self.action_dict = {0: None, 1: 97, 2: 100}
        #PLE env starts with black screen
        self.env.act(self.env.NOOP)

        self.action_space = Discrete(3)
        self.k = 4
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(screen_wh, screen_wh,
                                                   1 * self.k))
        self.frames = deque([], maxlen=self.k)

    def reset(self):
        self.env.reset_game()
        # PLE env starts with black screen, NOOP step to get initial screen
        self.env.act(self.env.NOOP)
        ob = np.reshape(self.env.getScreenGrayscale(),
                        (screen_wh, screen_wh, 1))
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        #traditional gym env step
        #_obs, _rew, done, _info = env.step(env.action_space.sample())
        action_value = self.action_dict[action]
        _rew = self.env.act(action_value)
        #_obs = self.env.getScreenGrayscale()
        _obs = np.reshape(self.env.getScreenGrayscale(),
                          (screen_wh, screen_wh, 1))
        self.frames.append(_obs)
        _done = self.env.game_over()
        _info = {}

        return self._get_ob(), _rew, _done, _info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return np.concatenate(self.frames, axis=2)
예제 #21
0
def train(FRAME_TRAIN=1000005):
    game = FlappyBird()
    p = PLE(game, fps=30, display_screen=True)
    p.init()
    ob = game.getGameState()
    state = ob
    state = np.reshape(np.asarray(list(state.values())), [1, 8])
    total_reward = 0
    agent = DDQN_Agent.DeepQAgent()
    agent.load('model95000')
    batch_size = 32
    my_timer = time.time()
    prev_frame = 0
    data = []
    for i in range(FRAME_TRAIN):
        if p.game_over():
            data.append(total_reward)
            p.reset_game()
            print(
                "Total reward = {}, Frame = {}, epsilon = {}, frame/second = {}"
                .format(total_reward, i, agent.epsilon,
                        (i - prev_frame) / (time.time() - my_timer)))
            total_reward = 0
            prev_frame = i
            my_timer = time.time()

        # get action from agent
        action = agent.act(state)

        # take action
        reward = p.act(p.getActionSet()[action])

        # making the reward space less sparse
        if reward < 0:
            reward = -1

        total_reward += reward
        next_state = np.asarray(list(game.getGameState().values()))
        next_state = np.reshape(next_state, [1, 8])

        # remember and replay
        agent.remember(state, action, reward, next_state, p.game_over())
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

        state = next_state

        # save Model
        if i % 5000 == 0:
            print("Updating weights")
            agent.save('newmodel' + str(i))
            agent.target_model.set_weights(agent.model.get_weights())

        # Plot socre
        if i % 1000 == 0:
            plot(data)
예제 #22
0
class GameEnv(object):
    def __init__(self, display_screen):
        self.width = IMAGE_WIDTH
        self.height = IMAGE_HEIGHT

        self.count = 0
        self.p = PLE(FlappyBird(), fps=30, display_screen=display_screen)
        self.p.init()
        self._update_state()
        self.score = 0

    def pre_process_image(self, image):
        self.count += 1
        image = color.rgb2gray(image)
        image = transform.resize(image, (self.width, self.height))
        image = exposure.rescale_intensity(image, out_range=(0, 255))
        image = image.astype('float')
        image = image / 255.0
        return image.reshape(1, self.width, self.height, 1)

    def _update_state(self):
        image = self.p.getScreenRGB()
        # TODO: convert to float
        image = self.pre_process_image(image)
        state = getattr(self, 'state', None)
        if state is None:
            self.state = np.concatenate([image] * 4, axis=3)
        else:
            self.state[:, :, :, :3] = image

    def get_state(self):
        return self.state

    def step(self, action):
        if action == 1:
            _ = self.p.act(119)
        else:
            _ = self.p.act(None)

        self._update_state()

        done = False
        if self.p.game_over():
            done = True
            self.p.reset_game()
            reward = -1
        else:
            reward = 0.1

        return_score = self.score + reward
        self.score = 0 if done else self.score + reward

        return self.state, reward, done, return_score

    def get_score(self):
        return self.score
예제 #23
0
def run(number_of_episodes):
    game = FlappyBird(pipe_gap=150)

    rewards = {
        "positive": 1.0,
        "negative": 0.0,
        "tick": 0.0,
        "loss": 0.0,
        "win": 0.0
    }

    env = PLE(game=game,
              fps=30,
              display_screen=True,
              reward_values=rewards,
              force_fps=False)

    # Reset environment at the beginning
    env.reset_game()

    score = 0
    max_score = 0
    episode_number = 1

    while number_of_episodes > 0:

        # Get current state
        state = BasicQLearningAgent.get_state(env.game.getGameState())

        # Select action in state "state"
        action = basic_q_agent.max_q(state)

        # After choosing action, get reward
        """
        After choosing action, get reward.
        PLE environment method act() returns the reward that the agent has accumulated while performing the action.
        """
        reward = env.act(env.getActionSet()[action])
        score += reward

        max_score = max(score, max_score)

        game_over = env.game_over()

        if game_over:
            print("===========================")
            print("Episode: " + str(episode_number))
            print("Score: " + str(score))
            print("Max. score: " + str(max_score))
            print("===========================\n")
            # f.write("Score: " + str(score) + "|Max. score: " + str(max_score) + "\n")
            episode_number += 1
            number_of_episodes -= 1
            score = 0
            env.reset_game()
예제 #24
0
    def train(self):
        """ Runs nb_episodes episodes of the game with agent picking the moves.
            An episode of FlappyBird ends with the bird crashing into a pipe or going off screen.
        """

        if not os.path.exists(self.name):
            os.mkdir(self.name)

        t = threading.Thread(target=self.draw_plots)
        t.daemon = True
        t.start()

        reward_values = self.reward_values()
        env = PLE(FlappyBird(),
                  fps=30,
                  display_screen=False,
                  force_fps=True,
                  rng=None,
                  reward_values=reward_values)
        env.init()

        score = 0
        while self.frame_count <= 1000000:
            # pick an action
            state1 = env.game.getGameState()
            action = self.training_policy(state1)

            # step the environment
            reward = env.act(env.getActionSet()[action])
            # print("reward=%d" % reward)

            state2 = env.game.getGameState()

            end = env.game_over(
            ) or score >= 100  # Stop after reaching 100 pipes
            self.observe(state1, action, reward, state2, end)

            # reset the environment if the game is over
            if end:
                env.reset_game()
                score = 0

            if self.frame_count % 25000 == 0:
                print("==========================")

                print("episodes done: {}".format(self.episode_count))
                print("frames done: {}".format(self.frame_count))

                self.score()

                with open("{}/agent.pkl".format(self.name), "wb") as f:
                    pickle.dump((self), f, pickle.HIGHEST_PROTOCOL)

                print("==========================")
예제 #25
0
def test():
    game = Snake(600, 600)
    p = PLE(game,
            fps=60,
            state_preprocessor=process_state,
            force_fps=True,
            display_screen=True,
            frame_skip=2,
            reward_values={
                "positive": 100.0,
                "negative": -50.0,
                "tick": -0.1,
                "loss": -70.0,
                "win": 5.0
            })
    agent = Agent(alpha=float(sys.argv[1]),
                  gamma=float(sys.argv[2]),
                  n_actions=3,
                  epsilon=0.01,
                  batch_size=100,
                  input_shape=6,
                  epsilon_dec=0.99999,
                  epsilon_end=0.001,
                  memory_size=500000,
                  file_name=sys.argv[3],
                  activations=[str(sys.argv[4]),
                               str(sys.argv[5])])
    p.init()
    agent.load_game()
    scores = []

    for _ in range(200):
        if p.game_over():
            p.reset_game()
        apples = 0
        initial_direction = "Right"
        while not p.game_over():
            old_state = np.array(
                vision(list(p.getGameState()[0]), initial_direction))

            action = agent.choose_action(old_state)
            possible_directions = prepare_corect_directions(initial_direction)
            possible_directions_tuples = list(
                zip(possible_directions.keys(), possible_directions.values()))
            direction = possible_directions_tuples[action]
            initial_direction = direction[1]

            reward = p.act(direction[0])
            if reward > 50.0:
                apples += reward

        scores.append(apples)
    return scores
예제 #26
0
class Game(gym.Env):
    def __init__(self, display_screen=False, force_fps=True):
        os.environ["SDL_VIDEODRIVER"] = "dummy"
        game = FlappyBird()  # define and initiate the environment
        self.env = PLE(game,
                       fps=30,
                       display_screen=display_screen,
                       force_fps=force_fps)
        self.env.init()
        # list of actions in the environment
        self.actions = self.env.getActionSet()
        # length of actions
        self.action_space = spaces.Discrete(len(self.actions))

    def step(self, action):
        """Take the action chosen and update the reward"""
        reward = self.env.act(self.actions[action])
        state = self.getGameState()
        terminal = self.env.game_over()

        # If the bird is stuck, the game is over and a reward of -1000
        # if it continues, +1
        if terminal:
            reward = -1000
        else:
            reward = 1

        return state, reward, terminal, {}

    def getGameState(self):
        '''
        PLEenv return gamestate as a dictionary. Returns a modified form
        of the gamestate only with the required information to define the state
        '''
        state = self.env.getGameState()
        h_dist = state['next_pipe_dist_to_player']
        v_dist = state['next_pipe_bottom_y'] - state['player_y']
        vel = state['player_vel']

        return ' '.join([str(vel), str(h_dist), str(v_dist)])

    def reset(self):
        """Resets the game to start a new game"""
        self.env.reset_game()
        state = self.env.getGameState()
        return state

    def seed(self, seed):
        rng = np.random.RandomState(seed)
        self.env.rng = rng
        self.env.game.rng = self.env.rng

        self.env.init()
예제 #27
0
def main(train=False):
    # Don't modify anything in this function.
    # See the constants defined at the top of this file if you'd like to
    # change the FPS, screen size, or round length
    game = Pong(width=WIDTH, height=HEIGHT, MAX_SCORE=MAX_SCORE)

    if train:
        p = PLE(game, fps=FPS, display_screen=False, force_fps=True)
    else:
        p = PLE(game, fps=FPS, display_screen=True, force_fps=False)

    p.init()

    agent_rounds = 0
    cpu_rounds = 0
    agent_score = 0
    cpu_score = 0
    num_frames = 0
    while True:
        if p.game_over():
            if game.score_counts['agent'] > game.score_counts['cpu']:
                agent_rounds += 1
                print('AGENT won round')
            else:
                cpu_rounds += 1
                print('CPU won round')

            if agent_rounds == NUM_ROUNDS or cpu_rounds == NUM_ROUNDS:
                break

            p.reset_game()

        obs = p.getGameState()
        action = agent(normalize(obs))
        reward = p.act(ACTION_MAP[action])

        if reward > 0:
            agent_score += 1
            print('AGENT scored')
        elif reward < 0:
            cpu_score += 1
            print('CPU scored')

        num_frames += 1

    winner = 'AGENT' if agent_rounds > cpu_rounds else 'CPU'
    print('Winner:', winner)
    print('Num frames      :', num_frames)
    print('AGENT rounds won:',agent_rounds)
    print('CPU   rounds won:',cpu_rounds)
    print('AGENT total score:',agent_score)
    print('CPU   total score:',cpu_score)
예제 #28
0
    def play(self):
        print("Playing {} agent after training for {} episodes or {} frames".
              format(self.name, self.episode_count, self.frame_count))
        reward_values = {
            "positive": 1.0,
            "negative": 0.0,
            "tick": 0.0,
            "loss": 0.0,
            "win": 0.0
        }

        env = PLE(FlappyBird(),
                  fps=30,
                  display_screen=True,
                  force_fps=False,
                  rng=None,
                  reward_values=reward_values)
        env.init()

        score = 0
        last_print = 0

        nb_episodes = 50
        while nb_episodes > 0:
            # pick an action
            state = env.game.getGameState()
            action = self.policy(state)

            # step the environment
            reward = env.act(env.getActionSet()[action])

            score += reward

            # if reward == 1:``
            #     print(state)

            if score % 100 == 0 and score != last_print:
                print(int(score))
                last_print = score

            # reset the environment if the game is over
            if env.game_over():
                # print("---------------")
                # for s1, s2 in self.last_10:
                #     print(s1)
                #     print(s2)
                #     print("-=-=-")
                print("Score: {}".format(score))
                env.reset_game()
                nb_episodes -= 1
                score = 0
예제 #29
0
def run_game(nb_episodes, agent):
    """ Runs nb_episodes episodes of the game with agent picking the moves.
        An episode of FlappyBird ends with the bird crashing into a pipe or going off screen.
    """

    reward_values = {
        "positive": 1.0,
        "negative": 0.0,
        "tick": 0.0,
        "loss": 0.0,
        "win": 0.0
    }
    # TODO: when training use the following instead:
    # reward_values = agent.reward_values

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=False,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    # TODO: to speed up training change parameters of PLE as follows:
    # display_screen=False, force_fps=True
    env.init()
    totalscore = 0
    count = nb_episodes
    score = 0

    while nb_episodes > 0:
        # pick an action
        # TODO: for training using agent.training_policy instead
        action = agent.policy(agent.state_binner(env.game.getGameState()))

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # TODO: for training let the agent observe the current state transition

        score += reward

        # reset the environment if the game is over
        if env.game_over():
            totalscore += score
            print(nb_episodes)
            print("score for this episode: %d" % score)
            env.reset_game()
            nb_episodes -= 1
            score = 0
    print("average for this run is :%d" % (totalscore / count))
    return (totalscore / count)
예제 #30
0
    def train(self):
        """ Runs nb_episodes episodes of the game with agent picking the moves.
            An episode of FlappyBird ends with the bird crashing into a pipe or going off screen.
        """
        #Check if the agent folder exists
        #If not, create it.
        if not os.path.exists(self.name):
            print(self.name)
            os.mkdir(self.name)

        reward_values = self.reward_values()
        env = PLE(FlappyBird(),
                  fps=30,
                  display_screen=False,
                  force_fps=True,
                  rng=None,
                  reward_values=reward_values)
        env.init()

        score = 0
        while self.num_of_frames <= 1000000:
            # pick an action
            state1 = env.game.getGameState()
            action = self.training_policy(state1)

            reward = env.act(env.getActionSet()[action])

            state2 = env.game.getGameState()

            end = env.game_over(
            ) or score >= 100  # Stop after reaching 100 pipes
            self.observe(state1, action, reward, state2, end)

            # reset the environment if the game is over
            if end:
                env.reset_game()
                score = 0

            if self.num_of_frames % 25000 == 0:
                print('++++++++++++++++++++++++++')

                print('Episodes finished: {}'.format(self.num_of_episodes))
                print('Number of frames: {}'.format(self.num_of_frames))

                self.score()

                with open('{}/agent.pkl'.format(self.name), 'wb') as f:
                    pickle.dump((self), f, pickle.HIGHEST_PROTOCOL)

                print('++++++++++++++++++++++++++\n')
예제 #31
0
def view_agent(agent):
    game = FlappyBird()
    p = PLE(game, fps=30, display_screen=True)
    p.init()

    for i in range(200000):
        if p.game_over():
            p.reset_game()

        time.sleep(0.03)
        action = agent.pick_action(p.getGameState())
        p.act(action)

        if p.game_over():
            break
예제 #32
0
def main_naive():
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True)
    my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet())

    env.init()
    reward = 0.0
    nb_frames = 10000

    for i in range(nb_frames):
        if env.game_over():
            env.reset_game()

        observation = env.getScreenRGB()
        action = my_agent.pickAction(reward, observation)
        reward = env.act(action)
	"""
	def __init__(self, actions):
		self.actions = actions

	def pickAction(self, reward, obs):
		return self.actions[np.random.randint(0, len(self.actions))]

###################################
game = Doom(scenario="take_cover")

env = PLE(game)
agent = NaiveAgent(env.getActionSet())
env.init()

reward = 0.0
for f in range(15000):
	#if the game is over
        if env.game_over():
            env.reset_game()
            
        action = agent.pickAction(reward, env.getScreenRGB())
        reward = env.act(action)

        if f > 2000:
            env.display_screen = True 
            env.force_fps = False
        
        if f > 2250:
            env.display_screen = True 
            env.force_fps = True
예제 #34
0
파일: PLE_env.py 프로젝트: halofanx/deer
class MyEnv(Environment):
    VALIDATION_MODE = 0

    def __init__(self, rng, game=None, frame_skip=4, 
            ple_options={"display_screen": True, "force_fps":True, "fps":30}):

        self._mode = -1
        self._mode_score = 0.0
        self._mode_episode_count = 0

        self._frameSkip = frame_skip if frame_skip >= 1 else 1
        self._random_state = rng
       
        if game is None:
            raise ValueError("Game must be provided")

        self._ple = PLE(game, **ple_options)
        self._ple.init()

        w, h = self._ple.getScreenDims()
        self._screen = np.empty((h, w), dtype=np.uint8)
        self._reducedScreen = np.empty((48, 48), dtype=np.uint8)
        self._actions = self._ple.getActionSet()

                
    def reset(self, mode):
        if mode == MyEnv.VALIDATION_MODE:
            if self._mode != MyEnv.VALIDATION_MODE:
                self._mode = MyEnv.VALIDATION_MODE
                self._mode_score = 0.0
                self._mode_episode_count = 0
            else:
                self._mode_episode_count += 1
        elif self._mode != -1: # and thus mode == -1
            self._mode = -1

        self._ple.reset_game()
        for _ in range(self._random_state.randint(15)):
            self._ple.act(self._ple.NOOP)
        self._screen = self._ple.getScreenGrayscale()
        cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST)
        
        return [4 * [48 * [48 * [0]]]]
        
        
    def act(self, action):
        action = self._actions[action]
        
        reward = 0
        for _ in range(self._frameSkip):
            reward += self._ple.act(action)
            if self.inTerminalState():
                break
            
        self._screen = self._ple.getScreenGrayscale()
        cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST)
  
        self._mode_score += reward
        return np.sign(reward)

    def summarizePerformance(self, test_data_set):
        if self.inTerminalState() == False:
            self._mode_episode_count += 1
        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))


    def inputDimensions(self):
        return [(4, 48, 48)]

    def observationType(self, subject):
        return np.uint8

    def nActions(self):
        return len(self._actions)

    def observe(self):
        return [np.array(self._reducedScreen)]

    def inTerminalState(self):
        return self._ple.game_over()
예제 #35
0
# You're not allowed to change this file
from ple.games.flappybird import FlappyBird
from ple import PLE
import numpy as np
from FlappyAgent import FlappyPolicy

game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

p.init()
reward = 0.0

nb_games = 100
cumulated = np.zeros((nb_games))

for i in range(nb_games):
    p.reset_game()
    
    while(not p.game_over()):
        state = game.getGameState()
        screen = p.getScreenRGB()
        action=FlappyPolicy(state, screen) ### Your job is to define this function.
        
        reward = p.act(action)
        cumulated[i] = cumulated[i] + reward

average_score = np.mean(cumulated)
max_score = np.max(cumulated)
예제 #36
0
def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    #setupGame()
    gameClass = FlappyBird(width=288, height=512, pipe_gap=100)
    
    fps = 30
    frame_skip = 2
    num_steps = 1
    force_fps = False
    display_screen = True
    reward = 0.0
    nb_frames = 15000

    game = PLE(gameClass, fps=fps, frame_skip=frame_skip, num_steps=num_steps,
            force_fps=force_fps, display_screen=display_screen)

    game.init()

    # store the previous observations in replay memory
    D = deque()

    # printing
    logdir = "logs_" + GAME
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    a_file = open(logdir + "/readout.txt", 'w')
    h_file = open(logdir + "/hidden.txt", 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    r_0 = game.act(game.NOOP)
    x_t = game.getScreenGrayscale()
    terminal = game.game_over()
    if terminal:
        print "NOOOO"
        game.reset_game()
    
    x_t = cv2.resize(x_t, (80, 80))
    ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2)

    # saving and loading networks
    #saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    '''
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print "Successfully loaded:", checkpoint.model_checkpoint_path
    else:
        print "Could not find old network weights"
    '''
    epsilon = INITIAL_EPSILON
    t = 0
    while True:
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict = {s : [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if random.random() <= epsilon or t <= OBSERVE:
            action_index = random.randrange(ACTIONS)
            a_t[random.randrange(ACTIONS)] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        for i in range(0, K):
            # run the selected action and observe next state and reward
            r_t = game.act(np.argmax(a_t))
            x_t1 = game.getScreenGrayscale()
            terminal = game.game_over()
            if terminal:
                print "NOOO2"
                game.reset_game()

            x_t1 = cv2.resize(x_t1, (80, 80))
            ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2)

            # store the transition in D
            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch})
            for i in range(0, len(minibatch)):
                # if terminal only equals reward
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(feed_dict = {
                y : y_batch,
                a : a_batch,
                s : s_j_batch})

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t)

        # write info to files
        '''