Python PLE.init примеры, ple.PLE.init Python примеры использования

Пример #1

0

Показать файл

class Env:
    def __init__(self):
        # initializing the instance of FlappyBird class
        self.game = FlappyBird(pipe_gap=100)
        # then pass this object into PLE constructor and create an instance of that
        self.env = PLE(self.game, fps=30, display_screen=False)
        # init does some necessary things under the hood
        self.env.init()
        self.env.getGameState = self.game.getGameState  # maybe not necessary
        self.action_map = self.env.getActionSet()

    # function which takes an action
    def step(self, action):
        action = self.action_map[action]
        reward = self.env.act(action)
        done = self.env.game_over()
        obs = self.get_observation()
        return obs, reward, done

    def reset(self):
        self.env.reset_game()
        return self.get_observation()

    def get_observation(self):
        # game state returns a dictionary which describes
        # the meaning of each value
        # we only want the values
        obs = self.env.getGameState()
        return np.array(list(obs.values()))

    def set_display(self, boolean_value):
        self.env.display_screen = boolean_value

Пример #2

0

Показать файл

Файл: birdML.py Проект: Pablololo12/FlappyBirdML

def main(argv):
    try:
        opts, _ = getopt.getopt(argv, "hr")
    except getopt.GetoptError:
        print("birdML.py [-h | -r]")
        sys.exit(2)

    record = False
    for opt, arg in opts:
        if opt == '-h':
            print("-h to help")
            print("-r record")
        elif opt == '-r':
            record = True

    netb = netBrain()
    netb.summary()
    game = FlappyBird()
    p = PLE(game, fps=30, display_screen=True, force_fps=True)
    p.init()
    actions = p.getActionSet()

    out = 1

    epochs = 50
    for i in range(epochs):
        lstates = []
        rewards = []
        if record:
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            out = cv2.VideoWriter('Videos/test_' + str(i) + '.mov', fourcc,
                                  30.0, (288, 512))
        for d in range(10):
            while not p.game_over():
                if record:
                    obs = p.getScreenRGB()
                    obs = cv2.transpose(obs)
                    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR)
                    out.write(obs)
                st = game.getGameState()
                gstate = list(st.values())
                gstate = np.array([np.array(gstate)])
                lstates.append(gstate[0])
                pred = netb.predict(gstate)[0]
                a = pred.argmax()
                p.act(actions[a])
                if st['next_pipe_bottom_y'] < st['player_y']:
                    pred[0] = 1.0
                    pred[1] = 0.0
                elif st['next_pipe_top_y'] > st['player_y']:
                    pred[0] = 0.0
                    pred[1] = 1.0
                rewards.append(pred)
            p.reset_game()
        netb.fit(np.array(lstates),
                 np.array(rewards),
                 batch_size=10,
                 epochs=10)
        if record:
            out.release()

Пример #3

0

Показать файл

Файл: new_main.py Проект: balangheorghe/flappybird_rn

def main_test():
    final_score = 0
    previous_action = 1
    model = build_neural_network_model()
    game = FlappyBird(width=288, height=512, pipe_gap=100)
    env = PLE(game, fps=30, display_screen=True, state_preprocessor=process_state)
    model = load_model("model.h5")
    env.init()
    passed = 0
    old_y = 0
    for i in range(game_steps):
        if i == game_steps - 1:
            print("Score: {}".format(final_score))
        if env.game_over():
            print("Final Score: {}".format(final_score))
            time.sleep(1)
            final_score = 0
            env.reset_game()

        observation = env.getGameState()

        vector = model.predict(np.matrix(list(observation[0].values())))
        a_star = np.argmax(vector[0])
        print(vector[0][0], vector[0][1], a_star)
        time.sleep(0.05)
        env_reward = env.act(env.getActionSet()[a_star])
        if env_reward == 1:
            final_score += 1

Пример #4

0

Показать файл

Файл: DeepQLearning.py Проект: mayeroa/RLchallenge

    def play(self, n=1, file_path=None):

        # use "Fancy" for full background, random bird color and random pipe color,
        # use "Fixed" (default) for black background and constant bird and pipe colors.
        game = FlappyBird(graphics="fixed")

        # Note: if you want to see you agent act in real time, set force_fps to False.
        # But don't use this setting for learning, just for display purposes.
        env = PLE(game,
                  fps=30,
                  frame_skip=1,
                  num_steps=1,
                  force_fps=False,
                  display_screen=True)

        # Init the environment (settings, display...)
        env.init()

        # Load the model
        model = load_model(file_path)

        # Let's play n games, and see if the model is correctly trained
        for _ in range(n):
            env.reset_game()
            while not env.game_over():
                S = self.get_game_data(game)
                Q = model.predict(S, batch_size=1)
                A = np.argmax(Q[0])
                env.act(self.ACTIONS[A])

Пример #5

0

Показать файл

Файл: FlappyBirdDummyAgent.py Проект: reinai/FlappyBirdAI

def run_a_game(game):
    from ple import PLE
    p = PLE(game, display_screen=True)
    agent = NaiveAgent(p.getActionSet())
    p.init()
    for i in range(NUM_STEPS):
        p.act(agent.pick_action())

Пример #6

0

Показать файл

Файл: dqn3.py Проект: ninetailskim/FlappyPaddle

def evaluate(agent):
    env = PLE(game, fps=30, display_screen=True)
    actionset = env.getActionSet()
    eval_reward = []
    for i in range(5):
        env.init()
        env.reset_game()
        obs = list(env.getGameState().values())
        episode_reward = 0
        while True:
            action = agent.predict(obs)
            observation = env.getScreenRGB()
            score = env.score()
            #action = agent.pickAction(reward, observation)
            observation = cv2.transpose(observation)
            font = cv2.FONT_HERSHEY_SIMPLEX
            observation = cv2.putText(observation, str(int(score)), (0, 25),
                                      font, 1.2, (255, 255, 255), 2)
            cv2.imshow("ss", observation)
            cv2.waitKey(10)  # 预测动作，只选最优动作
            reward = env.act(actionset[action])
            obs = list(env.getGameState().values())
            done = env.game_over()
            episode_reward += reward
            if done:
                break
        eval_reward.append(episode_reward)
        cv2.destroyAllWindows()
    return np.mean(eval_reward)

Пример #7

0

Показать файл

class Env:
    def __init__(self):
        self.game = FlappyBird(pipe_gap=110)
        self.env = PLE(self.game, fps=30, display_screen=False)
        self.env.init()
        self.env.getGameState = self.game.getGameState  # maybe not necessary

        # by convention we want to use (0,1)
        # but the game uses (None, 119)
        self.action_map = self.env.getActionSet()  # [None, 119]

    def step(self, action):
        action = self.action_map[action]
        reward = self.env.act(action)
        done = self.env.game_over()
        obs = self.get_observation()
        return obs, reward, done

    def reset(self):
        self.env.reset_game()
        return self.get_observation()

    def get_observation(self):
        # game state returns a dictionary which describes
        # the meaning of each value
        # we only want the values
        obs = self.env.getGameState()
        return np.array(list(obs.values()))

    def set_display(self, boolean_value):
        self.env.display_screen = boolean_value

Пример #8

0

Показать файл

Файл: touch_enviroment.py Проект: ariel415el/RLplayground

def _test_ple():
    from ple.games.pong import Pong
    from ple.games.flappybird import FlappyBird
    from ple import PLE
    # os.environ['SDL_VIDEODsRIVER'] = 'dummy'
    game = Pong()
    game = FlappyBird()
    ple_game = PLE(game, fps=30, display_screen=True)
    ple_game.init()
    ALLOWED_ACTIONS = ple_game.getActionSet()

    print(ALLOWED_ACTIONS)
    action = 0
    start = time()
    t = 0
    while True:
        ep_reward = 0
        ple_game.reset_game()
        while not ple_game.game_over():
            sleep(0.1)
            t += 1
            if t % 15 == 5:
                action = 0
            else:
                action = 1
            reward = ple_game.act(ALLOWED_ACTIONS[action])
            # print(reward)
            ep_reward += reward
        print(ep_reward, t, t / (time() - start))

Пример #9

0

Показать файл

class PLEEnv(Env):
    def __init__(self, game, _id, render=True, reset_done=True, num_steps=100):
        super().__init__(_id, render, reset_done)
        self.num_steps = num_steps
        self.game = game
        self.start()

    def start(self):
        if not self.env_instance:
            self.env_instance = PLE(self.game,
                                    fps=30,
                                    display_screen=self.render)
            self.env_instance.init()

    def step(self, action):
        reward = self.env_instance.act(action)
        obs = self.env_instance.getGameState()
        done = self.env_instance.game_over()
        return obs, reward, done

    def reset(self):
        self.env_instance.reset_game()
        obs = self.env_instance.getGameState()
        return obs

    def close(self):
        pass

    def restart(self):
        self.close()
        self.reset()

Пример #10

0

Показать файл

Файл: envs_snake.py Проект: xtwxfxk/left5

class SnakeEnv(object):

    def __init__(self):
        self.game = Snake()
        self.p = PLE(self.game, fps=30, display_screen=True)

        # self.actions = self.p.getActionSet()
        # self._action_space = list(range(self.actions[0]))
        # self._action_space.append(self.actions[-1])
        self.action_space = self.p.getActionSet()

    def reset(self):
        self.p.init()
        self.p.act(None)
        return self.p.getScreenRGB()
        # return self.p.getScreenGrayscale()

    def step(self, action):
        reward = self.p.act(self.action_space[action])
        # reward = self.p.act(119)
        # print(self.action_space[action], reward)
        return self.p.getScreenRGB(), reward, self.p.game_over()
        # return self.p.getScreenGrayscale(), reward, self.p.game_over()

    @property
    def action_space(self):
        return self._action_space

    @action_space.setter
    def action_space(self, action_space):
        self._action_space = action_space

Пример #11

0

Показать файл

Файл: flappy_agent_huginn.py Проект: Flatkaka/itml-project_2

def run_game(nb_episodes, agent):
    """ Runs nb_episodes episodes of the game with agent picking the moves.
        An episode of FlappyBird ends with the bird crashing into a pipe or going off screen.
    """

    reward_values = {
        "positive": 1.0,
        "negative": 0.0,
        "tick": 0.0,
        "loss": 0.0,
        "win": 0.0
    }
    # TODO: when training use the following instead:
    # reward_values = agent.reward_values

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=True,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    # TODO: to speed up training change parameters of PLE as follows:
    # display_screen=False, force_fps=True
    env.init()

    score = 0
    tot_nb_episodes = nb_episodes
    average = 0
    highscore = 0
    over_50_count = 0
    while nb_episodes > 0:
        # pick an action
        # TODO: for training using agent.training_policy instead
        state, ignore = agent.state_binner(env.game.getGameState())
        action = agent.policy(state)

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # TODO: for training let the agent observe the current state transition

        score += reward

        # reset the environment if the game is over
        if env.game_over() or score >= 60:
            average += score
            if score > highscore:
                highscore = score
            if score >= 50:
                over_50_count += 1
            print("score for this episode: %d" % score)
            env.reset_game()
            nb_episodes -= 1
            score = 0
    print("Average for {} runs {:.2f}".format(tot_nb_episodes,
                                              average / tot_nb_episodes))
    over_50_p = (over_50_count / tot_nb_episodes) * 100
    print("The percentage of scores over 50 is: %d" % (over_50_p))
    return over_50_p

Пример #12

0

Показать файл

Файл: ple_env_adapter.py Проект: MatheusZickuhr/neuroevolution-sandbox

class PleEnvAdapter(EnvAdapter):
    """Pygame learning env adapter"""
    def __init__(self, *args, **kwargs):
        super(PleEnvAdapter, self).__init__(*args, **kwargs)

        if not self.render:
            os.putenv('SDL_VIDEODRIVER', 'fbcon')
            os.environ["SDL_VIDEODRIVER"] = "dummy"

        Game = envs_lookup_table[self.env_name]
        self.env = PLE(Game(),
                       display_screen=self.render,
                       force_fps=not self.render)
        self.env.init()

    def get_input_shape(self):
        return (len(self.env.getGameState()), )

    def reset(self):
        self.env.reset_game()

    def step(self, action) -> (object, float, bool):
        reward = self.env.act(self.env.getActionSet()[action])
        observation = self.env.getGameState()
        observation = [val for key, val in observation.items()]
        done = self.env.game_over()
        return observation, reward, done

    def get_n_actions(self) -> int:
        return len(self.env.getActionSet())

    def get_random_action(self):
        return random.randint(0, len(self.env.getActionSet()) - 1)

Пример #13

0

Показать файл

Файл: flappy2envs.py Проект: lazyprogrammer/machine_learning_examples

class Env:
  def __init__(self):
    self.game = FlappyBird(pipe_gap=125)
    self.env = PLE(self.game, fps=30, display_screen=True)
    self.env.init()
    self.env.getGameState = self.game.getGameState # maybe not necessary

    # by convention we want to use (0,1)
    # but the game uses (None, 119)
    self.action_map = self.env.getActionSet() #[None, 119]

  def step(self, action):
    action = self.action_map[action]
    reward = self.env.act(action)
    done = self.env.game_over()
    obs = self.get_observation()
    # don't bother returning an info dictionary like gym
    return obs, reward, done

  def reset(self):
    self.env.reset_game()
    return self.get_observation()

  def get_observation(self):
    # game state returns a dictionary which describes
    # the meaning of each value
    # we only want the values
    obs = self.env.getGameState()
    return np.array(list(obs.values()))

  def set_display(self, boolean_value):
    self.env.display_screen = boolean_value

Пример #14

0

Показать файл

Файл: dqn.py Проект: zzz2010/RL_monsterkong

def main():
    render_bool = True
    if not render_bool:
        os.environ["SDL_VIDEODRIVER"] = "dummy"
    # else:
    #     pygame.display.set_mode((800, 600 + 60))
    # 创建环境
    game = GameEnv()
    p = PLE(game, display_screen=render_bool, fps=60, force_fps=False
            )  # , fps=30, display_screen=render_bool, force_fps=True)

    p.init()

    # 根据parl框架构建agent
    print(p.getActionSet())
    act_dim = len(p.getActionSet())
    width, height = p.getScreenDims()
    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池
    obs_dim = 1, width, height
    model = Model(act_dim=act_dim)
    alg = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(alg,
                  obs_dim=obs_dim,
                  act_dim=act_dim,
                  e_greed=0.5,
                  e_greed_decrement=0.00001)  # e_greed有一定概率随机选取动作，探索

    # 加载模型
    best_eval_reward = -1000

    if os.path.exists('./model_dqn.ckpt'):
        print("loaded model:", './model_dqn.ckpt')
        agent.restore('./model_dqn.ckpt')
        best_eval_reward = evaluate(p, agent, render=render_bool)
        # run_episode(env, agent, train_or_test='test', render=True)
        # exit()
    # 先往经验池里存一些数据，避免最开始训练的时候样本丰富度不够
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(p, agent, rpm)

    max_episode = 200000
    # 开始训练
    episode = 0

    while episode < max_episode:  # 训练max_episode个回合，test部分不计算入episode数量
        # train part
        for i in range(0, 5):
            total_reward = run_episode(p, agent, rpm)
            episode += 1
        # test part
        eval_reward = evaluate(p, agent,
                               render=render_bool)  # render=True 查看显示效果
        logger.info('episode:{}    e_greed:{}   test_reward:{}'.format(
            episode, agent.e_greed, eval_reward))

        # 保存模型到文件 ./model.ckpt
        agent.save('./model_dqn_%d.ckpt' % rate_num)
        if best_eval_reward < eval_reward:
            best_eval_reward = eval_reward
            agent.save('./model_dqn.ckpt')

Пример #15

0

Показать файл

def train(nb_frames, agent):
    reward_values = agent.reward_values()

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=False,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    env.init()

    score = 0
    biggest_score = -5
    avg_score = 0
    avrage = []
    count = []
    nb_episodes = 0
    number_of_frames = 0
    while number_of_frames < nb_frames:
        # pick an action
        state = env.game.getGameState()
        state = agent.state_binner(state)
        action = agent.training_policy(state)

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # let the agent observe the current state transition
        newState = env.game.getGameState()
        newState = agent.state_binner(newState)
        agent.observe(state, action, reward, newState, env.game_over())

        score += reward
        number_of_frames += 1
        # reset the environment if the game is over
        if env.game_over():
            nb_episodes += 1
            avg_score += score
            if score > biggest_score:
                biggest_score = score
                print(biggest_score)
                print(nb_episodes)
                print(number_of_frames)
            if nb_episodes % 100 == 0:
                print(avg_score / 100)
                avrage.append(avg_score / 100)
                count.append(number_of_frames)
                avg_score = 0

            #print("score for this episode: %d" % score)
            agent.calculate()
            env.reset_game()

            score = 0

    print(biggest_score)
    data = {"Count": count, "Avrage": avrage}
    df = pd.DataFrame(data)
    sns.relplot(x="Count", y="Avrage", ci=None, kind="line", data=df)

Пример #16

0

Показать файл

Файл: flappy_agent.py Проект: Flatkaka/itml-project_2

def train(nb_episodes, agent):
    reward_values = agent.reward_values()

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=False,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    env.init()

    score = 0
    biggest_score = -50000
    avg_score = 0
    episodes = 0
    to_break = False
    while nb_episodes > 0:
        # pick an action
        state = env.game.getGameState()

        state = agent.state_binner(state)
        action = agent.training_policy(state)

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # let the agent observe the current state transition
        newState = env.game.getGameState()
        newState = agent.state_binner(newState)
        agent.observe(state, action, reward, newState, env.game_over())
        agent.frames += 1
        score += reward

        if ((agent.frames % 10000) == 0):
            to_break = True
        # reset the environment if the game is over
        if env.game_over():
            avg_score += score
            if score > biggest_score:
                biggest_score = score
                if biggest_score > 450:
                    break
                print(biggest_score)
                print(nb_episodes)
            if nb_episodes % 100 == 0:
                print(avg_score / 100)
                if avg_score / 100 >= 5:
                    break
                avg_score = 0
            if to_break:
                break

            #print("score for this episode: %d" % score)
            env.reset_game()

            nb_episodes -= 1
            score = 0

    return biggest_score

Пример #17

0

Показать файл

def test_agent(policy, file_writer=None, test_games=10, step=0):
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=False)
    env.init()

    test_rewards = []
    for _ in range(test_games):
        env.reset_game()
        no_op(env)

        game_rew = 0

        while not env.game_over():

            state = flappy_game_state(env)

            action = 119 if policy(state) == 1 else None

            for _ in range(2):
                game_rew += env.act(action)

        test_rewards.append(game_rew)

        if file_writer is not None:
            summary = tf.Summary()
            summary.value.add(tag='test_performance', simple_value=game_rew)
            file_writer.add_summary(summary, step)
            file_writer.flush()

    return test_rewards

Пример #18

0

Показать файл

Файл: finding_alpha_gamma.py Проект: Flatkaka/itml-project_2

def train(nb_frames, agent, a, g, results):
    print("alpha %f" % a)
    print("gamma %f" % g)
    reward_values = agent.reward_values()

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=False,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    env.init()

    score = 0
    biggest_score = -5
    avg_score = 0

    number_of_frames = 0
    nb_episodes = 0
    while number_of_frames < nb_frames:
        # pick an action
        state = env.game.getGameState()
        state = agent.state_binner(state)
        action = agent.training_policy(state)

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # let the agent observe the current state transition
        newState = env.game.getGameState()
        newState = agent.state_binner(newState)
        agent.observe(state, action, reward, newState, env.game_over())

        score += reward
        number_of_frames += 1
        # reset the environment if the game is over
        if env.game_over():
            nb_episodes += 1
            avg_score += score
            if score > biggest_score:
                biggest_score = score
                print(biggest_score)
                print(nb_episodes)
                print(number_of_frames)
            if nb_episodes % 100 == 0:
                print(avg_score / 100)
                results[0].append(avg_score / 100)
                results[1].append(number_of_frames)
                results[2].append(a)
                results[3].append(g)
                avg_score = 0

            #print("score for this episode: %d" % score)
            env.reset_game()

            score = 0
    print(biggest_score)
    return results

Пример #19

0

Показать файл

Файл: flappy_agent.py Проект: emilnewel/flappy-itml

    def score(self, training=True, nb_episodes=10):
        reward_values = {
            'positive': 1.0,
            'negative': 0.0,
            'tick': 0.0,
            'loss': 0.0,
            'win': 0.0
        }

        env = PLE(FlappyBird(),
                  fps=30,
                  display_screen=False,
                  force_fps=True,
                  rng=None,
                  reward_values=reward_values)
        env.init()

        total_episodes = nb_episodes
        score = 0
        scores = []
        while nb_episodes > 0:
            # pick an action
            state = env.game.getGameState()
            action = self.policy(state)

            # step the environment
            reward = env.act(env.getActionSet()[action])

            score += reward

            # reset the environment if the game is over
            if env.game_over() or score >= 100:
                scores.append(score)
                env.reset_game()
                nb_episodes -= 1
                score = 0

        avg_score = sum(scores) / float(len(scores))
        print('Games played: {}'.format(total_episodes))
        print('Average score: {}'.format(avg_score))

        if training:
            score_file = '{}/scores.csv'.format(self.name)
            # If file doesn't exist, add the header
            if not os.path.isfile(score_file):
                with open(score_file, 'a') as f:
                    f.write('avg_score,episode_count,num_of_frames,min,max\n')

            # Append scores to the file
            with open(score_file, 'a') as f:
                f.write('{},{},{},{},{}\n'.format(avg_score,
                                                  self.num_of_episodes,
                                                  self.num_of_frames,
                                                  min(scores), max(scores)))

        else:
            with open('scores.txt', 'a') as f:
                for score in scores:
                    f.write('{},{}\n'.format(self.name, score))

Пример #20

0

Показать файл

Файл: games_wrapper.py Проект: iordachelivia/A3C

 def set_maze_game_setup(self, game):
     '''
                 @game : game instance
     '''
     p = PLE(game, display_screen=True)
     self.actions = p.getActionSet()
     p.init()
     return p

Пример #21

0

Показать файл

Файл: test_ple.py Проект: ntasfi/PyGame-Learning-Environment

 def run_a_game(self,game):
     from ple import PLE
     p =  PLE(game,display_screen=True)
     agent = NaiveAgent(p.getActionSet())
     p.init()
     reward = p.act(p.NOOP)
     for i in range(NUM_STEPS):
         obs = p.getScreenRGB()
         reward = p.act(agent.pickAction(reward,obs))

Пример #22

0

Показать файл

Файл: test_ple.py Проект: thimabru1010/Flappy-Bird-Double-DQN-Pytorch

 def run_a_game(self, game):
     from ple import PLE
     p = PLE(game, display_screen=True)
     agent = NaiveAgent(p.getActionSet())
     p.init()
     reward = p.act(p.NOOP)
     for i in range(NUM_STEPS):
         obs = p.getScreenRGB()
         reward = p.act(agent.pickAction(reward, obs))

Пример #23

0

Показать файл

def test_movement_up():
    game = Pong()
    p = PLE(game, display_screen=True, fps=20, force_fps=1)
    p.init()
    time.sleep(.5)
    oldState = p.getGameState()
    p.act(game.actions["up"])
    newState = p.getGameState()
    assert oldState["player_velocity"] > newState["player_velocity"]

Пример #24

0

Показать файл

Файл: Game.py Проект: nishantsinha15/Flappy-Bird-DDQN

def train(FRAME_TRAIN=1000005):
    game = FlappyBird()
    p = PLE(game, fps=30, display_screen=True)
    p.init()
    ob = game.getGameState()
    state = ob
    state = np.reshape(np.asarray(list(state.values())), [1, 8])
    total_reward = 0
    agent = DDQN_Agent.DeepQAgent()
    agent.load('model95000')
    batch_size = 32
    my_timer = time.time()
    prev_frame = 0
    data = []
    for i in range(FRAME_TRAIN):
        if p.game_over():
            data.append(total_reward)
            p.reset_game()
            print(
                "Total reward = {}, Frame = {}, epsilon = {}, frame/second = {}"
                .format(total_reward, i, agent.epsilon,
                        (i - prev_frame) / (time.time() - my_timer)))
            total_reward = 0
            prev_frame = i
            my_timer = time.time()

        # get action from agent
        action = agent.act(state)

        # take action
        reward = p.act(p.getActionSet()[action])

        # making the reward space less sparse
        if reward < 0:
            reward = -1

        total_reward += reward
        next_state = np.asarray(list(game.getGameState().values()))
        next_state = np.reshape(next_state, [1, 8])

        # remember and replay
        agent.remember(state, action, reward, next_state, p.game_over())
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

        state = next_state

        # save Model
        if i % 5000 == 0:
            print("Updating weights")
            agent.save('newmodel' + str(i))
            agent.target_model.set_weights(agent.model.get_weights())

        # Plot socre
        if i % 1000 == 0:
            plot(data)

Пример #25

0

Показать файл

Файл: model2.py Проект: ashutosh-dwivedi-e3502/deepq_flappy

class GameEnv(object):
    def __init__(self, display_screen):
        self.width = IMAGE_WIDTH
        self.height = IMAGE_HEIGHT

        self.count = 0
        self.p = PLE(FlappyBird(), fps=30, display_screen=display_screen)
        self.p.init()
        self._update_state()
        self.score = 0

    def pre_process_image(self, image):
        self.count += 1
        image = color.rgb2gray(image)
        image = transform.resize(image, (self.width, self.height))
        image = exposure.rescale_intensity(image, out_range=(0, 255))
        image = image.astype('float')
        image = image / 255.0
        return image.reshape(1, self.width, self.height, 1)

    def _update_state(self):
        image = self.p.getScreenRGB()
        # TODO: convert to float
        image = self.pre_process_image(image)
        state = getattr(self, 'state', None)
        if state is None:
            self.state = np.concatenate([image] * 4, axis=3)
        else:
            self.state[:, :, :, :3] = image

    def get_state(self):
        return self.state

    def step(self, action):
        if action == 1:
            _ = self.p.act(119)
        else:
            _ = self.p.act(None)

        self._update_state()

        done = False
        if self.p.game_over():
            done = True
            self.p.reset_game()
            reward = -1
        else:
            reward = 0.1

        return_score = self.score + reward
        self.score = 0 if done else self.score + reward

        return self.state, reward, done, return_score

    def get_score(self):
        return self.score

Пример #26

0

Показать файл

class PLEEnv(gym.Env):
    def __init__(self, env_config):
        game = Catcher(width=screen_wh, height=screen_wh)

        fps = 30  # fps we want to run at
        frame_skip = 2
        num_steps = 2
        force_fps = False  # False for slower speed
        display_screen = True
        # make a PLE instance.
        self.env = PLE(game,
                       fps=fps,
                       frame_skip=frame_skip,
                       num_steps=num_steps,
                       force_fps=force_fps,
                       display_screen=display_screen)
        self.env.init()
        self.action_dict = {0: None, 1: 97, 2: 100}
        #PLE env starts with black screen
        self.env.act(self.env.NOOP)

        self.action_space = Discrete(3)
        self.k = 4
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(screen_wh, screen_wh,
                                                   1 * self.k))
        self.frames = deque([], maxlen=self.k)

    def reset(self):
        self.env.reset_game()
        # PLE env starts with black screen, NOOP step to get initial screen
        self.env.act(self.env.NOOP)
        ob = np.reshape(self.env.getScreenGrayscale(),
                        (screen_wh, screen_wh, 1))
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        #traditional gym env step
        #_obs, _rew, done, _info = env.step(env.action_space.sample())
        action_value = self.action_dict[action]
        _rew = self.env.act(action_value)
        #_obs = self.env.getScreenGrayscale()
        _obs = np.reshape(self.env.getScreenGrayscale(),
                          (screen_wh, screen_wh, 1))
        self.frames.append(_obs)
        _done = self.env.game_over()
        _info = {}

        return self._get_ob(), _rew, _done, _info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return np.concatenate(self.frames, axis=2)

Пример #27

0

Показать файл

Файл: run_game.py Проект: Rokicki-203980/Study

def show_playing(episodes, agent):
    env = PLE(FlappyBird(),
              fps=30,
              display_screen=True,
              force_fps=False,
              rng=None,
              reward_values=reward_values)
    env.init()

    return run_game(episodes, agent, env, False)

Пример #28

0

Показать файл

Файл: games_wrapper.py Проект: iordachelivia/A3C

 def set_maze_game_setup(self, game):
     '''
                 @game : game instance
     '''
     p = PLE(game, display_screen=False)
     #In some games, doing nothing is a valid action
     #in a maze, it is not
     self.actions = p.getActionSet()[:-1]
     p.init()
     return p

Пример #29

0

Показать файл

Файл: flappy_agent.py Проект: Miscon/Flappy

    def train(self):
        """ Runs nb_episodes episodes of the game with agent picking the moves.
            An episode of FlappyBird ends with the bird crashing into a pipe or going off screen.
        """

        if not os.path.exists(self.name):
            os.mkdir(self.name)

        t = threading.Thread(target=self.draw_plots)
        t.daemon = True
        t.start()

        reward_values = self.reward_values()
        env = PLE(FlappyBird(),
                  fps=30,
                  display_screen=False,
                  force_fps=True,
                  rng=None,
                  reward_values=reward_values)
        env.init()

        score = 0
        while self.frame_count <= 1000000:
            # pick an action
            state1 = env.game.getGameState()
            action = self.training_policy(state1)

            # step the environment
            reward = env.act(env.getActionSet()[action])
            # print("reward=%d" % reward)

            state2 = env.game.getGameState()

            end = env.game_over(
            ) or score >= 100  # Stop after reaching 100 pipes
            self.observe(state1, action, reward, state2, end)

            # reset the environment if the game is over
            if end:
                env.reset_game()
                score = 0

            if self.frame_count % 25000 == 0:
                print("==========================")

                print("episodes done: {}".format(self.episode_count))
                print("frames done: {}".format(self.frame_count))

                self.score()

                with open("{}/agent.pkl".format(self.name), "wb") as f:
                    pickle.dump((self), f, pickle.HIGHEST_PROTOCOL)

                print("==========================")

Пример #30

0

Показать файл

def test():
    game = Snake(600, 600)
    p = PLE(game,
            fps=60,
            state_preprocessor=process_state,
            force_fps=True,
            display_screen=True,
            frame_skip=2,
            reward_values={
                "positive": 100.0,
                "negative": -50.0,
                "tick": -0.1,
                "loss": -70.0,
                "win": 5.0
            })
    agent = Agent(alpha=float(sys.argv[1]),
                  gamma=float(sys.argv[2]),
                  n_actions=3,
                  epsilon=0.01,
                  batch_size=100,
                  input_shape=6,
                  epsilon_dec=0.99999,
                  epsilon_end=0.001,
                  memory_size=500000,
                  file_name=sys.argv[3],
                  activations=[str(sys.argv[4]),
                               str(sys.argv[5])])
    p.init()
    agent.load_game()
    scores = []

    for _ in range(200):
        if p.game_over():
            p.reset_game()
        apples = 0
        initial_direction = "Right"
        while not p.game_over():
            old_state = np.array(
                vision(list(p.getGameState()[0]), initial_direction))

            action = agent.choose_action(old_state)
            possible_directions = prepare_corect_directions(initial_direction)
            possible_directions_tuples = list(
                zip(possible_directions.keys(), possible_directions.values()))
            direction = possible_directions_tuples[action]
            initial_direction = direction[1]

            reward = p.act(direction[0])
            if reward > 50.0:
                apples += reward

        scores.append(apples)
    return scores

Пример #31

0

Показать файл

class Game(gym.Env):
    def __init__(self, display_screen=False, force_fps=True):
        os.environ["SDL_VIDEODRIVER"] = "dummy"
        game = FlappyBird()  # define and initiate the environment
        self.env = PLE(game,
                       fps=30,
                       display_screen=display_screen,
                       force_fps=force_fps)
        self.env.init()
        # list of actions in the environment
        self.actions = self.env.getActionSet()
        # length of actions
        self.action_space = spaces.Discrete(len(self.actions))

    def step(self, action):
        """Take the action chosen and update the reward"""
        reward = self.env.act(self.actions[action])
        state = self.getGameState()
        terminal = self.env.game_over()

        # If the bird is stuck, the game is over and a reward of -1000
        # if it continues, +1
        if terminal:
            reward = -1000
        else:
            reward = 1

        return state, reward, terminal, {}

    def getGameState(self):
        '''
        PLEenv return gamestate as a dictionary. Returns a modified form
        of the gamestate only with the required information to define the state
        '''
        state = self.env.getGameState()
        h_dist = state['next_pipe_dist_to_player']
        v_dist = state['next_pipe_bottom_y'] - state['player_y']
        vel = state['player_vel']

        return ' '.join([str(vel), str(h_dist), str(v_dist)])

    def reset(self):
        """Resets the game to start a new game"""
        self.env.reset_game()
        state = self.env.getGameState()
        return state

    def seed(self, seed):
        rng = np.random.RandomState(seed)
        self.env.rng = rng
        self.env.game.rng = self.env.rng

        self.env.init()

Пример #32

0

Показать файл

def main(train=False):
    # Don't modify anything in this function.
    # See the constants defined at the top of this file if you'd like to
    # change the FPS, screen size, or round length
    game = Pong(width=WIDTH, height=HEIGHT, MAX_SCORE=MAX_SCORE)

    if train:
        p = PLE(game, fps=FPS, display_screen=False, force_fps=True)
    else:
        p = PLE(game, fps=FPS, display_screen=True, force_fps=False)

    p.init()

    agent_rounds = 0
    cpu_rounds = 0
    agent_score = 0
    cpu_score = 0
    num_frames = 0
    while True:
        if p.game_over():
            if game.score_counts['agent'] > game.score_counts['cpu']:
                agent_rounds += 1
                print('AGENT won round')
            else:
                cpu_rounds += 1
                print('CPU won round')

            if agent_rounds == NUM_ROUNDS or cpu_rounds == NUM_ROUNDS:
                break

            p.reset_game()

        obs = p.getGameState()
        action = agent(normalize(obs))
        reward = p.act(ACTION_MAP[action])

        if reward > 0:
            agent_score += 1
            print('AGENT scored')
        elif reward < 0:
            cpu_score += 1
            print('CPU scored')

        num_frames += 1

    winner = 'AGENT' if agent_rounds > cpu_rounds else 'CPU'
    print('Winner:', winner)
    print('Num frames      :', num_frames)
    print('AGENT rounds won:',agent_rounds)
    print('CPU   rounds won:',cpu_rounds)
    print('AGENT total score:',agent_score)
    print('CPU   total score:',cpu_score)

Пример #33

0

Показать файл

Файл: minion_learning.py Проект: SiyuanQi/RunningMinion

def main_naive():
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True)
    my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet())

    env.init()
    reward = 0.0
    nb_frames = 10000

    for i in range(nb_frames):
        if env.game_over():
            env.reset_game()

        observation = env.getScreenRGB()
        action = my_agent.pickAction(reward, observation)
        reward = env.act(action)

Пример #34

0

Показать файл

Файл: minion_learning.py Проект: SiyuanQi/RunningMinion

def play_with_saved_agent(agent_file_path, agent_file_name, test_rounds=20):
    game = RunningMinion()
    env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state)
    my_agent = load_agent(env, agent_file_path, agent_file_name)
    env.init()

    print "Testing model:", agent_file_name

    total_reward = 0.0
    for _ in range(test_rounds):
        my_agent.start_episode()
        episode_reward = 0.0
        while env.game_over() == False:
            state = env.getGameState()
            reward, action = my_agent.act(state, epsilon=0.00)
            episode_reward += reward

        print "Agent score {:0.1f} reward for episode.".format(episode_reward)
        total_reward += episode_reward
        my_agent.end_episode()

    return total_reward/test_rounds

Пример #35

0

Показать файл

Файл: PLE_env.py Проект: halofanx/deer

class MyEnv(Environment):
    VALIDATION_MODE = 0

    def __init__(self, rng, game=None, frame_skip=4, 
            ple_options={"display_screen": True, "force_fps":True, "fps":30}):

        self._mode = -1
        self._mode_score = 0.0
        self._mode_episode_count = 0

        self._frameSkip = frame_skip if frame_skip >= 1 else 1
        self._random_state = rng
       
        if game is None:
            raise ValueError("Game must be provided")

        self._ple = PLE(game, **ple_options)
        self._ple.init()

        w, h = self._ple.getScreenDims()
        self._screen = np.empty((h, w), dtype=np.uint8)
        self._reducedScreen = np.empty((48, 48), dtype=np.uint8)
        self._actions = self._ple.getActionSet()

                
    def reset(self, mode):
        if mode == MyEnv.VALIDATION_MODE:
            if self._mode != MyEnv.VALIDATION_MODE:
                self._mode = MyEnv.VALIDATION_MODE
                self._mode_score = 0.0
                self._mode_episode_count = 0
            else:
                self._mode_episode_count += 1
        elif self._mode != -1: # and thus mode == -1
            self._mode = -1

        self._ple.reset_game()
        for _ in range(self._random_state.randint(15)):
            self._ple.act(self._ple.NOOP)
        self._screen = self._ple.getScreenGrayscale()
        cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST)
        
        return [4 * [48 * [48 * [0]]]]
        
        
    def act(self, action):
        action = self._actions[action]
        
        reward = 0
        for _ in range(self._frameSkip):
            reward += self._ple.act(action)
            if self.inTerminalState():
                break
            
        self._screen = self._ple.getScreenGrayscale()
        cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST)
  
        self._mode_score += reward
        return np.sign(reward)

    def summarizePerformance(self, test_data_set):
        if self.inTerminalState() == False:
            self._mode_episode_count += 1
        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))


    def inputDimensions(self):
        return [(4, 48, 48)]

    def observationType(self, subject):
        return np.uint8

    def nActions(self):
        return len(self._actions)

    def observe(self):
        return [np.array(self._reducedScreen)]

    def inTerminalState(self):
        return self._ple.game_over()

Пример #36

0

Показать файл

Файл: launcher.py Проект: ntasfi/deep_q_rl

def launch(args, defaults, description):
    """
    Execute a complete training run.
    """

    logging.basicConfig(level=logging.INFO)
    parameters = process_args(args, defaults, description)

    rewards = {}
    
    try:
        module = importlib.import_module("ple.games.%s" % parameters.game.lower())
        game = getattr(module, parameters.game)
        if parameters.game == "FlappyBird":
            game = game()
        elif parameters.game == "WaterWorld":
            game = game(width=84, height=84, num_creeps=6)
        else:
            game = game(width=84, height=84)
    except:
        raise ValueError("The game %s could not be found. Try using the classname, it is case sensitive." % parameters.game)
    
    if parameters.deterministic:
        rng = np.random.RandomState(123456)
    else:
        rng = np.random.RandomState()

    if parameters.cudnn_deterministic:
        theano.config.dnn.conv.algo_bwd = 'deterministic'

    env = PLE(
            game,
            fps=60,
            force_fps=parameters.force_fps, 
            display_screen=parameters.display_screen,
            reward_values=rewards,
            rng=rng
    )

    num_actions = len(env.getActionSet())

    if parameters.nn_file is None:
        network = q_network.DeepQLearner(defaults.RESIZED_WIDTH,
                                         defaults.RESIZED_HEIGHT,
                                         num_actions,
                                         parameters.phi_length,
                                         parameters.discount,
                                         parameters.learning_rate,
                                         parameters.rms_decay,
                                         parameters.rms_epsilon,
                                         parameters.momentum,
                                         parameters.clip_delta,
                                         parameters.freeze_interval,
                                         parameters.batch_size,
                                         parameters.network_type,
                                         parameters.update_rule,
                                         parameters.batch_accumulator,
                                         rng)
    else:
        handle = open(parameters.nn_file, 'r')
        network = cPickle.load(handle)

    agent = ple_agent.NeuralAgent(network,
                                  parameters.epsilon_start,
                                  parameters.epsilon_min,
                                  parameters.epsilon_decay,
                                  parameters.replay_memory_size,
                                  parameters.experiment_prefix,
                                  parameters.replay_start_size,
                                  parameters.update_frequency,
                                  rng)

    experiment = ple_experiment.PLEExperiment(env, agent,
                                              defaults.RESIZED_WIDTH,
                                              defaults.RESIZED_HEIGHT,
                                              parameters.resize_method,
                                              parameters.epochs,
                                              parameters.steps_per_epoch,
                                              parameters.steps_per_test,
                                              parameters.frame_skip,
                                              parameters.death_ends_episode,
                                              parameters.max_start_nullops,
                                              rng)

    
    env.init()
    experiment.run()

Пример #37

0

Показать файл

Файл: example_doom.py Проект: ntasfi/PyGame-Learning-Environment

class NaiveAgent():
	"""
		This is our naive agent. It picks actions at random!
	"""
	def __init__(self, actions):
		self.actions = actions

	def pickAction(self, reward, obs):
		return self.actions[np.random.randint(0, len(self.actions))]

###################################
game = Doom(scenario="take_cover")

env = PLE(game)
agent = NaiveAgent(env.getActionSet())
env.init()

reward = 0.0
for f in range(15000):
	#if the game is over
        if env.game_over():
            env.reset_game()
            
        action = agent.pickAction(reward, env.getScreenRGB())
        reward = env.act(action)

        if f > 2000:
            env.display_screen = True 
            env.force_fps = False
        
        if f > 2250:

Пример #38

0

Показать файл

Файл: minion_learning.py Проект: SiyuanQi/RunningMinion

def agent_training(agent_file_path, agent_file_name, fig_path, num_steps_train_total = 5000):
    # training parameters
    num_epochs = 5
    num_steps_train_epoch = num_steps_train_total/num_epochs  # steps per epoch of training
    num_steps_test = 100
    update_frequency = 10  # step frequency of model training/updates

    epsilon = 0.15  # percentage of time we perform a random action, help exploration.
    epsilon_steps = 1000  # decay steps
    epsilon_min = 0.1
    epsilon_rate = (epsilon - epsilon_min) / epsilon_steps

    # memory settings
    max_memory_size = 10000
    min_memory_size = 60  # number needed before model training starts

    game = RunningMinion()
    env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state)
    my_agent = init_agent(env)

    memory = utils.ReplayMemory(max_memory_size, min_memory_size)
    env.init()

    # Logging configuration and figure plotting
    logging.basicConfig(filename='../learning.log', filemode='w',
                        level=logging.DEBUG, format='%(levelname)s:%(message)s')
    logging.info('========================================================')
    logging.info('Training started for total training steps: '+str(num_steps_train_total)+'.\n')
    learning_rewards = [0]
    testing_rewards = [0]

    for epoch in range(1, num_epochs + 1):
        steps, num_episodes = 0, 0
        losses, rewards = [], []
        env.display_screen = False

        # training loop
        while steps < num_steps_train_epoch:
            episode_reward = 0.0
            my_agent.start_episode()

            while env.game_over() == False and steps < num_steps_train_epoch:
                state = env.getGameState()
                reward, action = my_agent.act(state, epsilon=epsilon)
                memory.add([state, action, reward, env.game_over()])

                if steps % update_frequency == 0:
                    loss = memory.train_agent_batch(my_agent)

                    if loss is not None:
                        losses.append(loss)
                        epsilon = np.max(epsilon_min, epsilon - epsilon_rate)

                episode_reward += reward
                steps += 1

            if steps < num_steps_train_epoch:
                learning_rewards.append(episode_reward)

            if num_episodes % 5 == 0:
                # print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)
                logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward))

            rewards.append(episode_reward)
            num_episodes += 1
            my_agent.end_episode()

        logging.info("Train Epoch {:02d}: Epsilon {:0.4f} | Avg. Loss {:0.3f} | Avg. Reward {:0.3f}\n"
                     .format(epoch, epsilon, np.mean(losses), np.sum(rewards) / num_episodes))

        steps, num_episodes = 0, 0
        losses, rewards = [], []

        # testing loop
        while steps < num_steps_test:
            episode_reward = 0.0
            my_agent.start_episode()

            while env.game_over() == False and steps < num_steps_test:
                state = env.getGameState()
                reward, action = my_agent.act(state, epsilon=0.05)

                episode_reward += reward
                testing_rewards.append(testing_rewards[-1]+reward)
                steps += 1

                # done watching after 500 steps.
                if steps > 500:
                    env.display_screen = False

            if num_episodes % 5 == 0:
                logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward))

            if steps < num_steps_test:
                testing_rewards.append(episode_reward)

            rewards.append(episode_reward)
            num_episodes += 1
            my_agent.end_episode()

        logging.info("Test Epoch {:02d}: Best Reward {:0.3f} | Avg. Reward {:0.3f}\n"
                     .format(epoch, np.max(rewards), np.sum(rewards) / num_episodes))

    logging.info("Training complete.\n\n")
    plot_figure(fig_path, learning_rewards, 'reward', 'reward_in_training', num_steps_train_total)
    plot_figure(fig_path, testing_rewards, 'reward', 'reward_in_testing', num_steps_train_total)

    save_agent(my_agent, agent_file_path, agent_file_name)

Пример #39

0

Показать файл

Файл: run.py Проект: williamjussiau/RLchallenge

# You're not allowed to change this file
from ple.games.flappybird import FlappyBird
from ple import PLE
import numpy as np
from FlappyAgent import FlappyPolicy

game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

p.init()
reward = 0.0

nb_games = 100
cumulated = np.zeros((nb_games))

for i in range(nb_games):
    p.reset_game()
    
    while(not p.game_over()):
        state = game.getGameState()
        screen = p.getScreenRGB()
        action=FlappyPolicy(state, screen) ### Your job is to define this function.
        
        reward = p.act(action)
        cumulated[i] = cumulated[i] + reward

average_score = np.mean(cumulated)
max_score = np.max(cumulated)

Пример #40

0

Показать файл

Файл: deep_q_network.py Проект: nishithbsk/MultiTaskRL

def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    #setupGame()
    gameClass = FlappyBird(width=288, height=512, pipe_gap=100)
    
    fps = 30
    frame_skip = 2
    num_steps = 1
    force_fps = False
    display_screen = True
    reward = 0.0
    nb_frames = 15000

    game = PLE(gameClass, fps=fps, frame_skip=frame_skip, num_steps=num_steps,
            force_fps=force_fps, display_screen=display_screen)

    game.init()

    # store the previous observations in replay memory
    D = deque()

    # printing
    logdir = "logs_" + GAME
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    a_file = open(logdir + "/readout.txt", 'w')
    h_file = open(logdir + "/hidden.txt", 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    r_0 = game.act(game.NOOP)
    x_t = game.getScreenGrayscale()
    terminal = game.game_over()
    if terminal:
        print "NOOOO"
        game.reset_game()
    
    x_t = cv2.resize(x_t, (80, 80))
    ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2)

    # saving and loading networks
    #saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    '''
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print "Successfully loaded:", checkpoint.model_checkpoint_path
    else:
        print "Could not find old network weights"
    '''
    epsilon = INITIAL_EPSILON
    t = 0
    while True:
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict = {s : [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if random.random() <= epsilon or t <= OBSERVE:
            action_index = random.randrange(ACTIONS)
            a_t[random.randrange(ACTIONS)] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        for i in range(0, K):
            # run the selected action and observe next state and reward
            r_t = game.act(np.argmax(a_t))
            x_t1 = game.getScreenGrayscale()
            terminal = game.game_over()
            if terminal:
                print "NOOO2"
                game.reset_game()

            x_t1 = cv2.resize(x_t1, (80, 80))
            ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2)

            # store the transition in D
            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch})
            for i in range(0, len(minibatch)):
                # if terminal only equals reward
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(feed_dict = {
                y : y_batch,
                a : a_batch,
                s : s_j_batch})

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t)

        # write info to files
        '''

Python PLE.init примеры использования