예제 #1
0
def _test_ple():
    from ple.games.pong import Pong
    from ple.games.flappybird import FlappyBird
    from ple import PLE
    # os.environ['SDL_VIDEODsRIVER'] = 'dummy'
    game = Pong()
    game = FlappyBird()
    ple_game = PLE(game, fps=30, display_screen=True)
    ple_game.init()
    ALLOWED_ACTIONS = ple_game.getActionSet()

    print(ALLOWED_ACTIONS)
    action = 0
    start = time()
    t = 0
    while True:
        ep_reward = 0
        ple_game.reset_game()
        while not ple_game.game_over():
            sleep(0.1)
            t += 1
            if t % 15 == 5:
                action = 0
            else:
                action = 1
            reward = ple_game.act(ALLOWED_ACTIONS[action])
            # print(reward)
            ep_reward += reward
        print(ep_reward, t, t / (time() - start))
예제 #2
0
def test_movement_up():
    game = Pong()
    p = PLE(game, display_screen=True, fps=20, force_fps=1)
    p.init()
    time.sleep(.5)
    oldState = p.getGameState()
    p.act(game.actions["up"])
    newState = p.getGameState()
    assert oldState["player_velocity"] > newState["player_velocity"]
예제 #3
0
def main(train=False):
    # Don't modify anything in this function.
    # See the constants defined at the top of this file if you'd like to
    # change the FPS, screen size, or round length
    game = Pong(width=WIDTH, height=HEIGHT, MAX_SCORE=MAX_SCORE)

    if train:
        p = PLE(game, fps=FPS, display_screen=False, force_fps=True)
    else:
        p = PLE(game, fps=FPS, display_screen=True, force_fps=False)

    p.init()

    agent_rounds = 0
    cpu_rounds = 0
    agent_score = 0
    cpu_score = 0
    num_frames = 0
    while True:
        if p.game_over():
            if game.score_counts['agent'] > game.score_counts['cpu']:
                agent_rounds += 1
                print('AGENT won round')
            else:
                cpu_rounds += 1
                print('CPU won round')

            if agent_rounds == NUM_ROUNDS or cpu_rounds == NUM_ROUNDS:
                break

            p.reset_game()

        obs = p.getGameState()
        action = agent(normalize(obs))
        reward = p.act(ACTION_MAP[action])

        if reward > 0:
            agent_score += 1
            print('AGENT scored')
        elif reward < 0:
            cpu_score += 1
            print('CPU scored')

        num_frames += 1

    winner = 'AGENT' if agent_rounds > cpu_rounds else 'CPU'
    print('Winner:', winner)
    print('Num frames      :', num_frames)
    print('AGENT rounds won:',agent_rounds)
    print('CPU   rounds won:',cpu_rounds)
    print('AGENT total score:',agent_score)
    print('CPU   total score:',cpu_score)
예제 #4
0
 def __init__(self):
     self.resize_factor = 0.5
     self.width = 64
     self.height = 48
     self.ple = PLE(game=Pong(), fps=30, frame_skip=8)
     self.action_set = self.ple.getActionSet()
     self.action_space = spaces.Discrete(len(self.action_set))
     self.observation_space = spaces.Box(
         low=0.0,
         high=255.0,
         shape=(
             int(self.width * self.resize_factor),
             int(self.height * self.resize_factor),
             1,
         ),
         dtype=np.uint32,
     )
예제 #5
0
def main():
    # 创建环境
    game = Pong(width=200, height=200,MAX_SCORE=11)
    p = PLE(game, fps=30, display_screen=False, force_fps=False)
    p.reset_game()
    # 根据parl框架构建agent
    print(p.getActionSet())
    act_dim = len(p.getActionSet())

    obs = get_obs(p)
    obs_dim = 200*200

    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池

    model = Model(act_dim=act_dim)
    alg = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim, e_greed_decrement=1e-6, e_greed=0.1)  # e_greed有一定概率随机选取动作,探索

    # # 加载模型
    # if os.path.exists('./water_world_dqn.ckpt'):
    #     agent.restore('./water_world_dqn.ckpt')

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(p, agent, rpm)

    max_episode = 200000
    # 开始训练
    episode = 0
    best_reward = -float('inf')
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 50):
            total_reward = run_episode(p, agent, rpm)
            episode += 1
        # test part
        eval_reward = evaluate(p, agent, render=False)  # render=True 查看显示效果
        if eval_reward>best_reward:
            best_reward = eval_reward
            agent.save('model_dir/dqn_pong_{}_reward_{}.ckpt'.format(episode,best_reward))
        logger.info('episode:{}    e_greed:{}   test_reward:{}'.format(
            episode, agent.e_greed, eval_reward))
예제 #6
0
def main():
    # 创建环境
    game = Pong(width=200, height=200, MAX_SCORE=11)
    p = PLE(game, fps=30, display_screen=False, force_fps=True)
    # 根据parl框架构建agent
    p.reset_game()
    print(p.getActionSet())
    act_dim = len(p.getActionSet())
    obs_dim = 200 * 200

    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # # 加载模型
    # if os.path.exists('model_dir/pg_pong_episode_19.ckpt'):
    #     agent.restore('model_dir/pg_pong_episode_19.ckpt')

    best_total_reward = -float('inf')
    for i in range(500000):
        obs_list, action_list, reward_list = run_episode(p, agent)
        if i % 10 == 0:
            logger.info("Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)
        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 50 == 0:
            total_reward = evaluate(p, agent, render=False)
            if total_reward > best_total_reward:
                best_total_reward = total_reward
                agent.save(
                    'model_dir/pg_pong_episode_{}_reward_{}.ckpt'.format(
                        i, total_reward))
            logger.info('Test reward: {}'.format(total_reward))
예제 #7
0
    def __init__(self, game, display_screen=False):
        from ple import PLE
        assert game in [
            'catcher', 'monsterkong', 'flappybird', 'pixelcopter', 'pong',
            'puckworld', 'raycastmaze', 'snake', 'waterworld'
        ]
        if game == 'catcher':
            from ple.games.catcher import Catcher
            env = Catcher()
        elif game == 'monsterkong':
            from ple.games.monsterkong import MonsterKong
            env = MonsterKong()
        elif game == 'flappybird':
            from ple.games.flappybird import FlappyBird
            env = FlappyBird()
        elif game == 'pixelcopter':
            from ple.games.pixelcopter import Pixelcopter
            env = Pixelcopter()
        elif game == 'pong':
            from ple.games.pong import Pong
            env = Pong()
        elif game == 'puckworld':
            from ple.games.puckworld import PuckWorld
            env = PuckWorld()
        elif game == 'raycastmaze':
            from ple.games.raycastmaze import RaycastMaze
            env = RaycastMaze()
        elif game == 'snake':
            from ple.games.snake import Snake
            env = Snake()
        elif game == 'waterworld':
            from ple.games.waterworld import WaterWorld
            env = WaterWorld()

        self.p = PLE(env, fps=30, display_screen=display_screen)
        self.action_set = self.p.getActionSet()
        self.action_size = len(self.action_set)
        self.screen_dims = self.p.getScreenDims()
        self.p.init()
예제 #8
0
def main():
    # 创建环境
    game = Pong(width=200, height=200, MAX_SCORE=11)
    p = PLE(game, fps=30, display_screen=True, force_fps=False)
    p.reset_game()
    # 根据parl框架构建agent
    print(p.getActionSet())
    act_dim = len(p.getActionSet())
    print("act_dim:", act_dim)

    obs_dim = 200 * 200
    # 使用parl框架搭建Agent:QuadrotorModel, DDPG, QuadrotorAgent三者嵌套
    model = PongModel(act_dim)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = PongAgent(algorithm, obs_dim, act_dim)
    rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim)

    max_episode = 20000
    # 开始训练
    episode = 0
    best_reward = -float('inf')
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 50):
            total_reward = run_episode(p, agent, rpm)
            episode += 1
        # test part
        eval_reward = evaluate(p, agent, render=True)  # render=True 查看显示效果
        if eval_reward > best_reward:
            best_reward = eval_reward
            agent.save('model_dir/ddpg_pong_{}.ckpt'.format(episode))
        logger.info('episode:{}   test_reward:{}'.format(episode, eval_reward))
예제 #9
0
#coding:utf-8
from ple.games.pong import Pong
from ple import PLE
import numpy as np
def get_obs(env):
    # game_state = env.getGameState()
    # obs = list(game_state.values())
    obs = env.getScreenGrayscale()/255.0
    return obs.astype(np.float).ravel()


if __name__ == '__main__':
    game = Pong(width=128, height=96,MAX_SCORE=11)
    p = PLE(game, fps=30, display_screen=True, force_fps=True)
    # 根据parl框架构建agent
    print(p.getActionSet())

    act_dim = len(p.getActionSet())
    p.getScreenGrayscale()
    game_state = p.getGameState()
    print(game_state)
예제 #10
0
def init_main(save_path, model, train=True, display=False):
    """The application's entry point.

    If someone executes this module (instead of importing it, for
    example), this function is called.
    """
    push_to_memory, select_action, perform_action, optimize, save_model = model

    fps = 30  # fps we want to run at
    frame_skip = 2
    num_steps = 1
    force_fps = False  # slower speed

    game = Pong(width=256, height=256)

    p = PLE(game,
            fps=fps,
            frame_skip=frame_skip,
            num_steps=num_steps,
            force_fps=force_fps,
            display_screen=display)

    p.init()

    def p_action(action):
        # reward, action
        return p.act(action)

    def main(steps):

        x_t = extract_image(p.getScreenRGB(), (80, 80))

        stack_x = np.stack((x_t, x_t, x_t, x_t), axis=0)
        try:
            while p.game_over() == False and steps > 0:
                steps -= 1

                x_t = extract_image(p.getScreenRGB(), (80, 80))

                x_t = np.reshape(x_t, (1, 80, 80))

                st = np.append(stack_x[1:4, :, :], x_t, axis=0)

                if train:
                    reward, action, _, _, _ = train_and_play(
                        p_action, st, select_action, perform_action,
                        possible_actions, optimize, None, {})
                    push_to_memory(stack_x, action, st, reward)

                else:
                    play(p_action, st, select_action, perform_action,
                         possible_actions, None, {})

                stack_x = st

        except KeyboardInterrupt as e:
            print("KeyboardInterrupt >>", e)
            print("Saving model")
            if train:
                save_model(save_path)
                print("Model saved")
            sys.exit()

        score = p.score()
        p.reset_game()
        if train: save_model(save_path)
        return score

    return main
    def pickAction(self, reward, obs):
        return self.actions[np.random.randint(0, len(self.actions))]


fps = 30  # fps we want to run at
frame_skip = 2
num_steps = 1
force_fps = False  # slower speed
display_screen = True

reward = 0.0
max_noops = 20
nb_frames = 15000

# make a PLE instance.
env = PLE(Pong(),
          fps=fps,
          frame_skip=frame_skip,
          num_steps=num_steps,
          force_fps=force_fps,
          display_screen=display_screen)

# our Naive agent!
agent = NaiveAgent(env.getActionSet())

# init agent and game.
env.init()

# lets do a random number of NOOP's
for i in range(np.random.randint(0, max_noops)):
    reward = env.act(env.NOOP)
 def test_pong(self):
     from ple.games.pong import Pong
     game = Pong()
     self.run_a_game(game)
예제 #13
0
def a3c_main(save_path, shared_model,\
            model,\
            select_action,\
            perform_action,\
            save_model,\
            optimizer=None,\
            train=True,\
            display=False,\
            gamma =.99,\
            tau=1.):

    fps = 30  # fps we want to run at
    frame_skip = 2
    num_steps = 1
    force_fps = False  # slower speed

    game = Pong(width=256, height=256)

    p = PLE(game,
            fps=fps,
            frame_skip=frame_skip,
            num_steps=num_steps,
            force_fps=force_fps,
            display_screen=display)

    p.init()

    def p_action(action):
        # reward, action
        return p.act(action)

    def main(lstm_shape, steps):

        reward_alive = 0
        values = []
        log_probs = []
        rewards = []
        entropies = []

        x_t = extract_image(p.getScreenRGB(), (80, 80))

        stack_x = np.stack((x_t, x_t, x_t, x_t), axis=0)
        model.load_state_dict(shared_model.state_dict())

        cx = Variable(torch.zeros(1, lstm_shape[-1]))
        hx = Variable(torch.zeros(1, lstm_shape[-1]))

        try:
            while p.game_over() == False and steps > 0:
                steps -= 1

                x_t = extract_image(p.getScreenRGB(), (80, 80))

                x_t = np.reshape(x_t, (1, 80, 80))

                st = np.append(stack_x[1:4, :, :], x_t, axis=0)

                if train:
                    # print()
                    reward, action, hx, cx, info_dict = train_and_play(p_action, st,\
                                                        select_action, perform_action,\
                                                        possible_actions, opt_nothing, \
                                                        model, {"isTrain":True, "hx":hx,"cx":cx})
                    reward_alive += 0.1
                    reward += reward_alive
                    rewards.append(reward)
                    # reward += r

                    entropies.append(info_dict["entropies"])
                    values.append(info_dict["values"])
                    log_probs.append(info_dict["log_probs"])

                else:
                    _, _, hx, cx, _ = play(p_action, st, select_action,\
                        perform_action, possible_actions, model, {"hx":hx,"cx":cx, "isTrain":False})

                stack_x = st

            if train:
                state = torch.from_numpy(stack_x)
                R = torch.zeros(1, 1)
                if steps > 0:
                    value, _, _ = model(
                        (Variable(state.unsqueeze(0).float()), (hx, cx)))

                values.append(Variable(R))
                policy_loss = 0
                value_loss = 0
                R = Variable(R)
                gae = torch.zeros(1, 1)

                for i in reversed(range(len(rewards))):
                    R = gamma * R + rewards[i]
                    advantage = R - values[i]
                    value_loss = value_loss + 0.5 * advantage.pow(2)

                    # Generalized Advantage Estimataion
                    delta_t = rewards[i] + gamma * \
                        values[i + 1].data - values[i].data
                    gae = gae * gamma * tau + delta_t

                    policy_loss = policy_loss - \
                        log_probs[i] * Variable(gae) - 0.01 * entropies[i]

                optimizer.zero_grad()

                (policy_loss + 0.5 * value_loss).backward()
                torch.nn.utils.clip_grad_norm(model.parameters(), 40)

                ensure_shared_grads(model, shared_model)
                optimizer.step()

        except KeyboardInterrupt as e:
            print("KeyboardInterrupt >>", e)
            print("Saving model")
            if train:
                save_model(shared_model, save_path)
                print("Model saved")
            sys.exit()

        score = p.score()
        p.reset_game()
        if train: save_model(shared_model, save_path)
        return score

    return main
예제 #14
0
def test_invalid_max_score():
    with pytest.raises(Exception):
        game = Pong(MAX_SCORE=-1)
예제 #15
0
def test_invalid_game_size():
    with pytest.raises(Exception):
        game = Pong(width=-200, height=-200)
예제 #16
0
def test_negative_ball_speed():
    with pytest.raises(Exception):
        game = Pong(ball_speed_ratio=-1)
예제 #17
0
def test_negative_player_speed():
    with pytest.raises(Exception):
        game = Pong(players_speed_ratio=-1)
예제 #18
0
파일: train.py 프로젝트: 1415984987/RL
        while True:
            action_index = agent.predict(obs)  # 选取最优动作
            action = ple_env.getActionSet()[action_index]
            reward = ple_env.act(action)
            obs = list(ple_env.getGameState().values())
            episode_reward += reward
            # if render:
            #     ple_env.getScreenRGB()
            if ple_env.game_over():
                break
        eval_reward.append(episode_reward)
    return np.mean(eval_reward)


# 创建环境
game = Pong(cpu_speed_ratio=0.3)
# game = Pong()
pong = PLE(game, display_screen=True, force_fps=True)
# 根据parl框架构建agent
print(pong.getActionSet())
action_dim = len(pong.getActionSet())
obs_shape = len(pong.getGameState())
print(pong.getGameState())
# 创建经验池
rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池

model = Model(act_dim=action_dim)
algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
agent = Agent(
    algorithm,
    obs_dim=obs_shape,
예제 #19
0
import numpy as np
import time

import gym

from ple.games.pong import Pong
from ple import PLE

NET_SIZE = [7]
ACTIVATION = relu

MAX_STEPS = 800
ITERATIONS = 1
# ENV_NAME = 'BipedalWalker-v2'
env = Pong(250, 250)

## Environment initialization
# env = gym.make(ENV_NAME)

## Initialize UMDAc
umdac = UMDAc(1,
              NET_SIZE,
              ACTIVATION,
              env,
              max_steps=MAX_STEPS,
              action_mode='raw',
              iterations=ITERATIONS)

new = umdac.load_specimen('resultPLE.txt')
예제 #20
0
from ple import PLE
from ple.games.pong import Pong
import pygame
import time
import sys

game = Pong(width=300, height=200)

p = PLE(game, fps=30, display_screen=True, force_fps=True)
p.init()

print(p.getActionSet())

nb_frames = 1000
action = None

for f in range(nb_frames):
    if p.game_over():
        p.reset_game()
    obs = p.getScreenRGB()
    events = pygame.event.get()
    for event in events:
        if event.type == pygame.QUIT:
            sys.exit()
        elif event.type == pygame.KEYDOWN:
            if event.key:
                action = event.key
                print(action)
        elif event.type == pygame.KEYUP:
            action = None
    p.act(action)
예제 #21
0
GENERATIONS = 900
GEN_SIZE = 100
N_SURV = 35
N_RAND_SURV = 15 

ITERATIONS = 1
MAX_STEPS = None

LOG_NOTES = 'gensize:', str(GEN_SIZE),' , nsurv:', str(
    N_SURV), ' nrandsurv:', str(N_RAND_SURV)
 
## Environment initialization
#env = FlappyBird()
#env = Snake()
# env = WaterWorld(250, 250)
env = Pong(150, 150)
## Initialize UMDAc
umdac = UMDAc(GEN_SIZE, NET_SIZE, ACTIVATION, 
              env, 
              max_steps=MAX_STEPS,
              iterations=ITERATIONS, 
              display_info=True)

## Reset training data loggers    
avg_reward_log = []
max_rewards = []
min_rewards = []
last_avg_reward = 0

for i in range(GENERATIONS):    
    ## Reset reward logger
예제 #22
0
    def __init__(self, game_name, rewards, state_as_image = True, fps = 30, force_fps=True, frame_skip=2,
                 hold_action=2, visualize=False, width=84, height=84, lives=1):
        """
        Initialize Pygame Learning Environment
        https://github.com/ntasfi/PyGame-Learning-Environment

        Args:
            env_name: PLE environment

            fps: frames per second
            force_fps: False for slower speeds
            frame_skip: number of env frames to skip
            hold_action: number of env frames to hold each action for
            isRGB: get color or greyscale version of statespace #isRGB = False,
            game_height,game_width: height and width of environment
            visualize: If set True, the program will visualize the trainings, will slow down training
            lives: number of lives in game. Game resets on game over (ie lives = 0). only in Catcher and Pong (score)

        """

        self.env_name = game_name
        self.rewards = rewards
        self.lives = lives
        self.state_as_image = state_as_image
        self.fps = fps #30  # frames per second
        self.force_fps = force_fps #True  # False for slower speeds
        self.frame_skip = frame_skip  # frames to skip
        self.ple_num_steps = hold_action  # frames to continue action for
        #self.isRGB = isRGB #always returns color, lets tensorforce due the processing
        self.visualize = visualize
        self.width = width
        self.height = height
        #testing
        self.reached_terminal = 0
        self.episode_time_steps = 0
        self.episode_reward = 0
        self.total_time_steps = 0

        if self.env_name == 'catcher':
            self.game = Catcher(width=self.width, height=self.height,init_lives=self.lives)
        elif self.env_name == 'pixelcopter':
            self.game = Pixelcopter(width=self.width, height=self.height)
        elif self.env_name == 'pong':
            self.game = Pong(width=self.width, height=self.height,MAX_SCORE=self.lives)
        elif self.env_name == 'puckworld':
            self.game = PuckWorld(width=self.width, height=self.height)
        elif self.env_name == 'raycastmaze':
            self.game = RaycastMaze(width=self.width, height=self.height)
        elif self.env_name == 'snake':
            self.game = Snake(width=self.width, height=self.height)
        elif self.env_name == 'waterworld':
            self.game = WaterWorld(width=self.width, height=self.height)
        elif self.env_name == 'monsterkong':
            self.game = MonsterKong()
        elif self.env_name == 'flappybird':
            self.game = FlappyBird(width=144, height=256)  # limitations on height and width for flappy bird
        else:
            raise TensorForceError('Unknown Game Environement.')

        if self.state_as_image:
           process_state = None
        else:
            #create a preprocessor to read the state dictionary as a numpy array
            def process_state(state):
                # ret_value = np.fromiter(state.values(),dtype=float,count=len(state))
                ret_value = np.array(list(state.values()), dtype=np.float32)
                return ret_value

        # make a PLE instance
        self.env = PLE(self.game,reward_values=self.rewards,fps=self.fps, frame_skip=self.frame_skip,
                       num_steps=self.ple_num_steps,force_fps=self.force_fps,display_screen=self.visualize,
                       state_preprocessor = process_state)
        #self.env.init()
        #self.env.act(self.env.NOOP) #game starts on black screen
        #self.env.reset_game()
        #self.env.act(self.env.NOOP)
        #self.env.act(self.env.NOOP)
        #self.env.act(self.env.NOOP)
        #self.env.act(self.env.NOOP)
        #self.env.reset_game()


        # setup gamescreen object
        if state_as_image:
            w, h = self.env.getScreenDims()
            self.gamescreen = np.empty((h, w, 3), dtype=np.uint8)
        else:
            self.gamescreen = np.empty(self.env.getGameStateDims(), dtype=np.float32)
        # if isRGB:
        #     self.gamescreen = np.empty((h, w, 3), dtype=np.uint8)
        # else:
        #     self.gamescreen = np.empty((h, w), dtype=np.uint8)

        # setup action converter
        # PLE returns legal action indexes, convert these to just numbers
        self.action_list = self.env.getActionSet()
        self.action_list = sorted(self.action_list, key=lambda x: (x is None, x))