Пример #1
0
def main(argv):
    try:
        opts, _ = getopt.getopt(argv, "hr")
    except getopt.GetoptError:
        print("birdML.py [-h | -r]")
        sys.exit(2)

    record = False
    for opt, arg in opts:
        if opt == '-h':
            print("-h to help")
            print("-r record")
        elif opt == '-r':
            record = True

    netb = netBrain()
    netb.summary()
    game = FlappyBird()
    p = PLE(game, fps=30, display_screen=True, force_fps=True)
    p.init()
    actions = p.getActionSet()

    out = 1

    epochs = 50
    for i in range(epochs):
        lstates = []
        rewards = []
        if record:
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            out = cv2.VideoWriter('Videos/test_' + str(i) + '.mov', fourcc,
                                  30.0, (288, 512))
        for d in range(10):
            while not p.game_over():
                if record:
                    obs = p.getScreenRGB()
                    obs = cv2.transpose(obs)
                    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR)
                    out.write(obs)
                st = game.getGameState()
                gstate = list(st.values())
                gstate = np.array([np.array(gstate)])
                lstates.append(gstate[0])
                pred = netb.predict(gstate)[0]
                a = pred.argmax()
                p.act(actions[a])
                if st['next_pipe_bottom_y'] < st['player_y']:
                    pred[0] = 1.0
                    pred[1] = 0.0
                elif st['next_pipe_top_y'] > st['player_y']:
                    pred[0] = 0.0
                    pred[1] = 1.0
                rewards.append(pred)
            p.reset_game()
        netb.fit(np.array(lstates),
                 np.array(rewards),
                 batch_size=10,
                 epochs=10)
        if record:
            out.release()
Пример #2
0
 def __init__(self):
     self.model = Model()
     self.game = FlappyBird(pipe_gap=125)
     self.env = PLE(self.game, fps=30, display_screen=False)
     self.env.init()
     self.env.getGameState = self.game.getGameState
     self.es = EvolutionStrategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE)
     self.exploration = self.INITIAL_EXPLORATION
Пример #3
0
def train(FRAME_TRAIN=1000005):
    game = FlappyBird()
    p = PLE(game, fps=30, display_screen=True)
    p.init()
    ob = game.getGameState()
    state = ob
    state = np.reshape(np.asarray(list(state.values())), [1, 8])
    total_reward = 0
    agent = DDQN_Agent.DeepQAgent()
    agent.load('model95000')
    batch_size = 32
    my_timer = time.time()
    prev_frame = 0
    data = []
    for i in range(FRAME_TRAIN):
        if p.game_over():
            data.append(total_reward)
            p.reset_game()
            print(
                "Total reward = {}, Frame = {}, epsilon = {}, frame/second = {}"
                .format(total_reward, i, agent.epsilon,
                        (i - prev_frame) / (time.time() - my_timer)))
            total_reward = 0
            prev_frame = i
            my_timer = time.time()

        # get action from agent
        action = agent.act(state)

        # take action
        reward = p.act(p.getActionSet()[action])

        # making the reward space less sparse
        if reward < 0:
            reward = -1

        total_reward += reward
        next_state = np.asarray(list(game.getGameState().values()))
        next_state = np.reshape(next_state, [1, 8])

        # remember and replay
        agent.remember(state, action, reward, next_state, p.game_over())
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

        state = next_state

        # save Model
        if i % 5000 == 0:
            print("Updating weights")
            agent.save('newmodel' + str(i))
            agent.target_model.set_weights(agent.model.get_weights())

        # Plot socre
        if i % 1000 == 0:
            plot(data)
def prepare_game():
    asset_dir = "../assets"

    game = FlappyBird()
    for c in game.images["player"]:
        image_assets = [
            os.path.join(asset_dir, "bird-upflap.png"),
            os.path.join(asset_dir, "bird-midflap.png"),
            os.path.join(asset_dir, "bird-downflap.png"),
        ]

        game.images["player"][c] = [pygame.image.load(im).convert_alpha() for im in image_assets]

    for b in game.images["background"]:
        game.images["background"][b] = pygame.image.load(os.path.join(asset_dir, "background.png")).convert()

    for c in ["red", "green"]:
        path = os.path.join(asset_dir, "pipe.png")

        game.images["pipes"][c] = {}
        game.images["pipes"][c]["lower"] = pygame.image.load(path).convert_alpha()
        game.images["pipes"][c]["upper"] = pygame.transform.rotate(game.images["pipes"][c]["lower"], 180)

    game.images["base"] = pygame.image.load(os.path.join(asset_dir, "base.png")).convert()

    return game
def run_game(nb_episodes, agent):
    """ Runs nb_episodes episodes of the game with agent picking the moves.
        An episode of FlappyBird ends with the bird crashing into a pipe or going off screen.
    """

    reward_values = {
        "positive": 1.0,
        "negative": 0.0,
        "tick": 0.0,
        "loss": 0.0,
        "win": 0.0
    }
    # TODO: when training use the following instead:
    # reward_values = agent.reward_values

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=True,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    # TODO: to speed up training change parameters of PLE as follows:
    # display_screen=False, force_fps=True
    env.init()

    score = 0
    tot_nb_episodes = nb_episodes
    average = 0
    highscore = 0
    over_50_count = 0
    while nb_episodes > 0:
        # pick an action
        # TODO: for training using agent.training_policy instead
        state, ignore = agent.state_binner(env.game.getGameState())
        action = agent.policy(state)

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # TODO: for training let the agent observe the current state transition

        score += reward

        # reset the environment if the game is over
        if env.game_over() or score >= 60:
            average += score
            if score > highscore:
                highscore = score
            if score >= 50:
                over_50_count += 1
            print("score for this episode: %d" % score)
            env.reset_game()
            nb_episodes -= 1
            score = 0
    print("Average for {} runs {:.2f}".format(tot_nb_episodes,
                                              average / tot_nb_episodes))
    over_50_p = (over_50_count / tot_nb_episodes) * 100
    print("The percentage of scores over 50 is: %d" % (over_50_p))
    return over_50_p
Пример #6
0
 def __init__(self, screen=False, forcefps=True):
     self.game = FlappyBird(pipe_gap=125)
     self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps)
     self.env.init()
     self.env.getGameState = self.game.getGameState
     def conv_layer(x, conv, stride = 1):
         return tf.nn.conv2d(x, conv, [1, stride, stride, 1], padding = 'SAME')
     def pooling(x, k = 2, stride = 2):
         return tf.nn.max_pool(x, ksize = [1, k, k, 1], strides = [1, stride, stride, 1], padding = 'SAME')
     self.X = tf.placeholder(tf.float32, [None, 80, 80, 4])
     self.Y = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE])
     w_conv1 = tf.Variable(tf.truncated_normal([8, 8, 4, 32], stddev = 0.1))
     conv1 = tf.nn.relu(conv_layer(self.X, w_conv1, stride = 4))
     pooling1 = pooling(conv1)
     w_conv2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev = 0.1))
     conv2 = tf.nn.relu(conv_layer(pooling1, w_conv2, stride = 2))
     w_conv3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev = 0.1))
     conv3 = tf.nn.relu(conv_layer(conv2, w_conv3))
     pulling_size = int(conv3.shape[1]) * int(conv3.shape[2]) * int(conv3.shape[3])
     conv3 = tf.reshape(conv3, [-1, pulling_size])
     tensor_action, tensor_validation = tf.split(conv3,2,1)
     w_action = tf.Variable(tf.truncated_normal([pulling_size // 2, self.OUTPUT_SIZE], stddev = 0.1))
     w_validation = tf.Variable(tf.truncated_normal([pulling_size // 2, 1], stddev = 0.1))
     fc_action = tf.matmul(tensor_action, w_action)
     fc_validation = tf.matmul(tensor_validation, w_validation))
     self.logits = fc_validation + tf.subtract(fc_action,tf.reduce_mean(fc_action,axis=1,keep_dims=True))
     self.cost = tf.reduce_sum(tf.square(self.Y - self.logits))
     self.optimizer = tf.train.AdamOptimizer(learning_rate = self.LEARNING_RATE).minimize(self.cost)
     self.sess = tf.InteractiveSession()
     self.sess.run(tf.global_variables_initializer())
     self.saver = tf.train.Saver(tf.global_variables())
     self.rewards = []
Пример #7
0
    def __init__(self,
                 env_name,
                 args,
                 atari_wrapper=False,
                 test=False,
                 seed=595):
        game = FlappyBird(width=144, height=256, pipe_gap=80)
        self.test = test
        #define reward
        reward_func = rewards = {
            "positive": 1,
            "negative": -1.0,
            "tick": 1,
            "loss": -5.0,
            "win": 1.0
        }

        self.p = PLE(game,
                     fps=30,
                     display_screen=False,
                     force_fps=True,
                     reward_values=reward_func,
                     rng=seed)
        self.observation = np.zeros((144, 256, 4, 3))
        # if atari_wrapper:
        #     clip_rewards = not test
        #     self.env = make_wrap_atari(env_name, clip_rewards)
        # else:
        #     self.env = gym.make(env_name)

        self.action_space = self.p.getActionSet()
Пример #8
0
 def __init__(self, model, screen=False, forcefps=True):
     self.model = model
     self.game = FlappyBird(pipe_gap=125)
     self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps)
     self.env.init()
     self.env.getGameState = self.game.getGameState
     self.es = Deep_Evolution_Strategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE)
Пример #9
0
def main():
    env = FlappyBird()
    penv = PLE(env, fps=30, display_screen=True, force_fps=True)
    #penv.init()
    np.random.seed(0)

    obs_shape = len(penv.getGameState())
    IMG_shape = penv.getScreenGrayscale().shape
    action_dim = len(penv.getActionSet())

    print(obs_shape, action_dim)

    rpm = ReplayMemory(MEMORY_SIZE)

    model = Model(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(model,
                                    act_dim=action_dim,
                                    gamma=GAMMA,
                                    lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_shape,
        act_dim=action_dim,
        e_greed=0.15,  # explore  0.1
        e_greed_decrement=1e-6  #1e-6
    )  # probability of exploring is decreasing during training

    # 加载模型
    if os.path.exists('./dqn_model.ckpt'):
        save_path = './dqn_model.ckpt'
        agent.restore(save_path)
        print("模型加载成功")
    eval_reward = evaluate(agent, penv)
Пример #10
0
def train(nb_episodes, agent):
    reward_values = agent.reward_values()

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=False,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    env.init()

    score = 0
    biggest_score = -50000
    avg_score = 0
    episodes = 0
    to_break = False
    while nb_episodes > 0:
        # pick an action
        state = env.game.getGameState()

        state = agent.state_binner(state)
        action = agent.training_policy(state)

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # let the agent observe the current state transition
        newState = env.game.getGameState()
        newState = agent.state_binner(newState)
        agent.observe(state, action, reward, newState, env.game_over())
        agent.frames += 1
        score += reward

        if ((agent.frames % 10000) == 0):
            to_break = True
        # reset the environment if the game is over
        if env.game_over():
            avg_score += score
            if score > biggest_score:
                biggest_score = score
                if biggest_score > 450:
                    break
                print(biggest_score)
                print(nb_episodes)
            if nb_episodes % 100 == 0:
                print(avg_score / 100)
                if avg_score / 100 >= 5:
                    break
                avg_score = 0
            if to_break:
                break

            #print("score for this episode: %d" % score)
            env.reset_game()

            nb_episodes -= 1
            score = 0

    return biggest_score
Пример #11
0
 def __init__(self, game="pixelcopter", fps=30):
     os.environ['SDL_VIDEODRIVER'] = 'dummy'
     self.game_name = game
     if game == "flappy":
         engine = FlappyBird()
     elif game == "pixelcopter":
         engine = Pixelcopter()
     else:
         assert False, "This game is not available"
     engine.rewards["loss"] = -5  # reward at terminal state
     self.reward_terminal = -5
     self.game = PLE(engine, fps=fps, display_screen=False)
     self.game.init()
     self.game.act(0)  # Start the game by providing arbitrary key as input
     self.key_input = self.game.getActionSet()
     self.reward = 0
Пример #12
0
    def __init__(self, display=False):
        """
        Initializes a new environment for FlappyBird game.
        """
        game = game = FlappyBird()
        self._game = PLE(game, fps=30, display_screen=display)
        
        # _display_game flag controls whether or not to render the state that is being provided by the 
        # environment.
        self._display_game = display
        
        if self._display_game:
            self._display = self.show_img() # display sets up a cv2 window where the current state is displayed.
            self._display.__next__() # iterate over the display generator.
        
        self.NUM_ACTIONS = len(self._game.getActionSet()) # defines the number of action agent can take in the environment.

        self._ACTION_MAP = {}
        for i, action in enumerate(self._game.getActionSet()):
            self._ACTION_MAP[i] = action
        
        # Number contiguous images the environment provides as state. Basically at any time, the
        # environment provides a stack of last 4 (including the current) images as the state to the agent.
        self._IMAGE_STACK_SIZE = 4

        # Dimension of the (greyscale) image provided as state.
        self._PROCESSED_IMAGE_SIZE = 84

        # Determines the number of times the provided action is executed before returning the next
        # state.
        self._SKIP_FRAMES = 4 

        # Used by the RL agent to set up it's CNN model.
        self.STATE_SPACE = (self._PROCESSED_IMAGE_SIZE, self._PROCESSED_IMAGE_SIZE, self._IMAGE_STACK_SIZE)
        self._init_states()
Пример #13
0
    def play(self, n=1, file_path=None):

        # use "Fancy" for full background, random bird color and random pipe color,
        # use "Fixed" (default) for black background and constant bird and pipe colors.
        game = FlappyBird(graphics="fixed")

        # Note: if you want to see you agent act in real time, set force_fps to False.
        # But don't use this setting for learning, just for display purposes.
        env = PLE(game,
                  fps=30,
                  frame_skip=1,
                  num_steps=1,
                  force_fps=False,
                  display_screen=True)

        # Init the environment (settings, display...)
        env.init()

        # Load the model
        model = load_model(file_path)

        # Let's play n games, and see if the model is correctly trained
        for _ in range(n):
            env.reset_game()
            while not env.game_over():
                S = self.get_game_data(game)
                Q = model.predict(S, batch_size=1)
                A = np.argmax(Q[0])
                env.act(self.ACTIONS[A])
Пример #14
0
    def __init__(self, playback_mode, mod=None):
        self._playback_mode = playback_mode

        env = FlappyBird(pipe_gap=200)
        self._ple = PLE(env, fps=30, display_screen=DISPLAY)
        self._ple.init()

        self._sess = tf.Session()
        self._agent = DDQNAgent(self._sess, DIM_STATE, DIM_ACTION, LR, TAU, net_name='cnn_bird')
        self._sess.run(tf.global_variables_initializer())
        self._agent.update_target_paras()

        self._saver = tf.train.Saver()
        self._replay_buffer = ReplayBuffer(BUFFER_SIZE)
        self._explorer = Explorer(EPS_BEGIN, EPS_END, EPS_STEPS, playback_mode)
        self.summary = Summary(self._sess, DIR_SUM)

        self.summary.add_variable(tf.Variable(0.), 'reward')
        self.summary.add_variable(tf.Variable(0.), 'loss')
        self.summary.add_variable(tf.Variable(0.), 'maxq')
        self.summary.build()
        self.summary.write_variables(FLAGS)

        self._steps = 0

        if mod and os.path.exists(FLAGS.dir_mod.format(mod)):
            checkpoint = tf.train.get_checkpoint_state(FLAGS.dir_mod.format(mod))
            self._saver.restore(self._sess, save_path=checkpoint.model_checkpoint_path)
            print("Loaded checkpoints {0}".format(checkpoint.model_checkpoint_path))
Пример #15
0
def _test_ple():
    from ple.games.pong import Pong
    from ple.games.flappybird import FlappyBird
    from ple import PLE
    # os.environ['SDL_VIDEODsRIVER'] = 'dummy'
    game = Pong()
    game = FlappyBird()
    ple_game = PLE(game, fps=30, display_screen=True)
    ple_game.init()
    ALLOWED_ACTIONS = ple_game.getActionSet()

    print(ALLOWED_ACTIONS)
    action = 0
    start = time()
    t = 0
    while True:
        ep_reward = 0
        ple_game.reset_game()
        while not ple_game.game_over():
            sleep(0.1)
            t += 1
            if t % 15 == 5:
                action = 0
            else:
                action = 1
            reward = ple_game.act(ALLOWED_ACTIONS[action])
            # print(reward)
            ep_reward += reward
        print(ep_reward, t, t / (time() - start))
 def __init__(self, screen=False, forcefps=True):
     self.game = FlappyBird(pipe_gap=125)
     self.env = PLE(self.game,
                    fps=30,
                    display_screen=screen,
                    force_fps=forcefps)
     self.env.init()
     self.env.getGameState = self.game.getGameState
     self.X = tf.placeholder(tf.float32, (None, self.INPUT_SIZE))
     self.REWARDS = tf.placeholder(tf.float32, (None))
     self.ACTIONS = tf.placeholder(tf.int32, (None))
     input_layer = tf.Variable(
         tf.random_normal([self.INPUT_SIZE, self.LAYER_SIZE]))
     bias = tf.Variable(tf.random_normal([self.LAYER_SIZE]))
     output_layer = tf.Variable(
         tf.random_normal([self.LAYER_SIZE, self.OUTPUT_SIZE]))
     feed_forward = tf.nn.relu(tf.matmul(self.X, input_layer) + bias)
     self.logits = tf.nn.softmax(tf.matmul(feed_forward, output_layer))
     indexes = tf.range(0,
                        tf.shape(self.logits)[0]) * tf.shape(
                            self.logits)[1] + self.ACTIONS
     responsible_outputs = tf.gather(tf.reshape(self.logits, [-1]), indexes)
     self.cost = -tf.reduce_mean(tf.log(responsible_outputs) * self.REWARDS)
     self.optimizer = tf.train.AdamOptimizer(
         learning_rate=self.LEARNING_RATE).minimize(self.cost)
     self.sess = tf.InteractiveSession()
     self.sess.run(tf.global_variables_initializer())
     self.saver = tf.train.Saver(tf.global_variables())
     self.rewards = []
Пример #17
0
def train(nb_frames, agent):
    reward_values = agent.reward_values()

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=False,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    env.init()

    score = 0
    biggest_score = -5
    avg_score = 0
    avrage = []
    count = []
    nb_episodes = 0
    number_of_frames = 0
    while number_of_frames < nb_frames:
        # pick an action
        state = env.game.getGameState()
        state = agent.state_binner(state)
        action = agent.training_policy(state)

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # let the agent observe the current state transition
        newState = env.game.getGameState()
        newState = agent.state_binner(newState)
        agent.observe(state, action, reward, newState, env.game_over())

        score += reward
        number_of_frames += 1
        # reset the environment if the game is over
        if env.game_over():
            nb_episodes += 1
            avg_score += score
            if score > biggest_score:
                biggest_score = score
                print(biggest_score)
                print(nb_episodes)
                print(number_of_frames)
            if nb_episodes % 100 == 0:
                print(avg_score / 100)
                avrage.append(avg_score / 100)
                count.append(number_of_frames)
                avg_score = 0

            #print("score for this episode: %d" % score)
            agent.calculate()
            env.reset_game()

            score = 0

    print(biggest_score)
    data = {"Count": count, "Avrage": avrage}
    df = pd.DataFrame(data)
    sns.relplot(x="Count", y="Avrage", ci=None, kind="line", data=df)
Пример #18
0
 def __init__(self, screen=False, forcefps=True):
     self.game = FlappyBird(pipe_gap=125)
     self.env = PLE(self.game,
                    fps=30,
                    display_screen=screen,
                    force_fps=forcefps)
     self.env.init()
     self.env.getGameState = self.game.getGameState
     self.X = tf.placeholder(tf.float32, (None, None, self.INPUT_SIZE))
     self.Y = tf.placeholder(tf.float32, (None, self.OUTPUT_SIZE))
     cell = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple=False)
     self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * 512))
     self.rnn, self.last_state = tf.nn.dynamic_rnn(
         inputs=self.X,
         cell=cell,
         dtype=tf.float32,
         initial_state=self.hidden_layer)
     action_layer = tf.Variable(
         tf.random_normal([layer_size // 2, output_size]))
     validation_layer = tf.Variable(tf.random_normal([layer_size // 2, 1]))
     tensor_action, tensor_validation = tf.split(self.rnn[:, -1, :], 2, 1)
     feed_action = tf.matmul(tensor_action, action_layer)
     feed_validation = tf.matmul(tensor_validation, validation_layer)
     self.logits = feed_validation + tf.subtract(
         feed_action, tf.reduce_mean(feed_action, axis=1, keep_dims=True))
     self.cost = tf.reduce_sum(tf.square(self.Y - self.logits))
     self.optimizer = tf.train.AdamOptimizer(
         learning_rate=self.LEARNING_RATE).minimize(self.cost)
     self.sess = tf.InteractiveSession()
     self.sess.run(tf.global_variables_initializer())
     self.saver = tf.train.Saver(tf.global_variables())
     self.rewards = []
Пример #19
0
 def __init__(self, screen=False, forcefps=True):
     self.game = FlappyBird(pipe_gap=125)
     self.env = PLE(self.game,
                    fps=30,
                    display_screen=screen,
                    force_fps=forcefps)
     self.env.init()
     self.env.getGameState = self.game.getGameState
     self.X = tf.placeholder(tf.float32, (None, None, input_size))
     self.Y = tf.placeholder(tf.float32, (None, output_size))
     cell = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple=False)
     self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * 512))
     self.rnn, self.last_state = tf.nn.dynamic_rnn(
         inputs=self.X,
         cell=cell,
         dtype=tf.float32,
         initial_state=self.hidden_layer)
     w = tf.Variable(tf.random_normal([512, output_size]))
     self.logits = tf.matmul(self.rnn[:, -1], w)
     self.cost = tf.reduce_sum(tf.square(self.Y - self.logits))
     self.optimizer = tf.train.AdamOptimizer(
         learning_rate=self.LEARNING_RATE).minimize(self.cost)
     self.sess = tf.InteractiveSession()
     self.sess.run(tf.global_variables_initializer())
     self.saver = tf.train.Saver(tf.global_variables())
     self.rewards = []
Пример #20
0
def test_agent(policy, file_writer=None, test_games=10, step=0):
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=False)
    env.init()

    test_rewards = []
    for _ in range(test_games):
        env.reset_game()
        no_op(env)

        game_rew = 0

        while not env.game_over():

            state = flappy_game_state(env)

            action = 119 if policy(state) == 1 else None

            for _ in range(2):
                game_rew += env.act(action)

        test_rewards.append(game_rew)

        if file_writer is not None:
            summary = tf.Summary()
            summary.value.add(tag='test_performance', simple_value=game_rew)
            file_writer.add_summary(summary, step)
            file_writer.flush()

    return test_rewards
 def __init__(self, screen=False, forcefps=True):
     self.game = FlappyBird(pipe_gap=125)
     self.env = PLE(self.game,
                    fps=30,
                    display_screen=screen,
                    force_fps=forcefps)
     self.env.init()
     self.env.getGameState = self.game.getGameState
     self.X = tf.placeholder(tf.float32, (None, self.INPUT_SIZE))
     self.Y = tf.placeholder(tf.float32, (None, self.OUTPUT_SIZE))
     input_layer = tf.Variable(
         tf.random_normal([self.INPUT_SIZE, self.LAYER_SIZE]))
     bias = tf.Variable(tf.random_normal([self.LAYER_SIZE]))
     action_layer = tf.Variable(
         tf.random_normal([self.LAYER_SIZE // 2, self.OUTPUT_SIZE]))
     validation_layer = tf.Variable(
         tf.random_normal([self.LAYER_SIZE // 2, 1]))
     feed_forward = tf.nn.relu(tf.matmul(self.X, input_layer) + bias)
     self.tensor_action, self.tensor_validation = tf.split(
         feed_forward, 2, 1)
     self.feed_action = tf.matmul(self.tensor_action, action_layer)
     self.feed_validation = tf.matmul(self.tensor_validation,
                                      validation_layer)
     self.logits = self.feed_validation + tf.subtract(
         self.feed_action,
         tf.reduce_mean(self.feed_action, axis=1, keep_dims=True))
     self.cost = tf.reduce_sum(tf.square(self.Y - self.logits))
     self.optimizer = tf.train.AdamOptimizer(
         learning_rate=self.LEARNING_RATE).minimize(self.cost)
     self.sess = tf.InteractiveSession()
     self.sess.run(tf.global_variables_initializer())
     self.saver = tf.train.Saver(tf.global_variables())
     self.rewards = []
Пример #22
0
def main_test():
    final_score = 0
    previous_action = 1
    model = build_neural_network_model()
    game = FlappyBird(width=288, height=512, pipe_gap=100)
    env = PLE(game, fps=30, display_screen=True, state_preprocessor=process_state)
    model = load_model("model.h5")
    env.init()
    passed = 0
    old_y = 0
    for i in range(game_steps):
        if i == game_steps - 1:
            print("Score: {}".format(final_score))
        if env.game_over():
            print("Final Score: {}".format(final_score))
            time.sleep(1)
            final_score = 0
            env.reset_game()

        observation = env.getGameState()

        vector = model.predict(np.matrix(list(observation[0].values())))
        a_star = np.argmax(vector[0])
        print(vector[0][0], vector[0][1], a_star)
        time.sleep(0.05)
        env_reward = env.act(env.getActionSet()[a_star])
        if env_reward == 1:
            final_score += 1
Пример #23
0
    def score(self, training=True, nb_episodes=10):
        reward_values = {
            'positive': 1.0,
            'negative': 0.0,
            'tick': 0.0,
            'loss': 0.0,
            'win': 0.0
        }

        env = PLE(FlappyBird(),
                  fps=30,
                  display_screen=False,
                  force_fps=True,
                  rng=None,
                  reward_values=reward_values)
        env.init()

        total_episodes = nb_episodes
        score = 0
        scores = []
        while nb_episodes > 0:
            # pick an action
            state = env.game.getGameState()
            action = self.policy(state)

            # step the environment
            reward = env.act(env.getActionSet()[action])

            score += reward

            # reset the environment if the game is over
            if env.game_over() or score >= 100:
                scores.append(score)
                env.reset_game()
                nb_episodes -= 1
                score = 0

        avg_score = sum(scores) / float(len(scores))
        print('Games played: {}'.format(total_episodes))
        print('Average score: {}'.format(avg_score))

        if training:
            score_file = '{}/scores.csv'.format(self.name)
            # If file doesn't exist, add the header
            if not os.path.isfile(score_file):
                with open(score_file, 'a') as f:
                    f.write('avg_score,episode_count,num_of_frames,min,max\n')

            # Append scores to the file
            with open(score_file, 'a') as f:
                f.write('{},{},{},{},{}\n'.format(avg_score,
                                                  self.num_of_episodes,
                                                  self.num_of_frames,
                                                  min(scores), max(scores)))

        else:
            with open('scores.txt', 'a') as f:
                for score in scores:
                    f.write('{},{}\n'.format(self.name, score))
def main():
    env = FlappyBird()
    penv = PLE(env, fps=30, display_screen=True,force_fps=True)
    #penv.init()
    np.random.seed(0)

    obs_shape = len(penv.getGameState())
    IMG_shape = penv.getScreenGrayscale().shape
    action_dim = len(penv.getActionSet())


    print(obs_shape,action_dim)

    rpm = ReplayMemory(MEMORY_SIZE)

    model = Model(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(
        model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_shape,
        act_dim=action_dim,
        e_greed=0.15,  # explore  0.1
        e_greed_decrement=1e-6   #1e-6
    )  # probability of exploring is decreasing during training




    # 加载模型
    if os.path.exists('./dqn_model.ckpt'):
        save_path = './dqn_model.ckpt'
        agent.restore(save_path)
        print("模型加载成功")

    while len(rpm) < MEMORY_WARMUP_SIZE:  # warm up replay memory
        run_episode(agent, penv, rpm)

    max_episode = 1000

    # start train
    episode = 0
    while episode < max_episode:

        # train part
        for i in range(0, 50):
            total_reward = run_episode(agent, penv, rpm)
            episode += 1

        eval_reward = evaluate(agent, penv)
        logger.info('episode:{}    test_reward:{}'.format(
            episode, eval_reward))
        # 训练结束,保存模型
        save_path = './model/dqn_model_{}_{}.ckpt'.format(episode, eval_reward)
        agent.save(save_path)

    # 训练结束,保存模型
    save_path = './dqn_model.ckpt'
    agent.save(save_path)
Пример #25
0
def train(nb_frames, agent, a, g, results):
    print("alpha %f" % a)
    print("gamma %f" % g)
    reward_values = agent.reward_values()

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=False,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    env.init()

    score = 0
    biggest_score = -5
    avg_score = 0

    number_of_frames = 0
    nb_episodes = 0
    while number_of_frames < nb_frames:
        # pick an action
        state = env.game.getGameState()
        state = agent.state_binner(state)
        action = agent.training_policy(state)

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # let the agent observe the current state transition
        newState = env.game.getGameState()
        newState = agent.state_binner(newState)
        agent.observe(state, action, reward, newState, env.game_over())

        score += reward
        number_of_frames += 1
        # reset the environment if the game is over
        if env.game_over():
            nb_episodes += 1
            avg_score += score
            if score > biggest_score:
                biggest_score = score
                print(biggest_score)
                print(nb_episodes)
                print(number_of_frames)
            if nb_episodes % 100 == 0:
                print(avg_score / 100)
                results[0].append(avg_score / 100)
                results[1].append(number_of_frames)
                results[2].append(a)
                results[3].append(g)
                avg_score = 0

            #print("score for this episode: %d" % score)
            env.reset_game()

            score = 0
    print(biggest_score)
    return results
Пример #26
0
    def __init__(self):
        self.game = FlappyBird()
        self.p = PLE(self.game, fps=30, display_screen=True)

        # self.actions = self.p.getActionSet()
        # self._action_space = list(range(self.actions[0]))
        # self._action_space.append(self.actions[-1])
        self.action_space = self.p.getActionSet()
Пример #27
0
 def __init__(self, render=False, seed=0, pipe_gap=100):
     self.seed = seed
     print('SEED: {}'.format(self.seed))
     game = FlappyBird(pipe_gap=pipe_gap)
     self.env = PLE(game, fps=30, display_screen=render, rng=seed)
     self.env.init()
     self.full_state = np.zeros((1, 4, 80, 80), dtype=np.uint8)
     self.frame_sleep = 0.02
Пример #28
0
def train(FRAME_TRAIN=1000005):
    game = FlappyBird()
    p = PLE(game, fps=30, display_screen=True)
    p.init()
    ob = game.getGameState()
    state = ob
    state = np.reshape(np.asarray(list(state.values())), [1, 8])
    total_reward = 0
    agent = DDQN_Agent.DeepQAgent()
    agent.load("model95000")
    batch_size = 32
    my_timer = time.time()
    prev_frame = 0
    data = []
    for i in range(FRAME_TRAIN):
        if p.game_over():
            data.append(total_reward)
            p.reset_game()
            print(
                "Total reward = {}, Frame = {}, epsilon = {}, frame/second = {}"
                .format(total_reward, i, agent.epsilon,
                        (i - prev_frame) / (time.time() - my_timer)))
            total_reward = 0
            prev_frame = i
            my_timer = time.time()

        # get action from agent
        action = agent.act(state)

        # take action
        reward = p.act(p.getActionSet()[action])

        # making the reward space less sparse
        if reward < 0:
            reward = -1

        total_reward += reward
        next_state = np.asarray(list(game.getGameState().values()))
        next_state = np.reshape(next_state, [1, 8])

        state = next_state
        # time.sleep(0.3)
        # Plot socre
        if i % 1000 == 0:
            plot(data)
Пример #29
0
    def __init__(self, display_screen):
        self.width = IMAGE_WIDTH
        self.height = IMAGE_HEIGHT

        self.count = 0
        self.p = PLE(FlappyBird(), fps=30, display_screen=display_screen)
        self.p.init()
        self._update_state()
        self.score = 0
Пример #30
0
    def __init__(self):
        self.game = FlappyBird(pipe_gap=110)
        self.env = PLE(self.game, fps=30, display_screen=False)
        self.env.init()
        self.env.getGameState = self.game.getGameState  # maybe not necessary

        # by convention we want to use (0,1)
        # but the game uses (None, 119)
        self.action_map = self.env.getActionSet()  # [None, 119]
Пример #31
0
# You're not allowed to change this file
from ple.games.flappybird import FlappyBird
from ple import PLE
import numpy as np
from FlappyAgent import FlappyPolicy

game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

p.init()
reward = 0.0

nb_games = 100
cumulated = np.zeros((nb_games))

for i in range(nb_games):
    p.reset_game()
    
    while(not p.game_over()):
        state = game.getGameState()
        screen = p.getScreenRGB()
        action=FlappyPolicy(state, screen) ### Your job is to define this function.
        
        reward = p.act(action)
        cumulated[i] = cumulated[i] + reward

average_score = np.mean(cumulated)
max_score = np.max(cumulated)