def run_a_game(self,game):
     from ple import PLE
     p =  PLE(game,display_screen=True)
     agent = NaiveAgent(p.getActionSet())
     p.init()
     reward = p.act(p.NOOP)
     for i in range(NUM_STEPS):
         obs = p.getScreenRGB()
         reward = p.act(agent.pickAction(reward,obs))
class Env:
  def __init__(self):
    self.game = FlappyBird(pipe_gap=125)
    self.env = PLE(self.game, fps=30, display_screen=True)
    self.env.init()
    self.env.getGameState = self.game.getGameState # maybe not necessary

    # by convention we want to use (0,1)
    # but the game uses (None, 119)
    self.action_map = self.env.getActionSet() #[None, 119]

  def step(self, action):
    action = self.action_map[action]
    reward = self.env.act(action)
    done = self.env.game_over()
    obs = self.get_observation()
    # don't bother returning an info dictionary like gym
    return obs, reward, done

  def reset(self):
    self.env.reset_game()
    return self.get_observation()

  def get_observation(self):
    # game state returns a dictionary which describes
    # the meaning of each value
    # we only want the values
    obs = self.env.getGameState()
    return np.array(list(obs.values()))

  def set_display(self, boolean_value):
    self.env.display_screen = boolean_value
Пример #3
0
def main_naive():
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True)
    my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet())

    env.init()
    reward = 0.0
    nb_frames = 10000

    for i in range(nb_frames):
        if env.game_over():
            env.reset_game()

        observation = env.getScreenRGB()
        action = my_agent.pickAction(reward, observation)
        reward = env.act(action)
Пример #4
0
    def train(self, scratch, game, display):
        p = PLE(game,
                fps=30,
                frame_skip=1,
                num_steps=1,
                force_fps=True,
                display_screen=display)
        fname = None
        if not scratch:
            fname = self.load()
        else:
            delete_files(self.DATA_DIREC)
        f0, step, nb_save, nb_games = init_train(fname, self.DATA_DIREC)

        eps_tau = (self.NB_FRAMES - f0) // self.EPS_RATE
        scores = []
        while step < self.NB_FRAMES:
            if len(scores) == self.SCORE_FREQ:
                print_scores(scores, self.SCORE_FREQ)
                scores = []

            p.reset_game()
            state = game.getGameState()
            state_arr = self.state_to_arr(state)
            # state_arr = self.scaler.transform(state_arr.reshape(1, -1))
            gscore = 0
            nb_games += 1
            while not p.game_over():
                step += 1
                if step != 0 and (step % self.SAVE_FREQ) == 0:
                    self.save(
                        chr(97 + nb_save) + '_' + str(step) + '_' +
                        str(nb_games))
                    nb_save += 1
                if step != 0 and (step % self.EPS_UPDATE_FREQ) == 0:
                    self.epsilon = update_epsilon(step, f0, self.EPS0, eps_tau,
                                                  self.NB_FRAMES)
                    print('WEIGHTS ABS MEAN')
                    print(abs(np.mean(self.model.get_weights()[0], axis=1)))

                # 1) In s, choose a (GLIE actor)
                qvals = self.get_qvals(state)
                act = self.greedy_action(qvals, self.epsilon)

                # 2) Observe r, s′
                bare_reward = p.act(ACTIONS[act])
                reward = self.reward_engineering(bare_reward)
                new_state = game.getGameState()
                new_state_arr = self.state_to_arr(state)

                self.replay_memory.append(
                    (state_arr, act, reward, new_state_arr))
                if (len(self.replay_memory) == self.BUFFER_SIZE
                        and step % self.TRAIN_FREQ == 0):

                    X_train = []
                    y_train = []

                    # TEST: TRAIN ONLY WITH A SMALL BUFFER BATCH
                    replay_memory_copy = list(self.replay_memory)[:]
                    random.shuffle(replay_memory_copy)
                    for frame in replay_memory_copy[:self.BATCH_SIZE]:
                        s_arr_1, act_x, bare_reward_x, s_arr_2 = frame
                        reward_x = self.reward_engineering(bare_reward_x)
                        old_qval = self.model.predict(s_arr_1, batch_size=1)
                        qval_new = self.model.predict(s_arr_2, batch_size=1)
                        max_qval = np.max(qval_new)
                        # terminal state
                        if bare_reward < 0:
                            delta = reward_x
                        else:
                            delta = reward_x + self.GAMMA * max_qval
                        y = np.zeros((1, len(ACTIONS)))
                        y[0][:] = old_qval[0][:]
                        y[0][act_x] = old_qval[0][act_x] + self.ALPHA * delta
                        X_train.append(s_arr_1.reshape(len(STATES), ))
                        y_train.append(y.reshape(len(ACTIONS), ))

                    X_train = np.array(X_train)
                    y_train = np.array(y_train)
                    self.model.fit(X_train,
                                   y_train,
                                   batch_size=self.BATCH_SIZE,
                                   epochs=2,
                                   verbose=False)

                # 5) s <- s'
                state = new_state
                state_arr = new_state_arr

                if bare_reward > 0:
                    gscore += 1
            scores.append(gscore)

        self.save(chr(97 + nb_save) + '_' + str(step) + '_' + str(nb_games))
class Agent:

    LEARNING_RATE = 0.003
    BATCH_SIZE = 32
    INPUT_SIZE = 8
    LAYER_SIZE = 500
    OUTPUT_SIZE = 2
    EPSILON = 1
    DECAY_RATE = 0.005
    MIN_EPSILON = 0.1
    GAMMA = 0.99
    MEMORIES = deque()
    COPY = 1000
    T_COPY = 0
    MEMORY_SIZE = 300
    # based on documentation, features got 8 dimensions
    # output is 2 dimensions, 0 = do nothing, 1 = jump

    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        self.model = Model(self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE)
        self.model_negative = Model(self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.trainable = tf.trainable_variables()
        self.rewards = []

    def _assign(self):
        for i in range(len(self.trainable)//2):
            assign_op = self.trainable[i+len(self.trainable)//2].assign(self.trainable[i])
            sess.run(assign_op)

    def _memorize(self, state, action, reward, new_state, done):
        self.MEMORIES.append((state, action, reward, new_state, done))
        if len(self.MEMORIES) > self.MEMORY_SIZE:
            self.MEMORIES.popleft()

    def _select_action(self, state):
        if np.random.rand() < self.EPSILON:
            action = np.random.randint(self.OUTPUT_SIZE)
        else:
            action = self.get_predicted_action([state])
        return action

    def _construct_memories(self, replay):
        states = np.array([a[0] for a in replay])
        new_states = np.array([a[3] for a in replay])
        Q = self.predict(states)
        Q_new = self.predict(new_states)
        Q_new_negative = sess.run(self.model_negative.logits, feed_dict={self.model_negative.X:new_states})
        replay_size = len(replay)
        X = np.empty((replay_size, self.INPUT_SIZE))
        Y = np.empty((replay_size, self.OUTPUT_SIZE))
        for i in range(replay_size):
            state_r, action_r, reward_r, new_state_r, done_r = replay[i]
            target = Q[i]
            target[action_r] = reward_r
            if not done_r:
                target[action_r] += self.GAMMA * Q_new_negative[i, np.argmax(Q_new[i])]
            X[i] = state_r
            Y[i] = target
        return X, Y

    def predict(self, inputs):
        return self.sess.run(self.model.logits, feed_dict={self.model.X:inputs})

    def save(self, checkpoint_name):
        self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name))
        with open('%s-acc.p'%(checkpoint_name), 'wb') as fopen:
            pickle.dump(self.rewards, fopen)

    def load(self, checkpoint_name):
        self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name))
        with open('%s-acc.p'%(checkpoint_name), 'rb') as fopen:
            self.rewards = pickle.load(fopen)

    def get_predicted_action(self, sequence):
        prediction = self.predict(np.array(sequence))[0]
        return np.argmax(prediction)

    def get_state(self):
        state = self.env.getGameState()
        return np.array(list(state.values()))

    def get_reward(self, iterations, checkpoint):
        for i in range(iterations):
            total_reward = 0
            self.env.reset_game()
            done = False
            while not done:
                if (self.T_COPY + 1) % self.COPY == 0:
                    self._assign()
                state = self.get_state()
                action  = self._select_action(state)
                real_action = 119 if action == 1 else None
                reward = self.env.act(real_action)
                total_reward += reward
                new_state = self.get_state()
                done = self.env.game_over()
                self._memorize(state, action, reward, new_state, done)
                batch_size = min(len(self.MEMORIES), self.BATCH_SIZE)
                replay = random.sample(self.MEMORIES, batch_size)
                X, Y = self._construct_memories(replay)
                cost, _ = self.sess.run([self.model.cost, self.model.optimizer], feed_dict={self.model.X: X, self.model.Y:Y})
                self.T_COPY += 1
            self.rewards.append(total_reward)
            self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i)
            if (i+1) % checkpoint == 0:
                print('epoch:', i + 1, 'total rewards:', total_reward)
                print('epoch:', i + 1, 'cost:', cost)

    def fit(self, iterations, checkpoint):
        self.get_reward(iterations, checkpoint)
Пример #6
0
            qval = Q_function[RS[0]][RS[1]][RS[2]]
            if (
                    qval[0] == qval[1]
            ):  #Comme il est possible que lors de l'apprentissage, tous les états n'aient pas été "découvert", et que lors de l'initialisation
                #  de Q_function qval[0]=qval[1], alors si tel est le cas, je choisi de rien faire car il est plus risqué de flappé que de ne rie
                Action = None

            else:  #choose best action from Q(s,a) values
                Action = bool_to_act(qval.argmax())
        else:  #epsilon-greedy

            Action = bool_to_act(np.random.randint(0, 2))

        r = 5 * p.act(
            Action
        ) + 1  # fonction reward vaut 1 si le jeu n'est pas fini et 6 si un tuyau a été franchi
        cumulated[k] = cumulated[k] + (r - 1) / 5

        new_state = game.getGameState()
        ns = reduce_state(new_state)
        partie.append([
            RS, act_to_bool(Action), r, ns
        ])  #On enregistre l'état, l'action, la récompense et le futur état

        RS = ns  # mise à jour de l'état

    # save the model every 500 epochs
    if k % 500 == 0:
        print("saving model")
        f_myfile = open('Q_function_avecrandom.pickle', 'wb')
Пример #7
0
def play(size_image):
    sess = tf.InteractiveSession()

    img_size = 80
    net = NetworkOld(img_size)

    # open up a game state to communicate with emulator
    game = flappybird.prepare_game()
    p = PLE(game, fps=30, display_screen=True)
    p.init()
    reward = 0.0

    # get the first state by doing nothing and preprocess the image to 80x80x4

    actions = p.getActionSet()
    p.act(actions[1])

    s_t = preprocessing.transform_image(p.getScreenRGB(), img_size)

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    checkpoint = tf.train.get_checkpoint_state("../saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # start training
    t = 0
    while t < MAX_ITE:
        if p.game_over():
            p.reset_game()
            terminal = True
        else:
            terminal = False

        # choose an action epsilon greedily
        readout_t = net.readout.eval(feed_dict={net.s: [s_t]})[0]
        a_t = np.zeros([ACTIONS])

        action_index = np.argmax(readout_t)
        a_t[action_index] = 1

        # run the selected action and observe next state and reward
        action = int(np.argmax(a_t))
        if action == 0:
            action = 1
        else:
            action = 0
        r_t = p.act(actions[action])

        s_t1 = preprocessing.transform_image_stacked(p.getScreenRGB(), s_t,
                                                     img_size)

        # update the old values
        s_t = s_t1
        t += 1

        print("TIMESTEP", t, "/ ACTION", action_index, "/ REWARD", r_t,
              "/ Q_MAX %e" % np.max(readout_t), " / SCORE", p.score())
Пример #8
0
class PLEEnv(gym.Env):
    metadata = {'render.modes': ['human', 'rgb_array']}

    def __init__(self, game_name, display_screen=True):
        # set headless mode
        os.environ['SDL_VIDEODRIVER'] = 'dummy'
        # open up a game state to communicate with emulator
        import importlib
        game_module_name = ('ple.games.%s' % game_name).lower()
        game_module = importlib.import_module(game_module_name)
        game = getattr(game_module, game_name)()
        self.game_state = PLE(game,
                              fps=30,
                              frame_skip=2,
                              display_screen=display_screen)
        self.game_state.init()
        self._action_set = self.game_state.getActionSet()
        self.action_space = spaces.Discrete(len(self._action_set))
        self.screen_width, self.screen_height = self.game_state.getScreenDims()
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(self.screen_width,
                                                   self.screen_height, 3))
        self.viewer = None
        self.count = 0

    def step(self, a):
        reward = self.game_state.act(self._action_set[a])
        state = self._get_image()
        #import scipy.misc
        #scipy.misc.imsave('outfile'+str(self.count)+'.jpg', state)
        #self.count = self.count+1
        terminal = self.game_state.game_over()
        #print(randomAction)
        #print(a,self._action_set[a])
        return state, reward, terminal, {}

    def _get_image(self):
        #image_rotated = self.game_state.getScreenRGB()
        image_rotated = np.fliplr(
            np.rot90(self.game_state.getScreenRGB(),
                     3))  # Hack to fix the rotated image returned by ple
        return image_rotated

    @property
    def n_actions(self):
        return len(self._action_set)

    # return: (states, observations)
    def reset(self):
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(self.screen_width,
                                                   self.screen_height, 3))
        self.game_state.reset_game()
        state = self._get_image()
        return state

    def render(self, mode='human', close=False):
        #print('HERE')
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return
        img = self._get_image()
        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            from gym.envs.classic_control import rendering
            if self.viewer is None:
                self.viewer = rendering.SimpleImageViewer()
            self.viewer.imshow(img)

    def seed(self, seed):
        rng = np.random.RandomState(seed)
        self.game_state.rng = rng
        self.game_state.game.rng = self.game_state.rng

        self.game_state.init()
Пример #9
0
class Agent:
    GENERATION = 0;
    AGENT_HISTORY_LENGTH = 1
    NUM_OF_ACTIONS = 2
    POPULATION_SIZE = 15
    EPS_AVG = 1
    SIGMA = 0.1
    LEARNING_RATE = 0.03
    INITIAL_EXPLORATION = 0.0
    FINAL_EXPLORATION = 0.0
    EXPLORATION_DEC_STEPS = 100000

    def __init__(self):
        self.model = Model()
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game, fps=30, display_screen=False)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        self.es = EvolutionStrategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA,
                                    self.LEARNING_RATE)
        self.exploration = self.INITIAL_EXPLORATION

    def get_predicted_action(self, sequence):
        prediction = self.model.predict(np.array(sequence))
        x = np.argmax(prediction)
        return 119 if x == 1 else None

    def load(self, filename='weights.pkl'):
        with open(filename, 'rb') as fp:
            self.model.set_weights(pickle.load(fp))
        self.es.weights = self.model.get_weights()

    def get_observation(self):
        state = self.env.getGameState()
        return np.array(state.values())

    def save(self, filename='weights.pkl'):
        with open(filename, 'wb') as fp:
            pickle.dump(self.es.get_weights(), fp)

    def play(self, episodes):
        self.env.display_screen = True
        self.model.set_weights(self.es.weights)
        for episode in xrange(episodes):
            self.env.reset_game()
            observation = self.get_observation()
            sequence = [observation] * self.AGENT_HISTORY_LENGTH
            done = False
            score = 0

            while not done:
                action = self.get_predicted_action(sequence)
                reward = self.env.act(action)
                observation = self.get_observation()
                sequence = sequence[1:]
                sequence.append(observation)
                done = self.env.game_over()
                if self.game.getScore() > score:
                    score = self.game.getScore()
            self.GENERATION = self.GENERATION + 1

            print self.GENERATION
            print "score: %d" % score
        self.env.display_screen = False

    def train(self, iterations):
        self.es.run(iterations, print_step=10)

    def get_reward(self, weights):
        total_reward = 0.0
        self.model.set_weights(weights)

        for episode in xrange(self.EPS_AVG):
            self.env.reset_game()
            observation = self.get_observation()
            sequence = [observation] * self.AGENT_HISTORY_LENGTH
            done = False
            while not done:
                self.exploration = max(self.FINAL_EXPLORATION,
                                       self.exploration - self.INITIAL_EXPLORATION / self.EXPLORATION_DEC_STEPS)
                if random.random() < self.exploration:
                    action = random.choice([119, None])
                else:
                    action = self.get_predicted_action(sequence)

                reward = self.env.act(action)
                reward += random.choice([0.0001, -0.0001])
                total_reward += reward
                observation = self.get_observation()
                sequence = sequence[1:]
                sequence.append(observation)
                done = self.env.game_over()

        return total_reward / self.EPS_AVG
Пример #10
0
from ple.games.SpaceInvadersGame import SpaceInvadersGame
from ple import PLE

game = SpaceInvadersGame()
p = PLE(game, fps=30, display_screen=True)
#agent = myAgentHere()

p.init()
reward = 0.0

for i in range(100):
    if p.game_over():
        p.reset_game()

    observation = p.getScreenRGB()
    #action = agent.pickAction(reward, observation)
    allowed_actions = p.getActionSet()
    reward = p.act(allowed_actions[4])
Пример #11
0
class MonsterKongEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, map_config):
        self.map_config = map_config
        self.game = MonsterKong(self.map_config)

        self.fps = 30
        self.frame_skip = 1
        self.num_steps = 1
        self.force_fps = True
        self.display_screen = True
        self.nb_frames = 500
        self.reward = 0.0
        self.episode_end_sleep = 0.2

        if map_config.has_key('fps'):
            self.fps = map_config['fps']
        if map_config.has_key('frame_skip'):
            self.frame_skip = map_config['frame_skip']
        if map_config.has_key('force_fps'):
            self.force_fps = map_config['force_fps']
        if map_config.has_key('display_screen'):
            self.display_screen = map_config['display_screen']
        if map_config.has_key('episode_length'):
            self.nb_frames = map_config['episode_length']
        if map_config.has_key('episode_end_sleep'):
            self.episode_end_sleep = map_config['episode_end_sleep']
        self.current_step = 0

        self._seed()

        self.p = PLE(self.game,
                     fps=self.fps,
                     frame_skip=self.frame_skip,
                     num_steps=self.num_steps,
                     force_fps=self.force_fps,
                     display_screen=self.display_screen,
                     rng=self.rng)

        self.p.init()

        self._action_set = self.p.getActionSet()[1:]
        self.action_space = spaces.Discrete(len(self._action_set))
        (screen_width, screen_height) = self.p.getScreenDims()
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(screen_height, screen_width,
                                                   3))

    def _seed(self, seed=24):
        self.rng = seed

    def _step(self, action_taken):
        reward = 0.0
        action = self._action_set[action_taken]
        reward += self.p.act(action)
        obs = self.p.getScreenRGB()
        done = self.p.game_over()
        info = {'PLE': self.p}
        self.current_step += 1
        if self.current_step >= self.nb_frames:
            done = True
        return obs, reward, done, info

    def _reset(self):
        self.current_step = 0
        # Noop and reset if done
        start_done = True
        while start_done:
            self.p.reset_game()
            _, _, start_done, _ = self._step(4)
            #self.p.init()
        if self.p.display_screen:
            self._render()
            if self.episode_end_sleep > 0:
                time.sleep(self.episode_end_sleep)
        return self.p.getScreenRGB()

    def _render(self, mode='human', close=False):
        if close:
            return  # TODO: implement close
        original = self.p.display_screen
        self.p.display_screen = True
        self.p._draw_frame()
        self.p.display_screen = original
Пример #12
0
#处理obs
obs = preprocess(obs)
episode_reward = 0
while True:
    # 预测动作,只选最优动作
    action = agent.predict(obs)
    # 图像太快休眠
    # time.sleep(0.02)
    # # 新建窗口显示分数
    # observation = env.getScreenRGB()
    # score  = env.score()
    # # 格式转换
    # observation = cv2.cvtColor(observation,cv2.COLOR_RGB2BGR)
    # # 选择90度
    # observation = cv2.transpose(observation)
    # font = cv2.FONT_HERSHEY_SIMPLEX
    # observation = cv2.putText(observation, "score:"+str(int(score)), (0, 30), font, 0.6, (0, 0, 255), 2)
    # cv2.imshow("flappybird", observation)
    # cv2.waitKey(10)
    
    reward= env.act(actionset[action])
    obs = list(env.getGameState().values())
    #处理obs
    obs = preprocess(obs)
    done = env.game_over()
    episode_reward += reward
    if done:
        break
print("episode_reward:",episode_reward)
cv2.destroyAllWindows()
Пример #13
0
class Catcher_Env:
    def __init__(self,
                 random_seed=0,
                 init_lives=3,
                 normalise=True,
                 display=False):

        self._random_seed = random_seed
        self._game = Catcher(init_lives=init_lives)
        self._normalise = normalise
        self._display = display

        if self._display == False:
            os.putenv('SDL_VIDEODRIVER', 'fbcon')
            os.environ["SDL_VIDEODRIVER"] = "dummy"

        if self._normalise:
            self._env = PLE(self._game,
                            fps=30,
                            state_preprocessor=self._normalise_ob,
                            display_screen=display)
        else:
            self._env = PLE(self._game,
                            fps=30,
                            state_preprocessor=self._ob,
                            display_screen=display)

        self._env.init()
        self._actions = self._env.getActionSet()
        self._env.rng.seed(random_seed)

        # Tracker
        self._cum_reward = 0

    def _ob(self, state):
        return np.array([
            state['player_x'], state['player_vel'], state['fruit_x'],
            state['fruit_y']
        ])

    def _normalise_ob(self, state):
        state = np.array([
            state['player_x'], state['player_vel'], state['fruit_x'],
            state['fruit_y']
        ])
        state[0] = (state[0] - 26) / 26  # makes range -1 1
        state[1] = (state[1]) / 8  # makes range -1 1
        state[2] = (state[2] - 26) / 26  # makes range -1 1
        state[3] = (state[3] - 20) / 45  # makes range -1 1

        return state

    def reset(self):
        self._cum_reward = 0
        self._env.reset_game()

        return self._env.getGameState()

    def action_set(self):
        return self._actions

    def num_actions(self):
        return len(self._actions)

    def episode_return(self):
        return self._cum_reward

    def act(self, a):
        reward = self._env.act(self._actions[a])
        if reward == -6:
            reward = -1

        self._cum_reward += reward

        next_obs = self._env.getGameState()
        terminal = self._env.game_over()
        if self._cum_reward >= 200:
            self._cum_reward = 200
            terminal = True
        return reward, next_obs, terminal
Пример #14
0
                             (1, ))  # don't quite like this one but...

# Ready log
logFile = open(log_output, 'w')
logFile.write('Step,Episode,Loss,Mean_Reward,Time \n')
game, episode_reward, mean_reward = 0, 0, 0
start_time = time.time()

# Passes
epoch, loss = 0, float('Inf')
while epoch < NB_EPOCHS:

    # select action
    a = epsilon_greedy_action(model, features, epoch)
    # get reward
    r = p.act(actions[a])
    episode_reward += r
    screen_y = process_screen(p.getScreenRGB())
    d = p.game_over()
    replay_memory.append(screen_x, a, r, screen_y, d)
    # train
    if epoch > BATCH and epoch % ACCELERATE_TRAINING == 0 and epoch > OBSERVE:
        X, A, R, Y, D = replay_memory.minibatch(BATCH)
        QY = model_target.predict(Y)
        QYmax = QY.max(1).reshape((BATCH, 1))
        update = R + GAMMA * (1 - D) * QYmax
        QX = model.predict(X)
        QX[np.arange(BATCH), A.ravel()] = update.ravel()
        loss = float(model.train_on_batch(x=X, y=QX))

    # transfert weights between networks
class Agent:

    LEARNING_RATE = 0.003
    EPISODE = 500
    LAYER_SIZE = 500
    EPSILON = 1
    DECAY_RATE = 0.005
    MIN_EPSILON = 0.1
    GAMMA = 0.99
    INPUT_SIZE = 8
    # based on documentation, features got 8 dimensions
    OUTPUT_SIZE = 2

    # output is 2 dimensions, 0 = do nothing, 1 = jump

    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game,
                       fps=30,
                       display_screen=screen,
                       force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        self.X = tf.placeholder(tf.float32, (None, self.INPUT_SIZE))
        self.REWARDS = tf.placeholder(tf.float32, (None))
        self.ACTIONS = tf.placeholder(tf.int32, (None))
        input_layer = tf.Variable(
            tf.random_normal([self.INPUT_SIZE, self.LAYER_SIZE]))
        bias = tf.Variable(tf.random_normal([self.LAYER_SIZE]))
        output_layer = tf.Variable(
            tf.random_normal([self.LAYER_SIZE, self.OUTPUT_SIZE]))
        feed_forward = tf.nn.relu(tf.matmul(self.X, input_layer) + bias)
        self.logits = tf.nn.softmax(tf.matmul(feed_forward, output_layer))
        indexes = tf.range(0,
                           tf.shape(self.logits)[0]) * tf.shape(
                               self.logits)[1] + self.ACTIONS
        responsible_outputs = tf.gather(tf.reshape(self.logits, [-1]), indexes)
        self.cost = -tf.reduce_mean(tf.log(responsible_outputs) * self.REWARDS)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.LEARNING_RATE).minimize(self.cost)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.rewards = []

    def predict(self, inputs):
        return self.sess.run(self.logits, feed_dict={self.X: inputs})

    def save(self, checkpoint_name):
        self.saver.save(self.sess,
                        os.getcwd() + "/%s.ckpt" % (checkpoint_name))
        with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen:
            pickle.dump(self.rewards, fopen)

    def load(self, checkpoint_name):
        self.saver.restore(self.sess,
                           os.getcwd() + "/%s.ckpt" % (checkpoint_name))
        with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen:
            self.rewards = pickle.load(fopen)

    def get_predicted_action(self, sequence):
        prediction = self.predict(np.array(sequence))[0]
        return np.argmax(prediction)

    def get_state(self):
        state = self.env.getGameState()
        return np.array(list(state.values()))

    def get_reward(self, iterations, checkpoint):
        for i in range(iterations):
            ep_history = []
            for k in range(self.EPISODE):
                total_reward = 0
                self.env.reset_game()
                done = False
                state = self.get_state()
                sequence = [state]
                while not done:
                    action = self._select_action(state)
                    real_action = 119 if action == 1 else None
                    reward = self.env.act(real_action)
                    reward += random.choice([0.0001, -0.0001])
                    total_reward += reward
                    next_state = self.get_state()
                    ep_history.append(
                        [state, action, total_reward, next_state])
                    state = next_state
                    sequence = [state]
                    done = self.env.game_over()
                ep_history = np.array(ep_history)
                ep_history[:, 2] = discount_rewards(ep_history[:, 2])
                sess.run(self.optimizer,
                         feed_dict={
                             self.X: np.vstack(ep_history[:, 0]),
                             self.REWARDS: ep_history[:, 2],
                             self.ACTIONS: ep_history[:, 1]
                         })
            self.rewards.append(total_reward)
            if (i + 1) % checkpoint == 0:
                print('epoch:', i + 1, 'total rewards:', total_reward)
                print('epoch:', i + 1, 'cost:', cost)

    def fit(self, iterations, checkpoint):
        self.get_reward(iterations, checkpoint)

    def play(self, debug=False, not_realtime=False):
        total_reward = 0.0
        current_reward = 0
        self.env.force_fps = not_realtime
        self.env.reset_game()
        done = False
        while not done:
            state = self.get_state()
            action = self._select_action(state)
            real_action = 119 if action == 1 else None
            action_string = 'eh, jump!' if action == 1 else 'erm, do nothing..'
            if debug and total_reward > current_reward:
                print(action_string, 'total rewards:', total_reward)
            current_reward = total_reward
            total_reward += self.env.act(real_action)
            done = self.env.game_over()
        print('game over!')
class Agent:

    LEARNING_RATE = 0.003
    BATCH_SIZE = 32
    INPUT_SIZE = 8
    LAYER_SIZE = 500
    OUTPUT_SIZE = 2
    EPSILON = 1
    DECAY_RATE = 0.005
    MIN_EPSILON = 0.1
    GAMMA = 0.99
    MEMORIES = deque()
    MEMORY_SIZE = 300

    # based on documentation, features got 8 dimensions
    # output is 2 dimensions, 0 = do nothing, 1 = jump

    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game,
                       fps=30,
                       display_screen=screen,
                       force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        self.X = tf.placeholder(tf.float32, (None, None, input_size))
        self.Y = tf.placeholder(tf.float32, (None, output_size))
        cell = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple=False)
        self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * 512))
        self.rnn, self.last_state = tf.nn.dynamic_rnn(
            inputs=self.X,
            cell=cell,
            dtype=tf.float32,
            initial_state=self.hidden_layer)
        self.tensor_action, self.tensor_validation = tf.split(
            self.rnn[:, -1, :], 2, 1)
        self.feed_action = tf.matmul(self.tensor_action, action_layer)
        self.feed_validation = tf.matmul(self.tensor_validation, action_layer)
        self.logits = self.feed_validation + tf.subtract(
            self.feed_action,
            tf.reduce_mean(self.feed_action, axis=1, keep_dims=True))
        self.cost = tf.reduce_sum(tf.square(self.Y - self.logits))
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.LEARNING_RATE).minimize(self.cost)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.rewards = []

    def _memorize(self, state, action, reward, new_state, dead):
        self.MEMORIES.append((state, action, reward, new_state, dead))
        if len(self.MEMORIES) > self.MEMORY_SIZE:
            self.MEMORIES.popleft()

    def _select_action(self, state):
        if np.random.rand() < self.EPSILON:
            action = np.random.randint(self.OUTPUT_SIZE)
        else:
            action = self.get_predicted_action([state])
        return action

    def _construct_memories(self, replay):
        states = np.array([a[0] for a in replay])
        new_states = np.array([a[3] for a in replay])
        Q = self.predict(states)
        Q_new = self.predict(new_states)
        replay_size = len(replay)
        X = np.empty((replay_size, self.INPUT_SIZE))
        Y = np.empty((replay_size, self.OUTPUT_SIZE))
        for i in range(replay_size):
            state_r, action_r, reward_r, new_state_r, dead_r = replay[i]
            target = Q[i]
            target[action_r] = reward_r
            if not dead_r:
                target[action_r] += self.GAMMA * np.amax(Q_new[i])
            X[i] = state_r
            Y[i] = target
        return X, Y

    def predict(self, inputs):
        return self.sess.run(self.logits, feed_dict={self.X: inputs})

    def save(self, checkpoint_name):
        self.saver.save(self.sess,
                        os.getcwd() + "/%s.ckpt" % (checkpoint_name))
        with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen:
            pickle.dump(self.rewards, fopen)

    def load(self, checkpoint_name):
        self.saver.restore(self.sess,
                           os.getcwd() + "/%s.ckpt" % (checkpoint_name))
        with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen:
            self.rewards = pickle.load(fopen)

    def get_predicted_action(self, sequence):
        prediction = self.predict(np.array(sequence))[0]
        return np.argmax(prediction)

    def get_state(self):
        state = self.env.getGameState()
        return np.array(list(state.values()))

    def get_reward(self, iterations, checkpoint):
        for i in range(iterations):
            total_reward = 0
            self.env.reset_game()
            dead = False
            init_value = np.zeros((1, 2 * 512))
            state = self.get_state()
            for i in range(self.INITIAL_FEATURES.shape[0]):
                self.INITIAL_FEATURES[i, :] = state
            while not dead:
                if (self.T_COPY + 1) % self.COPY == 0:
                    self._assign()
                if np.random.rand() < self.EPSILON:
                    action = np.random.randint(self.OUTPUT_SIZE)
                else:
                    action, last_state = sess.run(self.model.logits,
                                                  self.model.last_state,
                                                  feed_dict={
                                                      self.model.X:
                                                      [self.INITIAL_FEATURES],
                                                      self.model.hidden_layer:
                                                      init_values
                                                  })
                    action, init_value = np.argmax(action[0]), last_state[0]
                real_action = 119 if action == 1 else None
                reward = self.env.act(real_action)
                total_reward += reward
                new_state = self.get_state()
                dead = self.env.game_over()
                self._memorize(state, action, reward, new_state, dead,
                               init_value)
                batch_size = min(len(self.MEMORIES), self.BATCH_SIZE)
                replay = random.sample(self.MEMORIES, batch_size)
                X, Y, init_values = self._construct_memories(replay)
                cost, _ = self.sess.run([self.cost, self.optimizer],
                                        feed_dict={
                                            self.X: X,
                                            self.Y: Y,
                                            self.hidden_layer: init_values
                                        })
            if (i + 1) % checkpoint == 0:
                print('epoch:', i + 1, 'total rewards:', total_reward)
                print('epoch:', i + 1, 'cost:', cost)

    def fit(self, iterations, checkpoint):
        self.get_reward(iterations, checkpoint)
Пример #17
0
def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    #setupGame()
    gameClass = FlappyBird(width=288, height=512, pipe_gap=100)
    
    fps = 30
    frame_skip = 2
    num_steps = 1
    force_fps = False
    display_screen = True
    reward = 0.0
    nb_frames = 15000

    game = PLE(gameClass, fps=fps, frame_skip=frame_skip, num_steps=num_steps,
            force_fps=force_fps, display_screen=display_screen)

    game.init()

    # store the previous observations in replay memory
    D = deque()

    # printing
    logdir = "logs_" + GAME
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    a_file = open(logdir + "/readout.txt", 'w')
    h_file = open(logdir + "/hidden.txt", 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    r_0 = game.act(game.NOOP)
    x_t = game.getScreenGrayscale()
    terminal = game.game_over()
    if terminal:
        print "NOOOO"
        game.reset_game()
    
    x_t = cv2.resize(x_t, (80, 80))
    ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2)

    # saving and loading networks
    #saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    '''
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print "Successfully loaded:", checkpoint.model_checkpoint_path
    else:
        print "Could not find old network weights"
    '''
    epsilon = INITIAL_EPSILON
    t = 0
    while True:
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict = {s : [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if random.random() <= epsilon or t <= OBSERVE:
            action_index = random.randrange(ACTIONS)
            a_t[random.randrange(ACTIONS)] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        for i in range(0, K):
            # run the selected action and observe next state and reward
            r_t = game.act(np.argmax(a_t))
            x_t1 = game.getScreenGrayscale()
            terminal = game.game_over()
            if terminal:
                print "NOOO2"
                game.reset_game()

            x_t1 = cv2.resize(x_t1, (80, 80))
            ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2)

            # store the transition in D
            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch})
            for i in range(0, len(minibatch)):
                # if terminal only equals reward
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(feed_dict = {
                y : y_batch,
                a : a_batch,
                s : s_j_batch})

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t)

        # write info to files
        '''
Пример #18
0
  if (action == 1): 
      action_value = 119 
  else: action_value=None        
  if (i>1):
      for j in range(37-1, 0, -1):
          x_wall[j] = int(x_wall[j-1])
          y_wall[j] = int(y_wall[j-1])
          v_wall[j] = int(v_wall[j-1])
          a_wall[j] = int(a_wall[j-1])
      x_wall[0] = int(x)
      y_wall[0] = int(y)
      v_wall[0] = int(v)
      a_wall[0] = int(action)
 
  #reward is +1 if bird fly by the pipe
  reward = p.act(action_value)
  my_reward=0
  if (reward==1):
      my_reward = r_1
      cumulated[i] += 1
      for j in range(1, 40):
          Q[int(y_wall[j]),int(x_wall[j]),int(v_wall[j]),int(a_wall[j])] += alpha * (my_reward + np.max(Q[int(y_wall[j-1]),int(x_wall[j-1]),int(v_wall[j-1]),int(a_wall[j-1])]))
  
  # bad result : -100
  if (reward<0):
      my_reward = r_2
      if (x==20):
          for j in range(0, 27):
              Q[int(y_wall[j]),int(x_wall[j]),int(v_wall[j]),int(a_wall[j])] += alpha * (my_reward + np.max(Q[int(y_wall[j-1]),int(x_wall[j-1]),int(v_wall[j-1]),int(a_wall[j-1])]))
      else:
         for j in range(0, 6):
reward = 0.0
max_noops = 20
nb_frames = 15000

#make a PLE instance.
p = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, 
	force_fps=force_fps, display_screen=display_screen)

#our Naive agent!
agent = NaiveAgent(p.getActionSet())

#init agent and game.
p.init()

#lets do a random number of NOOP's
for i in range(np.random.randint(0, max_noops)):
	reward = p.act(p.NOOP)

#start our training loop
for f in range(nb_frames):
	#if the game is over
        if p.game_over():
            p.reset_game()
            
        obs = p.getScreenRGB()
        action = agent.pickAction(reward, obs)
        reward = p.act(action)

	if f % 50 == 0:
		p.saveScreen("screen_capture.png")
class Agent:

    POPULATION_SIZE = 15
    SIGMA = 0.1
    LEARNING_RATE = 0.03
    INITIAL_IMAGES = np.zeros((80, 80, 4))
    EPSILON = 0.4
    INITIAL_EPSILON = 0.01
    WATCHING = 10000

    # based on documentation, features got 8 dimensions
    # output is 5 dimensions, 0 = left, 1 = right, 2 = up, 3 = down, 4 = space

    def __init__(self, model, screen=False, forcefps=True):
        self.model = model
        self.game = MonsterKong()
        self.env = PLE(self.game,
                       fps=30,
                       display_screen=screen,
                       force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        self.es = Deep_Evolution_Strategy(self.model.get_weights(),
                                          self.get_reward,
                                          self.POPULATION_SIZE, self.SIGMA,
                                          self.LEARNING_RATE)
        self.rewards = []

    def _get_image(self, image):
        r, g, b = image[:, :, 0], image[:, :, 1], image[:, :, 2]
        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
        return imresize(gray, size=(80, 80))

    def _map_action(self, action):
        if action == 0:
            return 97
        if action == 1:
            return 100
        if action == 2:
            return 119
        if action == 3:
            return 115
        if action == 4:
            return 32

    def get_predicted_action(self, sequence):
        if random.random() > self.EPSILON:
            prediction = np.argmax(self.model.predict(np.array(sequence))[0])
        else:
            prediction = np.random.randint(5)
        self.EPSILON -= (self.EPSILON / self.WATCHING)
        return prediction

    def save(self, checkpoint_name):
        with open('%s-weight.p' % (checkpoint_name), 'wb') as fopen:
            pickle.dump(self.model.get_weights(), fopen)
        with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen:
            pickle.dump(self.rewards, fopen)

    def load(self, checkpoint_name):
        with open('%s-weight.p' % (checkpoint_name), 'rb') as fopen:
            self.model.set_weights(pickle.load(fopen))
        with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen:
            self.rewards = pickle.load(fopen)

    def get_state(self):
        state = self.env.getGameState()
        return np.array(list(state.values()))

    def get_reward(self, weights):
        self.model.weights = weights
        total_reward = 0.0
        self.env.reset_game()
        state = self._get_image(self.env.getScreenRGB())
        for i in range(self.INITIAL_IMAGES.shape[2]):
            self.INITIAL_IMAGES[:, :, i] = state
        done = False
        while not done:
            action = self.get_predicted_action([self.INITIAL_IMAGES])
            real_action = self._map_action(action)
            reward = self.env.act(real_action)
            reward += random.choice([0.0001, -0.0001])
            total_reward += reward
            state = self._get_image(self.env.getScreenRGB())
            self.INITIAL_IMAGES = np.append(state.reshape([80, 80, 1]),
                                            self.INITIAL_IMAGES[:, :, :3],
                                            axis=2)
            done = self.env.game_over()
        self.rewards.append(total_reward)
        return total_reward

    def fit(self, iterations, checkpoint):
        self.es.train(iterations, print_every=checkpoint)

    def play(self, debug=False, not_realtime=False):
        total_reward = 0.0
        current_reward = 0
        self.env.force_fps = not_realtime
        self.env.reset_game()
        state = self._get_image(self.env.getScreenRGB())
        for i in range(self.INITIAL_IMAGES.shape[2]):
            self.INITIAL_IMAGES[:, :, i] = state
        done = False
        while not done:
            action = self.get_predicted_action(self.INITIAL_IMAGES)
            real_action = self._map_action(action)
            if debug and total_reward > current_reward:
                print(action_string, 'total rewards:', total_reward)
            current_reward = total_reward
            total_reward += self.env.act(real_action)
            state = self._get_image(self.env.getScreenRGB())
            self.INITIAL_IMAGES = np.append(state.reshape([80, 80, 1]),
                                            self.INITIAL_IMAGES[:, :, :3],
                                            axis=2)
            done = self.env.game_over()
        print('game over!')
class Agent:

    LEARNING_RATE = 0.003
    BATCH_SIZE = 32
    EPSILON = 1
    DECAY_RATE = 0.005
    MIN_EPSILON = 0.1
    GAMMA = 0.99
    MEMORIES = deque()
    COPY = 1000
    T_COPY = 0
    MEMORY_SIZE = 300
    INITIAL_FEATURES = np.zeros((4, INPUT_SIZE))
    INPUT_SIZE = 8
    LAYER_SIZE = 500
    OUTPUT_SIZE = 2

    # based on documentation, features got 8 dimensions
    # output is 2 dimensions, 0 = do nothing, 1 = jump

    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game,
                       fps=30,
                       display_screen=screen,
                       force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        # input_size, output_size, layer_size, learning_rate, name
        self.model = Model(self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE,
                           self.LEARNING_RATE, 'real_model')
        self.model_negative = Model(self.INPUT_SIZE, self.OUTPUT_SIZE,
                                    self.LAYER_SIZE, self.LEARNING_RATE,
                                    'negative_model')
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.rewards = []

    def _assign(self, from_name, to_name):
        from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope=from_name)
        to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 scope=to_name)
        for i in range(len(from_w)):
            assign_op = to_w[i].assign(from_w[i])
            sess.run(assign_op)

    def _memorize(self, state, action, reward, new_state, dead, rnn_state):
        self.MEMORIES.append(
            (state, action, reward, new_state, dead, rnn_state))
        if len(self.MEMORIES) > self.MEMORY_SIZE:
            self.MEMORIES.popleft()

    def _construct_memories(self, replay):
        states = np.array([a[0] for a in replay])
        new_states = np.array([a[3] for a in replay])
        init_values = np.array([a[-1] for a in replay])
        Q = sess.run(self.model.logits,
                     feed_dict={
                         self.model.X: states,
                         self.model_negative.hidden_layer: init_values
                     })
        Q_new = sess.run(self.model.logits,
                         feed_dict={
                             self.model.X: new_states,
                             self.model.hidden_layer: init_values
                         })
        Q_new_negative = sess.run(self.model_negative.logits,
                                  feed_dict={
                                      self.model_negative.X: new_states,
                                      self.model_negative.hidden_layer:
                                      init_values
                                  })
        replay_size = len(replay)
        X = np.empty((replay_size, self.INPUT_SIZE))
        Y = np.empty((replay_size, self.OUTPUT_SIZE))
        for i in range(replay_size):
            state_r, action_r, reward_r, new_state_r, dead_r = replay[i]
            target = Q[i]
            target[action_r] = reward_r
            if not dead_r:
                target[action_r] += self.GAMMA * Q_new_negative[
                    i, np.argmax(Q_new[i])]
            X[i] = state_r
            Y[i] = target
        return X, Y

    def save(self, checkpoint_name):
        self.saver.save(self.sess,
                        os.getcwd() + "/%s.ckpt" % (checkpoint_name))
        with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen:
            pickle.dump(self.rewards, fopen)

    def load(self, checkpoint_name):
        self.saver.restore(self.sess,
                           os.getcwd() + "/%s.ckpt" % (checkpoint_name))
        with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen:
            self.rewards = pickle.load(fopen)

    def get_state(self):
        state = self.env.getGameState()
        return np.array(list(state.values()))

    def get_reward(self, iterations, checkpoint):
        for i in range(iterations):
            total_reward = 0
            self.env.reset_game()
            dead = False
            init_value = np.zeros((1, 2 * 512))
            state = self.get_state()
            for i in range(self.INITIAL_FEATURES.shape[0]):
                self.INITIAL_FEATURES[i, :] = state
            while not dead:
                if (self.T_COPY + 1) % self.COPY == 0:
                    self._assign('real_model', 'target_model')
                if np.random.rand() < self.EPSILON:
                    action = np.random.randint(self.OUTPUT_SIZE)
                else:
                    action, last_state = sess.run(self.model.logits,
                                                  self.model.last_state,
                                                  feed_dict={
                                                      self.model.X:
                                                      [self.INITIAL_FEATURES],
                                                      self.model.hidden_layer:
                                                      init_values
                                                  })
                    action, init_value = np.argmax(action[0]), last_state[0]
                real_action = 119 if action == 1 else None
                reward = self.env.act(real_action)
                total_reward += reward
                new_state = np.append(self.get_state(),
                                      self.INITIAL_FEATURES[:3, :],
                                      axis=0)
                dead = self.env.game_over()
                self._memorize(state, action, reward, new_state, dead,
                               init_value)
                batch_size = min(len(self.MEMORIES), self.BATCH_SIZE)
                replay = random.sample(self.MEMORIES, batch_size)
                X, Y, init_values = self._construct_memories(replay)
                cost, _ = self.sess.run(
                    [self.model.cost, self.model.optimizer],
                    feed_dict={
                        self.model.X: X,
                        self.model.Y: Y,
                        self.model.hidden_layer: init_values
                    })
                self.T_COPY += 1
            self.rewards.append(total_reward)
            self.EPSILON = self.MIN_EPSILON + (
                1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i)
            if (i + 1) % checkpoint == 0:
                print('epoch:', i + 1, 'total rewards:', total_reward)
                print('epoch:', i + 1, 'cost:', cost)

    def fit(self, iterations, checkpoint):
        self.get_reward(iterations, checkpoint)
Пример #22
0
class PLEEnv(gym.Env):
    metadata = {'render.modes': ['human', 'rgb_array']}

    def __init__(self, game_name='FlappyBird', display_screen=True):
        # open up a game state to communicate with emulator
        import importlib
        game_module_name = ('ple.games.%s' % game_name).lower()
        game_module = importlib.import_module(game_module_name)
        game = getattr(game_module, game_name)()
        self.game = game
        self.game_state = PLE(game, fps=30, display_screen=display_screen)
        self.game_state.init()

        # increase gap for checking
        #self.game.pipe_gap = 115
        #self.game.player.height = 14

        self._action_set = self.game_state.getActionSet()
        self.action_space = spaces.Discrete(len(self._action_set))
        self.screen_width, self.screen_height = self.game_state.getScreenDims()
        #print(self.screen_width, self.screen_height)
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(self.screen_width,
                                                   self.screen_height, 3))
        self.viewer = None

    def _step(self, a):
        reward = self.game_state.act(self._action_set[a])
        state = self._get_image()
        terminal = self.game_state.game_over()
        return state, reward, terminal, {}

    def _get_image(self):
        image_rotated = np.fliplr(
            np.rot90(self.game_state.getScreenRGB(),
                     3))  # Hack to fix the rotated image returned by ple
        return image_rotated

    @property
    def _n_actions(self):
        return len(self._action_set)

    # return: (states, observations)
    def _reset(self):
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(self.screen_width,
                                                   self.screen_height, 3))
        self.game_state.reset_game()
        state = self._get_image()
        return state

    def _render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return
        img = self._get_image()
        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            from gym.envs.classic_control import rendering
            if self.viewer is None:
                self.viewer = rendering.SimpleImageViewer()
            self.viewer.imshow(img)

    def _seed(self, seed):
        rng = np.random.RandomState(seed)
        self.game_state.rng = rng
        self.game_state.game.rng = self.game_state.rng

        self.game_state.init()
Пример #23
0
class Agent:
    def __init__(self, hyper: dict, game: PyGameWrapper):
        self.hyper = hyper
        self.game = game
        self.p = PLE(game, fps=30, display_screen=True)
        self.p.init()

        self.memory = ReplayBuffer(hyper['obs_dim'], hyper['capacity'],
                                   hyper['batch_size'])
        self.epsilon_decay = hyper['epsilon_decay']
        self.epsilon = hyper['max_epsilon']
        self.max_epsilon = hyper['max_epsilon']
        self.min_epsilon = hyper['min_epsilon']
        self.gamma = torch.tensor(hyper['gamma']).to(Pytorch.device())
        self.target_update = hyper['target_update']

        self.dqn = Network(hyper['obs_dim'],
                           hyper['action_dim']).to(Pytorch.device())
        self.dqn_target = Network(hyper['obs_dim'],
                                  hyper['action_dim']).to(Pytorch.device())
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()

        self.optimizer = optim.Adam(self.dqn.parameters())
        self.transition = list()
        self.is_test = hyper['test']
        self.epochs = hyper['epochs']
        self.batch_size = hyper['batch_size']
        self.epoch_log = hyper['epoch_log']

    def select_action(self, state: np.ndarray) -> int:
        def random_action(scale=1):
            action_max = int(scale * 100)
            r = random.randint(0, 100)
            if r <= action_max:
                return 1
            return 0

        """
        使用贪心( ε—greedy )搜索方法来对环境进行探索

        以 ε—greedy搜索以概率 ε 从所有可能的动作中随机选取一个动作
        以 1- ε 的概率选择已知的最好的动作(即当前状态下,Q值最大的那个动作)

        在初期, ε 的值应更大一些(即注重对环境的探索),随后逐渐减小 ε 的值(即注重对于Q值表的使用)
        self.epsilon会随着回合数减小,实现 ε 的值随着回合数的增加而递减。
        """
        if self.epsilon > np.random.random():
            selected_action = random_action()
        else:
            # 神经网络得到动作
            selected_action = self.dqn(
                torch.FloatTensor(state).to(Pytorch.device())).argmax()
            selected_action = selected_action.detach().cpu().item()

        if not self.is_test:
            self.transition = [state, selected_action]

        return selected_action

    def step(self, action: int):
        reward = self.p.act(action)
        # 存储当前状态、行动、奖励、下一步状态、结束状态
        if not self.is_test:
            self.transition += [reward, self.state(), self.p.game_over()]
            self.memory.store(*self.transition)
        return reward

    def update_model(self):
        samples = self.memory.sample_batch()
        loss = self._compute_dqn_loss(samples)
        loss = loss / len(samples)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def _compute_dqn_loss(self, samples: Dict[str,
                                              np.ndarray]) -> torch.Tensor:
        """Return dqn loss."""
        device = Pytorch.device()
        state = torch.FloatTensor(samples["obs"]).to(device)
        next_state = torch.FloatTensor(samples["next_obs"]).to(device)
        action = torch.LongTensor(samples["acts"].reshape(-1, 1)).to(device)
        reward = torch.FloatTensor(samples["rews"].reshape(-1, 1)).to(device)
        done = torch.FloatTensor(samples["done"].reshape(-1, 1)).to(device)

        # G_t   = r + gamma * v(s_{t+1})  if state != Terminal
        #       = r                       otherwise
        curr_q_value = self.dqn(state).gather(1, action)
        next_q_value = self.dqn_target(next_state).max(
            dim=1, keepdim=True)[0].detach()
        mask = 1 - done
        target = (reward + self.gamma * next_q_value * mask).to(device)

        loss = func.smooth_l1_loss(curr_q_value, target)
        return loss

    def _target_hard_update(self):
        """Hard update: target <- local."""
        self.dqn_target.load_state_dict(self.dqn.state_dict())

    def state(self):
        obs = self.game.getGameState()
        return np.array([
            obs['player_y'], obs['player_vel'],
            obs['next_pipe_dist_to_player'], obs['next_pipe_top_y'],
            obs['next_pipe_bottom_y'], obs['next_next_pipe_dist_to_player'],
            obs['next_next_pipe_top_y'], obs['next_next_pipe_bottom_y']
        ])

    def train(self, ):
        self.is_test = False
        epsilons, losses, reward_records, update_cnt = [], [], [], 0,
        for frame_idx in range(1, self.epochs + 1):
            self.p.reset_game()
            reward = 0
            while not self.p.game_over():
                # 选取动作
                state = self.state()
                action = self.select_action(state)
                # 执行动作,获取更新的环境状态、奖励、是否完成等,并存储
                step_reward = self.step(action)
                reward = reward + step_reward

            # 计算损失函数,梯度下降
            loss = self.update_model()
            losses.append(loss)
            update_cnt += 1

            # 减少ε
            self.epsilon = max(
                self.min_epsilon, self.epsilon -
                (self.max_epsilon - self.min_epsilon) * self.epsilon_decay)
            epsilons.append(self.epsilon)

            # 更新target神经网络
            if update_cnt % self.target_update == 0:
                self._target_hard_update()

            reward_records.append(reward)
            if frame_idx % self.epoch_log == 0:
                avg_score = '%.2f' % np.mean(reward_records)
                logger.info("Epoch: %s, Score: %s, Avg-Score: %s, Loss: %s" %
                            (frame_idx, reward, avg_score, loss))

    def test(self) -> None:
        self.is_test = True

        self.p.reset_game()
        total_reward = 0

        while not self.p.game_over():
            action = self.select_action(self.state())
            total_reward += self.step(action)
        logger.info("Total-Reward: %s" % total_reward)
Пример #24
0
def train_agent(number_of_episodes):
    game = FlappyBird()

    rewards = {
        "positive": 1.0,
        "negative": 0.0,
        "tick": 0.0,
        "loss": -5.0,
        "win": 0.0
    }

    env = PLE(game=game, fps=30, display_screen=False, reward_values=rewards)

    # Reset environment at the beginning
    env.reset_game()

    training_score = 0
    max_training_score = 0
    episode_number = 1

    state_action_reward = ()

    results = []
    state_transition = 0

    every_100th = 1

    while number_of_episodes > 0:

        # Get current state
        state = BasicQLearningAgent.get_state(env.game.getGameState())

        # Select action in state "state"
        action = basic_q_agent.compute_action_from_q_values(state)

        if action is None:
            raise IllegalActionException("Illegal action occurred.")
        """
        After choosing action, get reward.
        PLE environment method act() returns the reward that the agent has accumulated while performing the action.
        """
        reward = env.act(env.getActionSet()[action])
        training_score += reward

        max_training_score = max(training_score, max_training_score)

        game_over = env.game_over()

        # observe the result
        if state_action_reward:
            basic_q_agent.update(state_action_reward[0],
                                 state_action_reward[1], state,
                                 state_action_reward[2])
            state_transition += 1

        state_action_reward = (state, action, reward)

        if game_over:
            print("===========================")
            print("Episode: " + str(episode_number))
            print("Training score: " + str(training_score))
            print("Max. training score: " + str(max_training_score))
            print("===========================\n")
            if every_100th == 100:
                results.append((episode_number, training_score))
                every_100th = 0
            episode_number += 1
            every_100th += 1
            number_of_episodes -= 1
            training_score = 0
            state_transition = 0
            env.reset_game()

    f = open("basicq_150000.txt", "w")
    f.write(str(basic_q_agent.Q_matrix))
    f.close()

    f = open("results_150000.txt", "w")
    f.write(str(results))
    f.close()
Пример #25
0
p = PLE(game, fps=fps, force_fps=False)
agent = NaiveAgent(p.getActionSet())
reward = 0.0
h = HashState(game.getGameState())  # sets up hash value
# reads in arguments/file names
f = sys.argv[1]
o = open(f, 'r')
array = agent.file_to_array(o, h.seed)
# if no third argument was given, use '2'
arg = 2
if len(sys.argv) == 3:
    arg = int(sys.argv[2])

if arg == 1:
    # if just using table contents, not learning
    while True:
        if p.game_over():
            p.reset_game()

        obs = game.getGameState()
        mid = obs['frog_y'] > 261.0
        obs_value = h.add_table(obs, mid)
        action = p.act(agent.actions[np.argmax(array[obs_value])])
else:
    # if 0, starts table from scratch, otherwise resumes from file
    if arg == 0:
        array = None
    # runs learning
    obs = game.getGameState()
    action = agent.pickAction(array, obs)
Пример #26
0
class Agent:

    LEARNING_RATE = 1e-6
    BATCH_SIZE = 32
    OUTPUT_SIZE = 2
    EPSILON = 1
    DECAY_RATE = 0.005
    MIN_EPSILON = 0.1
    GAMMA = 0.99
    MEMORIES = deque()
    MEMORY_SIZE = 300
    INITIAL_IMAGES = np.zeros((80, 80, 4))
    # based on documentation, features got 8 dimensions
    # output is 2 dimensions, 0 = do nothing, 1 = jump

    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        self.actor = Actor('actor', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE)
        self.actor_target = Actor('actor-target', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE)
        self.critic = Critic('critic', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE)
        self.critic_target = Critic('critic-target', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE)
        self.grad_critic = tf.gradients(self.critic.logits, self.critic.Y)
        self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE])
        weights_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')
        self.grad_actor = tf.gradients(self.actor.logits, weights_actor, -self.actor_critic_grad)
        grads = zip(self.grad_actor, weights_actor)
        self.optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE).apply_gradients(grads)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.rewards = []

    def _assign(self, from_name, to_name):
        from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name)
        to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name)
        for i in range(len(from_w)):
            assign_op = to_w[i].assign(from_w[i])
            sess.run(assign_op)

    def _memorize(self, state, action, reward, new_state, dead):
        self.MEMORIES.append((state, action, reward, new_state, dead))
        if len(self.MEMORIES) > self.MEMORY_SIZE:
            self.MEMORIES.popleft()

    def _get_image(self, image):
        r, g, b = image[:,:,0], image[:,:,1], image[:,:,2]
        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
        return imresize(gray, size = (80, 80))

    def _select_action(self, state):
        if np.random.rand() < self.EPSILON:
            action = np.random.randint(self.OUTPUT_SIZE)
        else:
            prediction = self.sess.run(self.actor.logits_actor, feed_dict={self.actor.X:[state]})[0]
            action = np.argmax(prediction)
        return action

    def _construct_memories_and_train(self, replay):
        # state_r, action_r, reward_r, new_state_r, dead_r = replay
        # train actor
        states = np.array([a[0] for a in replay])
        new_states = np.array([a[3] for a in replay])
        Q = self.sess.run(self.actor.logits, feed_dict={self.actor.X: states})
        Q_target = self.sess.run(self.actor_target.logits, feed_dict={self.actor_target.X: states})
        grads = self.sess.run(self.grad_critic, feed_dict={self.critic.X:states, self.critic.Y:Q})
        self.sess.run(self.optimizer, feed_dict={self.actor.X:states, self.actor_critic_grad:grads})

        # train critic
        rewards = np.array([a[2] for a in replay]).reshape((-1, 1))
        rewards_target = self.sess.run(self.critic_target.logits, feed_dict={self.critic_target.X:new_states,self.critic_target.Y:Q_target})
        for i in range(len(replay)):
            if not replay[0][-1]:
                rewards[i,0] += self.GAMMA * rewards_target
        cost, _ = self.sess.run([self.critic.cost, self.critic.optimizer), feed_dict={self.critic.X:states, self.critic.Y:Q, self.critic.REWARD:rewards})
        return cost

    def predict(self, inputs):
        return self.sess.run(self.logits, feed_dict={self.X:inputs})

    def save(self, checkpoint_name):
        self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name))
        with open('%s-acc.p'%(checkpoint_name), 'wb') as fopen:
            pickle.dump(self.rewards, fopen)

    def load(self, checkpoint_name):
        self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name))
        with open('%s-acc.p'%(checkpoint_name), 'rb') as fopen:
            self.rewards = pickle.load(fopen)

    def get_predicted_action(self, sequence):
        prediction = self.predict(np.array(sequence))[0]
        return np.argmax(prediction)

    def get_state(self):
        state = self.env.getGameState()
        return np.array(list(state.values()))

    def get_reward(self, iterations, checkpoint):
        for i in range(iterations):
            total_reward = 0
            self.env.reset_game()
            state = self._get_image(self.env.getScreenRGB())
            for k in range(self.INITIAL_IMAGES.shape[2]):
                self.INITIAL_IMAGES[:,:,k] = state
            dead = False
            while not dead:
                if (self.T_COPY + 1) % self.COPY == 0:
                    self._assign('actor', 'actor-target')
                    self._assign('critic', 'critic-target')
                action  = self._select_action(self.INITIAL_IMAGES)
                real_action = 119 if action == 1 else None
                reward = self.env.act(real_action)
                total_reward += reward
                new_state = self.get_state()
                state = self._get_image(self.env.getScreenRGB())
                new_state = np.append(state.reshape([80, 80, 1]), self.INITIAL_IMAGES[:, :, :3], axis = 2)
                dead = self.env.game_over()
                self._memorize(self.INITIAL_IMAGES, action, reward, new_state, dead)
                batch_size = min(len(self.MEMORIES), self.BATCH_SIZE)
                replay = random.sample(self.MEMORIES, batch_size)
                cost = self._construct_memories_and_train(replay)
                self.INITIAL_IMAGES = new_state
                self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i)
                self.T_COPY += 1
            self.rewards.append(total_reward)
            if (i+1) % checkpoint == 0:
                print('epoch:', i + 1, 'total rewards:', total_reward)
                print('epoch:', i + 1, 'cost:', cost)

    def fit(self, iterations, checkpoint):
        self.get_reward(iterations, checkpoint)
Пример #27
0
def main_train(learning = True):
    final_score = 0
    previous_action = 1
    model = build_neural_network_model()
    game = FlappyBird(width=288, height=512, pipe_gap=100)
    env = PLE(game, fps=30, display_screen=True, state_preprocessor=process_state)
    env.init()
    passed = 0
    old_y=0
    for i in range(game_steps):
        if i % 10000 == 0:
            print("STEP {} / {}".format(i, game_steps))
        if i == game_steps - 1:
            print("Score: {}".format(final_score))
        if env.game_over():
            print("Final Score: {}".format(final_score))
            # time.sleep(5)
            final_score = 0
            env.reset_game()

        observation = env.getGameState()
        # print(
        #     "player y position {}\n"
        #     "players velocity {}\n"
        #     "next pipe distance to player {}\n"
        #     "next pipe top y position {}\n"
        #     "next pipe bottom y position {}\n"
        #     "next next pipe distance to player {}\n"
        #     "next next pipe top y position {}\n"
        #     "next next pipe bottom y position {}\n".format(observation[0]["player_y"], observation[0]['player_vel'],
        #                                                    observation[0]["next_pipe_dist_to_player"], observation[0]['next_pipe_top_y'],
        #                                                    observation[0]["next_pipe_bottom_y"], observation[0]['next_next_pipe_dist_to_player'],
        #                                                    observation[0]["next_next_pipe_top_y"], observation[0]["next_next_pipe_bottom_y"])
        # )

        current_state = observation[0]

        if str(current_state) not in q_dictionary:
            q_dictionary[str(current_state)] = dict()
        if 0 not in q_dictionary[str(current_state)]:
            q_dictionary[str(current_state)][0] = 0
        if 1 not in q_dictionary[str(current_state)]:
            q_dictionary[str(current_state)][1] = 0

        for action in [0, 1]:
            returned_object = generate_next_state(previous_action, current_state, action, passed, old_y)
            if returned_object[0] == 0:
                raise NameError("Error. {}".format(returned_object[1]))
            else:
                next_state = returned_object[1]
                reward = returned_object[2]
                if str(next_state) not in q_dictionary:
                    q_dictionary[str(next_state)] = dict()
                if 0 not in q_dictionary[str(next_state)]:
                    q_dictionary[str(next_state)][0] = 0
                if 1 not in q_dictionary[str(next_state)]:
                    q_dictionary[str(next_state)][1] = 0

                q_dictionary[str(current_state)][action] += LEARNING_RATE * (reward + DISCOUNT_FACTOR *
                                                                        max(q_dictionary[str(next_state)][0],
                                                                            q_dictionary[str(next_state)][1]) -
                                                                        q_dictionary[str(current_state)][action])

        action_to_take = 0
        if (q_dictionary[str(current_state)][1] > q_dictionary[str(current_state)][0]):
            action_to_take = 1


        # vector = model.predict([np.matrix(list(current_state.values()))])
        # action_to_take = np.argmax(vector[0])
        # print(vector[0][0], vector[0][1], action_to_take)
        # q_dictionary[str(current_state)][0] = vector[0][0]
        # q_dictionary[str(current_state)][1] = vector[0][1]


        
        returned_object = generate_next_state(previous_action, current_state, 0, passed, old_y)
        if returned_object[0] == 0:
            raise NameError("Error. {}".format(returned_object[1]))
        else:
            reward_to_take = returned_object[2]
            next_state = returned_object[1]

        vector = model.predict(np.matrix(list(next_state.values())))
        target_to_learn = list()
        target_to_learn.append(reward_to_take + DISCOUNT_FACTOR * vector[0][0])

        returned_object = generate_next_state(previous_action, current_state, 1, passed, old_y)
        if returned_object[0] == 0:
            raise NameError("Error. {}".format(returned_object[1]))
        else:
            reward_to_take = returned_object[2]
            next_state = returned_object[1]
        vector = model.predict(np.matrix(list(next_state.values())))
        target_to_learn.append(reward_to_take + DISCOUNT_FACTOR * vector[0][1])



        # model.fit(np.matrix(list(current_state.values())), np.matrix(target_to_learn))
        
        """
        """

        #
        # returned_object = generate_next_state(previous_action, current_state, action_to_take, passed, old_y)
        # if returned_object[0] == 0:
        #     raise NameError("Error. {}".format(returned_object[1]))
        # else:
        #     reward_to_take = returned_object[2]
        #     next_state = returned_object[1]
        #
        # target_to_learn = [0, 0]
        # vector = model.predict(np.matrix(list(next_state.values())))
        # value_to_learn = (reward_to_take + DISCOUNT_FACTOR * vector[0][action_to_take])
        # if action_to_take == 0:
        #     target_to_learn[action_to_take] = value_to_learn
        #     target_to_learn[1] = q_dictionary[str(current_state)][1]
        # else:
        #     target_to_learn[action_to_take] = value_to_learn
        #     target_to_learn[0] = q_dictionary[str(current_state)][0]

        # target_to_learn = [q_dictionary[str(current_state)][0], q_dictionary[str(current_state)][1]]
        # time.sleep(0.04)
        model.fit(np.matrix(list(current_state.values())), np.matrix(target_to_learn))

        if observation[0]['next_pipe_dist_to_player'] - 4 < 0:
            passed = 4
            old_y = observation[0]['next_pipe_top_y']

        # action = agent.pickAction(reward, observation)
        #nn = random.randint(0, 1)
        # compute_reward(observation[0])
        # nn = int(input("Insert action 0 sau 1"))
        # reward = env.act(env.getActionSet()[nn])
        env_reward = env.act(env.getActionSet()[action_to_take])
        if env_reward == 1:
            final_score += 1
        # if env_reward == 1:
        #     action_to_take = 1
        #     env.act(env.getActionSet()[action_to_take])
        #     env.act(env.getActionSet()[action_to_take])
        previous_action = action_to_take
        if passed !=0:
            passed -= 1
    print("Saving the model")
    model.save("model.h5", overwrite=True)
	"""
	def __init__(self, actions):
		self.actions = actions

	def pickAction(self, reward, obs):
		return self.actions[np.random.randint(0, len(self.actions))]

###################################
game = Doom(scenario="take_cover")

env = PLE(game)
agent = NaiveAgent(env.getActionSet())
env.init()

reward = 0.0
for f in range(15000):
	#if the game is over
        if env.game_over():
            env.reset_game()
            
        action = agent.pickAction(reward, env.getScreenRGB())
        reward = env.act(action)

        if f > 2000:
            env.display_screen = True 
            env.force_fps = False
        
        if f > 2250:
            env.display_screen = True 
            env.force_fps = True
Пример #29
0
    def train(self, scratch, game, display):
        p = PLE(game,
                fps=30,
                frame_skip=1,
                num_steps=1,
                force_fps=True,
                display_screen=display)
        fname = None
        if not scratch:
            fname = self.load()
        else:
            delete_files(self.DATA_DIREC)
        f0, step, nb_save, nb_games = init_train(fname, self.DATA_DIREC)

        eps_tau = (self.NB_FRAMES - f0) // self.EPS_RATE
        scores = []
        while step < self.NB_FRAMES:
            if len(scores) == self.SCORE_FREQ:
                print_scores(scores, self.SCORE_FREQ)
                scores = []

            p.reset_game()
            self.game.getGameState()
            screen = self.process_screen(p.getScreenRGB())
            last_screens_buff = deque([screen] * 4, maxlen=NB_LAST_SCREENS)
            last_screens = np.stack(last_screens_buff, axis=-1)

            # gscore = 0
            nb_games += 1
            score = 0
            while not p.game_over():
                step += 1
                if step != 0 and (step % self.SAVE_FREQ) == 0:
                    self.save(
                        chr(97 + nb_save) + '_' + str(step) + '_' +
                        str(nb_games))
                    nb_save += 1
                if step != 0 and (step % self.EPS_UPDATE_FREQ) == 0:
                    self.epsilon = update_epsilon(step, f0, self.EPS0, eps_tau,
                                                  self.NB_FRAMES)
                    # print('WEIGHTS ABS MEAN')
                    # print(abs(np.mean(self.model.get_weights()[0], axis=1)))

                # 1) In s, choose a (GLIE actor)
                qvals = self.get_qvals(last_screens)
                act = self.greedy_action(qvals, self.epsilon)

                # 2) Observe r, s′
                bare_reward = p.act(ACTIONS[act])
                if bare_reward > 0:
                    score += 1
                reward = self.reward_engineering(bare_reward)
                screen_new = self.process_screen(p.getScreenRGB())

                # update replay_memory
                self.replay_memory.append(screen, act, screen_new, reward)
                if len(self.replay_memory.buff) > self.MIN_REPLAY_MEMORY_SIZE:
                    # build minibatch
                    ls, actions, ls_new, r, terms = self.replay_memory.minibatch(
                    )
                    qvals_new = self.model_target.predict(ls_new)
                    qvals_new_max = qvals_new.max(1).reshape(
                        (self.BATCH_SIZE, 1))
                    delta = r + (1 - terms) * self.GAMMA * qvals_new_max
                    qvals = self.model.predict(ls)
                    qvals[np.arange(self.BATCH_SIZE),
                          actions.ravel()] = delta.ravel()
                    self.model.train_on_batch(x=ls, y=qvals)

                    if step % self.TARGET_FREQ == 0:
                        self.model.save(filepath=self.DATA_DIREC + 'target.h5')
                        self.model_target = load_model(
                            filepath=self.DATA_DIREC + 'target.h5')

                last_screens_buff.append(screen_new)
                last_screens = np.stack(last_screens_buff, axis=-1)
                screen = screen_new
            scores.append(score)
Пример #30
0
                for i in state["creep_pos"]["BAD"]:
                    if math.sqrt((i[0] - x)**2 +
                                 (i[1] - y)**2) <= game.AGENT_RADIUS:
                        color = bad_hit_color
                pygame.draw.circle(screen, color, (int(x), int(y)),
                                   4)  # Here <<<

        pygame.display.update()
        next_state = p.getGameState()
        state = next_state
    p.display_screen = False
    p.reset_game()


# running the game
p.act(-1)
# this line fixes a weird error(first reward is 1.99)(WHY?? no idea)
state = p.getGameState()
reward = 0.0
oldest_state = 0
# play_game(1000)
for episode in range(nr_episodes):
    p.reset_game()
    if episode % 5 == 0 and episode != 0:
        network.save_model()
        play_game(100)
    if (episode == nr_episodes / 2):
        play_game(10)
    for step in range(nr_steps_per_episode):
        epsilon = get_next_epsilon(epsilon)
        if p.game_over():
Пример #31
0
    def train(self, scratch, game, display):
        p = PLE(game,
                fps=30,
                frame_skip=1,
                num_steps=1,
                force_fps=True,
                display_screen=display)
        t1 = time.time()
        fname = None
        if not scratch:
            fname = self.load()
        else:
            delete_files(self.DATA_DIREC)
        f0, step, nb_save, nb_games = init_train(fname, self.DATA_DIREC)

        eps_tau = (self.NB_FRAMES - f0) // 8

        scores = []
        while step < self.NB_FRAMES:
            if len(scores) == self.SCORE_FREQ:
                print('States visited:', len(self.Q))
                print_scores(scores, self.SCORE_FREQ)
                scores = []
            p.reset_game()
            state = game.getGameState()
            state_tp = self.discretize(state)
            if state_tp not in self.Q:
                self.Q[state_tp] = [0, 0]

            act = 1
            episode = deque([], self.SIZE_FIFO)
            elig = {}
            gscore = 0
            nb_games += 1
            while not p.game_over():
                step += 1
                if step != 0 and (step % self.SAVE_FREQ) == 0:
                    self.save('Q_' + chr(97 + nb_save) + '_' + str(step) +
                              '_' + str(nb_games) + '.p')
                    nb_save += 1
                if step != 0 and (step % self.EPS_UPDATE_FREQ) == 0:
                    self.epsilon = update_epsilon(step, f0, self.EPS0, eps_tau,
                                                  self.NB_FRAMES)
                # 1) Observe r, s′
                bare_reward = p.act(ACTIONS[act])
                reward = self.reward_engineering(bare_reward)
                new_state = game.getGameState()
                new_state_tp = self.discretize(new_state)

                # 2) Choose a′ (GLIE actor) using Q
                if new_state_tp not in self.Q:
                    self.Q[new_state_tp] = [0, 0]
                qvals = self.get_qvals(new_state)
                new_act = self.greedy_action(qvals, self.epsilon)

                # 3) Temporal difference:  δ=r+γQ(s′,a′)−Q(s,a)
                delta = reward + self.GAMMA * self.Q[new_state_tp][
                    new_act] - self.Q[state_tp][act]

                # 4) Update Q
                episode.append((state_tp, act))
                elig[(state_tp, act)] = 1
                for (state_tp_ep, act_ep) in episode:
                    self.Q[state_tp_ep][act_ep] += (
                        self.ALPHA * delta * elig[(state_tp_ep, act_ep)])
                    elig[(state_tp_ep, act_ep)] *= self.LAMBDA

                # 5) s<-s', a<-a'
                state = new_state
                state_tp = new_state_tp
                act = new_act

                if bare_reward > 0:
                    gscore += 1

            scores.append(gscore)

        t2 = time.time()
        # Unicode code point of a: 97
        self.save('Q_' + chr(97 + nb_save) + '_' + str(step) + '_' +
                  str(nb_games) + '.p')
        print()
        print('Number of played games:', nb_games)
        print('Training completed in', (t2 - t1) / 60, 'minutes')
        print()
Пример #32
0
from ple import PLE

NO_OP = 296


def pickAction(actions):
    return random.choice(actions)


game = Pong_2Player()
p = PLE(game, fps=30, display_screen=True, force_fps=False)
p.init()

actions_1 = [K_w, K_s, NO_OP]
actions_2 = [K_a, K_b, NO_OP]

nb_frames = 1000
reward_1 = 0.0
reward_2 = 0.0

for f in range(nb_frames):
    if p.game_over():  #check if the game is over
        p.reset_game()

    obs = p.getScreenRGB()
    action_1 = pickAction(actions_1)
    reward_1 = p.act(action_1)

    action_2 = pickAction(actions_2)
    reward_2 = p.act(action_2)
Пример #33
0
p = PLE(game,
        fps=30,
        frame_skip=1,
        num_steps=1,
        force_fps=False,
        display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

p.init()
reward = 0.0

nb_games = 100
cumulated = np.zeros((nb_games))

for i in range(nb_games):
    p.reset_game()

    while (not p.game_over()):
        state = game.getGameState()
        screen = p.getScreenRGB()
        action = FlappyPolicy(state,
                              screen)  ### Your job is to define this function.

        reward = p.act(action)
        cumulated[i] = cumulated[i] + reward

average_score = np.mean(cumulated)
max_score = np.max(cumulated)
print("Average : ", average_score)
print("Max score : ", max_score)
def train_bird(load_weights=False, train=True):
    game = PLE(FlappyBird(), fps=30, display_screen=True)
    FLAP = 119

    agent = Agent(load_weights, train)
    #     weights_filepath = 'weights/trained_weights.hdf5'
    #     if agent.load_weights:
    #         agent.network.load_weights(weights_filepath)

    games_played = 0
    total_score = 0

    # Training
    while games_played < agent.runs:
        total_score = 0
        # Play a total of 100 games before updating weights
        for i in range(100):
            score = 0
            game.init()
            while not game.game_over():
                # Greedy exploration
                old_state = agent.get_state(game)
                # print(old_state)
                if random.uniform(0, 1) < agent.epsilon:
                    final_action = to_categorical(randint(
                        0, 1), num_classes=2)  # [1,0] SAU [0,1]
                else:
                    prediction = agent.network.predict(
                        old_state.reshape((1, 5)))
                    # print(prediction)
                    final_action = to_categorical(np.argmax(prediction[0]),
                                                  num_classes=2)

                game.act(game.getActionSet()[np.argmax(final_action)])
                reward = 0
                if game.getActionSet()[np.argmax(final_action)] == FLAP:
                    reward = agent.get_reward_after_flap(game)
                else:
                    reward = agent.get_reward(game)
                score += reward
                new_state = agent.get_state(game)

                if agent.train:
                    agent.remember(old_state, final_action, reward, new_state,
                                   game.game_over())

            #print()
            print(
                f'Score: {score}    Epsilon: {agent.epsilon}    Gamma: {agent.gamma}'
            )
            total_score += score

        if agent.train:
            agent.replay_new(agent.memory, agent.batch_size)

        games_played += 1
        print(f'GAME {games_played}    Score: {total_score}')

        # Adjust epsilon for greedy exploration
        if not agent.train:
            agent.epsilon = 0.0
            agent.gamma = 0.9
        else:
            if agent.epsilon > 0.05:
                agent.epsilon = 1 - (games_played * agent.epsilon_decay)
            if agent.gamma <= 0.9:
                agent.gamma = games_played * agent.gamma_decay
Пример #35
0
# You're not allowed to change this file
from ple.games.flappybird import FlappyBird
from ple import PLE
import numpy as np
from FlappyAgent import FlappyPolicy

game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

p.init()
reward = 0.0

nb_games = 100
cumulated = np.zeros((nb_games))

for i in range(nb_games):
    p.reset_game()
    
    while(not p.game_over()):
        state = game.getGameState()
        screen = p.getScreenRGB()
        action=FlappyPolicy(state, screen) ### Your job is to define this function.
        
        reward = p.act(action)
        cumulated[i] = cumulated[i] + reward

average_score = np.mean(cumulated)
max_score = np.max(cumulated)
Пример #36
0
steps = []
step = 0
plt.ion()
for epoch in range(epochs):
    p.reset_game()

    for it in range(1000):
        if p.game_over():
            p.reset_game()
            print "Score:" + str(p.score())

        current_state = game.getGameState()
        processed_current_state = process_state(current_state)

        action = agent.act(processed_current_state)
        reward = p.act(actions[action])
        rewards.append(reward)

        next_state = game.getGameState()
        game_over = p.game_over()

        processed_next_state = process_state(next_state)

        agent.remember(processed_current_state, action, reward,
                       processed_next_state, game_over)
        if len(agent.memory) > 25:
            agent.replay(25)
    steps.append(epoch)
    epsilons.append(agent.epsilon)
    avg_rewards.append(np.average(rewards))
    plt.plot(steps, avg_rewards, 'r')
Пример #37
0
class MyEnv(Environment):
    VALIDATION_MODE = 0

    def __init__(self, rng, game=None, frame_skip=4, 
            ple_options={"display_screen": True, "force_fps":True, "fps":30}):

        self._mode = -1
        self._mode_score = 0.0
        self._mode_episode_count = 0

        self._frameSkip = frame_skip if frame_skip >= 1 else 1
        self._random_state = rng
       
        if game is None:
            raise ValueError("Game must be provided")

        self._ple = PLE(game, **ple_options)
        self._ple.init()

        w, h = self._ple.getScreenDims()
        self._screen = np.empty((h, w), dtype=np.uint8)
        self._reducedScreen = np.empty((48, 48), dtype=np.uint8)
        self._actions = self._ple.getActionSet()

                
    def reset(self, mode):
        if mode == MyEnv.VALIDATION_MODE:
            if self._mode != MyEnv.VALIDATION_MODE:
                self._mode = MyEnv.VALIDATION_MODE
                self._mode_score = 0.0
                self._mode_episode_count = 0
            else:
                self._mode_episode_count += 1
        elif self._mode != -1: # and thus mode == -1
            self._mode = -1

        self._ple.reset_game()
        for _ in range(self._random_state.randint(15)):
            self._ple.act(self._ple.NOOP)
        self._screen = self._ple.getScreenGrayscale()
        cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST)
        
        return [4 * [48 * [48 * [0]]]]
        
        
    def act(self, action):
        action = self._actions[action]
        
        reward = 0
        for _ in range(self._frameSkip):
            reward += self._ple.act(action)
            if self.inTerminalState():
                break
            
        self._screen = self._ple.getScreenGrayscale()
        cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST)
  
        self._mode_score += reward
        return np.sign(reward)

    def summarizePerformance(self, test_data_set):
        if self.inTerminalState() == False:
            self._mode_episode_count += 1
        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))


    def inputDimensions(self):
        return [(4, 48, 48)]

    def observationType(self, subject):
        return np.uint8

    def nActions(self):
        return len(self._actions)

    def observe(self):
        return [np.array(self._reducedScreen)]

    def inTerminalState(self):
        return self._ple.game_over()
Пример #38
0
            "player_vel": self.playerVelY,
            "next_pipe_dist_to_player": pipes[0][1],
            "next_pipe_top_y": pipes[0][0][0]["y"] + pipeHeight,
            "next_pipe_bottom_y": pipes[0][0][1]["y"],
            "next_next_pipe_dist_to_player": pipes[1][1],
            "next_next_pipe_top_y": pipes[1][0][0]["y"] + pipeHeight,
            "next_next_pipe_bottom_y": pipes[1][0][1]["y"]
        }

        return state

    def getScore(self):
        return self.score


if __name__ == '__main__':
    from ple import PLE
    pygame.init()
    game = FlappyClone(black=False)
    env = PLE(game, display_screen=True, force_fps=False, fps=30)
    env.init()

    while True:

        if env.game_over():
            print("Dead")
            env.reset_game()
        # print(game.getGameState())

        reward = env.act(None)