示例#1
0
文件: DQN.py 项目: c00208743/Y4-FYP
    def get_feedback(self):
        # Get the difference in scores between this and the last
        # frame.
        score_change = cartpole.get_score() - self.last_score
        self.last_score = cartpole.get_score()
        #print(cartpole.get_score())

        return float(score_change), score_change == -1
    def get_reward(self):

        # Get the difference in scores between this and the last
        # frame.
        score_change = cartpole.get_score() - self.last_score
        self.last_score = cartpole.get_score()

        return float(score_change)
示例#3
0
文件: DQN.py 项目: c00208743/Y4-FYP
    def get_new_state(self):
        ##should only take state of game after an action

        ##check score
        self.reward = cartpole.get_score()
        ##check if game ended
        terminal = cartpole.get_end()

        ##check new state
        self.new_state = cartpole.get_state()

        self.reward = self.reward if not terminal else -self.reward
        self.new_state = np.reshape(self.new_state, [1, self.observation_space])

        self.remember(self.state, self.action_index, self.reward, self.new_state, terminal)

        self.state = self.new_state

        self.experience_replay()

        ##if game end record scores
        if terminal == True:
            self.scores.append(self.reward)

        return self.state
    def train_model(self):

        # This function trains the NN.

        # Tell us when we train for the first time.
        if (not self.started_training):
            print 'Begin training'
            print 'The score is', cartpole.get_score()
            self.started_training = True

        # Sample a mini-batch of observations on which to train.
        mini_batch = random.sample(self.observations, self.mini_batch_size)

        # Take the mini-batch apart.
        previous_states = np.array([d[0] for d in mini_batch])
        actions = np.array([d[1] for d in mini_batch])
        rewards = np.array([d[2] for d in mini_batch])
        current_states = np.array([d[3] for d in mini_batch])

        # The variable which will hold the data against which we will train.
        agents_expected_reward = []

        # Run the forward pass on the current states, to get
        # Q(a_{t+1}, s_{t+1}).
        agents_reward_per_action = self.q_model.predict(current_states)

        # Now build the training data.
        for i in range(self.mini_batch_size):
            agents_expected_reward.append(rewards[i] +
                                          self.future_reward_discount *
                                          np.max(agents_reward_per_action[i]))

        # Train the NN on the mini-batch.
        loss = self.applied_action_model.train_on_batch(
            [previous_states, actions], np.array(agents_expected_reward))
    def get_keys_pressed(self, reward):

        # This is the real work horse of the code.  Here is where the
        # actual work gets done.

        # Get the current state of the game.
        current_state = cartpole.get_state()

        # Append the latest observation to the collection of
        # observations.
        self.observations.append(
            [self.last_state, self.last_action, reward, current_state])

        # We can't keep all observations.  If there are too many then
        # pop off the oldest.
        if (len(self.observations) > self.max_obs_length):

            # only remove non-rewarded actions, if there aren't enough
            if (self.rewards_frac() < 0.4):
                self.remove_bad_point()
            else:
                self.observations = self.observations[1:]

        # If we have collected enough observations, train.
        if (len(self.observations) > self.min_obs_steps):

            if cartpole.get_score() < 50:
                print "Initialization score is too low.  Initializing again."

                # remove 50 bad points
                for i in range(50):
                    self.remove_bad_point()
            else:
                self.train_model()

        # Reset the last state, and get the next action.
        self.last_state = current_state
        self.last_action, action_index = self.choose_next_action()

        # If we are out of the randomness-only regime, reduce the
        # current probability for a random move.
        if ((self.random_action_prob > self.final_random_prob)
                and (len(self.observations) > self.min_obs_steps)):
            self.random_action_prob -= (
                (self.initial_random_prob - self.final_random_prob) /
                self.explore_steps)

        # Set the move to take, based on the action.
        if action_index == 0:
            action = [K_LEFT]
        elif action_index == 1:
            action = []
        else:
            action = [K_RIGHT]

        return action
示例#6
0
文件: DQN.py 项目: c00208743/Y4-FYP
    def get_state(self):
        ##should only take state if game has just started
        self.score = cartpole.get_score()
        if self.score == 0:
            #print("Begining of game")
            self.state = cartpole.get_state()
            self.state = np.reshape(self.state, [1, self.observation_space])

        if self.score == 1:
            #print("Begining of game")
            self.state = cartpole.get_state()
            self.state = np.reshape(self.state, [1, self.observation_space])

        return 0
示例#7
0
    def q_learn(self):
        #get reward
        self.reward = cartpole.get_score()
        #if game ends and reward < 200 then reward = -300
        #if (cartpole.get_end() == True and self.reward < 100):
        #self.reward = -300
        #print(self.reward)

        a1, max_q_s1a1 = self.max_dict(self.Q[self.new_state])

        self.Q[self.state][self.action_index] += self.alpha * (
            self.reward + self.gamma * max_q_s1a1 -
            self.Q[self.state][self.action_index])
        return self.new_state
示例#8
0
    def get_observation(self):
        self.observation = cartpole.get_state()

        ## remember env and action choosen
        if self.training == True:
            if len(self.prev_obseration) > 0:
                self.game_memory.append(
                    [self.prev_obseration, self.action_index])
            #print(self.game_memory)
            self.prev_obseration = self.observation
        else:
            self.prev_obseration = self.observation
            self.game_memory.append([self.observation, self.action_index])

        self.reward = cartpole.get_score()
        self.score += self.reward

        return self.score
示例#9
0
    def get_new_state(self):
        ##observe the current state of the game
        self.new_observation = cartpole.get_state()
        ##check score
        self.reward = cartpole.get_score() - self.prev_score
        self.prev_score = cartpole.get_score()

        ##check if game ended
        self.done = cartpole.get_end()
        self.reward_sum += self.reward

        if self.training == True:
            self.drs.append(
                self.reward
            )  # record reward (has to be done after we call step() to get reward for previous action)

            if self.done:  # an episode finished
                # stack together all inputs, hidden states, action gradients, and rewards for this episode
                self.epx = np.vstack(self.xs)
                eph = np.vstack(self.hs)
                epdlogp = np.vstack(self.dlogps)
                epr = np.vstack(self.drs)

                # reset array memory
                self.xs, self.hs, self.dlogps, self.drs = [], [], [], []

                # compute the discounted reward backwards through time
                discounted_epr = self.discount_rewards(epr)
                # standardize the rewards to be unit normal (helps control the gradient estimator variance)
                discounted_epr = discounted_epr - np.mean(discounted_epr)
                discounted_epr /= np.std(discounted_epr)

                epdlogp *= discounted_epr  # modulate the gradient with advantage (PG magic happens right here.)

                grad = self.policy_backward(eph, epdlogp)
                for k in self.model:
                    self.grad_buffer[k] += grad[
                        k]  # accumulate grad over batch

                # perform rmsprop parameter update every batch_size episodes
                if self.episode_number % self.batch_size == 0:
                    for k, v in self.model.iteritems():
                        g = self.grad_buffer[k]  # gradient
                        self.rmsprop_cache[
                            k] = self.decay_rate * self.rmsprop_cache[k] + (
                                1 - self.decay_rate) * g**2
                        self.model[k] = self.alpha * g / (
                            np.sqrt(self.rmsprop_cache[k]) + 1e-5)
                        self.grad_buffer[k] = np.zeros_like(
                            v)  # reset batch gradient buffer

                self.reward_sum = 0
                self.episode_number += 1
                self.prev_score = 0
        else:
            if self.done or self.reward_sum >= 200:
                self.play_scores.append(self.reward_sum)
                self.reward_sum = 0
                self.prev_score = 0

        return self.observation