示例#1
0
文件: DQN.py 项目: c00208743/Y4-FYP
    def get_state(self):
        ##should only take state if game has just started
        self.score = cartpole.get_score()
        if self.score == 0:
            #print("Begining of game")
            self.state = cartpole.get_state()
            self.state = np.reshape(self.state, [1, self.observation_space])

        if self.score == 1:
            #print("Begining of game")
            self.state = cartpole.get_state()
            self.state = np.reshape(self.state, [1, self.observation_space])

        return 0
示例#2
0
文件: DQN.py 项目: c00208743/Y4-FYP
    def get_new_state(self):
        ##should only take state of game after an action

        ##check score
        self.reward = cartpole.get_score()
        ##check if game ended
        terminal = cartpole.get_end()

        ##check new state
        self.new_state = cartpole.get_state()

        self.reward = self.reward if not terminal else -self.reward
        self.new_state = np.reshape(self.new_state, [1, self.observation_space])

        self.remember(self.state, self.action_index, self.reward, self.new_state, terminal)

        self.state = self.new_state

        self.experience_replay()

        ##if game end record scores
        if terminal == True:
            self.scores.append(self.reward)

        return self.state
示例#3
0
    def get_new_state(self):
        # observe the env after a descion has been made
        self.new_observation = cartpole.get_state()
        self.new_state = self.get_state_as_string(
            self.assign_bins(self.new_observation, bins))
        #print(self.new_state)

        return self.new_state
示例#4
0
    def get_state(self):

        self.observation = cartpole.get_state()
        self.state = self.get_state_as_string(
            self.assign_bins(
                self.observation,
                bins))  # set state to string to use as key in dict

        return self.state
    def get_keys_pressed(self, reward):

        # This is the real work horse of the code.  Here is where the
        # actual work gets done.

        # Get the current state of the game.
        current_state = cartpole.get_state()

        # Append the latest observation to the collection of
        # observations.
        self.observations.append(
            [self.last_state, self.last_action, reward, current_state])

        # We can't keep all observations.  If there are too many then
        # pop off the oldest.
        if (len(self.observations) > self.max_obs_length):

            # only remove non-rewarded actions, if there aren't enough
            if (self.rewards_frac() < 0.4):
                self.remove_bad_point()
            else:
                self.observations = self.observations[1:]

        # If we have collected enough observations, train.
        if (len(self.observations) > self.min_obs_steps):

            if cartpole.get_score() < 50:
                print "Initialization score is too low.  Initializing again."

                # remove 50 bad points
                for i in range(50):
                    self.remove_bad_point()
            else:
                self.train_model()

        # Reset the last state, and get the next action.
        self.last_state = current_state
        self.last_action, action_index = self.choose_next_action()

        # If we are out of the randomness-only regime, reduce the
        # current probability for a random move.
        if ((self.random_action_prob > self.final_random_prob)
                and (len(self.observations) > self.min_obs_steps)):
            self.random_action_prob -= (
                (self.initial_random_prob - self.final_random_prob) /
                self.explore_steps)

        # Set the move to take, based on the action.
        if action_index == 0:
            action = [K_LEFT]
        elif action_index == 1:
            action = []
        else:
            action = [K_RIGHT]

        return action
示例#6
0
    def get_keys_pressed(self, reward):

        # Here is where the actual work gets done.

        # Get the current state of the game.
        current_state = cartpole.get_state()

        # Append the latest observation to the collection of
        # observations.
        self.last_state = cartpole.get_state()
        self.observations.append(
            [self.last_state, self.last_action, reward, current_state])

        # Reset the last state, and get the next action.
        self.last_state = current_state
        self.last_action, action_index = self.choose_next_action()

        # Set the move to take, based on the action.
        if action_index == 0:
            action = [K_LEFT]
        elif action_index == 1:
            action = [K_RIGHT]

        return action
    def __init__(self):
        """
        Plays CartPole by implementing a NN Q-learning strategy.
        """

        # The future discount rate.
        self.future_reward_discount = 1.0

        # The number of possible actions (left, right, no move)
        self.num_actions = 3

        # The probabilities of using a random move, instead of one
        # from the NN.
        self.initial_random_prob = 1.0
        self.final_random_prob = 0.05
        self.random_action_prob = self.initial_random_prob

        # Variables for holding information about the previous
        # timestep.
        self.last_score = 0
        self.last_state = cartpole.get_state()
        self.last_action = np.array([1.0, 0.0, 0.0])

        # Variables for dealing with pressed keys.
        self.keys_pressed = []
        self.last_keys_pressed = []

        # Build the neural network.
        self.build_model()

        # Size of the observations collection.
        self.max_obs_length = 6000
        self.observations = []

        # Number of observations to gather before starting training of
        # the NN.
        self.min_obs_steps = 3000

        # Number of observations over which to decrease the
        # probability of using a random move, rather than a move from
        # the NN.
        self.explore_steps = 5000

        # The mini-batch size.
        self.mini_batch_size = self.max_obs_length / 8

        # Have we starting training the NN yet?
        self.started_training = False
示例#8
0
    def get_observation(self):
        self.observation = cartpole.get_state()

        ## remember env and action choosen
        if self.training == True:
            if len(self.prev_obseration) > 0:
                self.game_memory.append(
                    [self.prev_obseration, self.action_index])
            #print(self.game_memory)
            self.prev_obseration = self.observation
        else:
            self.prev_obseration = self.observation
            self.game_memory.append([self.observation, self.action_index])

        self.reward = cartpole.get_score()
        self.score += self.reward

        return self.score
示例#9
0
    def get_keys_pressed(self, reward):

        # Here is where the actual work gets done.

        self.current_state = cartpole.get_state()
        self.state = self.get_state_as_string(self.assign_bins(self.current_state, bins))


        # get the next action.
        action_index = self.choose_next_action()





        self.last_state = self.current_state
        # Set the move to take, based on the action.
        if action_index == 0:
            action = [K_LEFT]
        elif action_index == 1:
            action = [K_RIGHT]
        print(action)
        return action
示例#10
0
    def get_new_state(self):
        ##observe the current state of the game
        self.new_observation = cartpole.get_state()
        ##check score
        self.reward = cartpole.get_score() - self.prev_score
        self.prev_score = cartpole.get_score()

        ##check if game ended
        self.done = cartpole.get_end()
        self.reward_sum += self.reward

        if self.training == True:
            self.drs.append(
                self.reward
            )  # record reward (has to be done after we call step() to get reward for previous action)

            if self.done:  # an episode finished
                # stack together all inputs, hidden states, action gradients, and rewards for this episode
                self.epx = np.vstack(self.xs)
                eph = np.vstack(self.hs)
                epdlogp = np.vstack(self.dlogps)
                epr = np.vstack(self.drs)

                # reset array memory
                self.xs, self.hs, self.dlogps, self.drs = [], [], [], []

                # compute the discounted reward backwards through time
                discounted_epr = self.discount_rewards(epr)
                # standardize the rewards to be unit normal (helps control the gradient estimator variance)
                discounted_epr = discounted_epr - np.mean(discounted_epr)
                discounted_epr /= np.std(discounted_epr)

                epdlogp *= discounted_epr  # modulate the gradient with advantage (PG magic happens right here.)

                grad = self.policy_backward(eph, epdlogp)
                for k in self.model:
                    self.grad_buffer[k] += grad[
                        k]  # accumulate grad over batch

                # perform rmsprop parameter update every batch_size episodes
                if self.episode_number % self.batch_size == 0:
                    for k, v in self.model.iteritems():
                        g = self.grad_buffer[k]  # gradient
                        self.rmsprop_cache[
                            k] = self.decay_rate * self.rmsprop_cache[k] + (
                                1 - self.decay_rate) * g**2
                        self.model[k] = self.alpha * g / (
                            np.sqrt(self.rmsprop_cache[k]) + 1e-5)
                        self.grad_buffer[k] = np.zeros_like(
                            v)  # reset batch gradient buffer

                self.reward_sum = 0
                self.episode_number += 1
                self.prev_score = 0
        else:
            if self.done or self.reward_sum >= 200:
                self.play_scores.append(self.reward_sum)
                self.reward_sum = 0
                self.prev_score = 0

        return self.observation
示例#11
0
    def get_state(self):
        ##observe the current state of the game
        self.observation = cartpole.get_state()

        return self.observation