def get_feedback(self): # Get the difference in scores between this and the last # frame. score_change = cartpole.get_score() - self.last_score self.last_score = cartpole.get_score() #print(cartpole.get_score()) return float(score_change), score_change == -1
def get_reward(self): # Get the difference in scores between this and the last # frame. score_change = cartpole.get_score() - self.last_score self.last_score = cartpole.get_score() return float(score_change)
def get_new_state(self): ##should only take state of game after an action ##check score self.reward = cartpole.get_score() ##check if game ended terminal = cartpole.get_end() ##check new state self.new_state = cartpole.get_state() self.reward = self.reward if not terminal else -self.reward self.new_state = np.reshape(self.new_state, [1, self.observation_space]) self.remember(self.state, self.action_index, self.reward, self.new_state, terminal) self.state = self.new_state self.experience_replay() ##if game end record scores if terminal == True: self.scores.append(self.reward) return self.state
def train_model(self): # This function trains the NN. # Tell us when we train for the first time. if (not self.started_training): print 'Begin training' print 'The score is', cartpole.get_score() self.started_training = True # Sample a mini-batch of observations on which to train. mini_batch = random.sample(self.observations, self.mini_batch_size) # Take the mini-batch apart. previous_states = np.array([d[0] for d in mini_batch]) actions = np.array([d[1] for d in mini_batch]) rewards = np.array([d[2] for d in mini_batch]) current_states = np.array([d[3] for d in mini_batch]) # The variable which will hold the data against which we will train. agents_expected_reward = [] # Run the forward pass on the current states, to get # Q(a_{t+1}, s_{t+1}). agents_reward_per_action = self.q_model.predict(current_states) # Now build the training data. for i in range(self.mini_batch_size): agents_expected_reward.append(rewards[i] + self.future_reward_discount * np.max(agents_reward_per_action[i])) # Train the NN on the mini-batch. loss = self.applied_action_model.train_on_batch( [previous_states, actions], np.array(agents_expected_reward))
def get_keys_pressed(self, reward): # This is the real work horse of the code. Here is where the # actual work gets done. # Get the current state of the game. current_state = cartpole.get_state() # Append the latest observation to the collection of # observations. self.observations.append( [self.last_state, self.last_action, reward, current_state]) # We can't keep all observations. If there are too many then # pop off the oldest. if (len(self.observations) > self.max_obs_length): # only remove non-rewarded actions, if there aren't enough if (self.rewards_frac() < 0.4): self.remove_bad_point() else: self.observations = self.observations[1:] # If we have collected enough observations, train. if (len(self.observations) > self.min_obs_steps): if cartpole.get_score() < 50: print "Initialization score is too low. Initializing again." # remove 50 bad points for i in range(50): self.remove_bad_point() else: self.train_model() # Reset the last state, and get the next action. self.last_state = current_state self.last_action, action_index = self.choose_next_action() # If we are out of the randomness-only regime, reduce the # current probability for a random move. if ((self.random_action_prob > self.final_random_prob) and (len(self.observations) > self.min_obs_steps)): self.random_action_prob -= ( (self.initial_random_prob - self.final_random_prob) / self.explore_steps) # Set the move to take, based on the action. if action_index == 0: action = [K_LEFT] elif action_index == 1: action = [] else: action = [K_RIGHT] return action
def get_state(self): ##should only take state if game has just started self.score = cartpole.get_score() if self.score == 0: #print("Begining of game") self.state = cartpole.get_state() self.state = np.reshape(self.state, [1, self.observation_space]) if self.score == 1: #print("Begining of game") self.state = cartpole.get_state() self.state = np.reshape(self.state, [1, self.observation_space]) return 0
def q_learn(self): #get reward self.reward = cartpole.get_score() #if game ends and reward < 200 then reward = -300 #if (cartpole.get_end() == True and self.reward < 100): #self.reward = -300 #print(self.reward) a1, max_q_s1a1 = self.max_dict(self.Q[self.new_state]) self.Q[self.state][self.action_index] += self.alpha * ( self.reward + self.gamma * max_q_s1a1 - self.Q[self.state][self.action_index]) return self.new_state
def get_observation(self): self.observation = cartpole.get_state() ## remember env and action choosen if self.training == True: if len(self.prev_obseration) > 0: self.game_memory.append( [self.prev_obseration, self.action_index]) #print(self.game_memory) self.prev_obseration = self.observation else: self.prev_obseration = self.observation self.game_memory.append([self.observation, self.action_index]) self.reward = cartpole.get_score() self.score += self.reward return self.score
def get_new_state(self): ##observe the current state of the game self.new_observation = cartpole.get_state() ##check score self.reward = cartpole.get_score() - self.prev_score self.prev_score = cartpole.get_score() ##check if game ended self.done = cartpole.get_end() self.reward_sum += self.reward if self.training == True: self.drs.append( self.reward ) # record reward (has to be done after we call step() to get reward for previous action) if self.done: # an episode finished # stack together all inputs, hidden states, action gradients, and rewards for this episode self.epx = np.vstack(self.xs) eph = np.vstack(self.hs) epdlogp = np.vstack(self.dlogps) epr = np.vstack(self.drs) # reset array memory self.xs, self.hs, self.dlogps, self.drs = [], [], [], [] # compute the discounted reward backwards through time discounted_epr = self.discount_rewards(epr) # standardize the rewards to be unit normal (helps control the gradient estimator variance) discounted_epr = discounted_epr - np.mean(discounted_epr) discounted_epr /= np.std(discounted_epr) epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.) grad = self.policy_backward(eph, epdlogp) for k in self.model: self.grad_buffer[k] += grad[ k] # accumulate grad over batch # perform rmsprop parameter update every batch_size episodes if self.episode_number % self.batch_size == 0: for k, v in self.model.iteritems(): g = self.grad_buffer[k] # gradient self.rmsprop_cache[ k] = self.decay_rate * self.rmsprop_cache[k] + ( 1 - self.decay_rate) * g**2 self.model[k] = self.alpha * g / ( np.sqrt(self.rmsprop_cache[k]) + 1e-5) self.grad_buffer[k] = np.zeros_like( v) # reset batch gradient buffer self.reward_sum = 0 self.episode_number += 1 self.prev_score = 0 else: if self.done or self.reward_sum >= 200: self.play_scores.append(self.reward_sum) self.reward_sum = 0 self.prev_score = 0 return self.observation