def get_action_by_state(state, verbose=1): global k, q, act, rew, r, c, old_r, old_c, last_score, only_kohonen, start if k is None: k = Kohonen(100, 100, n_features, 10, 0.5) # k = Kohonen.from_file('kohonen1', 10, 0.5) # q = QLearning(50, 50, n_actions, 0.3, 0.3, 1) q = QLearning(50, 50, n_actions, 0, 0, 1) start = time.time() r, c = k.find_winner(state) act = q.get_action(r, c) else: if only_kohonen: parallel_kohonen_learning(state) # k.find_winner(state) act = random.randint(0, 3) else: rew = bbox.get_score() - rew old_r = r old_c = c r, c = k.find_winner(state) q.update_qvalue(old_r, old_c, act, rew, r, c) act = q.get_action(r, c) if verbose: # print(bbox.get_score(), bbox.get_score() - last_score, act) # last_score = bbox.get_score() if bbox.get_time() % 1000 == 0: print(bbox.get_time(), bbox.get_score(), time.time() - start) # k.save_to_file("k_iteration_normal_" + str(bbox.get_time())) return act
def test_bot(bot, level, make_features): env = BBox(level) while env.has_next: if env.get_time() % 10000 == 0: print str(env.get_time()) + "\t" + str(env.get_score()) action = bot.get_action(make_features(env)) env.do_action(action) bbox.finish() print bbox.get_score()
def test_bot(bot, level, make_features): env = BBox(level) while env.has_next: if env.get_time() % 10000 == 0: print str(env.get_time()) + "\t" + str(env.get_score()) action = bot.get_action(make_features(env)) env.do_action(action) bbox.finish() print bbox.get_score()
def get_all_score_diffs(state=None,verbose=0): initial = bbox.get_score() checkpoint_id = bbox.create_checkpoint() all_scores = np.zeros(shape=n_a) for a in range(n_a): for _ in range(100): bbox.do_action(a) all_scores[a]=bbox.get_score()-initial bbox.load_from_checkpoint(checkpoint_id) return all_scores
def get_score(self): #fruit_row, fruit_col, basket = self.state[0] #if fruit_row == self.grid_size-1: # if abs(fruit_col - basket) <= 1: # self.won = True # return 1 # else: # return -1 #else: # return 0 self.action_score = bbox.get_score() - self.last_score self.last_score = bbox.get_score() return self.action_score #-1 if self.action_score < 0 else (1 if self.action_score > 0 else 0) # min(1, max(0,self.action_score))
def get_score(self): #fruit_row, fruit_col, basket = self.state[0] #if fruit_row == self.grid_size-1: # if abs(fruit_col - basket) <= 1: # self.won = True # return 1 # else: # return -1 #else: # return 0 self.action_score = bbox.get_score() - self.last_score self.last_score = bbox.get_score() return self.action_score #-1 if self.action_score < 0 else (1 if self.action_score > 0 else 0) # min(1, max(0,self.action_score))
def run_bbox(rnet_model, train_data, train_level=True, verbose=True): """ Run a single session of the black box training or test environments :param rnet_model: model with a get_action(state) method :param train_data: a DataSet object used to buffer each state :param train_level: boolean, run the training level if True :param verbose: boolean, display additional information if True :return: float, the final session score """ has_next = 1 prepare_bbox(train_level) train_data.clear_buffer() while has_next: step_count = bbox.get_time() train_data.update_buffer(bbox.get_state()) state = train_data.get_buffer() action = rnet_model.get_action(state) has_next = bbox.do_action(action) if step_count % 5000 == 0 and verbose: print("time = %d, score = %f" % (step_count, bbox.get_score())) final_score = bbox.finish(verbose=1) return final_score
def run_bbox(verbose=False): has_next = 1 prepare_bbox() #vector of the current state features input_var= T.dvector('in_state') input_var= T.reshape(input_var,(1,n_features)) #Load net into the agent object agent=prepare_agent(input_var) attempt = lasagne.layers.get_output(agent) #function to do all of the stuff above eval_fn = theano.function([input_var], attempt,on_unused_input='ignore') #time to check how long it takes to run start = time.time() error=0 steps=0 while has_next: state = bbox.get_state() r_state= np.reshape(state,(1,n_features)) attempt = eval_fn(r_state) action = np.argmax(attempt) steps+=1 if steps%10000==0: score = bbox.get_score() print ("Steps: {}".format(steps)) print (" training loss: {}".format(error/steps)) print (" current score: {}".format(score)) has_next = bbox.do_action(action) print ("Time to run: {} seconds".format(time.time()-start)) print ("{} steps total".format(steps)) np.savez('model.npz', *lasagne.layers.get_all_param_values(agent)) bbox.finish(verbose=1)
def run_bbox(verbose=False): ''' Runs the Blackbox challenge. ''' has_next = True prepare_bbox() while has_next: ## Observe the current state variables state = bbox.get_state() state_tuple = get_state_tuple(state) ## Select the current action action = get_action(state_tuple, verbose=verbose, is_current=True) ## Get the current reward reward = bbox.get_score() print 'Reward = ' + str(reward) ## Retrieve the current Q-value current_q = q_function[state_tuple][action] print 'Current Q = ' + str(current_q) ## Observe the next state (assuming there always is) has_next = bbox.do_action(action) next_state = bbox.get_state() next_state_tuple = get_state_tuple(next_state) ## Get the best q_action in the new state next_action = get_action(next_state_tuple, verbose=verbose, is_current=False) ## Get the new Q_value next_q = q_function[next_state_tuple][next_action] ## Update the Q-function q_function[state_tuple][action] = (1 - alpha) * current_q + alpha * (reward + gamma * next_q) print 'Updated Q = ' + str(q_function[state_tuple][action]) bbox.finish(verbose=True)
def run_bbox(rnet_model, train_data, train_level=True, verbose=True): """ Run a single session of the black box training or test environments :param rnet_model: model with a get_action(state) method :param train_data: a DataSet object used to buffer each state :param train_level: boolean, run the training level if True :param verbose: boolean, display additional information if True :return: float, the final session score """ has_next = 1 prepare_bbox(train_level) train_data.clear_buffer() while has_next: step_count = bbox.get_time() train_data.update_buffer(bbox.get_state()) state = train_data.get_buffer() action = rnet_model.get_action(state) has_next = bbox.do_action(action) if step_count % 5000 == 0 and verbose: print ("time = %d, score = %f" % (step_count, bbox.get_score())) final_score = bbox.finish(verbose=1) return final_score
def is_won(self): #fruit_row, fruit_col, basket = self.state[0] final_score = bbox.get_score() bbox.reset_level() # bbox.finish(verbose=1) self.last_score = 0 self.action_count = 0 return final_score > 0 #fruit_row == self.grid_size-1 and abs(fruit_col - basket) <= 1
def calc_best_action_using_checkpoint(): checkpoint_id = bbox.create_checkpoint() best_action = -1 best_score = -1e9 for action in range(n_actions): for _ in range(100): bbox.do_action(action) if bbox.get_score() > best_score: best_score = bbox.get_score() best_action = action bbox.load_from_checkpoint(checkpoint_id) return best_action
def update(self, action): self._actions_log.append(action[0]) self._steps += 1 self._prev_score = bbox.get_score() self._is_over = not bbox.do_action(action[0]) self._state = bbox.get_state().reshape(self._state_shape) #print "\nupdate", self._prev_score, action, bbox.get_score(), self._is_over return self.state, self.reward(), self.is_over
def is_won(self): #fruit_row, fruit_col, basket = self.state[0] final_score = bbox.get_score() bbox.reset_level() # bbox.finish(verbose=1) self.last_score = 0 self.action_count = 0 return final_score > 0 #fruit_row == self.grid_size-1 and abs(fruit_col - basket) <= 1
def get_all_scores(state,verbose=0): checkpoint_id = bbox.create_checkpoint() all_scores = np.array(1,n_actions) for a in range(n_actions): bbox.do_action(a) all_scores[a]=bbox.get_score() bbox.load_from_checkpoint(checkpoint_id) return all_scores
def act(self, action): self._actions_log.append(action) self._steps += 1 self._prev_score = bbox.get_score() self._is_over = not bbox.do_action(action) self._state = bbox.get_state().reshape((1, self.n_features)) #print "\nupdate", self._prev_score, action, bbox.get_score(), self._is_over return self.state, self.reward(), self.is_over
def calc_best_action_using_checkpoint(): checkpoint_id = bbox.create_checkpoint() best_action = -1 best_score = -1e9 for action in range(n_actions): for _ in range(100): bbox.do_action(action) if bbox.get_score() > best_score: best_score = bbox.get_score() best_action = action bbox.load_from_checkpoint(checkpoint_id) return best_action
def get_action_by_state(state, verbose=0): if verbose: for i in range(n_features): print ("state[%d] = %f" % (i, state[i])) print ("score = {}, time = {}".format(bbox.get_score(), bbox.get_time())) action_to_do = 0 return action_to_do
def get_action_by_state(state, verbose=0): if verbose: for i in range(n_features): print ("state[%d] = %f" % (i, state[i])) print ("score = {}, time = {}".format(bbox.get_score(), bbox.get_time())) action_to_do = 0 return action_to_do
def action_lookup(model, train_data, step_inc): """ At any given point, use action_lookup to determine the ideal action from the current state. Use the behavior of the model following each possible action to determine that which brings the greatest reward. :param model: object with a get_action method for action inference :param train_data: DataSet object used for bbox state buffering :param step_inc: int, the number of state steps to increment for each possible action of action_n total actions :return: (int, float), the tuple representing the highest scoring action """ # Create a checkpoint to revert to after each action lookup start_checkpoint = bbox.create_checkpoint() # Similarly, create a backup of the DataSet object state buffer train_data.backup_buffer() best_score = -1e9 best_action = -1 # Perform the forward lookup for all valid actions for action_idx in xrange(action_n): start_score = bbox.get_score() bbox.do_action(action_idx) train_data.update_buffer(bbox.get_state()) # After the initial action selection, use the model inference to # continue step_inc states into the future for _ in xrange(step_inc): action = model.get_action(train_data.get_buffer()) bbox.do_action(action) train_data.update_buffer(bbox.get_state()) # Check the score delta step_inc steps after the initial aciton end_score = bbox.get_score() score_delta = end_score - start_score if score_delta > best_score: best_score = score_delta best_action = action_idx bbox.load_from_checkpoint(start_checkpoint) train_data.restore_buffer() return best_action, best_score
def calc_best_action_using_checkpoint(action_range=50): # Pretty straightforward — we create a checkpoint and get it's ID checkpoint_id = bbox.create_checkpoint() best_action = -1 best_score = -1e9 for action in range(n_actions): for _ in range(action_range): #random.randint(1,100) bbox.do_action(action) if bbox.get_score() > best_score: best_score = bbox.get_score() best_action = action bbox.load_from_checkpoint(checkpoint_id) return best_action
def action_lookup(model, train_data, step_inc): """ At any given point, use action_lookup to determine the ideal action from the current state. Use the behavior of the model following each possible action to determine that which brings the greatest reward. :param model: object with a get_action method for action inference :param train_data: DataSet object used for bbox state buffering :param step_inc: int, the number of state steps to increment for each possible action of action_n total actions :return: (int, float), the tuple representing the highest scoring action """ # Create a checkpoint to revert to after each action lookup start_checkpoint = bbox.create_checkpoint() # Similarly, create a backup of the DataSet object state buffer train_data.backup_buffer() best_score = -1e9 best_action = -1 # Perform the forward lookup for all valid actions for action_idx in xrange(action_n): start_score = bbox.get_score() bbox.do_action(action_idx) train_data.update_buffer(bbox.get_state()) # After the initial action selection, use the model inference to # continue step_inc states into the future for _ in xrange(step_inc): action = model.get_action(train_data.get_buffer()) bbox.do_action(action) train_data.update_buffer(bbox.get_state()) # Check the score delta step_inc steps after the initial aciton end_score = bbox.get_score() score_delta = end_score - start_score if score_delta > best_score: best_score = score_delta best_action = action_idx bbox.load_from_checkpoint(start_checkpoint) train_data.restore_buffer() return best_action, best_score
def get_action_by_state(state, verbose=1): global action_to_do if verbose: #for i in range(n_features): # print ("state[%d] = %f" % (i, state[i])) if bbox.get_time() % 1000 == 0: print ("score = {}, time = {}".format(bbox.get_score(), bbox.get_time())) action_to_do = action_to_do + 1 if action_to_do == 4: action_to_do = 0 return action_to_do
def run_bbox(verbose=False): has_next = 1 prepare_bbox() # vector of the current state features input_var= T.dvector('in_state') input_var= T.reshape(input_var,(memtime,n_f+2)) #Load net into the agent object agent=prepare_agent(input_var) #What the agent thinks the best choice will be attempt = lasagne.layers.get_output(agent)[0] #function to do all of the stuff above test_fn = theano.function([input_var], attempt) # time to check how long it takes to run memory = np.zeros(shape=(memtime,n_f+2)) start = time.time() consequence=0 steps=0 while has_next: memory = forget(memory) state = bbox.get_state() memory[0][:-2]=state choices = test_fn(memory) action = np.argmax(choices) has_next = bbox.do_action(action) score = bbox.get_score() consequence=score-consequence memory[0][-2:] = [action,consequence] steps+=1 if steps%10000==0: score = bbox.get_score() print ("Steps: {}".format(steps)) print (" current score: {}".format(score)) print ("Final Score: {}".format(score)) print ("Time to run: {} seconds".format(time.time()-start)) bbox.finish(verbose=1)
def run_bbox(): global ensamble has_next = 1 prepare_bbox() ensamble=Ensemble.NN_Ensemble(n_features,4,[[36,64,4],[16,4],[16,4],[36,64,4]],n_actions) ensamble.read_weights("weights") while has_next: state = bbox.get_state() action = get_action_by_state(state) has_next = bbox.do_action(action) if(bbox.get_time()%10000==0): print(str(bbox.get_time())+" "+str(bbox.get_score())) bbox.finish(verbose=1)
def run_bbox(): has_next = 1 prepare_bbox() while has_next: best_act = calc_best_action_using_checkpoint() for _ in range(100): has_next = bbox.do_action(best_act) if bbox.get_time() % 10000 == 0: print ("time = %d, score = %f" % (bbox.get_time(), bbox.get_score())) bbox.finish(verbose=1)
def play(self, action, report_action=False): #state = self.state printing = False # SET PRINTING HERE ********************************************************************* self.action_count = self.action_count + 1 if report_action and printing: print print print("PRE ACTN#%d: time=%fs total score=%f" % (self.action_count, (dt.datetime.now() - self.time).seconds, bbox.get_score())) self.time = dt.datetime.now() self.has_next = bbox.do_action(action)
def get_action_by_state(state, verbose=0): # If verbose = True enable detailed logging to console if verbose: # Print environment state vector for i in range (n_features): print("state[%d] = %f" % (i, state[i])) # Print current score and time (number of current game step) print("score = {}, time={}".format(bbox.get_score(), bbox.get_time())) # This simple bot always performs action number 0. Not so smart :) # action_to_do = 0 action_to_do = random.randint(0, 3) # Choose a random integer with value between 0 and 3 return action_to_do
def run_bbox(): global ensamble has_next = 1 prepare_bbox() ensamble = Ensemble.NN_Ensemble( n_features, 4, [[36, 64, 4], [16, 4], [16, 4], [36, 64, 4]], n_actions) ensamble.read_weights("weights") while has_next: state = bbox.get_state() action = get_action_by_state(state) has_next = bbox.do_action(action) if (bbox.get_time() % 10000 == 0): print(str(bbox.get_time()) + " " + str(bbox.get_score())) bbox.finish(verbose=1)
def run_bbox(): has_next = 1 prepare_bbox() while has_next: best_act = calc_best_action_using_checkpoint() for _ in range(100): has_next = bbox.do_action(best_act) if bbox.get_time() % 10000 == 0: print("time = %d, score = %f" % (bbox.get_time(), bbox.get_score())) bbox.finish(verbose=1)
def get_action_by_state(state, verbose=0): # If verbose = True enable detailed logging to console if verbose: # Print environment state vector for i in range(n_features): print("state[%d] = %f" % (i, state[i])) # Print current score and time (number of current game step) print("score = {}, time={}".format(bbox.get_score(), bbox.get_time())) # This simple bot always performs action number 0. Not so smart :) # action_to_do = 0 action_to_do = random.randint( 0, 3) # Choose a random integer with value between 0 and 3 return action_to_do
def run_bbox(verbose=False): has_next = 1 prepare_bbox() # vector of the current state features input_var= T.matrix('in_state') input_var= T.reshape(input_var,(1000,n_features)) #vector of the scores for 100 of the same action target_var = T.matrix('scores') target_var = T.reshape(target_var,(1000,n_actions)) #Load net into the agent object agent=prepare_agent(input_var) #what the agent thinks will happen if it does each action 100 times attempt = lasagne.layers.get_output(agent) #how much the agent was wrong, and should be punished punish = lasagne.objectives.squared_error(attempt,target_var) punish = punish.mean() #get the parameters for updating params = lasagne.layers.get_all_params(agent,trainable=True) #update the net with the error teach = lasagne.updates.nesterov_momentum(punish,params,learning_rate=0.001,momentum=0.9) #function to do all of the stuff above train_fn = theano.function([input_var, target_var], punish, updates=teach,on_unused_input='ignore') # time to check how long it takes to run start = time.time() states, scores, loops = load_dataset('Full.txt') for n in range(loops): error=0 steps=0 ins = states[n:n+15] out = scores[n:n+15] action = np.argmax(out[0]) error = train_fn(ins,out) if n%10000==0: score = bbox.get_score() print ("Steps: {}".format(steps)) print (" training loss: {}".format(error)) print (" current score: {}".format(score)) has_next = bbox.do_action(action) print ("Time to run: {} seconds".format(time.time()-start)) np.savez('model.npz', *lasagne.layers.get_all_param_values(agent)) bbox.finish(verbose=1)
def is_over(self): score = bbox.get_score() if score > self._epoch_max: self._epoch_max = score # remember max score if self._steps >= self.train_steps: print "\nover (steps: {}/{}, score: {:.5}/{:.5})".format(self._steps, self.train_steps, score, self._epoch_max) print self._actions_log #if score == self._epoch_prev or score == self._epoch_max: self.train_steps += 0.1 # increase steps after a while self._epoch_prev = score return True if score < -1. and score < -self._epoch_max / 2: print "\ndead (steps: {}/{}, score: {:.5}/{:.5})".format(self._steps, self.train_steps, score, self._epoch_max) print self._actions_log return True return self._is_over
def is_over(self): score = bbox.get_score() if score > self._epoch_max: self._epoch_max = score # remember max score if self._steps >= self.train_steps: print "\nover (steps: {}/{}, score: {:.5}/{:.5})".format(self._steps, self.train_steps, score, self._epoch_max) print self._actions_log if score == self._epoch_prev or score == self._epoch_max: self.train_steps += 1 #self.train_steps += 0.1 # slowly increase steps self._epoch_prev = score return True if score < -1. and score < -self._epoch_max / 2: print "\ndead (steps: {}/{}, score: {:.5}/{:.5})".format(self._steps, self.train_steps, score, self._epoch_max) print self._actions_log return True return self._is_over
def run_bbox(verbose=False): bbox.load_level("../levels/train_level.data", verbose=True) states, actions, scores, rewards = [], [], [], [] with open('utility_models.pkl', 'rb') as f: utility_models = pickle.load(f) step = 0 has_next = 1 while has_next: step += 1 state = bbox.get_state() action = np.random.choice(n_actions) utilities = [m.predict([state]) for m in utility_models] action = np.argmax(utilities) # Do action and bookkeeping has_next = bbox.do_action(action) states.append(np.array(state)) actions.append(action) score = bbox.get_score() rewards.append(score if not scores else (score - scores[-1])) scores.append(score) if verbose and step % 10000 == 0: print(step, score) i = 1 get_outdir = 'run_{}'.format outdir = get_outdir(i) while os.path.exists(outdir): i += 1 outdir = get_outdir(i) os.mkdir(outdir) print('saving to {}'.format(outdir)) scores = np.array(scores, dtype=np.float32) scores.tofile(os.path.join(outdir, 'scores')) actions = np.array(actions, dtype=np.int8) actions.tofile(os.path.join(outdir, 'actions')) states = np.array(states, dtype=np.float32) states.tofile(os.path.join(outdir, 'states')) bbox.finish(verbose=True)
def get_action_by_state(state, verbose=False): ''' This is the policy function. It takes the environment state vector and returns an action that the agent performs. It suffices to only modify this function to create a proper learning agent. ''' an_interaction = [] if verbose: # enables detailed logging for i in range(n_features): ## Print the environment state vector # print ("state[%d] = %f" % (i, state[i])) an_interaction.append(state[i]) ## Print the current score and time (number of current game steps) reward = bbox.get_score() an_interaction.append(reward) # print ("score = {}, time = {}".format(reward, bbox.get_time())) ## TODO: Change this action action_to_do = random.random() an_interaction.append(action_to_do) interaction_list.append(an_interaction) return action_to_do
def run_bbox(verbose=False): prepare_bbox() # vector of the current state features input_var= T.tensor3('memory') input_var= T.reshape(input_var,(memtime,1,n_f+2)) #Score after the agent makes it's choice reality = T.vector('score_diffs') #Load net into the agent object agent=prepare_agent(input_var) #What the agent thinks their best choice is this event evaluation = lasagne.layers.get_output(agent)[0] #how much the agent should be rewarded/punished reward = lasagne.objectives.squared_error(evaluation,reality) reward = reward.mean() #get the parameters for updating params = lasagne.layers.get_all_params(agent,trainable=True) #update the net with the error teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.01,momentum=0.9) #A function to get the agent's choice of what to try this time decide_fn = theano.function([input_var],evaluation) #function to do all of the stuff above train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore') # time to check how long it takes to run start = time.time() for epoch in range(epochs): memory = np.zeros(shape=(memtime,1,n_f+2)) e_time = time.time() #time for this epoch has_next = 1 #looping variable, state of bbox #initialize tracking variables consequence=error=0 steps=0 trust=0.00+.02*epoch good=0 while has_next: #Updating memory matrix, forgetting a state, making room memory = forget(memory) state = bbox.get_state() #get best action based on 100 step checkpoint method actuals = get_all_score_diffs(state) #upload new state, with no score or action chosen memory[0][0][:-2] = state if rand.random()>trust: action = rand.randint(0,n_a-1) #if trust is too low still, random action else: choices = decide_fn(memory) #Otherwise, let the agent decide. action = np.argmax(choices) #pick action agent thinks is best if action == np.argmax(actuals): good = good+1 #do it, and find out the consequences (if the score improved or went down) has_next = bbox.do_action(action) #find consequenquence score = bbox.get_score() consequence=score-consequence #train on choices just made and memory memory[0][0][-2:]=[action,consequence] error += train_fn(memory,actuals) #train based on the score change #updating for next loop steps += 1 #occasionally check in on progress if steps%10000==0: score = bbox.get_score() print ("Epoch: {}".format(epoch)) print ("Steps: {}".format(steps)) print (" current trust: {}".format(trust)) print (" avg error: {}".format(error/steps)) print (" bad choices: {}%".format(100-float(good)/100)) print (" current score: {}".format(score)) if trust<.95: trust = trust+.02 bbox.clear_all_checkpoints() ch=ra=good=0 #report on model quality on previous epoch score = bbox.get_score() with open("epoch_data.txt","a") as f: f.write("Epoch: {} Final Score: {} Average Error: {} Time to Run: {} min\n".format(epoch,score,error/steps,(time.time()-e_time)/60)) #save model parameters np.savez('model_LSTM_cost.npz', *lasagne.layers.get_all_param_values(agent)) #reset box for next epoch if(epoch<epochs-1): bbox.reset_level() print ("Time to run: {} hours".format((time.time()-start)/3600)) bbox.finish(verbose=1)
def run_bbox(verbose=False, epsilon=0.1, gamma=0.99, action_repeat=4, update_frequency=4, batchSize=32, buffer=100000, load_weights=False, save_weights=False): has_next = 1 # Prepare environment - load the game level prepare_bbox() update_frequency_cntr = 0 replay = [] h=0 if load_weights: model.load_weights('my_model_weights.h5') model_prim.load_weights('my_model_weights.h5') #stores tuples of (S, A, R, S') while has_next: # Get current environment state state = copy.copy(bbox.get_state()) prev_reward = copy.copy(bbox.get_score()) #Run the Q function on S to get predicted reward values on all the possible actions qval = model.predict(state.reshape(1,n_features), batch_size=1) # Choose an action to perform at current step if random.random() < epsilon: #choose random action or best action if random.random() < 0.5: action = np.random.randint(0,n_actions) #assumes 4 different actions else: # Use checkpoints to prime network with good actions action_range=50 #random.randint(1,200) action = calc_best_action_using_checkpoint(action_range=action_range) #for _ in range(action_range): # has_next = bbox.do_action(action) else: #choose best action from Q(s,a) values action = (np.argmax(qval)) # Perform chosen action, observe new state S' # Function do_action(action) returns False if level is finished, otherwise returns True. for a in range(action_repeat): has_next = bbox.do_action(action) new_state = copy.copy(bbox.get_state()) reward = copy.copy(bbox.get_score()) - prev_reward #reward = 1.0 if reward > 0.0 else -1.0 #this gives better than random when combined with a small network #Experience replay storage if (len(replay) < buffer): #if buffer not filled, add to it replay.append((state, action, reward, new_state)) else: #if buffer full, overwrite old values if (h < (buffer-1)): h += 1 else: h = 0 replay[h] = (state, action, reward, new_state) #randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) X_train = [] y_train = [] for memory in minibatch: #Get max_Q(S',a) old_state, action, reward, new_state = memory old_qval = model.predict(old_state.reshape(1,n_features), batch_size=1) newQ = model.predict(new_state.reshape(1,n_features), batch_size=1) maxQ = np.max(newQ) y = np.zeros((1,n_actions)) y[:] = old_qval[:] if has_next == 1: #non-terminal state update = (reward + (gamma * maxQ)) else: #terminal state update = reward y[0][action] = update X_train.append(old_state) y_train.append(y.reshape(n_actions,)) X_train = np.array(X_train) y_train = np.array(y_train) # update the weights of a copy of the network model_prim.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0) if update_frequency_cntr >= update_frequency: prim_weights = model_prim.get_weights() print('model update') model.set_weights(prim_weights) update_frequency_cntr = 0 update_frequency_cntr += 1 if bbox.get_time() % 500000 == 0: print ("time = %d, score = %f" % (bbox.get_time(), bbox.get_score())) # Finish the game simulation, print earned reward and save weights if save_weights: model_prim.save_weights('my_model_weights.h5', overwrite=True) bbox.finish(verbose=1)
def run_bbox(verbose=False): prepare_bbox() # vector of the current state features input_var= T.matrix('memory') input_var= T.reshape(input_var,(memtime,n_f+2)) #Score after the agent makes it's choice reality = T.scalar('consequence') #Load net into the agent object agent=prepare_agent(input_var) #What the agent thinks the best choice will be attempt = T.max(lasagne.layers.get_output(agent)) #how much the agent should be rewarded/punished reward = lasagne.objectives.squared_error(attempt,reality) #get the parameters for updating params = lasagne.layers.get_all_params(agent,trainable=True) #update the net with the error teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.1,momentum=0.9) #function to do all of the stuff above I DON'T HAVE A TARGET?? train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore') # time to check how long it takes to run memory = np.zeros(shape=(memtime,n_f+2)) start = time.time() scores_per_epoch = np.zeros(epochs) for epoch in range(epochs): e_time = time.time() #time for this epoch has_next = 1 #looping variable, state of bbox #initialize tracking variables consequence=0 self_assessment=0 steps=0 trust=0.00 while has_next: #Updating memory matrix, forgetting a state, making room memory = forget(memory) state = bbox.get_state() #upload new state, with no score or action chosen memory[0][:-2] = state if rand.random>trust: action = rand.randint(0,n_a-1) #if trust is too low still, random action else: choices = lasagne.get_output(agent,memory) #Otherwise, let the agent decide. action = np.argmax(choices) #pick action agent thinks is best #do it, and find out the consequences (if the score improved or went down) has_next = bbox.do_action(action) consequence = bbox.get_score()-consequence #train on choices just made and memory memory[0][-2:]=[action,consequence] train_fn(memory,consequence) #train based on the score change #updating for next loop self_assessment += consequence steps += 1 #occasionally check in on progress if steps%10000==0: trust = trust+.01 score = bbox.get_score() print ("Epoch: {}".format(epoch)) print ("Steps: {}".format(steps)) print (" self assessment: {}".format(self_assessment)) print (" trust: {}".format(trust)) print (" current score: {}".format(score)) #report on model quality on previous epoch score = bbox.get_score() print ("Epoch: {}".format(epoch)) print ("Final Score: {}".format(score)) print ("Time to Run: {} minutes".format((time.time()-e_time)/60)) scores_per_epoch[epoch] = score #reset box for next epoch bbox.reset_level() print ("All scores per epoch: ") print (scores_per_epoch) print ("Time to run: {} hours".format((time.time()-start)/3600)) np.savez('model_mem.npz', *lasagne.layers.get_all_param_values(agent)) bbox.finish(verbose=1)
if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/train_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() def get_action_by_state(state): # return np.random.randint(0, 4) return 0 if __name__ == "__main__": has_next = 1 prepare_bbox() prev_score = bbox.get_score() steps = 0 states = [] while has_next and steps < 100: state = bbox.get_state() states.append(state) v = map(lambda f: "%.2f" % abs(f), state) print " ".join(v) action = get_action_by_state(state) has_next = bbox.do_action(action) score = bbox.get_score() prev_score = score steps += 1
def get_score(self): return bbox.get_score()
def run_bbox(verbose=False): bbox.load_level("../levels/train_level.data", verbose=True) states, actions, scores, rewards = [], [], [], [] utility_models = [ SGDRegressor(learning_rate='constant', #penalty='elasticnet', ) for _ in range(n_actions) ] zero_utilities = np.zeros([n_actions]) n_past_act = 1 n_past_st = 0 # in addition to current discount = 0.9 random_steps = 10000 step = 0 has_next = 1 while has_next: step += 1 state = bbox.get_state() utilities = zero_utilities # Choose action using current utility_models if step > random_steps: clf_state = np.concatenate(states[-n_past_st:] + [state]) \ if n_past_st else state try: utilities = np.array( [m.predict([clf_state])[0] for m in utility_models]) except NotFittedError: pass #utilities -= utilities.min() #p = None if np.isclose(utilities, 0).all() else \ # utilities / utilities.sum() if np.random.rand() < 0.1 or step <= random_steps: action = np.random.choice(n_actions) else: action = np.argmax(utilities) # Do action and bookkeeping has_next = bbox.do_action(action) states.append(np.array(state)) actions.append(action) score = bbox.get_score() rewards.append(score if not scores else (score - scores[-1])) scores.append(score) # Train classifiers if len(rewards) >= n_past_act + n_past_st: total_reward = sum(r * np.power(discount, i) for i, r in enumerate(rewards[-n_past_act:])) if n_past_act == 1: clf_state = np.concatenate(states[-(n_past_act + n_past_st):]) else: clf_state = np.concatenate( states[-(n_past_act + n_past_st):-n_past_act + 1]) utility_models[actions[-n_past_act]].partial_fit([clf_state], [total_reward]) if verbose and step % 1000 == 0: print(step, score) i = 1 get_outdir = 'run_{}'.format outdir = get_outdir(i) while os.path.exists(outdir): i += 1 outdir = get_outdir(i) os.mkdir(outdir) print('saving to {}'.format(outdir)) scores = np.array(scores, dtype=np.float32) scores.tofile(os.path.join(outdir, 'scores')) actions = np.array(actions, dtype=np.int8) actions.tofile(os.path.join(outdir, 'actions')) states = np.array(states, dtype=np.float32) states.tofile(os.path.join(outdir, 'states')) bbox.finish(verbose=True)
def getReward(self): cur_reward = self.lastreward self.lastreward = bbox.get_score() print 'lastreward', self.lastreward return cur_reward
def get_score(self): return bbox.get_score()
def reward(self): reward = bbox.get_score() - self._prev_score return reward
def reward(self): reward = bbox.get_score() - self._prev_score return reward
def learn_bbox(rnet_model, train_data, update_inc=5000, lookup_inc=250, seed_data=False): """ Add training instances to train_data from a single run-through of a bbox session. :param rnet_model: model object with get_lreg_action and get_action methods :param train_data: DataSet object used to buffer states and append new training instances :param update_inc: int, number of steps between each nnet model update :param lookup_inc: int, number of forward action lookup steps :param seed_data: boolean, sets best_action is the action returned by the lreg model. :return: int, the number of action errors, or differences between actions produced by the rnet_model and the ideal or seed model. """ has_next = 1 error_count = 0 rand_count = 0 rand_idx = rand_n prepare_bbox() # For each new state in the session, add it to the data set's state # buffer so that historical states are included in a commit event train_data.clear_buffer() current_state = bbox.get_state() train_data.update_buffer(current_state) while has_next: # If all random values have been used, generate a new batch if rand_idx >= (rand_n - 1): rand_vals = numpy.random.random_sample(size=(rand_n)) rand_idx = 0 step_count = bbox.get_time() # Get the next action from the model based on the current set of # buffered states action = rnet_model.get_action(train_data.get_buffer()) # Every update_inc steps train the model's network with newly # acquired training data if step_count % update_inc == 0: rn_model.run_training(train_data, max_steps=update_nnet, restore=True) error_count = 0 rand_count = 0 # If the random value is less than or equal to the sample # probability, sample the current session state and determine the # best action, adding it to the training set if necessary elif rand_vals[rand_idx] <= sample_prob: if seed_data: best_action = rnet_model.get_lreg_action(current_state) score_delta = 0.1 else: best_action, score_delta = action_lookup( rnet_model, train_data, lookup_inc) if action != best_action: train_data.commit_buffer(best_action, score_delta) error_count += 1 rand_count += 1 # Add random variation to the session by performing a random action # if less than or equal to perturb probability if rand_vals[rand_idx + 1] <= perturb_prob: action = numpy.random.randint(0, 4) step_inc = numpy.random.randint(rand_min, rand_max) for _ in xrange(step_inc): has_next = bbox.do_action(action) current_state = bbox.get_state() train_data.update_buffer(current_state) else: has_next = bbox.do_action(action) current_state = bbox.get_state() train_data.update_buffer(current_state) rand_idx += 2 if step_count % 5000 == 0: print("time = %d, score = %f" % (step_count, bbox.get_score())) print("errors = %d, samples = %d" % (error_count, rand_count)) #rn_model.print_stats() bbox.finish(verbose=1) return error_count
def get_action_by_state(state): action = seq[bbox.get_time() % len(seq)] #random.randint(0, n_actions-1) if bbox.get_time() % 1000 == 0: print bbox.get_time(), bbox.get_score() print state return action
def main(): epsilon = .1 # exploration num_actions = 4 input_size = 36 hidden_size = 24 activation = 'relu' max_memory = 2000 batch_size = 50 mini_epoch = 5 epoch = 10 model = Sequential() model.add( Dense(hidden_size, input_shape=[input_size], activation=activation)) model.add(Dense(hidden_size, activation=activation)) model.add(Dense(num_actions)) model.compile('adam', 'mse') # model.load_weights('model.h5') # Define environment/game bbox.load_level('../levels/train_level.data', verbose=True) # Initialize experience replay object exp_replay = ExperienceReplay(max_memory=max_memory) # FIXME #states = np.fromfile('run_random/states', dtype=np.float32)\ # .reshape([1214494, 36]) #scaler = preprocessing.StandardScaler() #scaler.fit(states) #with open('scaler.pkl', 'wb') as f: # scaler = pickle.dump(scaler, f, protocol=-1) with open('scaler.pkl', 'rb') as f: scaler = pickle.load(f) # Train for e in range(epoch): loss = 0. bbox.reset_level() game_over = False # get initial input get_state = lambda: scaler.transform(np.array([bbox.get_state()]))[0] input_t = get_state() score = 0 step = 0 report_steps = 100 while not game_over: step += 1 input_tm1 = input_t # get next action if np.random.rand() <= epsilon: action = np.random.randint(0, num_actions, size=1) else: q = model.predict(np.array([input_tm1]))[0] action = np.argmax(q) # apply action, get rewards and new state game_over = not bbox.do_action(action) input_t = get_state() new_score = bbox.get_score() reward = new_score - score score = new_score # store experience exp_replay.remember([input_tm1, action, reward, input_t], game_over) # adapt model for _ in range(mini_epoch): inputs, targets = exp_replay.get_batch(model, batch_size=batch_size) loss += model.train_on_batch(inputs, targets)[0] if step % report_steps == 0: print('Step {:07d} | Loss {:.4f} | Score {}'.format( step, loss / (report_steps * mini_epoch), score)) loss = 0. print('Epoch {:03d}/{} | Score {}'.format(e, epoch - 1, score)) # Save trained model weights model.save_weights('q_model.h5', overwrite=True)