def run_sim(self, p): print ("Parameters: ") for k, v in p.items(): if k[0:2] == "__": continue print (str(k) + " : " + str(v)) del k del v # init random number generator from seed np.random.seed(p["random_seed"]) # initialize hyperparameters fresh, unless we are resuming a saved simulation # in which case, we load the parameters if not p.has_key("load_name"): self.init_sim(p) else: self.load_sim(p) # initialize environment self.sim = cartpole_environment() self.vel_bound = p["vel_bound"] self.pos_bound = p["pos_bound"] self.angle_vel_bound = p["angle_vel_bound"] self.sim.init( self.vel_bound, self.angle_vel_bound, self.pos_bound, p["g"], p["l"], p["mp"], p["mc"], p["dt"], p["negative_reward"], p["positive_reward"], p["no_reward"], p.get("reward_type", 0), ) self.do_vis = p["do_vis"] self.save_images = p.get("save_images", False) self.image_save_dir = p.get("image_save_dir", None) save_interval = p["save_interval"] self.do_running_printout = p.get("do_running_printout", False) self.showevery = p["showevery"] self.fastforwardskip = 5 push_force = p["push_force"] self.reward_type = p.get("reward_type", 0) self.use_full_output = p.get("use_full_output", False) self.earlyendepisode = np.zeros(10) self.earlyendreward = np.zeros(10) for i in range(9): self.earlyendepisode[i] = p.get("earlyendepisode" + str(i), 0) self.earlyendreward[i] = p.get("earlyendreward" + str(i), 0) self.do_recurrence = p.get("do_recurrence", False) if self.do_vis: # only import if we need it, since we don't want to require installation of pygame from cartpole.vis.visualize_sdl import visualize_sdl v = visualize_sdl() v.init_vis( p["display_width"], p["display_height"], p["axis_x_min"], p["axis_x_max"], p["axis_y_min"], p["axis_y_max"], p["fps"], ) print_update_timer = time.time() self.start_time = time.time() elapsed_time = time.time() step_duration_timer = time.time() save_time = time.time() self.avg_step_duration = 1.0 ##repeat for each episode self.r_sum_avg = -0.95 self.r_sum_avg_list = [] self.steps_balancing_pole_list = [] self.steps_balancing_pole_avg = 0.00 self.steps_balancing_pole_avg_list = [] while 1: # reset eligibility at the beginning of each episode # TODO: This should be abstracted into a function call if hasattr(self.qsa, "_lambda"): for l in self.qsa.net.layer: l.eligibility = np.zeros(l.eligibility.shape, dtype=np.float32) self.step = 0 ##initialize s self.sim.reset_state() self.s = self.state_transformer.transform(self.sim.get_state()) if self.do_recurrence: # choose a from s using policy derived from Q self.h = np.zeros(p["num_hidden"], dtype=np.float32) (self.a, self.qsa_tmp, self.h_prime) = self.choose_action_recurrence(np.append(self.s, self.h), p) else: # choose a from s using policy derived from Q (self.a, self.qsa_tmp) = self.choose_action(self.s, p) balance_list = [] self.r_sum = 0.0 # repeat steps quit = False save_and_exit = False while 1: ##take action a, observe r, s' a_vel = [0.0, -push_force, push_force] self.sim.set_action(a_vel[self.a]) self.sim.step() # print("Terminal: " + str(self.sim.is_terminal)) self.r = self.sim.get_reward(self.reward_type) self.s_prime = self.state_transformer.transform(self.sim.get_state()) self.r_sum += self.r # for consistency, we always label balancing steps with the same reward function balance_list.append(self.sim.get_reward(0)) if self.do_recurrence: (self.a_prime, self.qsa_prime, self.h_primeprime) = self.choose_action_recurrence( np.append(self.s_prime, self.h_prime), p ) current_s = np.append(self.s, self.h) next_s = np.append(self.s_prime, self.h_prime) self.qsa.store( current_s, self.a, self.qsa_tmp + self.alpha * (self.r + self.gamma * self.qsa.load(next_s, self.a_prime) - self.qsa_tmp), ) else: # choose a' from s' using policy derived from Q (self.a_prime, self.qsa_prime) = self.choose_action(self.s_prime, p) # Q(s,a) <- Q(s,a) + alpha[r + gamma*Q(s_prime,a_prime) - Q(s,a)] # todo: qsa_prime can be saved and reused for qsa_tmp # qsa_tmp = self.qsa.load(self.s,self.a) # self.qsa.update(self.s,self.a,self.r,self.s_prime,self.a_prime,self.qsa_tmp) self.qsa.store( self.s, self.a, self.qsa_tmp + self.alpha * (self.r + self.gamma * self.qsa.load(self.s_prime, self.a_prime) - self.qsa_tmp), ) if self.do_vis: if not (self.episode % self.showevery): self.fast_forward = False v.delay_vis() v.draw_cartpole(self.sim.get_state(), self.a, self.sim.get_reward(self.reward_type), self) exit = v.update_vis() if exit: quit = True elif self.step == 0 and not (self.episode % self.fastforwardskip): self.fast_forward = True v.delay_vis() v.draw_cartpole(self.sim.get_state(), self.a, self.sim.get_reward(self.reward_type), self) exit = v.update_vis() if exit: quit = True # if(p.has_key('print_state_debug') and p['print_state_debug'] == True): # print("action: " + str(a) + " r: " + str(r) + \ # " Qsa: " + str(self.qsa.load(s,a)) + " state: " + str(s)) # print("Qs0: " + str(self.qsa.load(s,0))) # print("Qs1: " + str(self.qsa.load(s,1))) # print("Qs2: " + str(self.qsa.load(s,2))) # TODO: put this printout stuff in a function # the self.episode > 0 check prevents a bug where some of the printouts are empty arrays before the first episode completes if self.do_running_printout and print_update_timer < time.time() - 1.0 and self.episode > 0: self.do_running_printout() if self.episode >= p["train_episodes"]: save_and_exit = True quit = True if quit: break if self.sim.is_terminal: break if self.step > p["max_steps"]: break ## s <- s'; a <-- a' self.s = self.s_prime self.a = self.a_prime self.qsa_tmp = self.qsa_prime if self.do_recurrence: self.h = self.h_prime self.h_prime = self.h_primeprime # print("Next Step \n") self.step += 1 self.avg_step_duration = 0.995 * self.avg_step_duration + (1.0 - 0.995) * ( time.time() - step_duration_timer ) step_duration_timer = time.time() # end step loop # compute the number of steps that have a positive reward, as the number of steps that balanced self.steps_balancing_pole = np.sum(np.array(balance_list) > 0.0000001) self.steps_balancing_pole_list.append(self.steps_balancing_pole) self.steps_balancing_pole_avg = ( 0.995 * self.steps_balancing_pole_avg + (1.0 - 0.995) * self.steps_balancing_pole ) self.steps_balancing_pole_avg_list.append(self.steps_balancing_pole_avg) self.r_sum_avg = 0.995 * self.r_sum_avg + (1.0 - 0.995) * self.r_sum if p["decay_type"] == "geometric": self.epsilon = self.epsilon * p["epsilon_decay"] self.epsilon = max(p["epsilon_min"], self.epsilon) elif p["decay_type"] == "linear": self.epsilon = self.epsilon - p["epsilon_decay"] self.epsilon = max(p["epsilon_min"], self.epsilon) if p.has_key("learning_rate_decay_type") and p["learning_rate_decay_type"] == "geometric": self.alpha = self.alpha * p["learning_rate_decay"] self.alpha = max(p["learning_rate_min"] / p["learning_rate"], self.alpha) elif p.has_key("learning_rate_decay_type") and p["learning_rate_decay_type"] == "linear": self.alpha = self.alpha - p["learning_rate_decay"] self.alpha = max(p["learning_rate_min"] / p["learning_rate"], self.alpha) # print debug for episode m, s = divmod(time.time() - self.start_time, 60) h, m = divmod(m, 60) sys.stdout.write( ("ep: %d" % self.episode) + (" epsilon: %2.4f" % self.epsilon) + (" avg steps balanced: %2.4f" % self.steps_balancing_pole_avg) + (" max steps balanced: %2.4f" % np.max(np.array(self.steps_balancing_pole_avg_list))) + (" total_steps: %d" % self.step) + (" steps/sec: %2.4f" % (1.0 / self.avg_step_duration)) ) if hasattr(self.qsa, "net"): if hasattr(self.qsa.net.layer[0], "zeta"): sys.stdout.write(" L0 zeta: %2.4f" % self.qsa.net.layer[0].zeta) if hasattr(self.qsa.net.layer[1], "zeta"): sys.stdout.write(" L1 zeta: %2.4f" % self.qsa.net.layer[1].zeta) sys.stdout.write(" l_rate: %2.4f" % (self.alpha * p["learning_rate"])) print (" Time %d:%02d:%02d" % (h, m, s)) sys.stdout.flush() for i in range(9): if ( self.earlyendepisode[i] > 0 and self.episode == self.earlyendepisode[i] and np.max(np.array(self.steps_balancing_pole_avg_list)) < self.earlyendreward[i] ): print ("ending early") save_and_exit = True # save stuff (TODO: Put this in a save function) if time.time() - save_time > save_interval or save_and_exit == True: print ("saving results...") self.save_results(p["results_dir"] + p["simname"] + p["version"] + ".h5py", p) save_time = time.time() if quit == True or save_and_exit == True: break self.episode += 1 # end episode loop self.update_results(p) obj = np.max(self.results["steps_balancing_pole_avg_list"]) argmax = np.argmax(self.results["steps_balancing_pole_avg_list"]) print ("obj: " + str(obj) + " argmax: " + str(argmax)) return self.results
def run_sim(self,p): print("Parameters: ") for k,v in p.items(): if(k[0:2] == '__'): continue print(str(k) + " : " + str(v)) del k del v #init random number generator from seed np.random.seed(p['random_seed']); #initialize hyperparameters fresh, unless we are resuming a saved simulation #in which case, we load the parameters if(not p.has_key('load_name')): self.init_sim(p) else: self.load_sim(p) #initialize environment self.sim = cartpole_environment() self.vel_bound = p['vel_bound'] self.pos_bound = p['pos_bound'] self.angle_vel_bound = p['angle_vel_bound'] self.sim.init(self.vel_bound,self.angle_vel_bound,self.pos_bound, p['g'],p['l'],p['mp'],p['mc'],p['dt'],p['negative_reward'],p['positive_reward'],p['no_reward'],p.get('reward_type',0)) self.do_vis = p['do_vis'] self.save_images = p.get('save_images',False) self.image_save_dir = p.get('image_save_dir',None) save_interval = p['save_interval'] self.do_running_printout = p.get('do_running_printout',False) self.showevery = p['showevery'] self.fastforwardskip = 5 push_force = p['push_force'] self.reward_type = p.get('reward_type',0) self.use_full_output = p.get('use_full_output',False) self.earlyendepisode = np.zeros(10) self.earlyendreward = np.zeros(10) for i in range(9): self.earlyendepisode[i] = p.get('earlyendepisode' + str(i),0) self.earlyendreward[i] = p.get('earlyendreward' + str(i),0) self.do_recurrence = p.get('do_recurrence',False) if(self.do_vis): #only import if we need it, since we don't want to require installation of pygame from cartpole.vis.visualize_sdl import visualize_sdl v = visualize_sdl() v.init_vis(p['display_width'],p['display_height'],p['axis_x_min'],p['axis_x_max'],p['axis_y_min'],p['axis_y_max'],p['fps']) print_update_timer = time.time() self.start_time = time.time() elapsed_time = time.time() step_duration_timer = time.time() save_time = time.time() self.avg_step_duration = 1.0 ##repeat for each episode self.r_sum_avg = -0.95 self.r_sum_avg_list = [] self.steps_balancing_pole_list = [] self.steps_balancing_pole_avg = 0.00 self.steps_balancing_pole_avg_list = [] while 1: #reset eligibility at the beginning of each episode #TODO: This should be abstracted into a function call if(hasattr(self.qsa,'_lambda')): for l in self.qsa.net.layer: l.eligibility = np.zeros(l.eligibility.shape,dtype=np.float32) self.step = 0 ##initialize s self.sim.reset_state() self.s = self.state_transformer.transform(self.sim.get_state()) if(self.do_recurrence): #choose a from s using policy derived from Q self.h = np.zeros(p['num_hidden'],dtype=np.float32) (self.a,self.qsa_tmp,self.h_prime) = self.choose_action_recurrence(np.append(self.s,self.h),p); else: #choose a from s using policy derived from Q (self.a,self.qsa_tmp) = self.choose_action(self.s,p); balance_list = [] self.r_sum = 0.0 #repeat steps quit = False save_and_exit = False while 1: ##take action a, observe r, s' a_vel = [0.0,-push_force,push_force] self.sim.set_action(a_vel[self.a]) self.sim.step() #print("Terminal: " + str(self.sim.is_terminal)) self.r = self.sim.get_reward(self.reward_type) self.s_prime = self.state_transformer.transform(self.sim.get_state()) self.r_sum += self.r #for consistency, we always label balancing steps with the same reward function balance_list.append(self.sim.get_reward(0)) if(self.do_recurrence): (self.a_prime,self.qsa_prime,self.h_primeprime) = \ self.choose_action_recurrence(np.append(self.s_prime,self.h_prime),p) current_s = np.append(self.s,self.h) next_s = np.append(self.s_prime,self.h_prime) self.qsa.store(current_s,self.a,self.qsa_tmp + \ self.alpha*(self.r + self.gamma*self.qsa.load(next_s,self.a_prime) - self.qsa_tmp)) else: #choose a' from s' using policy derived from Q (self.a_prime,self.qsa_prime) = self.choose_action(self.s_prime,p) #Q(s,a) <- Q(s,a) + alpha[r + gamma*Q(s_prime,a_prime) - Q(s,a)] #todo: qsa_prime can be saved and reused for qsa_tmp #qsa_tmp = self.qsa.load(self.s,self.a) #self.qsa.update(self.s,self.a,self.r,self.s_prime,self.a_prime,self.qsa_tmp) self.qsa.store(self.s,self.a,self.qsa_tmp + \ self.alpha*(self.r + self.gamma*self.qsa.load(self.s_prime,self.a_prime) - self.qsa_tmp)) if(self.do_vis): if not (self.episode % self.showevery): self.fast_forward = False v.delay_vis() v.draw_cartpole(self.sim.get_state(),self.a,self.sim.get_reward(self.reward_type),self) exit = v.update_vis() if(exit): quit=True elif(self.step == 0 and not (self.episode % self.fastforwardskip)): self.fast_forward = True v.delay_vis() v.draw_cartpole(self.sim.get_state(),self.a,self.sim.get_reward(self.reward_type),self) exit = v.update_vis() if(exit): quit=True #if(p.has_key('print_state_debug') and p['print_state_debug'] == True): # print("action: " + str(a) + " r: " + str(r) + \ # " Qsa: " + str(self.qsa.load(s,a)) + " state: " + str(s)) # print("Qs0: " + str(self.qsa.load(s,0))) # print("Qs1: " + str(self.qsa.load(s,1))) # print("Qs2: " + str(self.qsa.load(s,2))) #TODO: put this printout stuff in a function #the self.episode > 0 check prevents a bug where some of the printouts are empty arrays before the first episode completes if(self.do_running_printout and print_update_timer < time.time() - 1.0 and self.episode > 0): self.do_running_printout() if(self.episode >= p['train_episodes']): save_and_exit = True quit=True if(quit): break if(self.sim.is_terminal): break if(self.step > p['max_steps']): break ## s <- s'; a <-- a' self.s = self.s_prime self.a = self.a_prime self.qsa_tmp = self.qsa_prime if(self.do_recurrence): self.h = self.h_prime self.h_prime = self.h_primeprime #print("Next Step \n") self.step += 1 self.avg_step_duration = 0.995*self.avg_step_duration + (1.0 - 0.995)*(time.time() - step_duration_timer) step_duration_timer = time.time() #end step loop #compute the number of steps that have a positive reward, as the number of steps that balanced self.steps_balancing_pole = np.sum(np.array(balance_list) > 0.0000001) self.steps_balancing_pole_list.append(self.steps_balancing_pole) self.steps_balancing_pole_avg = 0.995*self.steps_balancing_pole_avg + (1.0 - 0.995)*self.steps_balancing_pole self.steps_balancing_pole_avg_list.append(self.steps_balancing_pole_avg) self.r_sum_avg = 0.995*self.r_sum_avg + (1.0 - 0.995)*self.r_sum if(p['decay_type'] == 'geometric'): self.epsilon = self.epsilon * p['epsilon_decay'] self.epsilon = max(p['epsilon_min'],self.epsilon) elif(p['decay_type'] == 'linear'): self.epsilon = self.epsilon - p['epsilon_decay'] self.epsilon = max(p['epsilon_min'],self.epsilon) if(p.has_key('learning_rate_decay_type') and p['learning_rate_decay_type'] == 'geometric'): self.alpha = self.alpha * p['learning_rate_decay'] self.alpha = max(p['learning_rate_min']/p['learning_rate'],self.alpha) elif(p.has_key('learning_rate_decay_type') and p['learning_rate_decay_type'] == 'linear'): self.alpha = self.alpha - p['learning_rate_decay'] self.alpha = max(p['learning_rate_min']/p['learning_rate'],self.alpha) #print debug for episode m, s = divmod(time.time() - self.start_time, 60) h, m = divmod(m, 60) sys.stdout.write(("ep: %d" % self.episode) + (" epsilon: %2.4f" %self.epsilon) + (" avg steps balanced: %2.4f" % self.steps_balancing_pole_avg) + (" max steps balanced: %2.4f" % np.max(np.array(self.steps_balancing_pole_avg_list))) + (" total_steps: %d" % self.step) + (" steps/sec: %2.4f" % (1.0/self.avg_step_duration))) if(hasattr(self.qsa,'net')): if(hasattr(self.qsa.net.layer[0],'zeta')): sys.stdout.write(" L0 zeta: %2.4f" % self.qsa.net.layer[0].zeta) if(hasattr(self.qsa.net.layer[1],'zeta')): sys.stdout.write(" L1 zeta: %2.4f" % self.qsa.net.layer[1].zeta) sys.stdout.write(" l_rate: %2.4f" % (self.alpha*p['learning_rate'])) print(" Time %d:%02d:%02d" % (h, m, s)) sys.stdout.flush() for i in range(9): if(self.earlyendepisode[i] > 0 and self.episode == self.earlyendepisode[i] and np.max(np.array(self.steps_balancing_pole_avg_list)) < self.earlyendreward[i]): print("ending early") save_and_exit = True #save stuff (TODO: Put this in a save function) if(time.time() - save_time > save_interval or save_and_exit == True): print('saving results...') self.save_results(p['results_dir'] + p['simname'] + p['version'] + '.h5py',p) save_time = time.time(); if(quit==True or save_and_exit==True): break; self.episode += 1 #end episode loop self.update_results(p) obj = np.max(self.results['steps_balancing_pole_avg_list']) argmax = np.argmax(self.results['steps_balancing_pole_avg_list']) print("obj: " + str(obj) + " argmax: " + str(argmax)) return self.results
def run_sim(self,p): sim = cartpole_environment() reward_type = p.get('reward_type',0) negative_reward = p['negative_reward'] positive_reward = p['positive_reward'] sim.init(p['vel_bound'],p['angle_vel_bound'],p['pos_bound'],p['g'],p['l'],p['mp'],p['mc'],p['dt'],p['negative_reward'],p['positive_reward'],p['no_reward'],reward_type) v = visualize_sdl() v.init_vis(p['display_width'],p['display_height'],p['axis_x_min'],p['axis_x_max'],p['axis_y_min'],p['axis_y_max'],p['fps']) push_force = p['push_force'] self.vel_bound = p['vel_bound'] self.pos_bound = p['pos_bound'] self.angle_vel_bound = p['angle_vel_bound'] self.mins = np.array([0.0, -self.vel_bound, -self.pos_bound, -self.angle_vel_bound]) self.maxs = np.array([2*math.pi, self.vel_bound, self.pos_bound, self.angle_vel_bound]) self.mins = np.append(np.array([-1.0,-1.0]),self.mins[1:]) self.maxs = np.append(np.array([1.0,1.0]),self.maxs[1:]) self.incorrect_target = p['incorrect_target'] self.correct_target = p['correct_target'] self.num_actions = 3 action=0 while 1: if(p.has_key('print_state_debug')): clear() action_list = np.ones((1,self.num_actions))*self.incorrect_target action_list[0,action] = self.correct_target state = sim.sim.state s = np.append(np.array([math.sin(state[0]),math.cos(state[0])]),state[1:]) s = (np.array(s) - self.mins)/(self.maxs - self.mins) s = s-0.5 s = s*2.25 s = np.append(s,action_list) np.set_printoptions(precision=4) print(str(s[:,np.newaxis])) print(str(np.array(sim.sim.state)[:,np.newaxis])) v.delay_vis() k = v.get_keys() u = 0.0; action = 0 if(k[0]): action = 2 u = -push_force; if(k[1]): action = 1 u = push_force; sim.set_action(u) sim.step() #if(sim.state[2] < -4.0): # sim.state[2] = -4.0 #if(sim.state[2] > 4.0): # sim.state[2] = 4.0 if(sim.is_terminal): sim.reset_state() v.draw_cartpole(sim.get_state(),action,sim.get_reward(reward_type)/positive_reward) exit = v.update_vis() if(exit): break return