def do_running_printout(self): clear() print ("Simname: " + str(p["simname"])) print ("Episodes Elapsed: " + str(self.episode)) print ("Average Reward Per Episode: " + str(self.r_sum_avg)) print ("Average Number of Steps Spent Balancing Pole: " + str(self.steps_balancing_pole_avg)) print ("Max Number of Steps Spent Balancing Pole: " + str(np.max(np.array(self.steps_balancing_pole_avg_list)))) print ("Epsilon: " + str(self.epsilon)) print ("Epsilon Min: " + str(p["epsilon_min"])) print ("Alpha (learning rate): " + str(self.alpha * p["learning_rate"])) if p.has_key("learning_rate_decay"): print ("Alpha (learning rate) decay: " + str(p["learning_rate_decay"])) if p["qsa_type"] == "cluster_nnet": print ("num_hidden: " + str(p["num_hidden"])) print ("num_selected: " + str(self.qsa.net.layer[0].num_selected)) if p["qsa_type"] == "nnet": print ("Activation function: " + str(p["activation_function"])) print ("num_hidden: " + str(p["num_hidden"])) if p["action_type"] == "noisy_qsa": print ("Average QSA Standard Deviation: " + str(self.qsa_std_avg)) print ("Probability of taking different action: " + str(self.prob_of_different_action)) if p["qsa_type"] == "cartpole_nnet": print ("state given to nnet:\n" + str(np.array(self.qsa.net.input).transpose())) print ("Average Steps Per Second: " + str(1.0 / self.avg_step_duration)) print ("Action Type: " + str(p["action_type"])) print ("a_list: " + str(self.tmp_a_list)) m, s = divmod(time.time() - self.start_time, 60) h, m = divmod(m, 60) print "Elapsed Time %d:%02d:%02d" % (h, m, s) sys.stdout.flush() print_update_timer = time.time()
def do_running_printout(self): clear() print("Simname: " + str(p['simname'])) print("Episodes Elapsed: " + str(self.episode)) print("Average Reward Per Episode: " + str(self.r_sum_avg)) print("Average Number of Steps Spent Balancing Pole: " + str(self.steps_balancing_pole_avg)) print("Max Number of Steps Spent Balancing Pole: " + str(np.max(np.array(self.steps_balancing_pole_avg_list)))) print("Epsilon: " + str(self.epsilon)) print("Epsilon Min: " + str(p['epsilon_min'])) print("Alpha (learning rate): " + str(self.alpha*p['learning_rate'])) if(p.has_key('learning_rate_decay')): print("Alpha (learning rate) decay: " + str(p['learning_rate_decay'])) if(p['qsa_type'] == 'cluster_nnet'): print("num_hidden: " + str(p['num_hidden'])) print('num_selected: ' + str(self.qsa.net.layer[0].num_selected)) if(p['qsa_type'] == 'nnet'): print("Activation function: " + str(p['activation_function'])) print("num_hidden: " + str(p['num_hidden'])) if(p['action_type'] == 'noisy_qsa'): print("Average QSA Standard Deviation: " + str(self.qsa_std_avg)) print("Probability of taking different action: " + str(self.prob_of_different_action)) if(p['qsa_type'] == 'cartpole_nnet'): print("state given to nnet:\n" + str(np.array(self.qsa.net.input).transpose())) print("Average Steps Per Second: " + str(1.0/self.avg_step_duration)) print("Action Type: " + str(p['action_type'])) print("a_list: " + str(self.tmp_a_list)) m, s = divmod(time.time() - self.start_time, 60) h, m = divmod(m, 60) print "Elapsed Time %d:%02d:%02d" % (h, m, s) sys.stdout.flush() print_update_timer = time.time()
def run_sim(self,p): sim = cartpole_environment() reward_type = p.get('reward_type',0) negative_reward = p['negative_reward'] positive_reward = p['positive_reward'] sim.init(p['vel_bound'],p['angle_vel_bound'],p['pos_bound'],p['g'],p['l'],p['mp'],p['mc'],p['dt'],p['negative_reward'],p['positive_reward'],p['no_reward'],reward_type) v = visualize_sdl() v.init_vis(p['display_width'],p['display_height'],p['axis_x_min'],p['axis_x_max'],p['axis_y_min'],p['axis_y_max'],p['fps']) push_force = p['push_force'] self.vel_bound = p['vel_bound'] self.pos_bound = p['pos_bound'] self.angle_vel_bound = p['angle_vel_bound'] self.mins = np.array([0.0, -self.vel_bound, -self.pos_bound, -self.angle_vel_bound]) self.maxs = np.array([2*math.pi, self.vel_bound, self.pos_bound, self.angle_vel_bound]) self.mins = np.append(np.array([-1.0,-1.0]),self.mins[1:]) self.maxs = np.append(np.array([1.0,1.0]),self.maxs[1:]) self.incorrect_target = p['incorrect_target'] self.correct_target = p['correct_target'] self.num_actions = 3 action=0 while 1: if(p.has_key('print_state_debug')): clear() action_list = np.ones((1,self.num_actions))*self.incorrect_target action_list[0,action] = self.correct_target state = sim.sim.state s = np.append(np.array([math.sin(state[0]),math.cos(state[0])]),state[1:]) s = (np.array(s) - self.mins)/(self.maxs - self.mins) s = s-0.5 s = s*2.25 s = np.append(s,action_list) np.set_printoptions(precision=4) print(str(s[:,np.newaxis])) print(str(np.array(sim.sim.state)[:,np.newaxis])) v.delay_vis() k = v.get_keys() u = 0.0; action = 0 if(k[0]): action = 2 u = -push_force; if(k[1]): action = 1 u = push_force; sim.set_action(u) sim.step() #if(sim.state[2] < -4.0): # sim.state[2] = -4.0 #if(sim.state[2] > 4.0): # sim.state[2] = 4.0 if(sim.is_terminal): sim.reset_state() v.draw_cartpole(sim.get_state(),action,sim.get_reward(reward_type)/positive_reward) exit = v.update_vis() if(exit): break return