def get_start_state(self): angle = np.random.randint(-3, 4) angle = angle * np.pi / 180 req = RobotActionRequest() req.reset_robot = True req.reset_pole_angle = angle resp = self.robot_state(req) return resp.robot_state
def get_state_reward(self, action_idx): action = self.action_list[action_idx] req = RobotActionRequest() req.reset_robot = False req.action = [action] # type is list resp = self.robot_state(req) state_new = resp.robot_state reward = 1 if (abs(state_new[0]) > self.x_max) | (abs(state_new[1]) > self.angle_max): state_new = [np.nan, np.nan, np.nan, np.nan] reward = 0 return [state_new, reward]
def test(self, test_interval_min, test_interval_max): steps_score=np.zeros(self.num_random_tests) for k in range(0,self.num_random_tests): #Pick an initial state to start from req = RobotActionRequest() req.reset_robot = True req.reset_pole_angle = np.random.uniform(np.deg2rad(test_interval_min), np.deg2rad(test_interval_max))*self.get_random_sign() #print "Initial pole angle is", np.rad2deg(req.reset_pole_angle), "degrees" response = self.cartpole_action_service(req) for n in range(0, self.steps_max): pol = RobotPolicyRequest() pol.robot_state = response.robot_state #print(pol) dqn_response = self.cartpole_policy_service(pol) req = RobotActionRequest() req.action = dqn_response.action req.reset_robot=False response = self.cartpole_action_service(req) if self.state_out_of_bounds(response.robot_state): #print "Cartpole was balanced for",n,"number of steps" steps_score[k]=n time.sleep(1) break if n==self.steps_max-1: #print "Episode ended successfully!" steps_score[k]=self.steps_max time.sleep(1) time.sleep(0.01) return steps_score
def learn(self): episode_reward = 0 total_episodes = 512 for episode in range(1, total_episodes + 1): episode_reward = 0 s_1 = np.zeros((4, 1)) s_1[1, 0] = np.random.uniform(-3, 3) response = self.reset_robot(s_1) print(response.robot_state) epsilon = max(1 - float(episode) / total_episodes, 0.01) for t in range(1, 1024): s_t = response.robot_state req = RobotActionRequest() req.reset_robot = False dice = np.random.uniform(0, 1) if dice < epsilon: a_t = np.random.choice([10, -10]) else: index = self.act_network(torch.tensor(s_t)).max(0)[1] a_t = self.action_space[index] req.action = [a_t] response = self.cartpole_action_service(req) s_t_1 = response.robot_state x = s_t_1[0] theta = s_t_1[1] if abs(x) > 1.2: break elif abs(theta) > 3: r_t_1 = 0 s_t_1 = None transition = (s_t, a_t, r_t_1, s_t_1) self.replaymemory.push(transition) break else: r_t_1 = 1 episode_reward += r_t_1 transition = (s_t, a_t, r_t_1, s_t_1) self.replaymemory.push(transition) if t % 2 == 0: self.train() # print('state:',response.robot_state,'action:',req.action) print('episode =', episode, 'episode reward =', episode_reward) print('loss =', self.loss) # report episode reward if episode > 5 and episode % 2 == 0: # print('=========update target network========') self.target_network.load_state_dict( self.act_network.state_dict())
def data_acquisition(self,num_of_data_set,rep_data): print 'Training Started' features=[] labels=[] for i in range(num_of_data_set): action = np.random.rand(1, 3) action[0, 0] = (2 * action[0, 0] - 1.0) * 1.0 action[0, 1] = (2 * action[0, 1] - 1.0) * 0.5 action[0, 2] = (2 * action[0, 2] - 1.0) * 0.25 print 'Training for dataset', (i+1) real_robot_action = rospy.ServiceProxy('real_robot', RobotAction) req = RobotActionRequest() req.reset = True resp =real_robot_action(req) for j in range(rep_data): #print 'Rep set',(j+1) f=np.append(np.array(resp.robot_state),action) features.append(f) req = RobotActionRequest() req.reset = False req.action = action.reshape((3)) resp = real_robot_action(req) labels.append(resp.robot_state) features=np.asarray(features) labels=np.asarray(labels) print features print "Acquisition Completed" return features,labels
def get_data(n): # get data set real_robot_action = rospy.ServiceProxy('real_robot', RobotAction) feature = [] label = [] #print(resp_real) #print('------') for k in range(n): req = RobotActionRequest() req.reset = True resp_real = real_robot_action(req) action = numpy.random.rand(1, 3) action[0, 0] = (2 * action[0, 0] - 1.0) * 1.0 action[0, 1] = (2 * action[0, 1] - 1.0) * 0.5 action[0, 2] = (2 * action[0, 2] - 1.0) * 0.25 robsta = resp_real.robot_state robsta = numpy.array(robsta) sfeature = [ robsta[0], robsta[1], robsta[2], robsta[3], robsta[4], robsta[5], action[0, 0], action[0, 1], action[0, 2] ] for i in range(200): #print(sfeature) req = RobotActionRequest() req.reset = False req.action = action.reshape((3)) resp_real = real_robot_action(req) #print(resp_real.robot_state) #print(action) feature.append(sfeature) print(resp_real) label.append(numpy.array(resp_real.robot_state)) #print(label) #rospy.spin() return feature, label
def get_new_state(self, action, k): req = RobotActionRequest() req.reset = True resp = self.robot_new_state(req) for i in range(self.num_steps): req = RobotActionRequest() req.reset = False req.action = action.reshape((3)) resp = self.robot_new_state(req) self.train_data_matrix[k, i, 6:9] = action.reshape((3)) self.train_data_matrix[k, i, 9:15] = resp.robot_state time.sleep(0.01) self.train_data_matrix[k, 1:self.num_steps, 0:6] = self.train_data_matrix[k, 0:self.num_steps - 1, 9:15]
def train(self): for i in range(0, self.num_random_trains): req = RobotActionRequest() req.reset_robot = True episode_reward = 0 req.reset_pole_angle = np.random.uniform( 0.0, np.deg2rad(3.0)) * self.get_random_sign() response = self.cartpole_action_service(req) state = list(response.robot_state) global eps eps = max(eps * 0.9, 0.2) #max(1- episode_num/self.num_random_trains,0.01) for k in range(self.num_steps): #right req = RobotActionRequest() req.reset_robot = False action = self.selection(state) req.action = action response = self.cartpole_action_service(req) state_next = list(response.robot_state) if abs(state_next[0]) > 1.2 or abs( state_next[1]) > (6 * np.pi / 180): reward = [0] state_next = [0, 0, 0, 0] self.memory.push([state, action, reward, state_next]) break else: episode_reward += 1 reward = [1] self.memory.push([state, action, reward, state_next]) self.optimize_model() state = state_next print(episode_reward) if i % self.episodes_step == 0: self.target_net.load_state_dict(self.action_net.state_dict()) return self.target_net
def single_test(self, action): req = RobotActionRequest() req.reset = True # send request to reset resp = self.get_action(req) for i in range(self.num_steps): req = RobotActionRequest() req.reset = False req.action = action # get training data self.features.append((np.append(resp.robot_state, req.action)).tolist()) # send request to move real_robot resp = self.get_action(req) self.labels.append(resp.robot_state)
def test(self, test_interval_min, test_interval_max): #same as executive.py for episode in range(300): #Pick an initial state to start from req = RobotActionRequest() req.reset_robot = True req.reset_pole_angle = np.random.uniform(np.deg2rad(test_interval_min), np.deg2rad(test_interval_max))*self.get_random_sign() #print "Initial pole angle is", np.rad2deg(req.reset_pole_angle), "degrees" s_t = self.cartpole_action_service(req) final_reward=0 for n in range(0, self.steps_max): a_t=self.DQN.execute(s_t.robot_state) req = RobotActionRequest() req.action = [a_t] req.reset_robot=False s_t1 = self.cartpole_action_service(req) r_t1=1 if self.state_out_of_bounds(s_t1.robot_state): r_t1=-1 self.DQN.store(s_t.robot_state, a_t, r_t1, s_t1.robot_state) if self.state_out_of_bounds(s_t1.robot_state): break final_reward+=1 s_t=s_t1 #store training data then learn from it self.DQN.learn() self.score.append(final_reward) if np.median(self.score[-21:])>195: #if score is good, break break
def single_test(self, action): req = RobotActionRequest() req.reset = True resp = self.real_robot_action(req) resp = self.fake_robot_action(req) for i in range(self.num_steps): req = RobotActionRequest() req.reset = False req.action = action.reshape((3)) resp_real = self.real_robot_action(req) message = RobotState() message.robot_name = str('real_robot') message.robot_state = resp_real.robot_state self.pub.publish(message) resp_fake = self.fake_robot_action(req) message = RobotState() message.robot_name = str('fake_robot') message.robot_state = resp_fake.robot_state self.pub.publish(message) #time.sleep(0.01) return self.score_state(resp_real.robot_state, resp_fake.robot_state)
def perturb(self, action): req_real = RobotActionRequest() req_real.reset = True # send request to reset real_robot config resp_real = self.real_robot_action(req_real) collect_interval = 1 # how many steps we skip # apply a constant action for j in range(self.perturb_steps): # create a new request req_real = RobotActionRequest() req_real.reset = False req_real.action = action is_collected = j % collect_interval == 0 if is_collected: # collect feature self.features.append( np.append(resp_real.robot_state, req_real.action).tolist()) # send request to move real_robot resp_real = self.real_robot_action(req_real) if is_collected: # collect label self.labels.append(resp_real.robot_state)
def training_data(self): #get the data to train fake robot n = 0 for k in range(0, self.num_random_tests): action = np.random.rand(1, 3) action[0, 0] = (2 * action[0, 0] - 1.0) * 1.0 action[0, 1] = (2 * action[0, 1] - 1.0) * 0.5 action[0, 2] = (2 * action[0, 2] - 1.0) * 0.25 req_real = RobotActionRequest() req_real.reset = True resp = self.real_robot_action(req_real) self.data_fea[k * self.num_steps, 0:6] = np.array(resp.robot_state) print(k) for i in range(self.num_steps): req_real = RobotActionRequest() req_real.reset = False req_real.action = action.reshape((3)) resp_real = self.real_robot_action(req_real) self.data_fea[n, 6:9] = req_real.action self.label[n] = resp_real.robot_state n += 1 self.data_fea[n, 0:6] = resp_real.robot_state print('trainning finished') self.data_fea = np.delete(self.data_fea, -1, axis=0)
def __init__(self): self.cartpole_action_service = rospy.ServiceProxy('cartpole_robot', RobotAction) self.req = RobotActionRequest()
def reset_robot(self, state): req = RobotActionRequest() req.reset_robot = True req.reset_pole_angle = state[1, 0] response = self.cartpole_action_service(req) return response
def train(self): print('start training') start = time.time() done = False n_episodes = 800 scores = deque(maxlen=20) batch_size = 64 for e in range(n_episodes): # initial state # self.memory.clear() reward = 0 # reset the robot for each episode req = RobotActionRequest() req.reset_robot = True # np.random.seed(0) req.reset_pole_angle = np.random.uniform( np.deg2rad(0.0), np.deg2rad(3.0)) * self.get_random_sign() # req.reset_pole_angle = np.deg2rad(2.0) * self.get_random_sign() state = self.probe_service(req).robot_state # print(state) i = 0 done = False for j in range(1000): action, act_inx = self.act(state) # print('action', action, act_inx) req = RobotActionRequest() req.reset_robot = False req.action = [action] next_state = self.probe_service(req).robot_state done = self.state_out_of_bounds(next_state) reward = 10 if not done else -10 # print(state, action, act_inx, reward, next_state, done) self.remember(state, action, act_inx, reward, next_state, done) # print(state, action, reward, next_state, done) state = next_state i = i + 1 # scores.append(i) # mean = np.mean(scores) if done: episode_durations.append(i) #self.plot_durations() #print(('episode: {}/{}, score:{}, e:{:.2}').format(e, n_episodes, i, self.epsilon)) break if e > 100: if np.mean(episode_durations[-30:]) > 250 or (time.time() - start) >= 290: self.plot_durations() # torch.save(self.model.state_dict(), './x.pth') return self.replay(batch_size) if e % 10 == 0: self.target_net.load_state_dict(self.model.state_dict()) self.plot_durations()