def __init__(self): """ Initializing DDPG """ self.ddpg = DDPG(a_dim=A_DIM, s_dim=S_DIM, batch_size=10, memory_capacity=500) self.ep_reward = 0.0 self.current_ep = 0 self.current_step = 0 self.current_action = np.array([.0, .0, .0]) self.done = True # if the episode is done self.var = 5.0 print("Initialized DDPG") """ Setting communication""" self.sub1 = rospy.Subscriber('robot_pose', PA, self.callback1, queue_size=1) self.sub2 = rospy.Subscriber('robot_pose', PA, self.callback2, queue_size=1) self.pub1 = rospy.Publisher('Robot_5/pose', PS, queue_size=10) self.pub2 = rospy.Publisher('normalized_state', PC, queue_size=10) rospy.wait_for_service('airpress_control', timeout=5) self.target_PS = PS() self.action_V3 = Vector3() self.pc = PC() self.pc.header.frame_id = 'world' self.state = PA() self.updated = True # if s_ is updated self.got_callback1 = False self.got_callback2 = False print("Initialized communication") """ Reading targets """ """ The data should be w.r.t origin by base position """ ends = pickle.load(open('ends.p', 'rb')) self.r_ends = ends['r_ends'] self.rs_ends = ends['rs_ends'] self.x_offset = X_OFFSET self.y_offset = Y_OFFSET self.z_offset = Z_OFFSET self.scaler = 1 / 0.3 self.sample_target() print("Read target data") #self.ddpg.restore_momery() #self.ddpg.restore_model() """
def talker(): pub1 = rospy.Publisher('state', PC, queue_size=10) rospy.init_node('state_pub', anonymous=True) rate = rospy.Rate(1) # 50hz count = 0 pc = PC() pc.header.frame_id = 'world' pts = list() for i in range(10): point = Point() point.x = 0 point.y = 0 point.z = i * 0.1 pts.append(point) print[x.z for x in pts] pc.points = pts while not rospy.is_shutdown(): rospy.loginfo('Publishing state...') pub1.publish(pc) rate.sleep()
def __init__(self): self.pub = rospy.Publisher('state_sim', PC, queue_size=10) self.pc = PC() self.pc.header.frame_id = 'world' a_s = pickle.load(open('./data/action_state_data', 'rb')) as_array = np.zeros((len(a_s), 5, 3)) for i, ans in enumerate(a_s): as_array[i, 4, :] = ans['action'] as_array[i, :4, :] = ans['state'][:, :3] self.as_array_0 = np.zeros((len(a_s), 5, 3)) for i, asn in enumerate(a_s): self.as_array_0[i, 4, :] = asn['action'] self.as_array_0[ i, :4, :] = asn['state'][:, :3] - asn['state'][0, :3] self.pub_state(self.as_array_0[i, :4, :]) #print self.as_array_0[i,:4,:].shape #rospy.sleep(0.02) self.states = self.as_array_0[:, :4, :] self.actions = self.as_array_0[:, 4, :] self.current_pose = self.states[0, :, :]
def __init__(self): """ Initializing DDPG """ self.sim = Sim() self.ddpg = DDPG(a_dim=A_DIM, s_dim=S_DIM, batch_size=64, memory_capacity=100000, gamma=GAMMA, lr_a=0.001) #gamma=0.98 self.ep_reward = 0.0 self.current_ep = 0 self.current_step = 0 self.current_action = np.array([.0, .0, .0]) self.done = True # if the episode is done self.var = VAR_INIT self.reward_record = list() self.ep_record = list() self.fig = plt.gcf() self.fig.show() self.fig.canvas.draw() print("Initialized DDPG") """ Setting communication""" self.pc = PC() self.pc.header.frame_id = 'world' self.pub = rospy.Publisher('normalized_state', PC, queue_size=10) self.pub1 = rospy.Publisher('state', PC, queue_size=10) """ self.sub = rospy.Subscriber('robot_pose', PA, self.callback, queue_size=1) self.pub = rospy.Publisher('normalized_state', PC, queue_size=10) rospy.wait_for_service('airpress_control', timeout=5) self.target_PS = PS() self.action_V3 = Vector3() self.state = PA() self.updated = False # if s is updated self.got_callback1 = False self.got_callback2 = False print("Initialized communication") """ """ Reading targets """ """ The data should be w.r.t origin by base position """ self.ends = pickle.load(open('./data/targets.p', 'rb')) self.x_offset = X_OFFSET self.y_offset = Y_OFFSET self.z_offset = Z_OFFSET self.scaler = 1/0.3 self.sample_target() print("Read target data") #self.ddpg.restore_momery() #self.ddpg.restore_model() while not (rospy.is_shutdown()): if self.current_ep < MAX_EPISODES: if self.current_step < MAX_EP_STEPS: #rospy.sleep(0.5) p = self.sim.current_pose s = np.vstack((p, self.target)) s = s[3:,:] s = self.normalize_state(s) norm_a = self.ddpg.choose_action(s.reshape(6)[-S_DIM:]) noise_a = np.random.normal(norm_a, self.var) action = np.clip(noise_a, -1.0, 1.0) self.current_action = action*A_BOUND + A_BOUND try: p_ = self.sim.update_pose(self.current_action) except: print self.ddpg.choose_action(s.reshape(6)[-S_DIM:]) print s.reshape(6)[-S_DIM:] s_ = np.vstack((p_, self.target)) s_ = s_[3:,:] s_ = self.normalize_state(s_) self.compute_reward(s[0,:], s_[1,:]) self.current_step += 1 if self.reward > GOT_GOAL: #self.reward += 1.0 self.ep_reward += self.reward if self.current_ep% 10 ==0: self.reward_record.append(self.ep_reward / self.current_step) self.ep_record.append(self.current_ep) plt.plot(self.ep_record, self.reward_record) plt.ylim([-1.5,0.0]) self.fig.canvas.draw() self.fig.savefig('learning.png') print('\n') print "Target Reached" print("Normalized action:") print norm_a print("Noise action:") print action print("Output action:") print self.current_action print("Reward: %f" % self.reward) print('Episode:', self.current_ep, ' Reward: %f' % self.ep_reward, 'Explore: %.3f' % self.var,) print('*' * 40) self.ddpg.save_model() self.ddpg.save_memory() self.done = True self.current_step = 0 self.current_ep += 1 self.sample_target() self.ep_reward = 0 #self.ddpg.save_memory() #self.ddpg.save_model() """ self.current_action = np.array([.0, .0, .0]) self.action_V3.x, self.action_V3.y, self.action_V3.z \ = self.current_action[0], self.current_action[1], self.current_action[2] self.run_action(self.action_V3) """ else: self.ep_reward += self.reward if self.current_step == MAX_EP_STEPS: if self.current_ep % 10 ==0: self.reward_record.append(self.ep_reward / self.current_step) self.ep_record.append(self.current_ep) plt.plot(self.ep_record, self.reward_record) plt.ylim([-1.5, 0.0]) self.fig.canvas.draw() self.fig.savefig('learning.png') print('\n') print "Target Failed" print("Normalized action:") print norm_a print("Noise action:") print action print("Output action:") print self.current_action print("Reward: %f" % self.reward) print('Episode:', self.current_ep, ' Reward: %f' % self.ep_reward, 'Explore: %.3f' % self.var,) print('*' * 40) self.ddpg.save_model() self.ddpg.save_memory() self.done = False self.current_step = 0 self.current_ep += 1 self.sample_target() self.ep_reward = 0 #self.ddpg.save_memory() #self.ddpg.save_model() """ self.current_action = np.array([.0, .0, .0]) self.action_V3.x, self.action_V3.y, self.action_V3.z \ = self.current_action[0], self.current_action[1], self.current_action[2] self.run_action(self.action_V3) """ self.ddpg.store_transition(s.reshape(6)[-S_DIM:], action, self.reward, s_.reshape(6)[-S_DIM:]) if self.ddpg.pointer > TRAIN_POINT: #if (self.current_step % 10 == 0): #self.var *= VAR_DECAY self.var = max(0.0,self.var-1.02/(MAX_EP_STEPS*MAX_EPISODES)) self.ddpg.learn() self.pub_state(s_) else: p = self.sim.current_pose s = np.vstack((p, self.target)) s = s[3:,:] s = self.normalize_state(s) norm_a = self.ddpg.choose_action(s.reshape(6)[-S_DIM:]) self.current_action = norm_a * A_BOUND + A_BOUND print("Normalized action:") print norm_a print("Current action:") print self.current_action p_ = self.sim.update_pose(self.current_action) s_ = np.vstack((p_, self.target)) s_ = s_[3:,:] print("Distance: %f" % np.linalg.norm(s_[0,:]-s_[1,:])) s_ = self.normalize_state(s_) self.compute_reward(s_[0, :], s_[1, :]) print('Explore: %.2f' % self.var,) print("Reward: %f" % self.reward) rospy.sleep(1) #print print self.ddpg.get_value(s.reshape(6)[-S_DIM:],norm_a,self.reward.reshape((-1,1)),s_.reshape(6)[-S_DIM:]) print '\n' self.pub_state(s_) if self.reward > GOT_GOAL: self.done = True self.current_step = 0 self.current_ep += 1 self.sample_target() print "Target Reached" print("Episode %i Ended" % self.current_ep) print("Reward: %f" % self.reward) print('Episode:', self.current_ep, ' Reward: %d' % self.ep_reward, 'Explore: %.2f' % self.var,) print('*' * 40) self.ep_reward = 0 # self.ddpg.save_memory() # self.ddpg.save_model() """ self.current_action = np.array([.0, .0, .0]) self.action_V3.x, self.action_V3.y, self.action_V3.z \ = self.current_action[0], self.current_action[1], self.current_action[2] self.run_action(self.action_V3) """ else: self.done = False self.current_step += 1 if self.current_step == 2: print "Target Failed" print("Reward: %f" % self.reward) print("Episode %i Ends" % self.current_ep) print( 'Episode:', self.current_ep, ' Reward: %d' % self.ep_reward, 'Explore: %.2f' % self.var,) print('*' * 40) self.current_step = 0 self.current_ep += 1 self.sample_target() self.ep_reward = 0 # self.ddpg.save_memory() # self.ddpg.save_model() """
def __init__(self): """ Initializing DDPG """ self.sim = Sim() self.pfn = PFN(a_dim=A_DIM, s_dim=S_DIM, batch_size=8, memory_capacity=MEMORY_NUM, lr=0.001, bound=A_BOUND) self.ep_reward = 0.0 self.current_action = np.array([.0, .0, .0]) self.done = True # if the episode is done self.reward_record = list() self.ep_record = list() self.fig = plt.gcf() self.fig.show() self.fig.canvas.draw() print("Initialized PFN") """ Setting communication""" self.pc = PC() self.pc.header.frame_id = 'world' self.pub = rospy.Publisher('normalized_state', PC, queue_size=10) self.pub1 = rospy.Publisher('state', PC, queue_size=10) self.sub = rospy.Subscriber('Robot_1/pose', PS, self.callback, queue_size=1) rospy.wait_for_service('airpress_control', timeout=5) self.target_PS = PS() self.action_V3 = Vector3() self.updated = False # if s is updated self.got_target = False print("Initialized communication") """ Reading targets """ """ The data should be w.r.t origin by base position """ self.ends = pickle.load(open('./data/targets.p', 'rb')) self.x_offset = X_OFFSET self.y_offset = Y_OFFSET self.z_offset = Z_OFFSET self.sample_target() print("Read target data") #self.pfn.restore_momery() self.pfn.restore_model('model_pfn') memory_ep = np.ones((MAX_EP_STEPS, 3 + 3 + 1 + 1)) * -100 self.current_ep = 0 self.current_step = 0 while not (rospy.is_shutdown()): self.updated = False while (not self.updated) & (not rospy.is_shutdown()): rospy.sleep(0.1) real_target = self.real_target.copy() s = self.normalize_state(real_target) action, act_var = self.pfn.choose_action2(s) self.action_V3.x, self.action_V3.y, self.action_V3.z \ = action[0], action[1], action[2] self.run_action(self.action_V3) print '\n' #rospy.sleep(1.0) '''
def __init__(self): """ Initializing DDPG """ self.sim = Sim() self.pfn = PFN(a_dim=A_DIM, s_dim=S_DIM, batch_size=8, memory_capacity=MEMORY_NUM, lr=0.001, bound=A_BOUND) self.ep_reward = 0.0 self.current_action = np.array([.0, .0, .0]) self.done = True # if the episode is done self.reward_record = list() self.ep_record = list() self.fig = plt.gcf() self.fig.show() self.fig.canvas.draw() print("Initialized PFN") """ Setting communication""" self.pc = PC() self.pc.header.frame_id = 'world' self.pub = rospy.Publisher('normalized_state', PC, queue_size=10) self.pub1 = rospy.Publisher('state', PC, queue_size=10) """ Reading targets """ """ The data should be w.r.t origin by base position """ self.ends = pickle.load(open('./data/targets.p', 'rb')) self.x_offset = X_OFFSET self.y_offset = Y_OFFSET self.z_offset = Z_OFFSET self.sample_target() print("Read target data") self.pfn.restore_momery() self.pfn.restore_model() memory_ep = np.ones((MAX_EP_STEPS, 3 + 3 + 1 + 1)) * -100 self.current_ep = 0 self.current_step = 0 while not (rospy.is_shutdown()): if self.current_ep < MAX_EPISODES: if self.current_step < MAX_EP_STEPS: #rospy.sleep(0.5) s = self.normalize_state(self.target) #print 'x' #print s action, prob = self.pfn.choose_action(s) s_ = self.sim.update_pose(action)[-1, :] self.compute_reward(self.target, s_) #print action #print self.pfn.raw_action(s) #print self.target #print s_ #print np.linalg.norm(self.target - s_) #print self.reward transition = np.hstack( (self.target, action, self.reward, prob)) memory_ep[self.current_step] = transition self.current_step += 1 state = np.array([s_, self.target]) self.pub_state(state) if self.pfn.pointer > TRAIN_POINT: self.pfn.learn() else: best_i = np.argsort(memory_ep[:, -2])[-1] best_action = memory_ep[best_i, 3:6] best_s_ = self.sim.update_pose(best_action)[-1, :] best_distance = np.linalg.norm(self.target - best_s_) """ #best action print '*' j = np.argsort(memory_ep[:,-2])[0] print memory_ep[best_i, :] print memory_ep[j, :] print best_s_ print self.target print best_action print best_distance """ self.current_step = 0 mean_action, act_var = self.pfn.choose_action2(s) mean_s_ = self.sim.update_pose(mean_action)[-1, :] mean_distance = np.linalg.norm(mean_s_ - self.target) target_action, w = self.compute_weighted_action(memory_ep) s_ = self.sim.update_pose(target_action)[-1, :] target_distance = np.linalg.norm(s_ - self.target) self.compute_reward(self.target, s_) self.pfn.store_transition(s, target_action, mean_distance, np.var(w)) self.pfn.store_transition(s, best_action, best_distance, w[best_i]) self.current_ep += 1 self.sample_target() memory_ep = np.ones((MAX_EP_STEPS, 3 + 3 + 1 + 1)) * -100 if self.current_ep % 10 == 0: self.reward_record.append(mean_distance) self.ep_record.append(self.current_ep) plt.plot(self.ep_record, self.reward_record) plt.ylim([0.0, 0.1]) self.fig.canvas.draw() self.fig.savefig('learning.png') print('\n') #print s print('Episode:', self.current_ep) print("Mean Action:") print mean_action print("Mean Distance:") print mean_distance print("Action Variance:") print act_var print("Target Action:") print target_action print("Target Distance:") print target_distance print("Weights Variance:") print np.var(w) print('*' * 40) self.pfn.save_model() self.pfn.save_memory() else: s = self.normalize_state(self.target) action, act_var = self.pfn.choose_action2(s) s_ = self.sim.update_pose(action)[-1, :] self.compute_reward(self.target, s_) print("Mean Action:") print action print("Action Variance:") print act_var print("Distance: %f" % np.linalg.norm(self.target - s_)) print("Reward: %f" % self.reward) print '\n' state = np.array([s_, self.target]) self.pub_state(state) rospy.sleep(1) self.sample_target()
def __init__(self): """ Initializing DDPG """ self.ddpg = DDPG(a_dim=A_DIM, s_dim=S_DIM, batch_size=10, memory_capacity=500) self.ep_reward = 0.0 self.current_ep = 0 self.current_step = 0 self.current_action = np.array([.0, .0, .0]) self.done = True # if the episode is done self.var = 5.0 print("Initialized DDPG") """ Setting communication""" self.sub = rospy.Subscriber('robot_pose', PA, self.callback, queue_size=1) self.pub = rospy.Publisher('normalized_state', PC, queue_size=10) rospy.wait_for_service('airpress_control', timeout=5) self.target_PS = PS() self.action_V3 = Vector3() self.pc = PC() self.pc.header.frame_id = 'world' self.state = PA() self.updated = False # if s is updated self.got_callback1 = False self.got_callback2 = False print("Initialized communication") """ Reading targets """ """ The data should be w.r.t origin by base position """ self.ends = pickle.load(open('./data/ends.p', 'rb')) self.x_offset = X_OFFSET self.y_offset = Y_OFFSET self.z_offset = Z_OFFSET self.scaler = 1 / 0.3 self.sample_target() print("Read target data") #self.ddpg.restore_momery() #self.ddpg.restore_model() while not (rospy.is_shutdown()): if self.current_ep < MAX_EPISODES: if self.current_step < MAX_EP_STEPS: while (not self.updated) & (not rospy.is_shutdown()): rospy.sleep(0.1) s = self.s.copy() delta_a = self.ddpg.choose_action(s) * A_BOUND print delta_a / A_BOUND print("Delta action:") print delta_a delta_a = np.random.normal(delta_a, self.var) print("Noise delta action: ") print delta_a self.current_action += delta_a self.current_action = np.clip(self.current_action, 0, 25) self.action_V3.x, self.action_V3.y, self.action_V3.z \ = self.current_action[0], self.current_action[1], self.current_action[2] self.run_action(self.action_V3) rospy.sleep(2.5) print "Current action:" print self.current_action self.updated = False while (not self.updated) & (not rospy.is_shutdown()): rospy.sleep(0.1) s_ = self.s.copy() self.compute_reward(self.end, self.n_t) action = delta_a / A_BOUND print action self.ddpg.store_transition(s, action, self.reward, s_) # print("Experience stored") if self.ddpg.pointer > TRAIN_POINT: if (self.current_step % 2 == 0): self.var *= 0.992 self.ddpg.learn() self.current_step += 1 self.ep_reward += self.reward if self.reward > GOT_GOAL: self.done = True self.current_step = 0 self.current_ep += 1 self.sample_target() print "Target Reached" print("Episode %i Ended" % self.current_ep) print( 'Episode:', self.current_ep, ' Reward: %d' % self.ep_reward, 'Explore: %.2f' % self.var, ) print('*' * 40) self.ep_reward = 0 self.ddpg.save_memory() self.ddpg.save_model() """ self.current_action = np.array([.0, .0, .0]) self.action_V3.x, self.action_V3.y, self.action_V3.z \ = self.current_action[0], self.current_action[1], self.current_action[2] self.run_action(self.action_V3) """ else: self.done = False if self.current_step == MAX_EP_STEPS: print "Target Failed" print("Episode %i Ends" % self.current_ep) print( 'Episode:', self.current_ep, ' Reward: %d' % self.ep_reward, 'Explore: %.2f' % self.var, ) print('*' * 40) self.current_step = 0 self.current_ep += 1 self.sample_target() self.ep_reward = 0 self.ddpg.save_memory() self.ddpg.save_model() """ self.current_action = np.array([.0, .0, .0]) self.action_V3.x, self.action_V3.y, self.action_V3.z \ = self.current_action[0], self.current_action[1], self.current_action[2] self.run_action(self.action_V3) """ self.updated = False print('\n')