#!/usr/bin/env python """ generate data by simulation Zhiang Chen, Oct 30, 2017 MIT License """ import pickle import numpy as np import rospy from simulator import Sim rospy.init_node('simulator', anonymous=True) sim = Sim() actions = list() poses = list() for x in range(21): for y in range(21): for z in range(21): a = np.array([x,y,z]).astype(np.float32) p = sim.update_pose(a) actions.append(a) poses.append(p) sim_data = {'actions':actions,'poses':poses} pickle.dump( sim_data, open( "./data/sim_data.p", "wb" ) )
class Trainer(object): def __init__(self): """ Initializing DDPG """ self.sim = Sim() self.ddpg = DDPG(a_dim=A_DIM, s_dim=S_DIM, batch_size=64, memory_capacity=100000, gamma=GAMMA, lr_a=0.001) #gamma=0.98 self.ep_reward = 0.0 self.current_ep = 0 self.current_step = 0 self.current_action = np.array([.0, .0, .0]) self.done = True # if the episode is done self.var = VAR_INIT self.reward_record = list() self.ep_record = list() self.fig = plt.gcf() self.fig.show() self.fig.canvas.draw() print("Initialized DDPG") """ Setting communication""" self.pc = PC() self.pc.header.frame_id = 'world' self.pub = rospy.Publisher('normalized_state', PC, queue_size=10) self.pub1 = rospy.Publisher('state', PC, queue_size=10) """ self.sub = rospy.Subscriber('robot_pose', PA, self.callback, queue_size=1) self.pub = rospy.Publisher('normalized_state', PC, queue_size=10) rospy.wait_for_service('airpress_control', timeout=5) self.target_PS = PS() self.action_V3 = Vector3() self.state = PA() self.updated = False # if s is updated self.got_callback1 = False self.got_callback2 = False print("Initialized communication") """ """ Reading targets """ """ The data should be w.r.t origin by base position """ self.ends = pickle.load(open('./data/targets.p', 'rb')) self.x_offset = X_OFFSET self.y_offset = Y_OFFSET self.z_offset = Z_OFFSET self.scaler = 1/0.3 self.sample_target() print("Read target data") #self.ddpg.restore_momery() #self.ddpg.restore_model() while not (rospy.is_shutdown()): if self.current_ep < MAX_EPISODES: if self.current_step < MAX_EP_STEPS: #rospy.sleep(0.5) p = self.sim.current_pose s = np.vstack((p, self.target)) s = s[3:,:] s = self.normalize_state(s) norm_a = self.ddpg.choose_action(s.reshape(6)[-S_DIM:]) noise_a = np.random.normal(norm_a, self.var) action = np.clip(noise_a, -1.0, 1.0) self.current_action = action*A_BOUND + A_BOUND try: p_ = self.sim.update_pose(self.current_action) except: print self.ddpg.choose_action(s.reshape(6)[-S_DIM:]) print s.reshape(6)[-S_DIM:] s_ = np.vstack((p_, self.target)) s_ = s_[3:,:] s_ = self.normalize_state(s_) self.compute_reward(s[0,:], s_[1,:]) self.current_step += 1 if self.reward > GOT_GOAL: #self.reward += 1.0 self.ep_reward += self.reward if self.current_ep% 10 ==0: self.reward_record.append(self.ep_reward / self.current_step) self.ep_record.append(self.current_ep) plt.plot(self.ep_record, self.reward_record) plt.ylim([-1.5,0.0]) self.fig.canvas.draw() self.fig.savefig('learning.png') print('\n') print "Target Reached" print("Normalized action:") print norm_a print("Noise action:") print action print("Output action:") print self.current_action print("Reward: %f" % self.reward) print('Episode:', self.current_ep, ' Reward: %f' % self.ep_reward, 'Explore: %.3f' % self.var,) print('*' * 40) self.ddpg.save_model() self.ddpg.save_memory() self.done = True self.current_step = 0 self.current_ep += 1 self.sample_target() self.ep_reward = 0 #self.ddpg.save_memory() #self.ddpg.save_model() """ self.current_action = np.array([.0, .0, .0]) self.action_V3.x, self.action_V3.y, self.action_V3.z \ = self.current_action[0], self.current_action[1], self.current_action[2] self.run_action(self.action_V3) """ else: self.ep_reward += self.reward if self.current_step == MAX_EP_STEPS: if self.current_ep % 10 ==0: self.reward_record.append(self.ep_reward / self.current_step) self.ep_record.append(self.current_ep) plt.plot(self.ep_record, self.reward_record) plt.ylim([-1.5, 0.0]) self.fig.canvas.draw() self.fig.savefig('learning.png') print('\n') print "Target Failed" print("Normalized action:") print norm_a print("Noise action:") print action print("Output action:") print self.current_action print("Reward: %f" % self.reward) print('Episode:', self.current_ep, ' Reward: %f' % self.ep_reward, 'Explore: %.3f' % self.var,) print('*' * 40) self.ddpg.save_model() self.ddpg.save_memory() self.done = False self.current_step = 0 self.current_ep += 1 self.sample_target() self.ep_reward = 0 #self.ddpg.save_memory() #self.ddpg.save_model() """ self.current_action = np.array([.0, .0, .0]) self.action_V3.x, self.action_V3.y, self.action_V3.z \ = self.current_action[0], self.current_action[1], self.current_action[2] self.run_action(self.action_V3) """ self.ddpg.store_transition(s.reshape(6)[-S_DIM:], action, self.reward, s_.reshape(6)[-S_DIM:]) if self.ddpg.pointer > TRAIN_POINT: #if (self.current_step % 10 == 0): #self.var *= VAR_DECAY self.var = max(0.0,self.var-1.02/(MAX_EP_STEPS*MAX_EPISODES)) self.ddpg.learn() self.pub_state(s_) else: p = self.sim.current_pose s = np.vstack((p, self.target)) s = s[3:,:] s = self.normalize_state(s) norm_a = self.ddpg.choose_action(s.reshape(6)[-S_DIM:]) self.current_action = norm_a * A_BOUND + A_BOUND print("Normalized action:") print norm_a print("Current action:") print self.current_action p_ = self.sim.update_pose(self.current_action) s_ = np.vstack((p_, self.target)) s_ = s_[3:,:] print("Distance: %f" % np.linalg.norm(s_[0,:]-s_[1,:])) s_ = self.normalize_state(s_) self.compute_reward(s_[0, :], s_[1, :]) print('Explore: %.2f' % self.var,) print("Reward: %f" % self.reward) rospy.sleep(1) #print print self.ddpg.get_value(s.reshape(6)[-S_DIM:],norm_a,self.reward.reshape((-1,1)),s_.reshape(6)[-S_DIM:]) print '\n' self.pub_state(s_) if self.reward > GOT_GOAL: self.done = True self.current_step = 0 self.current_ep += 1 self.sample_target() print "Target Reached" print("Episode %i Ended" % self.current_ep) print("Reward: %f" % self.reward) print('Episode:', self.current_ep, ' Reward: %d' % self.ep_reward, 'Explore: %.2f' % self.var,) print('*' * 40) self.ep_reward = 0 # self.ddpg.save_memory() # self.ddpg.save_model() """ self.current_action = np.array([.0, .0, .0]) self.action_V3.x, self.action_V3.y, self.action_V3.z \ = self.current_action[0], self.current_action[1], self.current_action[2] self.run_action(self.action_V3) """ else: self.done = False self.current_step += 1 if self.current_step == 2: print "Target Failed" print("Reward: %f" % self.reward) print("Episode %i Ends" % self.current_ep) print( 'Episode:', self.current_ep, ' Reward: %d' % self.ep_reward, 'Explore: %.2f' % self.var,) print('*' * 40) self.current_step = 0 self.current_ep += 1 self.sample_target() self.ep_reward = 0 # self.ddpg.save_memory() # self.ddpg.save_model() """ self.current_action = np.array([.0, .0, .0]) self.action_V3.x, self.action_V3.y, self.action_V3.z \ = self.current_action[0], self.current_action[1], self.current_action[2] self.run_action(self.action_V3) """ """ def callback(self, pa): n_px = (np.array([pa.poses[i].position.x for i in range(4)]) - self.x_offset)*self.scaler n_py = (np.array([pa.poses[i].position.y for i in range(4)]) - self.y_offset)*self.scaler n_pz = (np.array([pa.poses[i].position.z for i in range(4)]) - self.z_offset)*self.scaler self.end = np.array((n_px[3], n_py[3], n_pz[3])) self.n_t = (self.target - np.array([0,0,Z_OFFSET]))*self.scaler self.s = np.concatenate((n_px, n_py, n_pz, self.n_t)) self.pub_state(n_px, n_py, n_pz, self.n_t) self.updated = True """ def sample_target(self): self.target = self.ends[np.random.randint(self.ends.shape[0])] """ def run_action(self,control): try: client = rospy.ServiceProxy('airpress_control', OneSeg) resp = client(control) return resp.status except rospy.ServiceException, e: print "Service call failed: %s"%e """ def compute_reward(self,end,target): error = target - end self.reward = -np.linalg.norm(error) #print np.linalg.norm(error) """ def pub_state(self, n_px, n_py, n_pz, n_t): pts = list() for i in range(4): pt = Point() pt.x = n_px[i] pt.y = n_py[i] pt.z = n_pz[i] pts.append(pt) pt = Point() pt.x, pt.y, pt.z = n_t[0], n_t[1], n_t[2] pts.append(pt) self.pc.points = pts self.pub.publish(self.pc) """ def pub_state(self, state): pts = list() for i in range(state.shape[0]): pt = Point() pt.x = state[i,0] pt.y = state[i,1] pt.z = state[i,2] pts.append(pt) self.pc.points = pts self.pub.publish(self.pc) pts = list() for i in range(state.shape[0]): pt = Point() pt.x = state[i, 0] / 10.0 pt.y = state[i, 1] / 10.0 pt.z = state[i, 2] / 35.0 + 0.42 pts.append(pt) self.pc.points = pts self.pub1.publish(self.pc) def normalize_state(self,state): offset = np.array([0,0,0.42]) scaler = np.array([10,10,35]) s = state - offset s = np.multiply(s, scaler) return s def calculate_dist(self, state): offset = np.array([0, 0, 0.42]) scaler = np.array([10, 10, 35]) s = np.multiply(state,1.0/scaler) s += offset return np.linalg.norm(s)
class Trainer(object): def __init__(self): """ Initializing DDPG """ self.sim = Sim() self.pfn = PFN(a_dim=A_DIM, s_dim=S_DIM, batch_size=8, memory_capacity=MEMORY_NUM, lr=0.001, bound=A_BOUND) self.ep_reward = 0.0 self.current_action = np.array([.0, .0, .0]) self.done = True # if the episode is done self.reward_record = list() self.ep_record = list() self.fig = plt.gcf() self.fig.show() self.fig.canvas.draw() print("Initialized PFN") """ Setting communication""" self.pc = PC() self.pc.header.frame_id = 'world' self.pub = rospy.Publisher('normalized_state', PC, queue_size=10) self.pub1 = rospy.Publisher('state', PC, queue_size=10) """ Reading targets """ """ The data should be w.r.t origin by base position """ self.ends = pickle.load(open('./data/targets.p', 'rb')) self.x_offset = X_OFFSET self.y_offset = Y_OFFSET self.z_offset = Z_OFFSET self.sample_target() print("Read target data") self.pfn.restore_momery() self.pfn.restore_model() memory_ep = np.ones((MAX_EP_STEPS, 3 + 3 + 1 + 1)) * -100 self.current_ep = 0 self.current_step = 0 while not (rospy.is_shutdown()): if self.current_ep < MAX_EPISODES: if self.current_step < MAX_EP_STEPS: #rospy.sleep(0.5) s = self.normalize_state(self.target) #print 'x' #print s action, prob = self.pfn.choose_action(s) s_ = self.sim.update_pose(action)[-1, :] self.compute_reward(self.target, s_) #print action #print self.pfn.raw_action(s) #print self.target #print s_ #print np.linalg.norm(self.target - s_) #print self.reward transition = np.hstack( (self.target, action, self.reward, prob)) memory_ep[self.current_step] = transition self.current_step += 1 state = np.array([s_, self.target]) self.pub_state(state) if self.pfn.pointer > TRAIN_POINT: self.pfn.learn() else: best_i = np.argsort(memory_ep[:, -2])[-1] best_action = memory_ep[best_i, 3:6] best_s_ = self.sim.update_pose(best_action)[-1, :] best_distance = np.linalg.norm(self.target - best_s_) """ #best action print '*' j = np.argsort(memory_ep[:,-2])[0] print memory_ep[best_i, :] print memory_ep[j, :] print best_s_ print self.target print best_action print best_distance """ self.current_step = 0 mean_action, act_var = self.pfn.choose_action2(s) mean_s_ = self.sim.update_pose(mean_action)[-1, :] mean_distance = np.linalg.norm(mean_s_ - self.target) target_action, w = self.compute_weighted_action(memory_ep) s_ = self.sim.update_pose(target_action)[-1, :] target_distance = np.linalg.norm(s_ - self.target) self.compute_reward(self.target, s_) self.pfn.store_transition(s, target_action, mean_distance, np.var(w)) self.pfn.store_transition(s, best_action, best_distance, w[best_i]) self.current_ep += 1 self.sample_target() memory_ep = np.ones((MAX_EP_STEPS, 3 + 3 + 1 + 1)) * -100 if self.current_ep % 10 == 0: self.reward_record.append(mean_distance) self.ep_record.append(self.current_ep) plt.plot(self.ep_record, self.reward_record) plt.ylim([0.0, 0.1]) self.fig.canvas.draw() self.fig.savefig('learning.png') print('\n') #print s print('Episode:', self.current_ep) print("Mean Action:") print mean_action print("Mean Distance:") print mean_distance print("Action Variance:") print act_var print("Target Action:") print target_action print("Target Distance:") print target_distance print("Weights Variance:") print np.var(w) print('*' * 40) self.pfn.save_model() self.pfn.save_memory() else: s = self.normalize_state(self.target) action, act_var = self.pfn.choose_action2(s) s_ = self.sim.update_pose(action)[-1, :] self.compute_reward(self.target, s_) print("Mean Action:") print action print("Action Variance:") print act_var print("Distance: %f" % np.linalg.norm(self.target - s_)) print("Reward: %f" % self.reward) print '\n' state = np.array([s_, self.target]) self.pub_state(state) rospy.sleep(1) self.sample_target() def compute_weighted_action(self, memory_ep): sum = np.sum(memory_ep[:, -2]) + 1e-6 try: w = memory_ep[:, -2] / sum except: print("Sum %f" % sum) print w target_action = np.average(memory_ep[:, S_DIM:S_DIM + A_DIM], axis=0, weights=w) return target_action, w def sample_target(self): self.target = self.ends[np.random.randint(self.ends.shape[0])] def compute_reward(self, end, target): error = target - end self.reward = 20**(-np.log2(2.5 * np.linalg.norm(error))) #print np.linalg.norm(error) def pub_state(self, state): pts = list() for i in range(state.shape[0]): pt = Point() pt.x = state[i, 0] pt.y = state[i, 1] pt.z = state[i, 2] pts.append(pt) self.pc.points = pts self.pub.publish(self.pc) pts = list() for i in range(state.shape[0]): pt = Point() pt.x = state[i, 0] / 10.0 pt.y = state[i, 1] / 10.0 pt.z = state[i, 2] / 35.0 + 0.42 pts.append(pt) self.pc.points = pts self.pub1.publish(self.pc) def normalize_state(self, state): offset = np.array([0, 0, 0.42]) scaler = np.array([10, 10, 35]) s = state - offset s = np.multiply(s, scaler) return s def calculate_dist(self, state): offset = np.array([0, 0, 0.42]) scaler = np.array([10, 10, 35]) s = np.multiply(state, 1.0 / scaler) s += offset return np.linalg.norm(s)