示例#1
0
#!/usr/bin/env python
"""
generate data by simulation
Zhiang Chen, Oct 30, 2017

MIT License
"""
import pickle
import numpy as np
import rospy
from simulator import Sim

rospy.init_node('simulator', anonymous=True)
sim = Sim()
actions = list()
poses = list()
for x in range(21):
    for y in range(21):
        for z in range(21):
            a = np.array([x,y,z]).astype(np.float32)
            p = sim.update_pose(a)
            actions.append(a)
            poses.append(p)

sim_data = {'actions':actions,'poses':poses}
pickle.dump( sim_data, open( "./data/sim_data.p", "wb" ) )
示例#2
0
class Trainer(object):
    def __init__(self):
        """ Initializing DDPG """
        self.sim = Sim()
        self.ddpg = DDPG(a_dim=A_DIM, s_dim=S_DIM, batch_size=64, memory_capacity=100000, gamma=GAMMA, lr_a=0.001) #gamma=0.98
        self.ep_reward = 0.0
        self.current_ep = 0
        self.current_step = 0
        self.current_action = np.array([.0, .0, .0])
        self.done = True # if the episode is done
        self.var = VAR_INIT
        self.reward_record = list()
        self.ep_record = list()
        self.fig = plt.gcf()
        self.fig.show()
        self.fig.canvas.draw()
        print("Initialized DDPG")

        """ Setting communication"""
        self.pc = PC()
        self.pc.header.frame_id = 'world'
        self.pub = rospy.Publisher('normalized_state', PC, queue_size=10)
        self.pub1 = rospy.Publisher('state', PC, queue_size=10)
        """
        self.sub = rospy.Subscriber('robot_pose', PA, self.callback, queue_size=1)
        self.pub = rospy.Publisher('normalized_state', PC, queue_size=10)
        rospy.wait_for_service('airpress_control', timeout=5)
        self.target_PS = PS()
        self.action_V3 = Vector3()
        self.state = PA()
        self.updated = False # if s is updated
        self.got_callback1 = False
        self.got_callback2 = False
        print("Initialized communication")
        """

        """ Reading targets """
        """ The data should be w.r.t origin by base position """
        self.ends = pickle.load(open('./data/targets.p', 'rb'))
        self.x_offset = X_OFFSET
        self.y_offset = Y_OFFSET
        self.z_offset = Z_OFFSET
        self.scaler = 1/0.3
        self.sample_target()
        print("Read target data")

        #self.ddpg.restore_momery()
        #self.ddpg.restore_model()

        while not (rospy.is_shutdown()):
            if self.current_ep < MAX_EPISODES:
                if self.current_step < MAX_EP_STEPS:
                    #rospy.sleep(0.5)
                    p = self.sim.current_pose
                    s = np.vstack((p, self.target))
                    s = s[3:,:]
                    s = self.normalize_state(s)
                    norm_a = self.ddpg.choose_action(s.reshape(6)[-S_DIM:])
                    noise_a = np.random.normal(norm_a, self.var)
                    action = np.clip(noise_a, -1.0, 1.0)
                    self.current_action = action*A_BOUND + A_BOUND
                    try:
                        p_ = self.sim.update_pose(self.current_action)
                    except:
                        print self.ddpg.choose_action(s.reshape(6)[-S_DIM:])
                        print s.reshape(6)[-S_DIM:]
                    s_ = np.vstack((p_, self.target))

                    s_ = s_[3:,:]
                    s_ = self.normalize_state(s_)
                    self.compute_reward(s[0,:], s_[1,:])

                    self.current_step += 1

                    if self.reward > GOT_GOAL:
                        #self.reward += 1.0
                        self.ep_reward += self.reward
                        if self.current_ep% 10 ==0:
                            self.reward_record.append(self.ep_reward / self.current_step)
                            self.ep_record.append(self.current_ep)
                            plt.plot(self.ep_record, self.reward_record)
                            plt.ylim([-1.5,0.0])
                            self.fig.canvas.draw()
                            self.fig.savefig('learning.png')
                            print('\n')
                            print "Target Reached"
                            print("Normalized action:")
                            print norm_a
                            print("Noise action:")
                            print action
                            print("Output action:")
                            print self.current_action
                            print("Reward: %f" % self.reward)
                            print('Episode:', self.current_ep, ' Reward: %f' % self.ep_reward, 'Explore: %.3f' % self.var,)
                            print('*' * 40)
                            self.ddpg.save_model()
                            self.ddpg.save_memory()
                        self.done = True
                        self.current_step = 0
                        self.current_ep += 1
                        self.sample_target()
                        self.ep_reward = 0
                        #self.ddpg.save_memory()
                        #self.ddpg.save_model()
                        """
                        self.current_action = np.array([.0, .0, .0])
                        self.action_V3.x, self.action_V3.y, self.action_V3.z \
                            = self.current_action[0], self.current_action[1], self.current_action[2]
                        self.run_action(self.action_V3)
                        """

                    else:
                        self.ep_reward += self.reward
                        if self.current_step == MAX_EP_STEPS:
                            if self.current_ep % 10 ==0:
                                self.reward_record.append(self.ep_reward / self.current_step)
                                self.ep_record.append(self.current_ep)
                                plt.plot(self.ep_record, self.reward_record)
                                plt.ylim([-1.5, 0.0])
                                self.fig.canvas.draw()
                                self.fig.savefig('learning.png')
                                print('\n')
                                print "Target Failed"
                                print("Normalized action:")
                                print norm_a
                                print("Noise action:")
                                print action
                                print("Output action:")
                                print self.current_action
                                print("Reward: %f" % self.reward)
                                print('Episode:', self.current_ep, ' Reward: %f' % self.ep_reward, 'Explore: %.3f' % self.var,)
                                print('*' * 40)
                                self.ddpg.save_model()
                                self.ddpg.save_memory()
                            self.done = False
                            self.current_step = 0
                            self.current_ep += 1
                            self.sample_target()
                            self.ep_reward = 0
                            #self.ddpg.save_memory()
                            #self.ddpg.save_model()
                            """
                            self.current_action = np.array([.0, .0, .0])
                            self.action_V3.x, self.action_V3.y, self.action_V3.z \
                                = self.current_action[0], self.current_action[1], self.current_action[2]
                            self.run_action(self.action_V3)
                            """
                    self.ddpg.store_transition(s.reshape(6)[-S_DIM:], action, self.reward, s_.reshape(6)[-S_DIM:])
                    if self.ddpg.pointer > TRAIN_POINT:
                        #if (self.current_step % 10 == 0):
                        #self.var *= VAR_DECAY
                        self.var = max(0.0,self.var-1.02/(MAX_EP_STEPS*MAX_EPISODES))
                        self.ddpg.learn()
                    self.pub_state(s_)

            else:
                p = self.sim.current_pose
                s = np.vstack((p, self.target))
                s = s[3:,:]
                s = self.normalize_state(s)
                norm_a = self.ddpg.choose_action(s.reshape(6)[-S_DIM:])
                self.current_action = norm_a * A_BOUND + A_BOUND
                print("Normalized action:")
                print norm_a
                print("Current action:")
                print self.current_action

                p_ = self.sim.update_pose(self.current_action)
                s_ = np.vstack((p_, self.target))
                s_ = s_[3:,:]
                print("Distance: %f" % np.linalg.norm(s_[0,:]-s_[1,:]))
                s_ = self.normalize_state(s_)

                self.compute_reward(s_[0, :], s_[1, :])
                print('Explore: %.2f' % self.var,)
                print("Reward: %f" % self.reward)
                rospy.sleep(1)
                #print
                print self.ddpg.get_value(s.reshape(6)[-S_DIM:],norm_a,self.reward.reshape((-1,1)),s_.reshape(6)[-S_DIM:])
                print '\n'
                self.pub_state(s_)

                if self.reward > GOT_GOAL:
                    self.done = True
                    self.current_step = 0
                    self.current_ep += 1
                    self.sample_target()
                    print "Target Reached"
                    print("Episode %i Ended" % self.current_ep)
                    print("Reward: %f" % self.reward)
                    print('Episode:', self.current_ep, ' Reward: %d' % self.ep_reward, 'Explore: %.2f' % self.var,)
                    print('*' * 40)
                    self.ep_reward = 0
                    # self.ddpg.save_memory()
                    # self.ddpg.save_model()
                    """
                    self.current_action = np.array([.0, .0, .0])
                    self.action_V3.x, self.action_V3.y, self.action_V3.z \
                        = self.current_action[0], self.current_action[1], self.current_action[2]
                    self.run_action(self.action_V3)
                    """

                else:
                    self.done = False
                    self.current_step += 1
                    if self.current_step == 2:
                        print "Target Failed"
                        print("Reward: %f" % self.reward)
                        print("Episode %i Ends" % self.current_ep)
                        print(
                            'Episode:', self.current_ep, ' Reward: %d' % self.ep_reward, 'Explore: %.2f' % self.var,)
                        print('*' * 40)
                        self.current_step = 0
                        self.current_ep += 1
                        self.sample_target()
                        self.ep_reward = 0
                        # self.ddpg.save_memory()
                        # self.ddpg.save_model()
                        """
                        self.current_action = np.array([.0, .0, .0])
                        self.action_V3.x, self.action_V3.y, self.action_V3.z \
                            = self.current_action[0], self.current_action[1], self.current_action[2]
                        self.run_action(self.action_V3)
                        """


    """
    def callback(self, pa):
        n_px = (np.array([pa.poses[i].position.x for i in range(4)]) - self.x_offset)*self.scaler
        n_py = (np.array([pa.poses[i].position.y for i in range(4)]) - self.y_offset)*self.scaler
        n_pz = (np.array([pa.poses[i].position.z for i in range(4)]) - self.z_offset)*self.scaler
        self.end = np.array((n_px[3], n_py[3], n_pz[3]))
        self.n_t = (self.target - np.array([0,0,Z_OFFSET]))*self.scaler
        self.s = np.concatenate((n_px, n_py, n_pz, self.n_t))
        self.pub_state(n_px, n_py, n_pz, self.n_t)
        self.updated = True
    """


    def sample_target(self):
            self.target = self.ends[np.random.randint(self.ends.shape[0])]
    """
    def run_action(self,control):
        try:
            client = rospy.ServiceProxy('airpress_control', OneSeg)
            resp = client(control)
            return resp.status
        except rospy.ServiceException, e:
            print "Service call failed: %s"%e
    """

    def compute_reward(self,end,target):
        error = target - end
        self.reward = -np.linalg.norm(error)
        #print np.linalg.norm(error)

    """
    def pub_state(self, n_px, n_py, n_pz, n_t):
        pts = list()
        for i in range(4):
            pt = Point()
            pt.x = n_px[i]
            pt.y = n_py[i]
            pt.z = n_pz[i]
            pts.append(pt)
        pt = Point()
        pt.x, pt.y, pt.z = n_t[0], n_t[1], n_t[2]
        pts.append(pt)
        self.pc.points = pts
        self.pub.publish(self.pc)
    """
    def pub_state(self, state):
        pts = list()
        for i in range(state.shape[0]):
            pt = Point()
            pt.x = state[i,0]
            pt.y = state[i,1]
            pt.z = state[i,2]
            pts.append(pt)
        self.pc.points = pts
        self.pub.publish(self.pc)
        pts = list()
        for i in range(state.shape[0]):
            pt = Point()
            pt.x = state[i, 0] / 10.0
            pt.y = state[i, 1] / 10.0
            pt.z = state[i, 2] / 35.0 + 0.42
            pts.append(pt)
        self.pc.points = pts
        self.pub1.publish(self.pc)


    def normalize_state(self,state):
        offset = np.array([0,0,0.42])
        scaler = np.array([10,10,35])
        s = state - offset
        s = np.multiply(s, scaler)
        return s

    def calculate_dist(self, state):
        offset = np.array([0, 0, 0.42])
        scaler = np.array([10, 10, 35])
        s = np.multiply(state,1.0/scaler)
        s += offset
        return  np.linalg.norm(s)
示例#3
0
class Trainer(object):
    def __init__(self):
        """ Initializing DDPG """
        self.sim = Sim()
        self.pfn = PFN(a_dim=A_DIM,
                       s_dim=S_DIM,
                       batch_size=8,
                       memory_capacity=MEMORY_NUM,
                       lr=0.001,
                       bound=A_BOUND)
        self.ep_reward = 0.0
        self.current_action = np.array([.0, .0, .0])
        self.done = True  # if the episode is done
        self.reward_record = list()
        self.ep_record = list()
        self.fig = plt.gcf()
        self.fig.show()
        self.fig.canvas.draw()
        print("Initialized PFN")
        """ Setting communication"""
        self.pc = PC()
        self.pc.header.frame_id = 'world'
        self.pub = rospy.Publisher('normalized_state', PC, queue_size=10)
        self.pub1 = rospy.Publisher('state', PC, queue_size=10)
        """ Reading targets """
        """ The data should be w.r.t origin by base position """
        self.ends = pickle.load(open('./data/targets.p', 'rb'))
        self.x_offset = X_OFFSET
        self.y_offset = Y_OFFSET
        self.z_offset = Z_OFFSET
        self.sample_target()
        print("Read target data")

        self.pfn.restore_momery()
        self.pfn.restore_model()

        memory_ep = np.ones((MAX_EP_STEPS, 3 + 3 + 1 + 1)) * -100
        self.current_ep = 0
        self.current_step = 0
        while not (rospy.is_shutdown()):
            if self.current_ep < MAX_EPISODES:
                if self.current_step < MAX_EP_STEPS:
                    #rospy.sleep(0.5)
                    s = self.normalize_state(self.target)
                    #print 'x'
                    #print s
                    action, prob = self.pfn.choose_action(s)
                    s_ = self.sim.update_pose(action)[-1, :]
                    self.compute_reward(self.target, s_)

                    #print action
                    #print self.pfn.raw_action(s)
                    #print self.target
                    #print s_
                    #print np.linalg.norm(self.target - s_)
                    #print self.reward
                    transition = np.hstack(
                        (self.target, action, self.reward, prob))
                    memory_ep[self.current_step] = transition
                    self.current_step += 1

                    state = np.array([s_, self.target])
                    self.pub_state(state)

                    if self.pfn.pointer > TRAIN_POINT:
                        self.pfn.learn()

                else:

                    best_i = np.argsort(memory_ep[:, -2])[-1]
                    best_action = memory_ep[best_i, 3:6]
                    best_s_ = self.sim.update_pose(best_action)[-1, :]
                    best_distance = np.linalg.norm(self.target - best_s_)
                    """ #best action
                    print '*'
                    j = np.argsort(memory_ep[:,-2])[0]
                    print memory_ep[best_i, :]
                    print memory_ep[j, :]
                    print best_s_
                    print self.target
                    print best_action
                    print best_distance
                    """

                    self.current_step = 0
                    mean_action, act_var = self.pfn.choose_action2(s)
                    mean_s_ = self.sim.update_pose(mean_action)[-1, :]
                    mean_distance = np.linalg.norm(mean_s_ - self.target)

                    target_action, w = self.compute_weighted_action(memory_ep)
                    s_ = self.sim.update_pose(target_action)[-1, :]
                    target_distance = np.linalg.norm(s_ - self.target)
                    self.compute_reward(self.target, s_)

                    self.pfn.store_transition(s, target_action, mean_distance,
                                              np.var(w))
                    self.pfn.store_transition(s, best_action, best_distance,
                                              w[best_i])
                    self.current_ep += 1
                    self.sample_target()
                    memory_ep = np.ones((MAX_EP_STEPS, 3 + 3 + 1 + 1)) * -100

                    if self.current_ep % 10 == 0:
                        self.reward_record.append(mean_distance)
                        self.ep_record.append(self.current_ep)
                        plt.plot(self.ep_record, self.reward_record)
                        plt.ylim([0.0, 0.1])
                        self.fig.canvas.draw()
                        self.fig.savefig('learning.png')
                        print('\n')
                        #print s
                        print('Episode:', self.current_ep)
                        print("Mean Action:")
                        print mean_action
                        print("Mean Distance:")
                        print mean_distance
                        print("Action Variance:")
                        print act_var
                        print("Target Action:")
                        print target_action
                        print("Target Distance:")
                        print target_distance
                        print("Weights Variance:")
                        print np.var(w)
                        print('*' * 40)
                        self.pfn.save_model()
                        self.pfn.save_memory()

            else:
                s = self.normalize_state(self.target)
                action, act_var = self.pfn.choose_action2(s)
                s_ = self.sim.update_pose(action)[-1, :]
                self.compute_reward(self.target, s_)

                print("Mean Action:")
                print action
                print("Action Variance:")
                print act_var
                print("Distance: %f" % np.linalg.norm(self.target - s_))
                print("Reward: %f" % self.reward)
                print '\n'
                state = np.array([s_, self.target])
                self.pub_state(state)
                rospy.sleep(1)
                self.sample_target()

    def compute_weighted_action(self, memory_ep):
        sum = np.sum(memory_ep[:, -2]) + 1e-6
        try:
            w = memory_ep[:, -2] / sum
        except:
            print("Sum %f" % sum)
            print w
        target_action = np.average(memory_ep[:, S_DIM:S_DIM + A_DIM],
                                   axis=0,
                                   weights=w)
        return target_action, w

    def sample_target(self):
        self.target = self.ends[np.random.randint(self.ends.shape[0])]

    def compute_reward(self, end, target):
        error = target - end
        self.reward = 20**(-np.log2(2.5 * np.linalg.norm(error)))
        #print np.linalg.norm(error)

    def pub_state(self, state):
        pts = list()
        for i in range(state.shape[0]):
            pt = Point()
            pt.x = state[i, 0]
            pt.y = state[i, 1]
            pt.z = state[i, 2]
            pts.append(pt)
        self.pc.points = pts
        self.pub.publish(self.pc)
        pts = list()
        for i in range(state.shape[0]):
            pt = Point()
            pt.x = state[i, 0] / 10.0
            pt.y = state[i, 1] / 10.0
            pt.z = state[i, 2] / 35.0 + 0.42
            pts.append(pt)
        self.pc.points = pts
        self.pub1.publish(self.pc)

    def normalize_state(self, state):
        offset = np.array([0, 0, 0.42])
        scaler = np.array([10, 10, 35])
        s = state - offset
        s = np.multiply(s, scaler)
        return s

    def calculate_dist(self, state):
        offset = np.array([0, 0, 0.42])
        scaler = np.array([10, 10, 35])
        s = np.multiply(state, 1.0 / scaler)
        s += offset
        return np.linalg.norm(s)