Пример #1
0
    def prologue(self):
        # self.env = gym.envs.make('CartPoleAltRandom-v0')     # Used over mutliple trials
        self.env = gym.envs.make('CartPoleAlt-v0')  # Used for just one trial
        self.sup = Supervisor(self.act)
        self.lnr = Learner(LRC(self.alpha, self.eta, intercept=False))
        print(self.env.env.force_mag)

        self.env.env.force_mag = self.force_mag
Пример #2
0
    def prologue(self):
        """
            Preprocess hyperparameters and initialize learner and supervisor
        """
        self.params['filename'] = './experts/' + self.params['envname'] + '.pkl'
        self.env = gym.envs.make(self.params['envname'])

        self.params['d'] = self.env.action_space.shape[0]

        sess = tf.Session()
        policy = load_policy.load_policy(self.params['filename'])
        net_sup = Supervisor(policy, sess)
        init_cov = np.zeros((self.params['d'], self.params['d']))
        sup = GaussianSupervisor(net_sup, init_cov)
        est, lnr = self.reset_learner(self.params)

        self.lnr, self.sup, self.net_sup = lnr, sup, net_sup
        return self.params
Пример #3
0
class CartpoleDagger():
    def __init__(self, force_mag, reg):
        self.reg = reg
        self.iters = 100
        self.T = 200
        self.trials = 1

        self.alpha = 0.1
        self.lambda_prior = list(np.ones(10))

        self.eta = 1.0
        self.inner_eta = self.eta
        self.params = {}
        self.params['T'] = self.T
        self.params['iters'] = self.iters
        self.act = deepq.load("cartpole_model_alt2.pkl")

        if self.reg:
            self.base_dir = 'data/reg_cartpole_force_mag' + str(force_mag)
        else:
            self.base_dir = 'data/cartpole_force_mag' + str(force_mag)
        self.dir = os.path.join(self.base_dir, 'dagger')
        self.prefix = 'dagger'
        self.path = os.path.join(self.dir, self.prefix)
        self.force_mag = force_mag

        self.t = .01

    def prologue(self):
        # self.env = gym.envs.make('CartPoleAltRandom-v0')     # Used over mutliple trials
        self.env = gym.envs.make('CartPoleAlt-v0')  # Used for just one trial
        self.sup = Supervisor(self.act)
        self.lnr = Learner(LRC(self.alpha, self.eta, intercept=False))
        print(self.env.env.force_mag)

        self.env.env.force_mag = self.force_mag

    def run_trials(self):
        all_results = []
        # Used for multiple trials with random initial states
        # init_states = np.load("data/init_states.npy")
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)

        for trial in range(self.trials):
            self.prologue()
            # print("Init state: " + str(self.env.env.init_state))
            # self.env.env.init_state = init_states[trial, :]
            # print("Setting init state: " + str(self.env.env.init_state))
            self.path = os.path.join(self.dir,
                                     self.prefix) + '_trial' + str(trial)
            results = self.run_iters()
            all_results.append(results)

        self.aggregate(all_results)

        filepath = os.path.join(self.dir, self.prefix) + '.p'
        f = open(filepath, 'wb')
        pickle.dump(all_results, f)
        f.close()

        return all_results

    def aggregate(self, all_results):
        n = len(all_results)
        d = self.iters

        lnr_costs = np.zeros((n, d))
        opt_costs = np.zeros((n, d))
        diff_costs = np.zeros((n, d))

        lnr_batch_costs = np.zeros((n, d))
        opt_batch_costs = np.zeros((n, d))
        static_regret = np.zeros((n, d))

        for t, result in enumerate(all_results):
            lnr_costs[t, :] = result['lnr_costs']
            opt_costs[t, :] = result['opt_costs']
            diff_costs[t, :] = result['lnr_costs'] - result['opt_costs']
            lnr_batch_costs[t, :] = result['lnr_batch_costs']
            opt_batch_costs[t, :] = result['opt_batch_costs']
            static_regret[t, :] = result['static_regret']

        lnr_mean, lnr_std = statistics.mean_sem(lnr_costs)
        opt_mean, opt_std = statistics.mean_sem(opt_costs)
        diff_mean, diff_std = statistics.mean_sem(diff_costs)

        lnr_batch_mean, lnr_batch_std = statistics.mean_sem(lnr_batch_costs)
        opt_batch_mean, opt_batch_std = statistics.mean_sem(opt_batch_costs)
        static_regret_mean, static_regret_sem = statistics.mean_sem(
            static_regret)

        x_axis = np.arange(len(lnr_mean))

        # Dynamic Regret
        plt.subplot(211)
        plt.title("Actual loss")
        plt.errorbar(x_axis, lnr_mean, yerr=lnr_std, label='lnr costs')
        plt.errorbar(x_axis, opt_mean, yerr=opt_std, label='opt costs')
        plt.legend()

        plt.subplot(212)
        plt.title("Difference")
        plt.errorbar(x_axis, diff_mean, yerr=diff_std)
        plt.tight_layout()

        filepath = os.path.join(self.dir, self.prefix) + '.pdf'
        plt.savefig(filepath)
        plt.close()
        plt.cla()
        plt.clf()

        # Static Regret
        plt.subplot(211)
        plt.title("Batch loss")
        plt.errorbar(x_axis, lnr_batch_mean, yerr=lnr_std, label='lnr costs')
        plt.errorbar(x_axis, opt_batch_mean, yerr=opt_std, label='opt costs')
        plt.legend()

        plt.subplot(212)
        plt.title("Static Regret")
        plt.errorbar(x_axis, static_regret_mean, yerr=diff_std)
        plt.tight_layout()

        filepath = os.path.join(self.dir, self.prefix) + '_batch.pdf'
        plt.savefig(filepath)
        plt.close()
        plt.cla()
        plt.clf()

    def compute_statistics(self, iteration, results):

        states, tmp_actions, _, reward = statistics.collect_traj(
            self.env, self.lnr, self.params['T'], False)
        actions = [self.sup.intended_action(s) for s in states]
        d = self.env.observation_space.shape[0]
        # states += [np.zeros(d), np.zeros(d)]
        # actions += [1, 0]

        est = LRC(self.lnr.est.alpha, self.inner_eta, intercept=False)
        lh, ph = est.fit(states, actions)

        lnr_cost = self.lnr.est.loss(states, actions)
        opt_cost = est.loss(states, actions)

        print("\tlnr_cost: " + str(lnr_cost))
        print("\topt_cost: " + str(opt_cost))

        results['lnr_costs'].append(lnr_cost)
        results['opt_costs'].append(opt_cost)
        results['rewards'].append(reward)
        results['alphas'].append(self.lnr.est.alpha)

        curr_coef_ = self.lnr.est.coef_.copy()
        curr_opt_coef_ = est.coef_.copy()

        results['param_norms'].append(np.linalg.norm(curr_coef_))
        results['opt_param_norms'].append(np.linalg.norm(curr_opt_coef_))

        if not iteration is 0:

            variation = np.linalg.norm(self.last_coef_ - curr_coef_)
            opt_variation = np.linalg.norm(self.last_opt_coef_ -
                                           curr_opt_coef_)

            last_gradient = est.gradient(self.last_states, self.last_actions,
                                         curr_coef_)
            curr_gradient = est.gradient(states, actions, curr_coef_)
            beta = np.linalg.norm(last_gradient - curr_gradient) / variation

            results['variations'].append(variation)
            results['opt_variations'].append(opt_variation)
            results['lambdas'].append(opt_variation / variation)
            results['betas'].append(beta)

        self.last_coef_ = curr_coef_.copy()
        self.last_opt_coef_ = curr_opt_coef_.copy()
        self.last_states = states
        self.last_actions = actions

        static_est = LRC(self.lnr.est.alpha, self.inner_eta, intercept=False)
        batch_states = self.data_states + states
        batch_actions = self.data_actions + actions

        lh_batch, ph_batch = static_est.fit(batch_states, batch_actions)
        opt_batch_cost = static_est.loss(batch_states, batch_actions)
        lnr_batch_cost = np.mean(results['lnr_costs'])
        static_regret = lnr_batch_cost - opt_batch_cost

        print("\tlnr_batch_cost: " + str(lnr_batch_cost))
        print("\topt_batch_cost: " + str(opt_batch_cost))
        print()

        results['lnr_batch_costs'].append(lnr_batch_cost)
        results['opt_batch_costs'].append(opt_batch_cost)
        results['static_regret'].append(static_regret)

        return results

    def compute_results(self, results):

        _, _, _, sup_reward = statistics.collect_traj(self.env, self.sup,
                                                      self.params['T'], False)
        results['sup_rewards'] = [sup_reward] * len(results['rewards'])

        # DYNAMIC REGRET
        plt.subplot(211)
        plt.title("Actual loss")
        plt.plot(results['lnr_costs'], label='lnr costs')
        plt.plot(results['opt_costs'], label='opt costs')
        plt.legend()

        difference = results['lnr_costs'] - results['opt_costs']
        plt.subplot(212)
        plt.title("Difference")
        plt.plot(difference)
        plt.tight_layout()

        filepath = self.path + '.pdf'
        plt.savefig(filepath)
        plt.close()
        plt.cla()
        plt.clf()

        # STATIC REGRET
        plt.subplot(211)
        plt.title("Batch costs")
        plt.plot(results['lnr_batch_costs'], label='lnr costs')
        plt.plot(results['opt_batch_costs'], label='opt costs')
        plt.legend()

        plt.subplot(212)
        plt.title("Static regret (lnr batch - opt batch)")
        plt.plot(results['static_regret'])
        plt.tight_layout()

        filepath = self.path + '_batch.pdf'
        plt.savefig(filepath)
        plt.close()
        plt.cla()
        plt.clf()

        plt.subplot(111)
        plt.title("Rewards")
        plt.plot(results['rewards'], label='Learner rewards')
        plt.plot(results['sup_rewards'], label='Supervisor Rewards')
        plt.legend()
        plt.ylim(0, 20)
        filepath = self.path + '_reward.pdf'
        plt.savefig(filepath)
        plt.close()
        plt.cla()
        plt.clf()

        filepath = self.path + '.p'
        f = open(filepath, 'wb')
        pickle.dump(results, f)
        f.close()

    def run_iters(self):

        results = {
            'lnr_costs': [],
            'opt_costs': [],
            'variations': [],
            'opt_variations': [],
            'param_norms': [],
            'opt_param_norms': [],
            'lambdas': [],
            'lnr_batch_costs': [],
            'opt_batch_costs': [],
            'static_regret': [],
            'rewards': [],
            'betas': [],
            'alphas': [],
        }

        d = self.env.observation_space.shape[0]
        # self.data_states = [np.zeros(d), np.zeros(d)]
        # self.data_actions = [1, 0]
        self.data_states = []
        self.data_actions = []

        for iteration in range(self.iters):
            print("\tIteration: " + str(iteration))
            print("\tData states: " + str(len(self.data_states)))

            self.compute_statistics(iteration, results)

            states, tmp_actions, _, _ = statistics.collect_traj(
                self.env, self.lnr, self.params['T'], False)
            i_actions = [self.sup.intended_action(s) for s in states]

            self.data_states += states
            self.data_actions += i_actions

            self.lnr.set_data(self.data_states, self.data_actions)
            self.lnr.train()

            # Adaptive regularization:
            if self.reg and (iteration + 1) % 10 == 0:
                # mean_lambda = np.mean(results['lambdas'][-10:] + self.lambda_prior)
                mean_lambda = np.mean(results['lambdas'][-10:])
                next_alpha = mean_lambda * self.lnr.est.alpha
                self.lnr.est.alpha = self.t * next_alpha + (
                    1 - self.t) * self.lnr.est.alpha
                print("\n\n\t\t Updated alpha: " + str(self.lnr.est.alpha))
                print("\t\t Lambda was: " + str(mean_lambda))

        for key in results.keys():
            results[key] = np.array(results[key])

        self.compute_results(results)

        return results
Пример #4
0
def main():
    global state, action
    ##clear whole file from data dir
    clear()
    ####### Initialize Parameters
    dataNumber = 1
    Max_trajectory = 10
    init_noise = 0.0
    noise = [init_noise,init_noise]
    old_sigma = []
    N_K = []
    sampling_flag = False
    save_flag = False
    fail_flag = False
    robot = Robot(Num_goal)

    result = {'model_Num' :[]
             ,'Noise':[]
             ,'Number_of_Mixture':[]}
    row = 0
    col = 0
    initialize()
    rospy.init_node('Demo', anonymous=True, disable_signals=True)
    rospy.on_shutdown(shutdown)
    rate = rospy.Rate(10)
    for t in range(Max_trajectory):
        
        Sup_x = Supervisor(noise[0])
        Sup_y = Supervisor(noise[1])
        [a_x,E_x,IE_x] = [0.0,0.0,0.0]
        [a_y,E_y,IE_y] = [0.0,0.0,0.0]
        
        button = True
        k=0

        while True:
            # s = [Sub.goal_1,Sub.goal_2,Sub.goal_3,Sub.endeffector_pose]
            s = [Sub.goal_1,Sub.goal_2,Sub.endeffector_pose]
        
            fail = Fail(s)
            a_x, a_y = robot.policy(s,k)
            
            axes = [a_y, a_x]
            a = axes
            temp_state, temp_action = save.tempDataframe(s, a, Num_goal)
            
            fail_flag = fail.fail_check(Sub.simulationTime)
            if button :
                robot = Robot(Num_goal)
                Pub.reset(t)
                initialize()
                
                sampling_flag = True
                button = False
            elif fail_flag or (Sub.simulationState == 0) :
                Pub.sim_stop()
                initialize()
                sampling_flag = False
                button = True
                fail_flag = False
            
            if sampling_flag :
                action1 = Sup_y.sample_action(axes[0])
                action2 = Sup_x.sample_action(axes[1])
                sample_action = [action1,action2]
                # temp_action['v_y1'], temp_action['v_x1']= action1, action2
                state = save.dataAppend(state,temp_state)
                action = save.dataAppend(action,temp_action)
                Pub.actionInput(sample_action)
                if Sub.simulationTime >1.0 :
                    fail.simple_success()
                
                if (Sub.success == True) or (fail.success==True) :
                    save_flag = True
            if save_flag :
                # k = 0
                k += 1
                k %= Num_goal
                Pub.sim_stop()
                save.dataSave(state,action,dataNumber)
                save_flag = False
                sampling_flag = False
                Sub.success = False
                fail.success = False
                button = True
                if (dataNumber)%Num_goal==0 :
                    dataNumber += 1
                    break
                dataNumber += 1
            
            rate.sleep()
        
        if ((dataNumber-1) % 2 ==0) :
            initialize()
            Num_data = int(subprocess.check_output(command + " action | wc -l", shell=True))
            for i in range(Num_data):
                _state, _action = load.dataLoad(i+1)
                state = save.dataAppend(state,_state)
                action = save.dataAppend(action,_action)
            N = state.shape[0]
            
            X = state
            Y = action
            Y1 = Y['v_x1']
            Y2 = Y['v_y1']
            
            model = Learning('HIMGP',30,X,Y,old_sigma=old_sigma,K=N_K)
            model.learning(int((dataNumber-1)/Num_goal))

            old_sigma = model.model.old_sigma
            N_K = model.model.N_K
            K = len(model.model.N_K)-1
            noise = [model.model.Noise[K],model.model.Noise[K]]
            
            result['Noise'].append(noise[0])
            result['Number_of_Mixture'].append(model.model.M)
            result['model_Num'].append(i+1)
            df = pd.DataFrame(result)
            df.to_excel('data/Learning_state/LS.xlsx')
            
            print("="*40)
            print("Optimized Noise x: %f, Noise y: %f" %(noise[0],noise[1]))
            print(" \t model saved" )
            print(" \t Number of step %i " %(N))
            print("="*40)

    
    rospy.spin()
Пример #5
0
def main():
    global state, action
    ##clear whole file from data dir
    clear()
    ####### Initialize Parameters
    dataNumber = 1
    Max_trajectory = 10
    sampling_flag = False
    save_flag = False
    fail_flag = False
    robot = Robot(Num_goal)

    row = 0
    col = 0
    initialize()
    rospy.init_node('Demo', anonymous=True, disable_signals=True)
    rospy.on_shutdown(shutdown)
    rate = rospy.Rate(10)
    for t in range(Max_trajectory):
        Sup_x = Supervisor(0.0)
        Sup_y = Supervisor(0.0)
        [a_x, E_x, IE_x] = [0.0, 0.0, 0.0]
        [a_y, E_y, IE_y] = [0.0, 0.0, 0.0]

        button = True
        k = 1

        while True:
            # s = [Sub.goal_1,Sub.goal_2,Sub.goal_3,Sub.endeffector_pose]
            s = [Sub.goal_1, Sub.goal_2, Sub.endeffector_pose]

            fail = Fail(s)
            a_x, a_y = robot.policy(s, k)

            axes = [a_y, a_x]
            a = axes
            temp_state, temp_action = save.tempDataframe(s, a, Num_goal)

            fail_flag = fail.fail_check(Sub.simulationTime)
            if button:
                robot = Robot(Num_goal)
                Pub.reset(t)
                initialize()

                sampling_flag = True
                button = False
            elif fail_flag or (Sub.simulationState == 0):
                Pub.sim_stop()
                initialize()
                sampling_flag = False
                button = True
                fail_flag = False

            if sampling_flag:
                # temp_action['v_y1'], temp_action['v_x1']= action1, action2
                state = save.dataAppend(state, temp_state)
                action = save.dataAppend(action, temp_action)
                action1 = Sup_y.sample_action(axes[0])
                action2 = Sup_x.sample_action(axes[1])
                sample_action = [action1, action2]
                Pub.actionInput(sample_action)
                if Sub.simulationTime > 1.0:
                    fail.simple_success()

                if (Sub.success == True) or (fail.success == True):
                    save_flag = True
            if save_flag:
                k += 1
                k %= Num_goal
                Pub.sim_stop()
                save.dataSave(state, action, dataNumber)
                save_flag = False
                sampling_flag = False
                Sub.success = False
                fail.success = False
                button = True
                if (dataNumber) % Num_goal == 0:
                    dataNumber += 1
                    break
                dataNumber += 1

            rate.sleep()

        if ((dataNumber - 1) % 2 == 0):
            initialize()
            Num_data = int(
                subprocess.check_output(command + " action | wc -l",
                                        shell=True))
            for i in range(Num_data):
                _state, _action = load.dataLoad(i + 1)
                state = save.dataAppend(state, _state)
                action = save.dataAppend(action, _action)
            N = state.shape[0]

            X = state
            Y = action
            Y1 = Y['v_x1']
            Y2 = Y['v_y1']

            model = Learning('IMGP', 30, X, Y)
            # model = Learning('HIMGP',30,X,Y)
            model.learning(int((dataNumber - 1) / Num_goal))

            print("=" * 40)
            print(" \t model saved")
            print(" \t Number of step %i " % (N))
            print("=" * 40)

    rospy.spin()
Пример #6
0
from tools.Data.Subscriber import Subscriber
from tools.Data.Publisher import Publisher
from tools.Data.Save import Save
from tools.Data.Clear import clear
from tools.Fail_condition import Fail
from tools.supervisor import Supervisor
import rospy

save = Save('data/')
Sub = Subscriber()
Pub = Publisher()
noise = 0.1980
Sup_x = Supervisor(noise)
Sup_y = Supervisor(noise)
Num_goal = 2

def initialize():
    global state, action
    state , action = save.initDataframe(Num_goal)


def shutdown():
    print ('ros shutdown')
    

def main():
    global state, action
    init_data_num = 3
    dataNumber = init_data_num
    sampling_flag = False
    save_flag = False