예제 #1
0
    def linearize(self, env, sim, x_array, u_array):
        print "Linearizing"
        start_time = timer.time()
        aug_n = u_array[0].shape[0] + x_array[0].shape[0]
        T = len(x_array)
        Cs = [np.identity(aug_n)] * (T - 1)
        Fs = []
        cs = [np.zeros(aug_n)] * (T - 1)
        fs = [np.zeros(x_array[0].shape)] * (T - 1)
        simulate_fn = utils.gen_simulate_step(sim)
        for t in range(T - 1):
            print "linear t: " + str(t)
            x, u = x_array[t], u_array[t]
            xu = np.hstack((x, u))
            F = utils.finite_diff1(xu, simulate_fn)
            Fs.append(F)

        x = x_array[-1]
        u = np.zeros(u.shape)
        xu = np.hstack((x, u))
        Cs.append(np.identity(aug_n))
        cs.append(np.zeros(aug_n))
        Fs.append(np.zeros(Fs[0].shape))
        fs.append(np.zeros(x.shape))

        print "Done linearizing"
        end_time = timer.time()
        print "Total time: " + str(end_time - start_time)

        self.Cs = Cs
        self.Fs = Fs
        self.cs = cs
        self.fs = fs

        return Cs, Fs, cs, fs
예제 #2
0
def collect_robust_traj(env, agent, oc, T, visualize=False, early_stop=True):
    states = []
    intended_actions = []
    taken_actions = []
    scores = []

    s = env.reset()

    reward = 0.0
    count = 0

    def dec(u):
        x = env.get_x()
        s, _, _, _ = env.step(u)
        env.set_x(x)
        return oc.decision_function([s])[0, 0]

    for t in range(T):
        score = oc.decision_function([s])[0, 0]
        scores.append(score)

        a_intended = agent.intended_action(s)

        if score < .1:
            alpha = .1
            count += 1
            a = a_intended
            for _ in range(20):
                a = a + alpha * utils.finite_diff1(a, dec)
            next_s, r, done, _ = env.step(a)
        else:
            a = agent.sample_action(s)
            next_s, r, done, _ = env.step(a)

        reward += r

        states.append(s)
        intended_actions.append(a_intended)
        taken_actions.append(a)

        s = next_s

        if visualize:
            env.render()

        if early_stop == True and done == True:
            print "Breaking"
            break

    freq = count / float(t + 1)
    return states, intended_actions, taken_actions, reward, freq, scores
예제 #3
0
def collect_rec_finite_diff(env,
                            sim,
                            agent,
                            ocs,
                            T,
                            opt,
                            KLs,
                            visualize=False,
                            early_stop=True,
                            init_state=None,
                            max_rec=max_rec):
    states = []
    intended_actions = []
    taken_actions = []
    scores = []
    mags = []

    s = env.reset()
    if init_state:
        env.set_pos_vel(*init_state)
        s = env._get_obs()

    reward = 0.0
    count = 0
    freq = 0
    d = env.action_space.shape[0]

    reject = False
    failed = False

    info = {}
    info['first_out'] = -1
    info['first_violation'] = -1
    info['rec_failed'] = -1
    info['first_complete'] = -1
    info['triggered'] = False

    info['initial_state'] = env.get_pos_vel()

    info['completed'] = False
    info['failed'] = False
    info['failed_in_support'] = False

    if ocs[0].predict([s])[0] == -1:
        print "Initial state predicted out of distribution"
        reject = True

    else:
        for t in range(T):

            def dec(u):
                x = env.get_pos_vel()
                sim.set_pos_vel(*x)
                delta_s, _, _, _ = sim.step(u)
                sim.set_pos_vel(*x)
                return ocs[t].decision_function([delta_s])[0, 0]

            triggered = False
            score = ocs[t].decision_function([s])[0, 0]
            score_last = score

            if score < 0.0 and info['first_out'] == -1:
                info['first_out'] = t

            failed = env.violation()
            if failed and info['first_violation'] == -1:
                info['failed'] = True
                if info['first_out'] == -1:
                    info['failed_in_support'] = True
                info['first_violation'] = t

            if failed or score < 0.0:
                break

            completed = env.completed()
            if completed and info['first_complete'] == -1:
                info['completed'] = True
                info['first_complete'] = t
                break

            a_intended = agent.intended_action(s)

            j = 0
            if not info['triggered']:
                rec_scores = np.zeros(max_rec + 1)
                rec_cutoffs = np.zeros(max_rec + 1)
            while j < max_rec and score_last < (
                    KLs[t] * np.linalg.norm(a_intended)) and score_last > 0:
                if not info['triggered']:
                    rec_scores[j] = score_last
                    rec_cutoffs[j] = (KLs[t] * np.linalg.norm(a_intended))
                triggered = True

                delta = max(score_last, 0.0) / KLs[t]
                delta_u = 10 * utils.finite_diff1(np.zeros(d), dec)
                if np.linalg.norm(delta_u) * KLs[t] > score_last:
                    delta_u = delta * delta_u / np.linalg.norm(delta_u) / 5.0
                u_r = delta * delta_u

                s, _, _, _ = env.step(u_r)
                if visualize:
                    env.render()

                score_last = ocs[t].decision_function([s])[0, 0]

                a_intended = agent.intended_action(s)
                # print "time step: " + str(t) + ", j: " + str(j)
                # print "learner norm: " + str(np.linalg.norm(a_intended))
                # print "cutoff: " + str(KLs[t] * np.linalg.norm(a_intended))
                # print "score last: " + str(score_last)
                # print "Completed: " + str(env.completed())
                # print "Failure: " + str(env.violation())
                # print "\n"
                j += 1

            if not info['triggered']:
                rec_scores[j] = score_last
                rec_scores[j + 1:] = score_last
                rec_cutoffs[j:] = KLs[t] * np.linalg.norm(a_intended)

            if triggered:
                print "\t\tRecovery was activated, stopped at t: " + str(
                    t) + ", j: " + str(j)
                info['triggered'] = True

            if j == max_rec:
                print "\t\tNot able to recover, stopping"
                break

            score_updated = score_last

            a = agent.sample_action(s)
            mags.append(0.0)
            scores.append(score_updated)
            if score_updated > 0 or True:
                next_s, r, done, _ = env.step(a)
            else:
                next_s, r, done, _ = env.step(np.zeros(a.shape))
            reward += r

            states.append(s)
            intended_actions.append(a_intended)
            taken_actions.append(a)

            s = next_s

            if visualize:
                env.render()

            if early_stop == True and done == True:
                print "Breaking"
                break

        freq = count / float(t + 1)
        info['rec_scores'] = rec_scores
        info['rec_cutoffs'] = rec_cutoffs
    return states, intended_actions, taken_actions, reward, freq, scores, mags, info, failed, reject
예제 #4
0
def collect_robust_traj_multiple(env,
                                 agent,
                                 ocs,
                                 T,
                                 opt,
                                 visualize=False,
                                 early_stop=True,
                                 clipped=False):
    states = []
    intended_actions = []
    taken_actions = []
    scores = []
    mags = []

    s = env.reset()

    reward = 0.0
    count = 0

    for t in range(T):

        def dec(u):
            x = env.get_x()
            if clipped:
                u = np.clip(u, -1, 1)
            s, _, _, _ = env.step(u)
            env.set_x(x)
            return ocs[t + 1].decision_function([s])[0, 0]

        score = ocs[t].decision_function([s])[0, 0]
        scores.append(score)

        a_intended = agent.intended_action(s)

        if score < .1 and t < (T - 1):
            alpha = .01
            count += 1
            a = a_intended
            for _ in range(opt.grads):
                update_a = alpha * utils.finite_diff1(a, dec)
                a = a + update_a
            mags.append(np.linalg.norm(a - a_intended))
            if clipped:
                a = np.clip(a, -1, 1)
            next_s, r, done, _ = env.step(a)
        else:
            a = agent.sample_action(s)
            if clipped:
                a = np.clip(a, -1, 1)
            next_s, r, done, _ = env.step(a)
            mags.append(0.0)

        reward += r

        states.append(s)
        intended_actions.append(a_intended)
        taken_actions.append(a)

        s = next_s

        if visualize:
            env.render()

        if early_stop == True and done == True:
            print "Breaking"
            break

    freq = count / float(t + 1)
    return states, intended_actions, taken_actions, reward, freq, scores, mags
예제 #5
0
def finite_diff_loop(s,
                     t,
                     score,
                     env,
                     sim,
                     agent,
                     ocs,
                     T,
                     opt,
                     KLs,
                     visualize=False,
                     max_rec=500):
    rec_info = {'triggered': False, 'reached_max': False}
    j = 0
    d = env.action_space.shape[0]
    a_intended = agent.intended_action(s)
    score_last = score
    d = env.action_space.shape[0]

    rec_scores = np.zeros(max_rec + 1)
    rec_cutoffs = np.zeros(max_rec + 1)

    def dec(u):
        x = env.get_pos_vel()
        sim.set_pos_vel(*x)
        delta_s, _, _, _ = sim.step(u)
        sim.set_pos_vel(*x)
        return ocs[t].decision_function([delta_s])[0, 0]

    while j < max_rec and score_last < (
            KLs[t] * np.linalg.norm(a_intended)) and score_last > 0:
        rec_info['triggered'] = True
        rec_scores[j] = score_last
        rec_cutoffs[j] = np.linalg.norm(a_intended) * KLs[t]

        delta = max(score_last, 0.0) / KLs[t]
        delta_u = 10 * utils.finite_diff1(np.zeros(d), dec)
        if np.linalg.norm(delta_u) * KLs[t] > score_last:
            delta_u = delta * delta_u / np.linalg.norm(delta_u) / 5.0
        u_r = delta * delta_u

        s, _, _, _ = env.step(u_r)
        if visualize:
            env.render()

        score_last = ocs[t].decision_function([s])[0, 0]

        a_intended = agent.intended_action(s)
        j += 1

    rec_scores[j] = score_last
    rec_scores[j + 1:] = score_last
    rec_cutoffs[j:] = KLs[t] * np.linalg.norm(a_intended)

    if rec_info['triggered']:
        print "\t\tRecovery was activated, stopped at t: " + str(
            t) + ", j: " + str(j)

    if j == max_rec:
        rec_info['reached_max'] = True

    rec_info['score_last'] = score_last
    rec_info['rec_scores'] = rec_scores
    rec_info['rec_cutoffs'] = rec_cutoffs
    rec_info['a_intended'] = a_intended
    return s, score_last, rec_info