def linearize(self, env, sim, x_array, u_array): print "Linearizing" start_time = timer.time() aug_n = u_array[0].shape[0] + x_array[0].shape[0] T = len(x_array) Cs = [np.identity(aug_n)] * (T - 1) Fs = [] cs = [np.zeros(aug_n)] * (T - 1) fs = [np.zeros(x_array[0].shape)] * (T - 1) simulate_fn = utils.gen_simulate_step(sim) for t in range(T - 1): print "linear t: " + str(t) x, u = x_array[t], u_array[t] xu = np.hstack((x, u)) F = utils.finite_diff1(xu, simulate_fn) Fs.append(F) x = x_array[-1] u = np.zeros(u.shape) xu = np.hstack((x, u)) Cs.append(np.identity(aug_n)) cs.append(np.zeros(aug_n)) Fs.append(np.zeros(Fs[0].shape)) fs.append(np.zeros(x.shape)) print "Done linearizing" end_time = timer.time() print "Total time: " + str(end_time - start_time) self.Cs = Cs self.Fs = Fs self.cs = cs self.fs = fs return Cs, Fs, cs, fs
def collect_robust_traj(env, agent, oc, T, visualize=False, early_stop=True): states = [] intended_actions = [] taken_actions = [] scores = [] s = env.reset() reward = 0.0 count = 0 def dec(u): x = env.get_x() s, _, _, _ = env.step(u) env.set_x(x) return oc.decision_function([s])[0, 0] for t in range(T): score = oc.decision_function([s])[0, 0] scores.append(score) a_intended = agent.intended_action(s) if score < .1: alpha = .1 count += 1 a = a_intended for _ in range(20): a = a + alpha * utils.finite_diff1(a, dec) next_s, r, done, _ = env.step(a) else: a = agent.sample_action(s) next_s, r, done, _ = env.step(a) reward += r states.append(s) intended_actions.append(a_intended) taken_actions.append(a) s = next_s if visualize: env.render() if early_stop == True and done == True: print "Breaking" break freq = count / float(t + 1) return states, intended_actions, taken_actions, reward, freq, scores
def collect_rec_finite_diff(env, sim, agent, ocs, T, opt, KLs, visualize=False, early_stop=True, init_state=None, max_rec=max_rec): states = [] intended_actions = [] taken_actions = [] scores = [] mags = [] s = env.reset() if init_state: env.set_pos_vel(*init_state) s = env._get_obs() reward = 0.0 count = 0 freq = 0 d = env.action_space.shape[0] reject = False failed = False info = {} info['first_out'] = -1 info['first_violation'] = -1 info['rec_failed'] = -1 info['first_complete'] = -1 info['triggered'] = False info['initial_state'] = env.get_pos_vel() info['completed'] = False info['failed'] = False info['failed_in_support'] = False if ocs[0].predict([s])[0] == -1: print "Initial state predicted out of distribution" reject = True else: for t in range(T): def dec(u): x = env.get_pos_vel() sim.set_pos_vel(*x) delta_s, _, _, _ = sim.step(u) sim.set_pos_vel(*x) return ocs[t].decision_function([delta_s])[0, 0] triggered = False score = ocs[t].decision_function([s])[0, 0] score_last = score if score < 0.0 and info['first_out'] == -1: info['first_out'] = t failed = env.violation() if failed and info['first_violation'] == -1: info['failed'] = True if info['first_out'] == -1: info['failed_in_support'] = True info['first_violation'] = t if failed or score < 0.0: break completed = env.completed() if completed and info['first_complete'] == -1: info['completed'] = True info['first_complete'] = t break a_intended = agent.intended_action(s) j = 0 if not info['triggered']: rec_scores = np.zeros(max_rec + 1) rec_cutoffs = np.zeros(max_rec + 1) while j < max_rec and score_last < ( KLs[t] * np.linalg.norm(a_intended)) and score_last > 0: if not info['triggered']: rec_scores[j] = score_last rec_cutoffs[j] = (KLs[t] * np.linalg.norm(a_intended)) triggered = True delta = max(score_last, 0.0) / KLs[t] delta_u = 10 * utils.finite_diff1(np.zeros(d), dec) if np.linalg.norm(delta_u) * KLs[t] > score_last: delta_u = delta * delta_u / np.linalg.norm(delta_u) / 5.0 u_r = delta * delta_u s, _, _, _ = env.step(u_r) if visualize: env.render() score_last = ocs[t].decision_function([s])[0, 0] a_intended = agent.intended_action(s) # print "time step: " + str(t) + ", j: " + str(j) # print "learner norm: " + str(np.linalg.norm(a_intended)) # print "cutoff: " + str(KLs[t] * np.linalg.norm(a_intended)) # print "score last: " + str(score_last) # print "Completed: " + str(env.completed()) # print "Failure: " + str(env.violation()) # print "\n" j += 1 if not info['triggered']: rec_scores[j] = score_last rec_scores[j + 1:] = score_last rec_cutoffs[j:] = KLs[t] * np.linalg.norm(a_intended) if triggered: print "\t\tRecovery was activated, stopped at t: " + str( t) + ", j: " + str(j) info['triggered'] = True if j == max_rec: print "\t\tNot able to recover, stopping" break score_updated = score_last a = agent.sample_action(s) mags.append(0.0) scores.append(score_updated) if score_updated > 0 or True: next_s, r, done, _ = env.step(a) else: next_s, r, done, _ = env.step(np.zeros(a.shape)) reward += r states.append(s) intended_actions.append(a_intended) taken_actions.append(a) s = next_s if visualize: env.render() if early_stop == True and done == True: print "Breaking" break freq = count / float(t + 1) info['rec_scores'] = rec_scores info['rec_cutoffs'] = rec_cutoffs return states, intended_actions, taken_actions, reward, freq, scores, mags, info, failed, reject
def collect_robust_traj_multiple(env, agent, ocs, T, opt, visualize=False, early_stop=True, clipped=False): states = [] intended_actions = [] taken_actions = [] scores = [] mags = [] s = env.reset() reward = 0.0 count = 0 for t in range(T): def dec(u): x = env.get_x() if clipped: u = np.clip(u, -1, 1) s, _, _, _ = env.step(u) env.set_x(x) return ocs[t + 1].decision_function([s])[0, 0] score = ocs[t].decision_function([s])[0, 0] scores.append(score) a_intended = agent.intended_action(s) if score < .1 and t < (T - 1): alpha = .01 count += 1 a = a_intended for _ in range(opt.grads): update_a = alpha * utils.finite_diff1(a, dec) a = a + update_a mags.append(np.linalg.norm(a - a_intended)) if clipped: a = np.clip(a, -1, 1) next_s, r, done, _ = env.step(a) else: a = agent.sample_action(s) if clipped: a = np.clip(a, -1, 1) next_s, r, done, _ = env.step(a) mags.append(0.0) reward += r states.append(s) intended_actions.append(a_intended) taken_actions.append(a) s = next_s if visualize: env.render() if early_stop == True and done == True: print "Breaking" break freq = count / float(t + 1) return states, intended_actions, taken_actions, reward, freq, scores, mags
def finite_diff_loop(s, t, score, env, sim, agent, ocs, T, opt, KLs, visualize=False, max_rec=500): rec_info = {'triggered': False, 'reached_max': False} j = 0 d = env.action_space.shape[0] a_intended = agent.intended_action(s) score_last = score d = env.action_space.shape[0] rec_scores = np.zeros(max_rec + 1) rec_cutoffs = np.zeros(max_rec + 1) def dec(u): x = env.get_pos_vel() sim.set_pos_vel(*x) delta_s, _, _, _ = sim.step(u) sim.set_pos_vel(*x) return ocs[t].decision_function([delta_s])[0, 0] while j < max_rec and score_last < ( KLs[t] * np.linalg.norm(a_intended)) and score_last > 0: rec_info['triggered'] = True rec_scores[j] = score_last rec_cutoffs[j] = np.linalg.norm(a_intended) * KLs[t] delta = max(score_last, 0.0) / KLs[t] delta_u = 10 * utils.finite_diff1(np.zeros(d), dec) if np.linalg.norm(delta_u) * KLs[t] > score_last: delta_u = delta * delta_u / np.linalg.norm(delta_u) / 5.0 u_r = delta * delta_u s, _, _, _ = env.step(u_r) if visualize: env.render() score_last = ocs[t].decision_function([s])[0, 0] a_intended = agent.intended_action(s) j += 1 rec_scores[j] = score_last rec_scores[j + 1:] = score_last rec_cutoffs[j:] = KLs[t] * np.linalg.norm(a_intended) if rec_info['triggered']: print "\t\tRecovery was activated, stopped at t: " + str( t) + ", j: " + str(j) if j == max_rec: rec_info['reached_max'] = True rec_info['score_last'] = score_last rec_info['rec_scores'] = rec_scores rec_info['rec_cutoffs'] = rec_cutoffs rec_info['a_intended'] = a_intended return s, score_last, rec_info