def _test_mux_rnn_policies(policy_dict, env_list, n_envs): env = hex_env.Hexapod(env_list, max_n_envs=n_envs) env.env_change_prob = 1 env.max_steps = env.max_steps classifier = T.load(os.path.join( os.path.dirname(os.path.realpath(__file__)), "data/classifier.p"), map_location='cpu') # Test visually while True: current_idx = 0 s = env.reset() h_p = None h_c = None episode_reward = 0 with T.no_grad(): for i in range(env.max_steps * 2): env_idx, h_c = classifier( (my_utils.to_tensor(s, True).unsqueeze(0), h_c)) env_idx = T.argmax(env_idx[0][0]).numpy() if env_idx != current_idx: current_idx = env_idx h_p = None print("Changing policy to: {}".format(env_list[env_idx])) act, h_p = policy_dict[env_list[env_idx]]( (my_utils.to_tensor(s, True).unsqueeze(0), h_p)) s, r, done, _ = env.step(act[0][0].numpy()) episode_reward += r env.render() print("Env classification: {}".format(env_list[env_idx])) print("Episode reward: {}".format(episode_reward))
def test_classifier_reactive_policies(policy_dict, env_list): env = hex_env.Hexapod(env_list) env.env_change_prob = 1 env.max_steps = 600 classifier = T.load("classifier_A.p", map_location='cpu') # Test visually while True: current_env = "flat" env_idx = np.random.randint(0, 3) rnd_idx = np.random.randint(0, 3) s = env.reset() h_c = None episode_reward = 0 with T.no_grad(): for i in range(env.max_steps * 2): env_idx, h_c = classifier( (my_utils.to_tensor(s, True).unsqueeze(0), h_c)) #print(env_idx) env_idx = T.argmax(env_idx[0][0]).numpy() if np.random.rand() < 0.01: rnd_idx = np.random.randint(0, 3) act = policy_dict[env_list[env_idx]](my_utils.to_tensor( s, True)) s, r, done, _ = env.step(act[0].numpy()) episode_reward += r env.render() print("Env classification: {}".format(env_list[env_idx])) print("Episode reward: {}".format(episode_reward))
def test(env_list): env = hex_env.Hexapod(env_list) master = T.load("master_A.p", map_location='cpu') classifier = T.load("classifier_A.p", map_location='cpu') env.env_change_prob = 1. # Test visually while True: s = env.reset() h_m = None h_c = None episode_reward = 0 with T.no_grad(): for i in range(env.max_steps * 2): act, h_m = master((my_utils.to_tensor(s, True).unsqueeze(0), h_m)) c, h_c = classifier( (my_utils.to_tensor(s, True).unsqueeze(0), h_c)) s, r, done, _ = env.step(act[0][0].numpy()) episode_reward += r env.render() print("Env classification: {}".format(env_list[T.argmax( c[0][0]).numpy()])) print("Episode reward: {}".format(episode_reward))
def _test_mux_reactive_policies(policy_dict, env_list, n_envs, ID='def'): import cv2 def printval(values): img = np.zeros((90, 200, 3), dtype=np.uint8) a_idx = np.argmax(values) cv2.putText(img, 'p_{}'.format(env_list[0]) + '{0:.2f}'.format(values[0]), (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255 * int(a_idx != 0), 255, 0), 1, cv2.LINE_AA) cv2.putText(img, 'p_{}'.format(env_list[1]) + '{0:.2f}'.format(values[1]), (10, 45), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255 * int(a_idx != 1), 255, 0), 1, cv2.LINE_AA) cv2.putText(img, 'p_{}'.format(env_list[2]) + '{0:.2f}'.format(values[2]), (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255 * int(a_idx != 2), 255, 0), 1, cv2.LINE_AA) cv2.imshow('classification', img) cv2.waitKey(1) env = hex_env.Hexapod(env_list, max_n_envs=3, specific_env_len=25, s_len=200, walls=False) env.env_change_prob = 1 classifier = T.load(os.path.join( os.path.dirname(os.path.realpath(__file__)), "data/classifier_{}.p".format(ID)), map_location='cpu') # Test visually while True: s = env.reset() h_c = None episode_reward = 0 with T.no_grad(): for i in range(env.max_steps + 400): env_dist, h_c = classifier( (my_utils.to_tensor(s, True).unsqueeze(0), h_c)) env_softmax = T.softmax(env_dist, 2)[0][0].numpy() env_idx = T.argmax(env_dist[0][0]).numpy() printval(env_softmax) act = policy_dict[env_list[env_idx]](my_utils.to_tensor( s, True)) s, r, done, _ = env.step(act[0].numpy()) episode_reward += r env.render() #print("Env classification: {}".format(env_list[env_idx])) print("Episode reward: {}".format(episode_reward))
def test(self, policy, render=True, N=30, seed=None): # obs = np.array([1, 0, 0] * 3 + [0, 1, 0] * 3 + [0, 0, 0, 1]) # action = policy(my_utils.to_tensor(obs, True)).detach() # exit() if seed is not None: self.setseed(seed) self.env_change_prob = 1 rew = 0 vel_rew = 0 dist_rew = 0 for i in range(N): obs = self.reset() cr = 0 vr = 0 dr = 0 for j in range(int(self.max_steps)): #obs[0:18] = obs[0:18] + np.random.randn(18) * 0.3 action = policy(my_utils.to_tensor(obs, True)).detach() obs, r, done, _ = self.step(action[0].numpy(), render=True) cr += r rew += cr vel_rew += vr dist_rew += dr if render: print("Total episode reward: {}".format(cr)) if render: print("Total average reward = {}".format(rew / N)) return rew / N, vel_rew / N, dist_rew / N
def f(w): rewards = [] done = False obs, _ = env.reset() vector_to_parameters(torch.from_numpy(w).float(), policy.parameters()) while not done: # Get action from policy with torch.no_grad(): act = policy(my_utils.to_tensor(obs, True)) # Step environment obs, rew, done, od = env.step(act.squeeze(0).numpy()) if animate: env.render() rewards.append(od['rV']) r = 0 for rew in rewards: rew_arr = np.array(rew) r += rew_arr.sum() - np.abs(rew_arr - rew_arr.mean()).mean() return -r
def f(w): reward_total = 0 reps = 1 vector_to_parameters(torch.from_numpy(w).float(), policy.parameters()) for i in range(reps): reward = 0 done = False obs = env.reset() h_0 = policy.init_hidden() while not done: # Get action from policy with torch.no_grad(): act, h_1 = policy((my_utils.to_tensor(obs, True), h_0)) # Step environment act = act.squeeze(0).numpy() #act = np.array([-1,0]) obs, rew, done, _ = env.step(act) if animate: env.render() reward += rew h_0 = h_1 reward_total += reward return - (reward_total) / reps
def test_record_hidden(self, policy): self.reset() h_episodes = [] for i in range(10): h_list = [] obs = self.reset() h = None cr = 0 for j in range(self.max_steps * 2): action, h = policy((my_utils.to_tensor(obs, True), h)) obs, r, done, od, = self.step(action[0].detach().numpy()) cr += r time.sleep(0.001) self.render() h_list.append(h[0].detach().numpy()) print("Total episode reward: {}".format(cr)) h_arr = np.concatenate(h_list) h_episodes.append(h_arr) h_episodes_arr = np.stack(h_episodes) # Save hidden states filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/{}_states.npy".format(self.env_name)) np.save(filename, h_episodes_arr)
def test(self, policy, render=True, N=30, seed=None): if seed is not None: self.setseed(seed) self.env_change_prob = 1 rew = 0 vel_rew = 0 dist_rew = 0 for i in range(N): obs = self.reset() cr = 0 vr = 0 dr = 0 for j in range(int(self.max_steps)): action = policy(my_utils.to_tensor(obs, True)).detach() obs, r, done, (r_v, r_d) = self.step(action[0].numpy()) cr += r vr += r_v dr = max(dr, r_d) time.sleep(0.000) if render: self.render() rew += cr vel_rew += vr dist_rew += dr if render: print("Total episode reward: {}".format(cr)) if render: print("Total average reward = {}".format(rew / N)) return rew / N, vel_rew / N, dist_rew / N
def make_rollout(env, policy): obs = env.reset() observations = [] clean_actions = [] noisy_actions = [] rewards = [] step_ctr_list = [] episode_rew = 0 step_ctr = 0 while True: step_ctr_list.append(step_ctr) observations.append(obs) clean_act, noisy_act = policy.sample_action( my_utils.to_tensor(obs, True)) clean_act = clean_act.squeeze(0).detach().numpy() noisy_act = noisy_act.squeeze(0).detach().numpy() obs, r, done, _ = env.step(noisy_act) if abs(r) > 5: logging.warning("Warning! high reward ({})".format(r)) step_ctr += 1 episode_rew += r if config["animate"]: env.render() clean_actions.append(clean_act) noisy_actions.append(noisy_act) rewards.append(r) if done: break terminals = [False] * len(observations) terminals[-1] = True return observations, clean_actions, noisy_actions, rewards, terminals, step_ctr_list
def test_record(self, policy, ID): episode_states = [] episode_acts = [] for i in range(10): s = self.reset() cr = 0 states = [] acts = [] for j in range(self.max_steps): states.append(s) action = policy(my_utils.to_tensor(s, True)).detach()[0].numpy() acts.append(action) s, r, done, od, = self.step(action) cr += r episode_states.append(np.concatenate(states)) episode_acts.append(np.concatenate(acts)) print("Total episode reward: {}".format(cr)) np_states = np.concatenate(episode_states) np_acts = np.concatenate(episode_acts) np.save( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/{}_states.npy".format(ID)), np_states) np.save( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/{}_acts.npy".format(ID)), np_acts)
def test_recurrent(self, policy): self.env_change_prob = 1 self.reset() h_episodes = [] for i in range(10): self.difficulty = 1.5 h_list = [] obs = self.reset() h = None cr = 0 for j in range(self.max_steps * 3): action, h = policy((my_utils.to_tensor(obs, True).unsqueeze(0), h)) obs, r, done, od, = self.step(action[0, 0].detach().numpy() + np.random.randn(self.act_dim) * 0.1) cr += r time.sleep(0.001) self.render() h_list.append(h[0][:, 0, :].detach().numpy()) print("Total episode reward: {}".format(cr)) h_arr = np.stack(h_list) h_episodes.append(h_arr) h_episodes_arr = np.stack(h_episodes) # Save hidden states filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/{}_states.npy".format(self.env_name))
def test_recurrent(self, policy): self.env_change_prob = 1 self.reset() h_episodes = [] N = 20 rew = 0 for i in range(N): h_list = [] obs = self.reset() h = None cr = 0 for j in range(self.max_steps): action, h = policy((my_utils.to_tensor(obs, True).unsqueeze(0), h)) obs, r, done, od, = self.step(action[0].detach().numpy()) cr += r rew += r time.sleep(0.001) self.render() #h_list.append(h[0][:,0,:].detach().numpy()) print("Total episode reward: {}".format(cr)) #h_arr = np.stack(h_list) #h_episodes.append(h_arr) print("Total average reward = {}".format(rew / N)) exit() h_episodes_arr = np.stack(h_episodes) # Save hidden states filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/{}_states.npy".format(self.env_name))
def test_recurrent(self, policy, render=True, N=30, seed=None): if seed is not None: np.random.seed(seed) self.env_change_prob = 1 rew = 0 vel_rew = 0 dist_rew = 0 for i in range(N): obs = self.reset() h = None cr = 0 vr = 0 dr = 0 for j in range(self.max_steps): action, h = policy((my_utils.to_tensor(obs, True).unsqueeze(0), h)) obs, r, done, (r_v, r_d) = self.step(action[0].detach().numpy()) cr += r vr += r_v dr = max(dr, r_d) time.sleep(0.000) if render: self.render() rew += cr vel_rew += vr dist_rew += dr if render: print("Total episode reward: {}".format(cr)) return rew / N, vel_rew / N, dist_rew / N
def test_adapt(self, p1, p2, ID): self.env_list = ["flatpipe"] episode_states = [] episode_acts = [] ctr = 0 while ctr < 1000: print("Iter: {}".format(ctr)) current_policy_name = "p1" rnd_x = -0.1 + np.random.rand() * 0.3 + np.random.randint(0, 2) * 1.2 s = self.reset(init_pos=np.array([rnd_x, 0, 0])) cr = 0 states = [] acts = [] policy = p1 for j in range(self.max_steps): x = self.sim.get_state().qpos.tolist()[0] if 2.2 > x > 0.8 and current_policy_name == "p1": policy = p2 current_policy_name = "p2" print("Policy switched to p2") if not (2.2 > x > 0.8) and current_policy_name == "p2": policy = p1 current_policy_name = "p1" print("Policy switched to p1") states.append(s) action = policy(my_utils.to_tensor(s, True)).detach()[0].numpy() acts.append(action) s, r, done, od, = self.step(action) cr += r #self.render() if cr < 50: continue ctr += 1 episode_states.append(np.stack(states)) episode_acts.append(np.stack(acts)) print("Total episode reward: {}".format(cr)) np_states = np.stack(episode_states) np_acts = np.stack(episode_acts) np.save( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/states_{}.npy".format(ID)), np_states) np.save( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/acts_{}.npy".format(ID)), np_acts)
def test(self, policy): for i in range(100): obs = self.reset() cr = 0 for j in range(self.max_steps): action = policy(my_utils.to_tensor(obs, True)).detach() obs, r, done, od, = self.step(action[0].numpy()) cr += r time.sleep(0.001) self.render() print("Total episode reward: {}".format(cr))
def test(self, policy): self.reset() for i in range(100): done = False obs, _ = self.reset() cr = 0 while not done: action = policy(my_utils.to_tensor(obs, True)).detach() obs, r, done, od, = self.step(action[0]) cr += r time.sleep(0.001) print("Total episode reward: {}".format(cr))
def test(self, policy): #self.envgen.load() for i in range(100): obs = self.reset(test=True) cr = 0 for j in range(self.max_steps): action = policy(my_utils.to_tensor(obs, True)).detach() #print(action[0, :-self.mem_dim]) obs, r, done, od, = self.step(action[0]) cr += r time.sleep(0.001) self.render() print("Total episode reward: {}".format(cr))
def test_recurrent(self, policy): self.reset() for i in range(100): obs = self.reset() h = None cr = 0 for j in range(self.max_steps): action, h = policy((my_utils.to_tensor(obs, True).unsqueeze(0), h)) obs, r, done, od, = self.step(action[0, 0].detach().numpy()) cr += r time.sleep(0.001) self.render() print("Total episode reward: {}".format(cr))
def test(self, policy): #self.envgen.load() self.env_change_prob = 1 for i in range(100): obs = self.reset() done = False cr = 0 while not done: action = policy(my_utils.to_tensor(obs, True)).detach() obs, r, done, od, = self.step(action[0].numpy()) cr += r time.sleep(0.001) self.render() print("Total episode reward: {}".format(cr))
def test_agent(env, policy): for _ in range(100): obs = env.reset() cum_rew = 0 while True: action, noisy_action = policy.sample_action( my_utils.to_tensor(obs, True)) obs, reward, done, info = env.step( action.detach().squeeze(0).numpy()) cum_rew += reward env.render() if done: print(cum_rew) break env.close()
def test_recurrent(self, policy): self.reset() for i in range(100): done = False obs, _ = self.reset() h = policy.init_hidden() cr = 0 while not done: action, h_ = policy((my_utils.to_tensor(obs, True), h)) h = h_ obs, r, done, od, = self.step(action[0].detach()) cr += r time.sleep(0.001) self.render() print("Total episode reward: {}".format(cr))
def test(self, policy, render=True): N = 30 rew = 0 for i in range(N): obs = self.reset() cr = 0 for j in range(int(self.max_steps)): action = policy(my_utils.to_tensor(obs, True)).detach() obs, r, done, od, = self.step(action[0].numpy()) cr += r rew += r time.sleep(0.000) if render: self.render() print("Total episode reward: {}".format(cr)) print("Total average reward = {}".format(rew / N))
def test(self, policy): #self.envgen.load() self.env_change_prob = 1 for i in range(100): obs = self.reset() cr = 0 for j in range(int(self.max_steps * 1.5)): action = policy(my_utils.to_tensor(obs, True)).detach() obs, r, done, od, = self.step(action[0].numpy()) cr += r time.sleep(0.001) self.render() if np.sqrt((self.prev_xy[0] - self.goal_xy[0])**2 + (self.prev_xy[1] - self.goal_xy[1])**2) < 0.15: break print("Total episode reward: {}".format(cr))
def test_recurrent(self, policy): total_rew = 0 self.render_prob = 1.0 for i in range(100): obs = self.reset() h = None cr = 0 for j in range(self.max_steps): action, h_ = policy((my_utils.to_tensor(obs, True), h)) h = h_ obs, r, done, od, = self.step(action[0].detach().numpy()) cr += r total_rew += r time.sleep(0.001) self.render() print("Total episode reward: {}".format(cr)) print("Total reward: {}".format(total_rew))
def test_recurrent(self, policy, slow=True, seed=None): if seed is not None: np.random.seed(seed) total_rew = 0 for i in range(100): obs = self.reset() h = None cr = 0 for j in range(self.max_steps): action, h = policy((my_utils.to_tensor(obs, True).unsqueeze(0), h)) obs, r, done, od, = self.step(action[0][0].detach().numpy()) cr += r total_rew += r if slow: time.sleep(0.01) print("Total episode reward: {}".format(cr)) print("Total reward: {}".format(total_rew))
def test(self, policy, slow=True, seed=None): if seed is not None: np.random.seed(seed) self.render_prob = 1.0 total_rew = 0 for i in range(100): obs = self.reset() cr = 0 for j in range(self.max_steps): action = policy(my_utils.to_tensor(obs, True)).detach() obs, r, done, od, = self.step(action[0].numpy()) cr += r total_rew += r if slow: time.sleep(0.01) print("Total episode reward: {}".format(cr)) print("Total reward: {}".format(total_rew))
def test_agent(self, policy): import src.my_utils as my_utils for _ in range(100): obs = self.reset() cum_rew = 0 ctr = 0 while True: torso_pos_prev, torso_quat_prev, _, _, joint_angles_prev, _, _, _, _, _ = self.get_obs() action, _ = policy.sample_action(my_utils.to_tensor(obs, True)) obs, reward, done, info = self.step(action.detach().squeeze(0).numpy()) cum_rew += reward self.render() if ctr % 10 == 0 and ctr > 0 and True: p.setJointMotorControlArray(bodyUniqueId=self.robot, jointIndices=range(18), controlMode=p.POSITION_CONTROL, targetPositions=[0] * 18, forces=[0] * 18, physicsClientId=self.client_ID) joint_angles_desired = self.norm_to_rads(np.tanh(action.detach().squeeze(0).numpy() * 0.5)) for _ in range(3): [p.resetJointState(self.robot, k, joint_angles_prev[k], 0, physicsClientId=self.client_ID) for k in range(18)] p.stepSimulation(physicsClientId=self.client_ID) time.sleep(0.6) [p.resetJointState(self.robot, k, joint_angles_desired[k], 0, physicsClientId=self.client_ID) for k in range(18)] p.stepSimulation(physicsClientId=self.client_ID) time.sleep(0.6) [p.resetJointState(self.robot, k, joint_angles_prev[k], 0, physicsClientId=self.client_ID) for k in range(18)] p.stepSimulation(physicsClientId=self.client_ID) ctr += 1 if done: print(cum_rew) break env.close()
def test_recurrent(self, policy): self.reset() for i in range(100): done = False obs = self.reset() h = policy.init_hidden() cr = 0 self.max_steps = 600 import matplotlib.pyplot as plt #fig = plt.figure() acts = [] while not done: action, h_ = policy((my_utils.to_tensor(obs, True), h)) acts.append(action[0].detach()) h = h_ obs, r, done, od, = self.step(action[0].detach()) cr += r time.sleep(0.001) self.render() print("Total episode reward: {}".format(cr))
def make_rollout(self, policy): self.env.set_randomize_env(False) obs = self.env.reset() observations = [] clean_actions = [] noisy_actions = [] rewards = [] while True: observations.append(obs) clean_act, noisy_act = policy.sample_action( my_utils.to_tensor(obs, True)) clean_act = clean_act.squeeze(0).detach().numpy() noisy_act = noisy_act.squeeze(0).detach().numpy() obs, r, done, _ = self.env.step(noisy_act) clean_actions.append(clean_act) noisy_actions.append(noisy_act) rewards.append(r) if done: break terminals = [False] * len(observations) terminals[-1] = True return observations, clean_actions, noisy_actions, rewards, terminals