def step(self, ctrl): ctrl = self.scale_action(ctrl) self.sim.data.ctrl[:] = ctrl self.sim.forward() self.sim.step() self.step_ctr += 1 obs = self.get_obs() # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, thd, phid, psid = self.sim.get_state().qvel.tolist()[:6] # Reward conditions target_vel = 0.25 t_x = target_vel * np.cos(self.target_yaw) t_y = target_vel * np.sin(self.target_yaw) velocity_rew_x = 1. / (abs(xd - t_x) + 1.) - 1. / (t_x + 1.) velocity_rew_y = 1. / (abs(yd - t_y) + 1.) - 1. / (t_y + 1.) roll, pitch, yaw = my_utils.quat_to_rpy([qw, qx, qy, qz]) yaw_pen = np.min((abs((yaw % 6.183) - (self.target_yaw % 6.183)), abs(yaw - self.target_yaw))) r_pos = velocity_rew_x * 6 + velocity_rew_y * 6 r_neg = np.square(roll) * 0.5 + \ np.square(pitch) * 0.5 + \ np.square(zd) * 0.5 + \ np.square(yaw_pen) * 0.2 r_neg = np.clip(r_neg, 0, 1) * 1. r_pos = np.clip(r_pos, -2, 2) r = r_pos - r_neg self.episode_reward += r # Reevaluate termination condition done = self.step_ctr > self.max_steps contacts = (np.abs( np.array(self.sim.data.cfrc_ext[[4, 7, 10, 13, 16, 19 ]])).sum(axis=1) > 0.05).astype( np.float32) obs = np.concatenate([ self.scale_joints(self.sim.get_state().qpos.tolist()[7:]), self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], [roll, pitch, yaw, y], [self.target_yaw], contacts ]) self.model.body_quat[20] = my_utils.rpy_to_quat(0, 0, self.target_yaw) self.model.body_pos[20] = [x, y, 1] return obs, r, done, None
def step(self, ctrl): ctrl = self.scale_action(ctrl) self.sim.data.ctrl[:] = ctrl self.sim.forward() self.sim.step() self.step_ctr += 1 obs = self.get_obs() obs_dict = self.get_obs_dict() # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, thd, phid, psid = self.sim.get_state().qvel.tolist()[:6] contacts = (np.abs( np.array(self.sim.data.cfrc_ext[[4, 7, 10, 13, 16, 19 ]])).sum(axis=1) > 0.05).astype( np.float32) # Reward conditions target_vel = self.target_criteria_dict["vel"] velocity_rew = (1. / (abs(xd - target_vel) + 1.) - 1. / (target_vel + 1.)) * (1. / target_vel) roll, pitch, yaw = my_utils.quat_to_rpy([qw, qx, qy, qz]) r_pos = velocity_rew r_neg = np.square(self.target_criteria_dict["height"] - z) * 100 + \ np.square(roll) * 1. + \ np.square(pitch) * 1. + \ np.square(zd) * 1. + \ np.square(yaw) * 5. + \ np.square(y) * 1. #print(z, self.target_criteria_dict["height"], np.square(self.target_criteria_dict["height"] - z) * 100, velocity_rew) r_neg = np.clip(r_neg, 0, 2.) r_pos = np.clip(r_pos, -2, 2) r = r_pos - r_neg # Reevaluate termination condition done = self.step_ctr > self.max_steps # or abs(roll) > 2 or abs(pitch) > 2 obs = np.concatenate([ self.scale_joints(self.sim.get_state().qpos.tolist()[7:]), self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], self.target_criteria_norm, [roll, pitch, yaw, y], contacts ]) return obs, r, done, obs_dict
def step(self, ctrl): ctrl = self.scale_action(ctrl) self.sim.data.ctrl[:] = ctrl self.sim.forward() self.sim.step() self.step_ctr += 1 obs = self.get_obs() # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, thd, phid, psid = self.sim.get_state().qvel.tolist()[:6] # Reward conditions target_vel = 0.30 velocity_rew = 1. / (abs(xd - target_vel) + 1.) - 1. / (target_vel + 1.) roll, pitch, yaw = my_utils.quat_to_rpy([qw, qx, qy, qz]) r_pos = velocity_rew * 5 r_neg = np.square(self.sim.data.actuator_force).mean() * 0.00001 + \ np.square(ctrl).mean() * 0.01 + \ np.square(roll) * 0.1 + \ np.square(pitch) * 0.1 + \ np.square(yaw) * 0.3 + \ np.square(y) * 0.1 r_neg = np.clip(r_neg, 0, 1) * 1 r_pos = np.clip(r_pos, -2, 2) r = r_pos - r_neg self.episode_reward += r # Reevaluate termination condition done = self.step_ctr > self.max_steps contacts = (np.abs( np.array(self.sim.data.cfrc_ext[[4, 7, 10, 13, 16, 19 ]])).sum(axis=1) > 0.05).astype( np.float32) obs = np.concatenate([ self.sim.get_state().qpos.tolist()[7:], self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], [roll, pitch, yaw, y], contacts ]) return obs, r, done, None
def step(self, ctrl): goal_dist_pre = np.abs(self.goal_pos - self.sim.get_state().qpos.tolist()[:2]).sum() mem = ctrl[-self.mem_dim:] act = ctrl[:-self.mem_dim] sact = self.scale_action(act) self.sim.data.ctrl[:] = sact self.sim.step() self.step_ctr += 1 goal_dist_post = np.abs(self.goal_pos - self.sim.get_state().qpos.tolist()[:2]).sum() obs_c = self.get_obs() obs_dict = self.get_obs_dict() x, y, z, qw, qx, qy, qz = obs_c[:7] #angle = 2 * np.arccos(qw) _, _, yaw = my_utils.quat_to_rpy((qw, qx, qy, qz)) target_angle = np.arctan2(self.goal_pos[1] - y, self.goal_pos[0] - x) target_err = abs(yaw - target_angle) # Reevaluate termination condition. done = self.step_ctr > self.max_steps # or obs_dict['torso_contact'] > 0.1 xd, yd, _, _, _, _ = obs_dict["root_vel"] ctrl_effort = np.square(ctrl[0:8]).mean() * 0.00 target_progress = (goal_dist_pre - goal_dist_post) * 50 #print(target_progress) obs = np.concatenate( (self.goal_pos - obs_c[:2], obs_c[2:], obs_dict["contacts"], [obs_dict['torso_contact']], mem)) r = np.clip(target_progress, -1, 1) - ctrl_effort - min(target_err, 3) * 0.1 return obs, r, done, obs_dict
def step(self, ctrl): ctrl = self.scale_action(ctrl) self.sim.data.ctrl[:] = ctrl self.sim.forward() self.sim.step() self.step_ctr += 1 obs = self.get_obs() obs_dict = self.get_obs_dict() # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, thd, phid, psid = self.sim.get_state().qvel.tolist()[:6] # Reward conditions ctrl_effort = np.square(ctrl).sum() target_vel = 0.25 velocity_rew = 1. / (abs(xd - target_vel) + 1.) - 1. / (target_vel + 1.) roll, pitch, yaw = my_utils.quat_to_rpy([qw, qx, qy, qz]) r = velocity_rew * 5 - \ np.square(self.sim.data.actuator_force).mean() * 0.0001 r = np.clip(r, -2, 2) self.episode_reward += r # Reevaluate termination condition done = self.step_ctr > self.max_steps #or abs(roll) > 2 or abs(pitch) > 2 obs = np.concatenate([ np.array(self.sim.get_state().qpos.tolist()[7:]), [roll, pitch, yaw, xd, yd, thd, phid], obs_dict["contacts"] ]) return obs, r, done, obs_dict
def step(self, ctrl): ctrl = np.clip(ctrl, -1, 1) ctrl_pen = np.square(ctrl).mean() #torques = self.sim.data.actuator_force #ctrl_pen = np.square(torques).mean() ctrl = self.scale_action(ctrl) self.sim.data.ctrl[:] = ctrl self.sim.forward() self.sim.step() self.step_ctr += 1 # # if np.random.rand() < 0.002: # self.model.geom_friction[0] = np.array([0.0001, 0.5, 0.5]) # print("Changed friction") obs = self.get_obs() # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, thd, phid, psid = self.sim.get_state().qvel.tolist()[:6] #xa, ya, za, tha, phia, psia = self.sim.data.qacc.tolist()[:6] self.vel_sum += xd # Reward conditions target_vel = 0.4 velocity_rew = 1. / (abs(xd - target_vel) + 1.) - 1. / (target_vel + 1.) velocity_rew = velocity_rew * (1 / (1 + 30 * np.square(yd))) roll, pitch, _ = my_utils.quat_to_rpy([qw, qx, qy, qz]) #yaw_deviation = np.min((abs((yaw % 6.183) - (0 % 6.183)), abs(yaw - 0))) q_yaw = 2 * acos(qw) r_neg = np.square(y) * 0.2 + \ np.square(q_yaw) * 0.5 + \ np.square(pitch) * 0.5 + \ np.square(roll) * 0.5 + \ ctrl_pen * 0.01 + \ np.square(zd) * 0.7 r_pos = velocity_rew * 6 # + (abs(self.prev_yaw_deviation) - abs(yaw_deviation)) * 3. + (abs(self.prev_y_deviation) - abs(y)) * 3. r = r_pos - r_neg # Reevaluate termination condition done = self.step_ctr > self.max_steps # or abs(y) > 0.3 or abs(yaw) > 0.6 or abs(roll) > 0.8 or abs(pitch) > 0.8 contacts = (np.abs( np.array(self.sim.data.cfrc_ext[[4, 7, 10, 13, 16, 19 ]])).sum(axis=1) > 0.05).astype( np.float32) if self.use_HF: obs = np.concatenate([ self.scale_joints(self.sim.get_state().qpos.tolist()[7:]), self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], [qw, qx, qy, qz, y], contacts, self.get_local_hf(x, y).flatten() ]) else: obs = np.concatenate([ self.scale_joints(self.sim.get_state().qpos.tolist()[7:]), self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], [qw, qx, qy, qz, y], contacts ]) return obs, r, done, None
def step(self, ctrl): ctrl = self.scale_action(ctrl) self.sim.data.ctrl[:] = ctrl self.sim.forward() self.sim.step() self.step_ctr += 1 obs = self.get_obs() # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, thd, phid, psid = self.sim.get_state().qvel.tolist()[:6] # Reward conditions target_vel = 0.25 dprev = np.sqrt((self.prev_xy[0] - self.goal_xy[0])**2 + (self.prev_xy[1] - self.goal_xy[1])**2) dcur = np.sqrt((x - self.goal_xy[0])**2 + (y - self.goal_xy[1])**2) to_goal_vel = dprev - dcur to_goal_vel *= 50. velocity_rew = 1. / (abs(to_goal_vel - target_vel) + 1.) - 1. / (target_vel + 1.) roll, pitch, yaw = my_utils.quat_to_rpy([qw, qx, qy, qz]) tar_angle = np.arctan2(self.goal_xy[1] - y, self.goal_xy[0] - x) yaw_deviation = np.min( (abs((yaw % 6.183) - (tar_angle % 6.183)), abs(yaw - tar_angle))) r_pos = velocity_rew * 3 + (self.prev_deviation - yaw_deviation) * 9 r_neg = np.square(roll) * 1.2 + \ np.square(pitch) * 1.2 + \ np.square(zd) * 1.5 self.prev_deviation = yaw_deviation r_neg = np.clip(r_neg, 0, 2) * 1. r_pos = np.clip(r_pos, -2, 2) r = r_pos - r_neg self.episode_reward += r # Reevaluate termination condition done = self.step_ctr > self.max_steps or dcur < 0.15 self.prev_xy = [x, y] contacts = (np.abs( np.array(self.sim.data.cfrc_ext[[4, 7, 10, 13, 16, 19 ]])).sum(axis=1) > 0.05).astype( np.float32) obs = np.concatenate([ self.scale_joints(self.sim.get_state().qpos.tolist()[7:]), self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], [roll, pitch, yaw, y], [x - self.goal_xy[0], y - self.goal_xy[1]], contacts ]) return obs, r, done, None
def step(self, ctrl): ctrl = ctrl[:-self.n_envs] classif = ctrl[-self.n_envs:] ctrl = np.clip(ctrl, -1, 1) ctrl = self.scale_action(ctrl) self.sim.data.ctrl[:] = ctrl self.sim.forward() self.sim.step() self.step_ctr += 1 obs = self.get_obs() # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, thd, phid, psid = self.sim.get_state().qvel.tolist()[:6] xa, ya, za, tha, phia, psia = self.sim.data.qacc.tolist()[:6] if x > self.scaled_indeces_list[self.current_env_idx]: self.current_env_idx += 1 self.current_env = self.envs[self.current_env_idx] with T.no_grad(): env_label = T.tensor(self.env_list.index( self.current_env)).unsqueeze(0) env_pred = T.tensor(classif).unsqueeze(0) ce_loss = self.CE_loss(env_pred, env_label) # Reward conditions target_vel = 0.3 velocity_rew = 1. / (abs(xd - target_vel) + 1.) - 1. / (target_vel + 1.) roll, pitch, yaw = my_utils.quat_to_rpy([qw, qx, qy, qz]) r_pos = velocity_rew * 6 r_neg = np.square(roll) * 2. + \ np.square(pitch) * 2. + \ np.square(zd) * 2. + \ np.square(yd) * 2. + \ np.square(y) * 2. + \ np.square(yaw) * 4.0 + \ np.square(self.sim.data.actuator_force).mean() * 0.000 + \ np.clip(np.square(np.array(self.sim.data.cfrc_ext[1])).sum(axis=0), 0, 1) * 0.4 + \ ce_loss.numpy() * .1 r_neg = np.clip(r_neg, 0, 1) * 1 r_pos = np.clip(r_pos, -2, 2) r = r_pos - r_neg self.episode_reward += r # Reevaluate termination condition done = self.step_ctr > self.max_steps contacts = (np.abs( np.array(self.sim.data.cfrc_ext[[4, 7, 10, 13, 16, 19 ]])).sum(axis=1) > 0.05).astype( np.float32) if self.use_HF: obs = np.concatenate([ self.scale_joints(self.sim.get_state().qpos.tolist()[7:]), self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], [roll, pitch, yaw, y], contacts, self.get_local_hf(x, y).flatten() ]) else: obs = np.concatenate([ self.scale_joints(self.sim.get_state().qpos.tolist()[7:]), self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], [roll, pitch, yaw, y], contacts ]) return obs, r, done, None
def step(self, ctrl): ctrl = np.clip(ctrl, -1, 1) ctrl = self.scale_action(ctrl) self.sim.data.ctrl[:] = ctrl self.sim.forward() self.sim.step() self.step_ctr += 1 obs = self.get_obs() # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, thd, phid, psid = self.sim.get_state().qvel.tolist()[:6] if x > self.scaled_indeces_list[self.current_env_idx]: self.current_env_idx += 1 self.current_env = self.envs[self.current_env_idx] expert = self.expert_dict[self.current_env] expert_act = expert(my_utils.to_tensor(self.obs, True)).detach() expert_act = expert_act[0].numpy() # Reward conditions target_vel = 0.3 velocity_rew = 1. / (abs(xd - target_vel) + 1.) - 1. / (target_vel + 1.) roll, pitch, yaw = my_utils.quat_to_rpy([qw, qx, qy, qz]) r_pos = velocity_rew * 6 r_neg = np.square(roll) * 1. + \ np.square(pitch) * 1. + \ np.square(zd) * 1. + \ np.square(y) * 1. + \ np.square(yaw) * 1.0 + \ np.square(ctrl - expert_act).mean() * 0.01 print(np.square(ctrl - expert_act).mean() * 0.01) r_neg = np.clip(r_neg, 0, 1) * 1 r_pos = np.clip(r_pos, -2, 2) r = r_pos - r_neg self.episode_reward += r # Reevaluate termination condition done = self.step_ctr > self.max_steps contacts = (np.abs( np.array(self.sim.data.cfrc_ext[[4, 7, 10, 13, 16, 19 ]])).sum(axis=1) > 0.05).astype( np.float32) if self.use_HF: obs = np.concatenate([ self.scale_joints(self.sim.get_state().qpos.tolist()[7:]), self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], [roll, pitch, yaw, y], contacts, self.get_local_hf(x, y).flatten() ]) else: obs = np.concatenate([ self.scale_joints(self.sim.get_state().qpos.tolist()[7:]), self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], [roll, pitch, yaw, y], contacts ]) self.obs = obs return obs, r, done, self.env_list.index(self.current_env)
def step(self, ctrl): ctrl = ctrl[:-(self.n_envs + 1)] classif = ctrl[-(self.n_envs + 1):] if np.argmax(classif) < self.n_envs + 1: policy = self.expert_list[np.argmax(classif[:self.n_envs])] act = policy(my_utils.to_tensor(self.obs, True)).detach() ctrl = act[0].numpy() ctrl = np.clip(ctrl, -1, 1) ctrl = self.scale_action(ctrl) self.sim.data.ctrl[:] = ctrl self.sim.forward() self.sim.step() self.step_ctr += 1 obs = self.get_obs() # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, thd, phid, psid = self.sim.get_state().qvel.tolist()[:6] xa, ya, za, tha, phia, psia = self.sim.data.qacc.tolist()[:6] # Reward conditions target_vel = 0.3 velocity_rew = 1. / (abs(xd - target_vel) + 1.) - 1. / (target_vel + 1.) roll, pitch, yaw = my_utils.quat_to_rpy([qw, qx, qy, qz]) r_pos = velocity_rew * 6 r_neg = np.square(roll) * 2. + \ np.square(pitch) * 2. + \ np.square(zd) * 2. + \ np.square(yd) * 2. + \ np.square(y) * 2. + \ np.square(yaw) * 4.0 + \ np.square(self.sim.data.actuator_force).mean() * 0.000 + \ np.clip(np.square(np.array(self.sim.data.cfrc_ext[1])).sum(axis=0), 0, 1) * 0.4 r_neg = np.clip(r_neg, 0, 1) * 1 r_pos = np.clip(r_pos, -2, 2) r = r_pos - r_neg self.episode_reward += r # Reevaluate termination condition done = self.step_ctr > self.max_steps contacts = (np.abs( np.array(self.sim.data.cfrc_ext[[4, 7, 10, 13, 16, 19 ]])).sum(axis=1) > 0.05).astype( np.float32) if self.use_HF: self.obs = np.concatenate([ self.scale_joints(self.sim.get_state().qpos.tolist()[7:]), self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], [roll, pitch, yaw, y], contacts, self.get_local_hf(x, y).flatten() ]) else: self.obs = np.concatenate([ self.scale_joints(self.sim.get_state().qpos.tolist()[7:]), self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], [roll, pitch, yaw, y], contacts ]) return self.obs, r, done, None
def step(self, ctrl): # Mute appropriate leg joints for i in range(6): if self.dead_leg_vector[i] == 1: ctrl[i * 3:i * 3 + 3] = np.zeros(3) #np.random.randn(3) * 0.1 if self.mem_dim == 0: mem = np.zeros(0) act = ctrl ctrl = self.scale_action(act) else: mem = ctrl[-self.mem_dim:] act = ctrl[:-self.mem_dim] ctrl = self.scale_action(act) self.prev_act = np.array(act) self.sim.data.ctrl[:] = ctrl self.sim.forward() self.sim.step() self.step_ctr += 1 obs = self.get_obs() obs_dict = self.get_obs_dict() # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, _, _, _ = self.sim.get_state().qvel.tolist()[:6] angle = 2 * acos(qw) roll, pitch, yaw = my_utils.quat_to_rpy((qw, qx, qy, qz)) # Reward conditions target_vel = 0.25 velocity_rew = 1. / (abs(xd - target_vel) + 1.) - 1. / (target_vel + 1.) r = velocity_rew * 10 - \ np.square(self.sim.data.actuator_force).mean() * 0.0001 - \ np.abs(roll) * 0.1 - \ np.square(pitch) * 0.1 - \ np.square(yaw) * .1 - \ np.square(y) * 0.1 - \ np.square(zd) * 0.01 r = np.clip(r, -2, 2) self.cumulative_environment_reward += r # Reevaluate termination condition done = self.step_ctr > self.max_steps # or abs(y) > 0.3 or x < -0.2 or abs(yaw) > 0.8 obs = np.concatenate([ np.array(self.sim.get_state().qpos.tolist()[3:]), [xd, yd], obs_dict["contacts"], mem ]) # if np.random.rand() < self.dead_leg_prob: # idx = np.random.randint(0,6) # self.dead_leg_vector[idx] = 1 # self.dead_leg_sums[idx] += 1 # self.model.geom_rgba[self.model._geom_name2id[self.leg_list[idx]]] = [1, 0, 0, 1] # self.dead_leg_prob = 0. return obs, r, done, obs_dict
def step(self, ctrl): ctrl_penalty = np.mean(ctrl) ctrl = self.scale_action(ctrl) self.sim.data.ctrl[:] = ctrl self.sim.forward() self.sim.step() self.step_ctr += 1 obs = self.get_obs() self.used_energy += self.sim.data.actuator_force # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, thd, phid, psid = self.sim.get_state().qvel.tolist()[:6] # Reward conditions target_vel = 0.30 velocity_rew_x = 1. / (abs(xd - target_vel) + 1.) - 1. / (target_vel + 1.) roll, pitch, yaw = my_utils.quat_to_rpy([qw, qx, qy, qz]) contacts = (np.abs( np.array(self.sim.data.cfrc_ext[[4, 7, 10, 13, 16, 19 ]])).sum(axis=1) > 0.05).astype( np.float32) # mean_used_energy = self.used_energy / self.step_ctr # mean_used_energy_coxa = mean_used_energy[0::3] # mean_used_energy_femur = mean_used_energy[1::3] # mean_used_energy_tibia = mean_used_energy[2::3] # coxa_penalty = np.mean(np.square(mean_used_energy_coxa - np.mean(mean_used_energy_coxa))) # femur_penalty = np.mean(np.square(mean_used_energy_femur - np.mean(mean_used_energy_femur))) # tibia_penalty = np.mean(np.square(mean_used_energy_tibia - np.mean(mean_used_energy_tibia))) r_pos = velocity_rew_x * 5. # + np.mean(contacts) * 0.2 r_neg = np.square(roll) * 1. + \ np.square(pitch) * 1. + \ np.square(zd) * 1. + \ np.square(yd) * 4. + \ np.square(y) * 7. + \ np.square(yaw) * 4. + \ np.square(self.sim.data.actuator_force).mean() * 0.000 + \ ctrl_penalty * 0.0 # + \ #(coxa_penalty * .1 + femur_penalty * .1 + tibia_penalty * .1) * self.step_ctr > 30 r_neg = np.clip(r_neg, 0, 1) * 1. r_pos = np.clip(r_pos, -2, 2) r = r_pos - r_neg self.episode_reward += r # Reevaluate termination condition done = self.step_ctr > self.max_steps obs = np.concatenate([ self.scale_joints(self.sim.get_state().qpos.tolist()[7:]), self.sim.get_state().qvel.tolist()[6:], self.sim.get_state().qvel.tolist()[:6], [roll, pitch, yaw, y], contacts ]) return obs, r, done, None
def step(self, ctrl, render=False): ctrl = np.clip(ctrl, -1, 1) ctrl = self.scale_action(ctrl) self.sim.data.ctrl[:] = ctrl self.model.opt.timestep = 0.003 for i in range(27): self.sim.forward() self.sim.step() if render: self.render() # Simulate read delay self.model.opt.timestep = 0.0008 joints = [] for i in range(18): joints.append(self.sim.get_state().qpos.tolist()[7 + i]) self.sim.forward() self.sim.step() self.step_ctr += 1 self.current_difficulty = np.minimum( self.current_difficulty + self.difficulty_per_step_increment, 1) torques = self.sim.data.actuator_force ctrl_pen = np.square(torques).mean() obs = self.get_obs() # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, thd, phid, psid = self.sim.get_state().qvel.tolist()[:6] self.xd_queue.append(xd) if len(self.xd_queue) > 15: self.xd_queue.pop(0) xd_av = sum(self.xd_queue) / len(self.xd_queue) velocity_rew = 1. / (abs(xd_av - self.target_vel) + 1.) - 1. / (self.target_vel + 1.) velocity_rew *= (0.3 / self.target_vel) roll, pitch, _ = my_utils.quat_to_rpy([qw, qx, qy, qz]) q_yaw = 2 * acos(qw) yaw_deviation = np.min( (abs((q_yaw % 6.183) - (0 % 6.183)), abs(q_yaw - 0))) r_neg = np.square(q_yaw) * 0.5 + \ np.square(pitch) * 1.5 * self.current_difficulty + \ np.square(roll) * 1.5 * self.current_difficulty + \ ctrl_pen * 0.00000 + \ np.square(zd) * 1.5 * self.current_difficulty r_correction = np.clip( abs(self.prev_deviation) - abs(yaw_deviation), -1, 1) r_pos = velocity_rew * 7 + r_correction * 15 r = np.clip(r_pos - r_neg, -3, 3) self.prev_deviation = yaw_deviation # Reevaluate termination condition done = self.step_ctr > self.max_steps contacts = (np.abs( np.array(self.sim.data.sensordata[0:6], dtype=np.float32)) > 0.05).astype(np.float32) * 2 - 1. # See if changing contacts affects policy #np.random.randint(0,2, size=6) * 2 - 1 #contacts = -np.ones(6) #clipped_torques = np.clip(torques * 0.05, -1, 1) c_obs = self.scale_joints(joints) quat = obs[3:7] if self.use_contacts: c_obs = np.concatenate([c_obs, contacts]) c_obs = np.concatenate([c_obs, quat]) return c_obs, r, done, {}
def step(self, ctrl, render=False): ctrl = np.clip(ctrl, -1, 1) ctrl = self.scale_action(ctrl) self.sim.data.ctrl[:] = ctrl self.sim.forward() self.sim.step() if render: self.render() time.sleep(0.001) self.step_ctr += 1 torques = self.sim.data.actuator_force ctrl_pen = np.square(torques).mean() obs = self.get_obs() # Angle deviation x, y, z, qw, qx, qy, qz = obs[:7] xd, yd, zd, thd, phid, psid = self.sim.get_state().qvel.tolist()[:6] self.xd_queue.append(xd) if len(self.xd_queue) > 15: self.xd_queue.pop(0) xd_av = sum(self.xd_queue) / len(self.xd_queue) velocity_rew = 1. / (abs(xd_av - self.target_vel) + 1.) - 1. / (self.target_vel + 1.) velocity_rew *= (0.3 / self.target_vel) roll, pitch, _ = my_utils.quat_to_rpy([qw, qx, qy, qz]) q_yaw = 2 * acos(qw) yaw_deviation = np.min( (abs((q_yaw % 6.183) - (0 % 6.183)), abs(q_yaw - 0))) r_neg = np.square(q_yaw) * 0.2 + \ np.square(pitch) * 0.5 + \ np.square(roll) * 0.5 + \ ctrl_pen * 0.00001 + \ np.square(zd) * 0.7 r_correction = np.clip( abs(self.prev_deviation) - abs(yaw_deviation), -1, 1) r_pos = velocity_rew * 4 + r_correction * 10 r = np.clip(r_pos - r_neg, -3, 3) self.prev_deviation = yaw_deviation # Reevaluate termination condition done = self.step_ctr > self.max_steps contacts = (np.abs( np.array(self.sim.data.sensordata[0:6], dtype=np.float32)) > 0.05).astype(np.float32) * 2 - 1. #clipped_torques = np.clip(torques * 0.05, -1, 1) c_obs = self.scale_joints(self.sim.get_state().qpos.tolist()[7:]) quat = obs[3:7] if self.use_contacts: c_obs = np.concatenate([c_obs, contacts]) c_obs = np.concatenate([c_obs, quat]) return np.array(quat), r, done, {}