def update_extended_observation(self): self.obs = self.robot.get_robot_observation(with_vel=True) # actually should /2 (downsample) self.obs.extend( list(np.mean(self.normal_forces[-self.control_skip:], axis=0))) if self.obs_noise: self.obs = utils.perturb(self.obs, 0.1, self.np_random) if self.train_dyn: obs_behavior = self.feat_select_func(self.obs) obs_nn = utils.wrap(obs_behavior, is_cuda=self.cuda_env) with torch.no_grad(): _, action_nn, _, self.recurrent_hidden_states = self.go_actor_critic.act( obs_nn, self.recurrent_hidden_states, self.masks, deterministic=False) action = utils.unwrap(action_nn, is_cuda=self.cuda_env) # 25% noise if a clipped to -1, 1 action = utils.perturb(action, self.enlarge_act_range, self.np_random) action = np.clip(action, -1.0, 1.0) self.behavior_act_len = len(action) self.obs = list(self.obs) + list(action)
def update_extended_observation(self): self.obs = self.robot.get_robot_observation() # self.obs = np.concatenate((self.obs, [np.minimum(self.timer / 500, self.max_tar_vel)])) # TODO if self.obs_noise: self.obs = utils.perturb(self.obs, 0.1, self.np_random) if self.train_dyn: self.behavior_obs_len = len(self.obs) obs_nn = utils.wrap(self.obs, is_cuda=self.cuda_env) with torch.no_grad(): _, action_nn, _, self.recurrent_hidden_states = self.go_actor_critic.act( obs_nn, self.recurrent_hidden_states, self.masks, deterministic=False # TODO, det pi ) action = utils.unwrap(action_nn, is_cuda=self.cuda_env) if self.enlarge_act_range: # 15% noise if a clipped to -1, 1 action = utils.perturb(action, 0.15, self.np_random) self.behavior_act_len = len(action) self.obs = np.concatenate((self.obs, action))
def reset(self, bullet_client): self._p = bullet_client base_init_pos = utils.perturb(self.base_init_pos, 0.03) base_init_euler = utils.perturb(self.base_init_euler, 0.1) self.go_id = self._p.loadURDF( os.path.join(currentdir, "assets/laikago/laikago_toes_limits.urdf"), list(base_init_pos - [0.043794, 0.0, 0.03]), list(self._p.getQuaternionFromEuler(list(base_init_euler))), flags=self._p.URDF_USE_SELF_COLLISION, useFixedBase=0) # self.go_id = self._p.loadURDF(os.path.join(currentdir, # "assets/laikago/laikago_limits.urdf"), # list(self.base_init_pos), # list(self._p.getQuaternionFromEuler([1.5708, 0, 1.5708])), # flags=self._p.URDF_USE_SELF_COLLISION, # useFixedBase=0) # self.go_id = self._p.loadURDF(os.path.join(currentdir, # "assets/laikago/laikago_toes_limits.urdf"), # list(self.base_init_pos), # list(self.base_init_quat), # useFixedBase=0) # self.print_all_joints_info() for j in range(self._p.getNumJoints(self.go_id)): # self._p.changeDynamics(self.go_id, j, linearDamping=0, angularDamping=0) # TODO self._p.changeDynamics(self.go_id, j, jointDamping=0.5) # TODO if len(self.ctrl_dofs) == 0: for j in range(self._p.getNumJoints(self.go_id)): info = self._p.getJointInfo(self.go_id, j) joint_type = info[2] if joint_type == self._p.JOINT_PRISMATIC or joint_type == self._p.JOINT_REVOLUTE: self.ctrl_dofs.append(j) # print("ctrl dofs:", self.ctrl_dofs) self.reset_joints(self.init_q, np.array([0.0] * len(self.ctrl_dofs))) # turn off root default control: # use torque control self._p.setJointMotorControlArray(bodyIndex=self.go_id, jointIndices=self.ctrl_dofs, controlMode=self._p.VELOCITY_CONTROL, forces=[0.0] * len(self.ctrl_dofs)) self.torque = [0.0] * len(self.ctrl_dofs) self.ll = np.array( [self._p.getJointInfo(self.go_id, i)[8] for i in self.ctrl_dofs]) self.ul = np.array( [self._p.getJointInfo(self.go_id, i)[9] for i in self.ctrl_dofs]) assert len(self.ctrl_dofs) == len(self.init_q) assert len(self.max_forces) == len(self.ctrl_dofs) assert len(self.max_forces) == len(self.ll)
def reset_joints(self, q, dq): if self.init_noise: init_q = utils.perturb(q, 0.01, self.np_random) init_dq = utils.perturb(dq, 0.1, self.np_random) else: init_q = utils.perturb(q, 0.0, self.np_random) init_dq = utils.perturb(dq, 0.0, self.np_random) for pt, ind in enumerate(self.ctrl_dofs): self._p.resetJointState(self.go_id, ind, init_q[pt], init_dq[pt])
def soft_reset(self, bullet_client): self._p = bullet_client base_init_pos = utils.perturb(self.base_init_pos, 0.03) base_init_euler = utils.perturb(self.base_init_euler, 0.1) self._p.resetBaseVelocity(self.go_id, [0., 0, 0], [0., 0, 0]) self._p.resetBasePositionAndOrientation( self.go_id, list(base_init_pos), list(self._p.getQuaternionFromEuler(list(base_init_euler)))) self.reset_joints(self.init_q, np.array([0.0] * len(self.ctrl_dofs)))
def get_extended_observation(self): # with vel false cur_state = self.robot.get_robot_observation(with_vel=False) if self.obs_noise: cur_state = utils.perturb(cur_state, 0.1, self.np_random) # then update past obs utils.push_recent_value(self.past_obs_array, cur_state) # then construct behavior obs b_obs_all = utils.select_and_merge_from_s_a( s_mt=list(self.past_obs_array), a_mt=list(self.past_bact_array), s_idx=self.behavior_past_obs_t_idx, a_idx=np.array([])) # if train motor, return behavior obs and we are done if not self.train_dyn: return b_obs_all # else, train dyn # rollout b_pi obs_nn = utils.wrap(b_obs_all, is_cuda=self.cuda_env) with torch.no_grad(): _, action_nn, _, self.recurrent_hidden_states = self.go_actor_critic.act( obs_nn, self.recurrent_hidden_states, self.masks, deterministic=False) b_cur_act = list(utils.unwrap(action_nn, is_cuda=self.cuda_env)) b_cur_act = utils.perturb(b_cur_act, self.enlarge_act_range, self.np_random) b_cur_act = np.tanh(b_cur_act) # Store action after tanh (-1,1) utils.push_recent_value(self.past_bact_array, b_cur_act) # construct G obs from updated past obs&b_act # g_obs_all = utils.select_and_merge_from_s_a( # s_mt=list(self.past_obs_array), # a_mt=list(self.past_bact_array), # s_idx=self.generator_past_obs_t_idx, # a_idx=self.generator_past_act_t_idx # ) # calculate 4x split obs here g_obs_all = self.get_all_feet_local_obs(b_cur_act) return g_obs_all
def update_extended_observation(self): self.obs = self.robot.get_robot_observation() if self.correct_obs_dx: dx = self.get_ave_dx() * self.robot.obs_scaling[5] self.obs[5] = dx if self.obs_noise: self.obs = utils.perturb(self.obs, 0.1, self.np_random)
def get_extended_observation(self): obs = self.robot.get_robot_observation(with_vel=False) if self.obs_noise: obs = utils.perturb(obs, 0.1, self.np_random) # print(obs[-4:]) return obs
def reset( self, bullet_client ): self._p = bullet_client self.hopper_id = self._p.loadURDF(os.path.join(currentdir, "assets/hopper_my_box.urdf"), list(self.base_init_pos), self._p.getQuaternionFromEuler(list(self.base_init_euler)), flags=self._p.URDF_USE_SELF_COLLISION, useFixedBase=1) # self.print_all_joints_info() if self.init_noise: init_q = utils.perturb([0.0] * self.n_total_dofs, 0.01, self.np_random) init_dq = utils.perturb([0.0] * self.n_total_dofs, 0.1, self.np_random) else: init_q = utils.perturb([0.0] * self.n_total_dofs, 0.0, self.np_random) init_dq = utils.perturb([0.0] * self.n_total_dofs, 0.0, self.np_random) for ind in range(self.n_total_dofs): self._p.resetJointState(self.hopper_id, ind, init_q[ind], init_dq[ind]) # turn off root default control: self._p.setJointMotorControlArray( bodyIndex=self.hopper_id, jointIndices=self.root_dofs, controlMode=self._p.VELOCITY_CONTROL, forces=[0.0] * len(self.root_dofs)) # use torque control self._p.setJointMotorControlArray( bodyIndex=self.hopper_id, jointIndices=self.ctrl_dofs, controlMode=self._p.VELOCITY_CONTROL, forces=[0.0] * len(self.ctrl_dofs)) self.torque = [0.0] * len(self.ctrl_dofs) self.ll = np.array([self._p.getJointInfo(self.hopper_id, i)[8] for i in self.ctrl_dofs]) self.ul = np.array([self._p.getJointInfo(self.hopper_id, i)[9] for i in self.ctrl_dofs])
def get_extended_observation(self): obs = self.robot.get_robot_observation(with_vel=False) # actually should /2 (downsample) obs.extend( list(np.mean(self.normal_forces[-self.control_skip:], axis=0))) if self.obs_noise: obs = utils.perturb(obs, 0.1, self.np_random) # print(obs[-4:]) return obs
def get_extended_observation(self): obs = self.robot.get_robot_observation() # for foot in self.robot.feet: # cps = self._p.getContactPoints(self.robot.go_id, self.floor_id, foot, -1) # if len(cps) > 0: # obs.extend([1.0]) # else: # obs.extend([-1.0]) # obs.extend([np.minimum(self.timer / 500, self.max_tar_vel)]) # TODO if self.obs_noise: obs = utils.perturb(obs, 0.1, self.np_random) # print(obs) return obs
def get_extended_observation(self): obs = self.robot.get_robot_observation(with_vel=False) if self.obs_noise: obs = utils.perturb(obs, 0.1, self.np_random) utils.push_recent_value(self.past_obs_array, obs) # print(obs[-4:]) # obs_all = [] # list_past_obs = list(self.past_obs_array) # for t_idx in self.behavior_past_obs_t_idx: # obs_all.extend(list_past_obs[t_idx]) obs_all = utils.select_and_merge_from_s_a( s_mt=list(self.past_obs_array), a_mt=list(self.past_act_array), s_idx=self.behavior_past_obs_t_idx, a_idx=np.array([])) return list(obs_all)
def step(self, a): root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_0 = root_pos[0] a = np.clip(a, -1.0, 1.0) if self.act_noise: a = utils.perturb(a, 0.05, self.np_random) if self.low_power_env: # # for pi44 # _, dq = self.robot.get_q_dq(self.robot.ctrl_dofs) # max_force_ratio = np.clip(2 - dq/2.5, 0, 1) # a *= max_force_ratio # for pi43 _, dq = self.robot.get_q_dq(self.robot.ctrl_dofs) max_force_ratio = np.clip(2 - dq/2., 0, 1) a *= max_force_ratio for _ in range(self.control_skip): # action is in not -1,1 if a is not None: self.act = a self.robot.apply_action(a) if self.randomforce_train: for foot_ind, link in enumerate(self.robot.feet): # first dim represents fz fz = np.random.uniform(-80, 80) # second dim represents fx fx = np.random.uniform(-80, 80) # third dim represents fy fy = np.random.uniform(-80, 80) utils.apply_external_world_force_on_local_point(self.robot.go_id, link, [fx, fy, fz], [0, 0, 0], self._p) self._p.stepSimulation() if self.render: time.sleep(self._ts * 0.5) self.timer += 1 root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_1 = root_pos[0] self.velx = (x_1 - x_0) / (self.control_skip * self._ts) q, dq = self.robot.get_q_dq(self.robot.ctrl_dofs) reward = self.ab # alive bonus tar = np.minimum(self.timer / 500, self.max_tar_vel) reward += np.minimum(self.velx, tar) * self.vel_r_weight # print("v", self.velx, "tar", tar) reward += -self.energy_weight * np.square(a).sum() # print("act norm", -self.energy_weight * np.square(a).sum()) pos_mid = 0.5 * (self.robot.ll + self.robot.ul) q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll) joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97) reward += -self.jl_weight * joints_at_limit # print("jl", -self.jl_weight * joints_at_limit) reward += -np.minimum(np.sum(np.square(dq)) * self.dq_pen_weight, 5.0) weight = np.array([2.0, 1.0, 1.0] * 4) reward += -np.minimum(np.sum(np.square(q - self.robot.init_q) * weight) * self.q_pen_weight, 5.0) # reward = self.ab # tar = np.minimum(self.timer / 500, self.max_tar_vel) # reward += np.minimum(self.velx, tar) * self.vel_r_weight # # print("v", self.velx, "tar", tar) # # # reward += np.maximum((self.max_tar_vel - tar) * self.vel_r_weight - 3.0, 0.0) # alive bonus # # reward += -self.energy_weight * np.linalg.norm(a) # # print("act norm", -self.energy_weight * np.square(a).sum()) # # q, dq = self.robot.get_q_dq(self.robot.ctrl_dofs) # # print(np.max(np.abs(dq))) # pos_mid = 0.5 * (self.robot.ll + self.robot.ul) # q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll) # joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97) # reward += -self.jl_weight * joints_at_limit # # print("jl", -self.jl_weight * joints_at_limit) # # reward += -np.minimum(np.linalg.norm(dq) * self.dq_pen_weight, 5.0) # weight = np.array([2.0, 0.2, 1.0] * 4) # reward += -np.minimum(np.linalg.norm((q - self.robot.init_q) * weight) * self.q_pen_weight, 5.0) # # print("vel pen", -np.minimum(np.sum(np.abs(dq)) * self.dq_pen_weight, 5.0)) # # print("pos pen", -np.minimum(np.sum(np.square(q - self.robot.init_q)) * self.q_pen_weight, 5.0)) y_1 = root_pos[1] reward += -y_1 * 0.5 # print("dev pen", -y_1*0.5) height = root_pos[2] in_support = self.robot.is_root_com_in_support() # print("______") # print(in_support) # print("h", height) # print("dq.", np.abs(dq)) # print((np.abs(dq) < 50).all()) # cps = self._p.getContactPoints(bodyA=self.robot.go_id, linkIndexA=0) # body_in_contact = (len(cps) > 0) # print("------") obs = self.get_extended_observation() # rpy = self._p.getEulerFromQuaternion(obs[8:12]) # for data collection not_done = (np.abs(dq) < 90).all() and (height > 0.3) and (height < 1.0) # TODO # not_done = (np.abs(dq) < 90).all() and (height > 0.3) and (height < 1.0) and in_support # not_done = True return obs, reward, not not_done, {}
def step(self, a): # TODO: currently for laika, env_action is 12D, 4 feet 3D without wrench if self.train_dyn: env_action = a robo_action = self.obs[-self.behavior_act_len:] else: robo_action = a robo_action = np.clip(robo_action, -1.0, 1.0) env_pi_obs = np.concatenate((self.obs, robo_action)) env_pi_obs_nn = utils.wrap(env_pi_obs, is_cuda=self.cuda_env) with torch.no_grad(): _, env_action_nn, _, self.recurrent_hidden_states = self.dyn_actor_critic.act( env_pi_obs_nn, self.recurrent_hidden_states, self.masks, deterministic=False) env_action = utils.unwrap(env_action_nn, is_cuda=self.cuda_env) # self.ratios = np.append(self.ratios, [env_action / robo_action], axis=0) # # env_pi_obs_feat = self.feat_select_func(self.obs) # dis_state = np.concatenate((env_pi_obs_feat, robo_action)) # dis_state = utils.wrap(dis_state, is_cuda=self.cuda_env) root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_0 = root_pos[0] # TODO # # this is post noise (unseen), different from seen diversify of self.enlarge_act_scale if self.act_noise: robo_action = utils.perturb(robo_action, 0.05, self.np_random) # # TODO # if self.pretrain_dyn: # # self.state_id = self._p.saveState() # self.img_obs, pre_s_i = self.rollout_one_step_imaginary() # takes the old self.obs # # img_obs = self.rollout_one_step_imaginary_same_session() # # self._p.restoreState(self.state_id) # pre_s = self.robot.get_robot_raw_state_vec() # # print(pre_s_i) # # print(pre_s) # assert np.allclose(pre_s, pre_s_i, atol=1e-5) for _ in range(self.control_skip): self.robot.apply_action(robo_action) self.apply_scale_clip_conf_from_pi_new(env_action) self._p.stepSimulation() if self.render: time.sleep(self._ts * 0.5) self.timer += 1 self.update_extended_observation() root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_1 = root_pos[0] self.velx = (x_1 - x_0) / (self.control_skip * self._ts) y_1 = root_pos[1] height = root_pos[2] q, dq = self.robot.get_q_dq(self.robot.ctrl_dofs) # print(np.max(np.abs(dq))) in_support = self.robot.is_root_com_in_support() if not self.pretrain_dyn: reward = self.ab # alive bonus tar = np.minimum(self.timer / 500, self.max_tar_vel) reward += np.minimum(self.velx, tar) * self.vel_r_weight # print("v", self.velx, "tar", tar) reward += -self.energy_weight * np.square(robo_action).sum() # print("act norm", -self.energy_weight * np.square(a).sum()) pos_mid = 0.5 * (self.robot.ll + self.robot.ul) q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll) joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97) reward += -self.jl_weight * joints_at_limit # print("jl", -self.jl_weight * joints_at_limit) reward += -np.minimum( np.sum(np.square(dq)) * self.dq_pen_weight, 5.0) weight = np.array([2.0, 1.0, 1.0] * 4) reward += -np.minimum( np.sum(np.square(q - self.robot.init_q) * weight) * self.q_pen_weight, 5.0) # print("vel pen", -np.minimum(np.sum(np.abs(dq)) * self.dq_pen_weight, 5.0)) # print("pos pen", -np.minimum(np.sum(np.square(q - self.robot.init_q)) * self.q_pen_weight, 5.0)) y_1 = root_pos[1] reward += -y_1 * 0.5 # print("dev pen", -y_1*0.5) else: # reward = self.calc_obs_dist_pretrain(self.img_obs[:-4], self.obs[:len(self.img_obs[:-4])]) reward = 0 # TODO # print("______") # print(in_support) # print("h", height) # print("dq.", np.abs(dq)) # print((np.abs(dq) < 50).all()) # print("------") # conf policy will not have body-in-contact flag not_done = (np.abs(dq) < 90).all() and (height > 0.3) and (height < 1.0) # not_done = (np.abs(dq) < 90).all() and (height > 0.3) and (height < 1.0) and in_support # not_done = (abs(y_1) < 5.0) and (height > 0.1) and (height < 1.0) and (rpy[2] > 0.1) # not_done = True # # if not not_done: # print(self.ratios.shape) # print(np.mean(self.ratios, axis=0)) # if not self.train_dyn: # dis_action = self.feat_select_func(self.obs) # dis_action = utils.wrap(dis_action, is_cuda=self.cuda_env) # d_score = self.discri.predict_prob_single_step(dis_state, dis_action) # self.d_scores.append(utils.unwrap(d_score, is_cuda=self.cuda_env)) # # if len(self.d_scores) > 20 and np.mean(self.d_scores[-20:]) < 0.4: # # not_done = False # # if not not_done or self.timer==1000: # # print(np.mean(self.d_scores)) if not self.train_dyn: obs_behavior = self.feat_select_func(self.obs) return obs_behavior, reward, not not_done, {} return self.obs, reward, not not_done, {}
def step(self, a): assert self.train_dyn # TODO: currently for laika, env_action is 12D, 4 feet 3D without wrench env_action = a robo_action = self.obs[self.behavior_obs_len:] # else: # robo_action = a # if not self.soft_floor_env: # env_pi_obs = np.concatenate((self.obs, robo_action)) # env_pi_obs_nn = utils.wrap(env_pi_obs, is_cuda=self.cuda_env) # with torch.no_grad(): # _, env_action_nn, _, self.recurrent_hidden_states = self.dyn_actor_critic.act( # env_pi_obs_nn, self.recurrent_hidden_states, self.masks, deterministic=False # ) # env_action = utils.unwrap(env_action_nn, is_cuda=self.cuda_env) root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_0 = root_pos[0] # this is post noise (unseen), different from seen diversify of self.enlarge_act_scale robo_action = np.clip(robo_action, -1.0, 1.0) if self.act_noise: robo_action = utils.perturb(robo_action, 0.05, self.np_random) if self.pretrain_dyn: # self.state_id = self._p.saveState() self.img_obs, pre_s_i = self.rollout_one_step_imaginary( env_action) # takes the old self.obs # img_obs = self.rollout_one_step_imaginary_same_session() # self._p.restoreState(self.state_id) pre_s = self.robot.get_robot_raw_state_vec() # print(pre_s_i) # print(pre_s) assert np.allclose(pre_s, pre_s_i, atol=1e-5) for _ in range(self.control_skip): self.robot.apply_action(robo_action) self._p.stepSimulation() if self.render: time.sleep(self._ts * 0.5) self.timer += 1 self.update_extended_observation() # print(self.obs[:self.behavior_obs_len - 5], "out real") root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_1 = root_pos[0] self.velx = (x_1 - x_0) / (self.control_skip * self._ts) y_1 = root_pos[1] height = root_pos[2] q, dq = self.robot.get_q_dq(self.robot.ctrl_dofs) # print(np.max(np.abs(dq))) in_support = self.robot.is_root_com_in_support() rpy = self._p.getEulerFromQuaternion(self.obs[9:13]) if not self.pretrain_dyn: reward = self.ab # alive bonus tar = np.minimum(self.timer / 500, self.max_tar_vel) reward += np.minimum(self.velx, tar) * self.vel_r_weight # print("v", self.velx, "tar", tar) reward += -self.energy_weight * np.square(robo_action).sum() # print("act norm", -self.energy_weight * np.square(a).sum()) pos_mid = 0.5 * (self.robot.ll + self.robot.ul) q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll) joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97) reward += -self.jl_weight * joints_at_limit # print("jl", -self.jl_weight * joints_at_limit) reward += -np.minimum(np.sum(np.abs(dq)) * self.dq_pen_weight, 5.0) reward += -np.minimum( np.sum(np.square(q - self.robot.init_q)) * self.q_pen_weight, 5.0) # print("vel pen", -np.minimum(np.sum(np.abs(dq)) * self.dq_pen_weight, 5.0)) # print("pos pen", -np.minimum(np.sum(np.square(q - self.robot.init_q)) * self.q_pen_weight, 5.0)) y_1 = root_pos[1] reward += -y_1 * 0.5 # print("dev pen", -y_1*0.5) else: reward = self.calc_obs_dist_pretrain( self.img_obs[:-4], self.obs[:len(self.img_obs[:-4])]) # print("______") # print(in_support) # print("h", height) # print("dq.", np.abs(dq)) # print((np.abs(dq) < 50).all()) # body = [0, 4, 8, 12, -1, 1, 5, 9, 13] # body_floor = False # for link in body: # cps = self._p.getContactPoints(self.robot.go_id, self.floor_id, link, -1) # if len(cps) > 0: # print("body hit floor") # body_floor = True # break # print("------") not_done = (np.abs(dq) < 90).all() and (height > 0.3) and (height < 1.0) # not_done = (abs(y_1) < 5.0) and (height > 0.1) and (height < 1.0) and (rpy[2] > 0.1) # not_done = True return self.obs, reward, not not_done, {}
def step(self, a): # TODO: currently for laika, env_action is 12D, 4 feet 3D without wrench if self.train_dyn: env_action = a robo_action = self.past_bact_array[0] # after tanh else: robo_action = a robo_action = np.tanh(robo_action) # update past_bact after tanh utils.push_recent_value(self.past_bact_array, robo_action) env_pi_obs = utils.select_and_merge_from_s_a( s_mt=list(self.past_obs_array), a_mt=list(self.past_bact_array), s_idx=self.generator_past_obs_t_idx, a_idx=self.generator_past_act_t_idx) env_pi_obs_nn = utils.wrap(env_pi_obs, is_cuda=self.cuda_env) with torch.no_grad(): _, env_action_nn, _, self.recurrent_hidden_states = self.dyn_actor_critic.act( env_pi_obs_nn, self.recurrent_hidden_states, self.masks, deterministic=False) env_action = utils.unwrap(env_action_nn, is_cuda=self.cuda_env) # if self.ratio is None: # self.ratio = np.array([env_action / robo_action]) # else: # self.ratio = np.append(self.ratio, [env_action / robo_action], axis=0) # self.ratios = np.append(self.ratios, [env_action / robo_action], axis=0) # # env_pi_obs_feat = self.feat_select_func(self.obs) # dis_state = np.concatenate((env_pi_obs_feat, robo_action)) # dis_state = utils.wrap(dis_state, is_cuda=self.cuda_env) root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_0 = root_pos[0] # this is post noise (unseen), different from seen diversify of self.enlarge_act_scale if self.act_noise: robo_action = utils.perturb(robo_action, 0.05, self.np_random) # when call info, should call before sim_step() as in v4 (append s_t+1 later) # info will be used to construct D input outside. past_info = self.construct_past_traj_window() # # TODO # if self.pretrain_dyn: # # self.state_id = self._p.saveState() # self.img_obs, pre_s_i = self.rollout_one_step_imaginary() # takes the old self.obs # # img_obs = self.rollout_one_step_imaginary_same_session() # # self._p.restoreState(self.state_id) # pre_s = self.robot.get_robot_raw_state_vec() # # print(pre_s_i) # # print(pre_s) # assert np.allclose(pre_s, pre_s_i, atol=1e-5) for _ in range(self.control_skip): self.robot.apply_action(robo_action) self.apply_scale_clip_conf_from_pi_new(env_action) self._p.stepSimulation() if self.render: time.sleep(self._ts * 1.0) self.timer += 1 obs_new = self.get_extended_observation() # and update past_obs_array past_info += [self.past_obs_array[0]] # s_t+1 root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_1 = root_pos[0] self.velx = (x_1 - x_0) / (self.control_skip * self._ts) y_1 = root_pos[1] height = root_pos[2] q, dq = self.robot.get_q_dq(self.robot.ctrl_dofs) # print(np.max(np.abs(dq))) # in_support = self.robot.is_root_com_in_support() if not self.pretrain_dyn: reward = self.ab # alive bonus tar = np.minimum(self.timer / 500, self.max_tar_vel) reward += np.minimum(self.velx, tar) * self.vel_r_weight # print("v", self.velx, "tar", tar) reward += -self.energy_weight * np.square(robo_action).sum() # print("act norm", -self.energy_weight * np.square(a).sum()) pos_mid = 0.5 * (self.robot.ll + self.robot.ul) q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll) joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97) reward += -self.jl_weight * joints_at_limit # print("jl", -self.jl_weight * joints_at_limit) reward += -np.minimum( np.sum(np.square(dq)) * self.dq_pen_weight, 5.0) weight = np.array([2.0, 1.0, 1.0] * 4) reward += -np.minimum( np.sum(np.square(q - self.robot.init_q) * weight) * self.q_pen_weight, 5.0) # print("vel pen", -np.minimum(np.sum(np.abs(dq)) * self.dq_pen_weight, 5.0)) # print("pos pen", -np.minimum(np.sum(np.square(q - self.robot.init_q)) * self.q_pen_weight, 5.0)) y_1 = root_pos[1] reward += -y_1 * 0.5 # print("dev pen", -y_1*0.5) else: # reward = self.calc_obs_dist_pretrain(self.img_obs[:-4], self.obs[:len(self.img_obs[:-4])]) reward = 0 # TODO # print("______") # print(in_support) # print("h", height) # print("dq.", np.abs(dq)) # print((np.abs(dq) < 50).all()) # print("------") # conf policy will not have body-in-contact flag not_done = (np.abs(dq) < 90).all() and (height > 0.2) and (height < 1.0) # not_done = (abs(y_1) < 5.0) and (height > 0.1) and (height < 1.0) and (rpy[2] > 0.1) # not_done = True # # if not not_done: # print(self.ratio.shape) # labels = list("123456789ABC") # data = self.ratio # from matplotlib import pyplot as plt # width = 0.4 # fig, ax = plt.subplots() # for i, l in enumerate(labels): # x = np.ones(data.shape[0]) * i + (np.random.rand(data.shape[0]) * width - width / 2.) # ax.scatter(x, data[:, i], s=25) # median = np.median(data[:, i]) # ax.plot([i - width / 2., i + width / 2.], [median, median], color="k") # # plt.ylim(-5, 5) # ax.set_xticks(range(len(labels))) # ax.set_xticklabels(labels) # plt.show() # self.ratio = None # if not self.train_dyn: # dis_action = self.feat_select_func(self.obs) # dis_action = utils.wrap(dis_action, is_cuda=self.cuda_env) # d_score = self.discri.predict_prob_single_step(dis_state, dis_action) # self.d_scores.append(utils.unwrap(d_score, is_cuda=self.cuda_env)) # # if len(self.d_scores) > 20 and np.mean(self.d_scores[-20:]) < 0.4: # # not_done = False # # if not not_done or self.timer==1000: # # print(np.mean(self.d_scores)) return obs_new, reward, not not_done, {"sas_window": past_info}
def step(self, a): if self.act_noise and a is not None: a = utils.perturb(a, 0.05, self.np_random) if not self.use_gen_dyn: for _ in range(self.control_skip): # action is in not -1,1 if a is not None: self.act = np.clip(a, -1.0, 1.0) self.robot.apply_action(self.act) self._p.stepSimulation() if self.render: time.sleep(self._ts * 0.5) self.timer += 1 self.robot.update_x() self.update_extended_observation() obs_unnorm = np.array(self.obs) / self.robot.obs_scaling else: # gen_input = list(self.obs) + list(a) + [0.0] * (11+3) gen_input = list(self.obs) + list(a) + list( np.random.normal(0, 1, 14)) # TODO gen_input = utils.wrap(gen_input, is_cuda=False) # TODO gen_output = self.gen_dyn(gen_input) gen_output = utils.unwrap(gen_output, is_cuda=False) self.obs = gen_output obs_unnorm = np.array(self.obs) / self.robot.obs_scaling self.robot.last_x = self.robot.x self.robot.x += obs_unnorm[5] * (self.control_skip * self._ts) if self.render: all_qs = [self.robot.x] + list(obs_unnorm[:5]) all_qs[1] -= 1.5 for ind in range(self.robot.n_total_dofs): self._p.resetJointState(self.robot.hopper_id, ind, all_qs[ind], 0.0) time.sleep(self._ts * 5.0) if self.obs_noise: self.obs = utils.perturb(self.obs, 0.1, self.np_random) self.timer += self.control_skip reward = 2.0 # alive bonus reward += self.get_ave_dx() # print("v", self.get_ave_dx()) reward += -0.1 * np.square(a).sum() # print("act norm", -0.1 * np.square(a).sum()) q = np.array(obs_unnorm[2:5]) pos_mid = 0.5 * (self.robot.ll + self.robot.ul) q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll) joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97) reward += -2.0 * joints_at_limit # print("jl", -2.0 * joints_at_limit) dq = np.array(obs_unnorm[8:11]) reward -= np.minimum(np.sum(np.abs(dq)) * 0.02, 5.0) # almost like /23 # print("vel pen", np.minimum(np.sum(np.abs(dq)) * 0.02, 5.0)) height = obs_unnorm[0] # ang = self._p.getJointState(self.robot.hopper_id, 2)[0] # print(joints_dq) # print(height) # print("ang", ang) not_done = (np.abs(dq) < 50).all() and (height > .3) and (height < 1.8) return self.obs, reward, not not_done, {}