def update_extended_observation(self): self.obs = self.robot.get_robot_observation() # self.obs = np.concatenate((self.obs, [np.minimum(self.timer / 500, self.max_tar_vel)])) # TODO if self.obs_noise: self.obs = utils.perturb(self.obs, 0.1, self.np_random) if self.train_dyn: self.behavior_obs_len = len(self.obs) obs_nn = utils.wrap(self.obs, is_cuda=self.cuda_env) with torch.no_grad(): _, action_nn, _, self.recurrent_hidden_states = self.go_actor_critic.act( obs_nn, self.recurrent_hidden_states, self.masks, deterministic=False # TODO, det pi ) action = utils.unwrap(action_nn, is_cuda=self.cuda_env) if self.enlarge_act_range: # 15% noise if a clipped to -1, 1 action = utils.perturb(action, 0.15, self.np_random) self.behavior_act_len = len(action) self.obs = np.concatenate((self.obs, action))
def update_extended_observation(self): self.obs = self.robot.get_robot_observation(with_vel=True) # actually should /2 (downsample) self.obs.extend( list(np.mean(self.normal_forces[-self.control_skip:], axis=0))) if self.obs_noise: self.obs = utils.perturb(self.obs, 0.1, self.np_random) if self.train_dyn: obs_behavior = self.feat_select_func(self.obs) obs_nn = utils.wrap(obs_behavior, is_cuda=self.cuda_env) with torch.no_grad(): _, action_nn, _, self.recurrent_hidden_states = self.go_actor_critic.act( obs_nn, self.recurrent_hidden_states, self.masks, deterministic=False) action = utils.unwrap(action_nn, is_cuda=self.cuda_env) # 25% noise if a clipped to -1, 1 action = utils.perturb(action, self.enlarge_act_range, self.np_random) action = np.clip(action, -1.0, 1.0) self.behavior_act_len = len(action) self.obs = list(self.obs) + list(action)
def get_extended_observation(self): # with vel false cur_state = self.robot.get_robot_observation(with_vel=False) if self.obs_noise: cur_state = utils.perturb(cur_state, 0.1, self.np_random) # then update past obs utils.push_recent_value(self.past_obs_array, cur_state) # then construct behavior obs b_obs_all = utils.select_and_merge_from_s_a( s_mt=list(self.past_obs_array), a_mt=list(self.past_bact_array), s_idx=self.behavior_past_obs_t_idx, a_idx=np.array([])) # if train motor, return behavior obs and we are done if not self.train_dyn: return b_obs_all # else, train dyn # rollout b_pi obs_nn = utils.wrap(b_obs_all, is_cuda=self.cuda_env) with torch.no_grad(): _, action_nn, _, self.recurrent_hidden_states = self.go_actor_critic.act( obs_nn, self.recurrent_hidden_states, self.masks, deterministic=False) b_cur_act = list(utils.unwrap(action_nn, is_cuda=self.cuda_env)) b_cur_act = utils.perturb(b_cur_act, self.enlarge_act_range, self.np_random) b_cur_act = np.tanh(b_cur_act) # Store action after tanh (-1,1) utils.push_recent_value(self.past_bact_array, b_cur_act) # construct G obs from updated past obs&b_act # g_obs_all = utils.select_and_merge_from_s_a( # s_mt=list(self.past_obs_array), # a_mt=list(self.past_bact_array), # s_idx=self.generator_past_obs_t_idx, # a_idx=self.generator_past_act_t_idx # ) # calculate 4x split obs here g_obs_all = self.get_all_feet_local_obs(b_cur_act) return g_obs_all
def step(self, a): # TODO: currently for laika, env_action is 12D, 4 feet 3D without wrench if self.train_dyn: env_action = a robo_action = self.obs[-self.behavior_act_len:] else: robo_action = a robo_action = np.clip(robo_action, -1.0, 1.0) env_pi_obs = np.concatenate((self.obs, robo_action)) env_pi_obs_nn = utils.wrap(env_pi_obs, is_cuda=self.cuda_env) with torch.no_grad(): _, env_action_nn, _, self.recurrent_hidden_states = self.dyn_actor_critic.act( env_pi_obs_nn, self.recurrent_hidden_states, self.masks, deterministic=False) env_action = utils.unwrap(env_action_nn, is_cuda=self.cuda_env) # self.ratios = np.append(self.ratios, [env_action / robo_action], axis=0) # # env_pi_obs_feat = self.feat_select_func(self.obs) # dis_state = np.concatenate((env_pi_obs_feat, robo_action)) # dis_state = utils.wrap(dis_state, is_cuda=self.cuda_env) root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_0 = root_pos[0] # TODO # # this is post noise (unseen), different from seen diversify of self.enlarge_act_scale if self.act_noise: robo_action = utils.perturb(robo_action, 0.05, self.np_random) # # TODO # if self.pretrain_dyn: # # self.state_id = self._p.saveState() # self.img_obs, pre_s_i = self.rollout_one_step_imaginary() # takes the old self.obs # # img_obs = self.rollout_one_step_imaginary_same_session() # # self._p.restoreState(self.state_id) # pre_s = self.robot.get_robot_raw_state_vec() # # print(pre_s_i) # # print(pre_s) # assert np.allclose(pre_s, pre_s_i, atol=1e-5) for _ in range(self.control_skip): self.robot.apply_action(robo_action) self.apply_scale_clip_conf_from_pi_new(env_action) self._p.stepSimulation() if self.render: time.sleep(self._ts * 0.5) self.timer += 1 self.update_extended_observation() root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_1 = root_pos[0] self.velx = (x_1 - x_0) / (self.control_skip * self._ts) y_1 = root_pos[1] height = root_pos[2] q, dq = self.robot.get_q_dq(self.robot.ctrl_dofs) # print(np.max(np.abs(dq))) in_support = self.robot.is_root_com_in_support() if not self.pretrain_dyn: reward = self.ab # alive bonus tar = np.minimum(self.timer / 500, self.max_tar_vel) reward += np.minimum(self.velx, tar) * self.vel_r_weight # print("v", self.velx, "tar", tar) reward += -self.energy_weight * np.square(robo_action).sum() # print("act norm", -self.energy_weight * np.square(a).sum()) pos_mid = 0.5 * (self.robot.ll + self.robot.ul) q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll) joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97) reward += -self.jl_weight * joints_at_limit # print("jl", -self.jl_weight * joints_at_limit) reward += -np.minimum( np.sum(np.square(dq)) * self.dq_pen_weight, 5.0) weight = np.array([2.0, 1.0, 1.0] * 4) reward += -np.minimum( np.sum(np.square(q - self.robot.init_q) * weight) * self.q_pen_weight, 5.0) # print("vel pen", -np.minimum(np.sum(np.abs(dq)) * self.dq_pen_weight, 5.0)) # print("pos pen", -np.minimum(np.sum(np.square(q - self.robot.init_q)) * self.q_pen_weight, 5.0)) y_1 = root_pos[1] reward += -y_1 * 0.5 # print("dev pen", -y_1*0.5) else: # reward = self.calc_obs_dist_pretrain(self.img_obs[:-4], self.obs[:len(self.img_obs[:-4])]) reward = 0 # TODO # print("______") # print(in_support) # print("h", height) # print("dq.", np.abs(dq)) # print((np.abs(dq) < 50).all()) # print("------") # conf policy will not have body-in-contact flag not_done = (np.abs(dq) < 90).all() and (height > 0.3) and (height < 1.0) # not_done = (np.abs(dq) < 90).all() and (height > 0.3) and (height < 1.0) and in_support # not_done = (abs(y_1) < 5.0) and (height > 0.1) and (height < 1.0) and (rpy[2] > 0.1) # not_done = True # # if not not_done: # print(self.ratios.shape) # print(np.mean(self.ratios, axis=0)) # if not self.train_dyn: # dis_action = self.feat_select_func(self.obs) # dis_action = utils.wrap(dis_action, is_cuda=self.cuda_env) # d_score = self.discri.predict_prob_single_step(dis_state, dis_action) # self.d_scores.append(utils.unwrap(d_score, is_cuda=self.cuda_env)) # # if len(self.d_scores) > 20 and np.mean(self.d_scores[-20:]) < 0.4: # # not_done = False # # if not not_done or self.timer==1000: # # print(np.mean(self.d_scores)) if not self.train_dyn: obs_behavior = self.feat_select_func(self.obs) return obs_behavior, reward, not not_done, {} return self.obs, reward, not not_done, {}
def step(self, a): # TODO: currently for laika, env_action is 12D, 4 feet 3D without wrench if self.train_dyn: env_action = a robo_action = self.past_bact_array[0] # after tanh else: robo_action = a robo_action = np.tanh(robo_action) # update past_bact after tanh utils.push_recent_value(self.past_bact_array, robo_action) env_pi_obs = utils.select_and_merge_from_s_a( s_mt=list(self.past_obs_array), a_mt=list(self.past_bact_array), s_idx=self.generator_past_obs_t_idx, a_idx=self.generator_past_act_t_idx) env_pi_obs_nn = utils.wrap(env_pi_obs, is_cuda=self.cuda_env) with torch.no_grad(): _, env_action_nn, _, self.recurrent_hidden_states = self.dyn_actor_critic.act( env_pi_obs_nn, self.recurrent_hidden_states, self.masks, deterministic=False) env_action = utils.unwrap(env_action_nn, is_cuda=self.cuda_env) # if self.ratio is None: # self.ratio = np.array([env_action / robo_action]) # else: # self.ratio = np.append(self.ratio, [env_action / robo_action], axis=0) # self.ratios = np.append(self.ratios, [env_action / robo_action], axis=0) # # env_pi_obs_feat = self.feat_select_func(self.obs) # dis_state = np.concatenate((env_pi_obs_feat, robo_action)) # dis_state = utils.wrap(dis_state, is_cuda=self.cuda_env) root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_0 = root_pos[0] # this is post noise (unseen), different from seen diversify of self.enlarge_act_scale if self.act_noise: robo_action = utils.perturb(robo_action, 0.05, self.np_random) # when call info, should call before sim_step() as in v4 (append s_t+1 later) # info will be used to construct D input outside. past_info = self.construct_past_traj_window() # # TODO # if self.pretrain_dyn: # # self.state_id = self._p.saveState() # self.img_obs, pre_s_i = self.rollout_one_step_imaginary() # takes the old self.obs # # img_obs = self.rollout_one_step_imaginary_same_session() # # self._p.restoreState(self.state_id) # pre_s = self.robot.get_robot_raw_state_vec() # # print(pre_s_i) # # print(pre_s) # assert np.allclose(pre_s, pre_s_i, atol=1e-5) for _ in range(self.control_skip): self.robot.apply_action(robo_action) self.apply_scale_clip_conf_from_pi_new(env_action) self._p.stepSimulation() if self.render: time.sleep(self._ts * 1.0) self.timer += 1 obs_new = self.get_extended_observation() # and update past_obs_array past_info += [self.past_obs_array[0]] # s_t+1 root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_1 = root_pos[0] self.velx = (x_1 - x_0) / (self.control_skip * self._ts) y_1 = root_pos[1] height = root_pos[2] q, dq = self.robot.get_q_dq(self.robot.ctrl_dofs) # print(np.max(np.abs(dq))) # in_support = self.robot.is_root_com_in_support() if not self.pretrain_dyn: reward = self.ab # alive bonus tar = np.minimum(self.timer / 500, self.max_tar_vel) reward += np.minimum(self.velx, tar) * self.vel_r_weight # print("v", self.velx, "tar", tar) reward += -self.energy_weight * np.square(robo_action).sum() # print("act norm", -self.energy_weight * np.square(a).sum()) pos_mid = 0.5 * (self.robot.ll + self.robot.ul) q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll) joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97) reward += -self.jl_weight * joints_at_limit # print("jl", -self.jl_weight * joints_at_limit) reward += -np.minimum( np.sum(np.square(dq)) * self.dq_pen_weight, 5.0) weight = np.array([2.0, 1.0, 1.0] * 4) reward += -np.minimum( np.sum(np.square(q - self.robot.init_q) * weight) * self.q_pen_weight, 5.0) # print("vel pen", -np.minimum(np.sum(np.abs(dq)) * self.dq_pen_weight, 5.0)) # print("pos pen", -np.minimum(np.sum(np.square(q - self.robot.init_q)) * self.q_pen_weight, 5.0)) y_1 = root_pos[1] reward += -y_1 * 0.5 # print("dev pen", -y_1*0.5) else: # reward = self.calc_obs_dist_pretrain(self.img_obs[:-4], self.obs[:len(self.img_obs[:-4])]) reward = 0 # TODO # print("______") # print(in_support) # print("h", height) # print("dq.", np.abs(dq)) # print((np.abs(dq) < 50).all()) # print("------") # conf policy will not have body-in-contact flag not_done = (np.abs(dq) < 90).all() and (height > 0.2) and (height < 1.0) # not_done = (abs(y_1) < 5.0) and (height > 0.1) and (height < 1.0) and (rpy[2] > 0.1) # not_done = True # # if not not_done: # print(self.ratio.shape) # labels = list("123456789ABC") # data = self.ratio # from matplotlib import pyplot as plt # width = 0.4 # fig, ax = plt.subplots() # for i, l in enumerate(labels): # x = np.ones(data.shape[0]) * i + (np.random.rand(data.shape[0]) * width - width / 2.) # ax.scatter(x, data[:, i], s=25) # median = np.median(data[:, i]) # ax.plot([i - width / 2., i + width / 2.], [median, median], color="k") # # plt.ylim(-5, 5) # ax.set_xticks(range(len(labels))) # ax.set_xticklabels(labels) # plt.show() # self.ratio = None # if not self.train_dyn: # dis_action = self.feat_select_func(self.obs) # dis_action = utils.wrap(dis_action, is_cuda=self.cuda_env) # d_score = self.discri.predict_prob_single_step(dis_state, dis_action) # self.d_scores.append(utils.unwrap(d_score, is_cuda=self.cuda_env)) # # if len(self.d_scores) > 20 and np.mean(self.d_scores[-20:]) < 0.4: # # not_done = False # # if not not_done or self.timer==1000: # # print(np.mean(self.d_scores)) return obs_new, reward, not not_done, {"sas_window": past_info}
def step(self, a): # TODO: in this env, env_action is 12D, being the scaling factors for torque command if self.train_dyn: env_action = a robo_action = self.obs[-self.behavior_act_len:] else: robo_action = a robo_action = np.clip(robo_action, -1.0, 1.0) env_pi_obs = np.concatenate((self.obs, robo_action)) env_pi_obs_nn = utils.wrap(env_pi_obs, is_cuda=self.cuda_env) with torch.no_grad(): _, env_action_nn, _, self.recurrent_hidden_states = self.dyn_actor_critic.act( env_pi_obs_nn, self.recurrent_hidden_states, self.masks, deterministic=False) env_action = utils.unwrap(env_action_nn, is_cuda=self.cuda_env) # if self.ratio is None: # self.ratio = np.array([env_action / robo_action]) # else: # self.ratio = np.append(self.ratio, [env_action / robo_action], axis=0) # self.ratios = np.append(self.ratios, [env_action / robo_action], axis=0) # # env_pi_obs_feat = self.feat_select_func(self.obs) # dis_state = np.concatenate((env_pi_obs_feat, robo_action)) # dis_state = utils.wrap(dis_state, is_cuda=self.cuda_env) root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_0 = root_pos[0] # TODO # # this is post noise (unseen), different from seen diversify of self.enlarge_act_scale # if self.act_noise: # robo_action = utils.perturb(robo_action, 0.05, self.np_random) # # TODO # if self.pretrain_dyn: # # self.state_id = self._p.saveState() # self.img_obs, pre_s_i = self.rollout_one_step_imaginary() # takes the old self.obs # # img_obs = self.rollout_one_step_imaginary_same_session() # # self._p.restoreState(self.state_id) # pre_s = self.robot.get_robot_raw_state_vec() # # print(pre_s_i) # # print(pre_s) # assert np.allclose(pre_s, pre_s_i, atol=1e-5) for _ in range(self.control_skip): self.robot.apply_action(env_action) self._p.stepSimulation() if self.render: time.sleep(self._ts * 0.5) self.timer += 1 if self.timer % 2 == 0: cur_forces = [] # last normal forces for foot in self.robot.feet: cps = self._p.getContactPoints(self.robot.go_id, self.floor_id, foot, -1) normal_f_sum = 0 # print(len(cps)) # 0 or 1 or 2?? for cp in cps: normal_f_sum += cp[9] # normal force cur_forces.extend([normal_f_sum / 500.0]) self.normal_forces = np.append(self.normal_forces, [cur_forces], axis=0) self.update_extended_observation() root_pos, _ = self.robot.get_link_com_xyz_orn(-1) x_1 = root_pos[0] self.velx = (x_1 - x_0) / (self.control_skip * self._ts) y_1 = root_pos[1] height = root_pos[2] q, dq = self.robot.get_q_dq(self.robot.ctrl_dofs) # print(np.max(np.abs(dq))) in_support = self.robot.is_root_com_in_support() if not self.pretrain_dyn: reward = self.ab # alive bonus tar = np.minimum(self.timer / 500, self.max_tar_vel) reward += np.minimum(self.velx, tar) * self.vel_r_weight # print("v", self.velx, "tar", tar) reward += -self.energy_weight * np.square(robo_action).sum() # print("act norm", -self.energy_weight * np.square(a).sum()) pos_mid = 0.5 * (self.robot.ll + self.robot.ul) q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll) joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97) reward += -self.jl_weight * joints_at_limit # print("jl", -self.jl_weight * joints_at_limit) reward += -np.minimum( np.sum(np.square(dq)) * self.dq_pen_weight, 5.0) weight = np.array([2.0, 1.0, 1.0] * 4) reward += -np.minimum( np.sum(np.square(q - self.robot.init_q) * weight) * self.q_pen_weight, 5.0) # print("vel pen", -np.minimum(np.sum(np.abs(dq)) * self.dq_pen_weight, 5.0)) # print("pos pen", -np.minimum(np.sum(np.square(q - self.robot.init_q)) * self.q_pen_weight, 5.0)) y_1 = root_pos[1] reward += -y_1 * 0.5 # print("dev pen", -y_1*0.5) else: # reward = self.calc_obs_dist_pretrain(self.img_obs[:-4], self.obs[:len(self.img_obs[:-4])]) reward = 0 # TODO # print("______") # print(in_support) # print("h", height) # print("dq.", np.abs(dq)) # print((np.abs(dq) < 50).all()) # print("------") # conf policy will not have body-in-contact flag not_done = (np.abs(dq) < 90).all() and (height > 0.3) and ( height < 1.0) and in_support # not_done = (abs(y_1) < 5.0) and (height > 0.1) and (height < 1.0) and (rpy[2] > 0.1) # not_done = True # # if not not_done: # print(self.ratio.shape) # labels = list("123456789ABC") # data = self.ratio # from matplotlib import pyplot as plt # width = 0.4 # fig, ax = plt.subplots() # for i, l in enumerate(labels): # x = np.ones(data.shape[0]) * i + (np.random.rand(data.shape[0]) * width - width / 2.) # ax.scatter(x, data[:, i], s=25) # median = np.median(data[:, i]) # ax.plot([i - width / 2., i + width / 2.], [median, median], color="k") # # plt.ylim(-5, 5) # ax.set_xticks(range(len(labels))) # ax.set_xticklabels(labels) # plt.show() # self.ratio = None # if not self.train_dyn: # dis_action = self.feat_select_func(self.obs) # dis_action = utils.wrap(dis_action, is_cuda=self.cuda_env) # d_score = self.discri.predict_prob_single_step(dis_state, dis_action) # self.d_scores.append(utils.unwrap(d_score, is_cuda=self.cuda_env)) # # if len(self.d_scores) > 20 and np.mean(self.d_scores[-20:]) < 0.4: # # not_done = False # # if not not_done or self.timer==1000: # # print(np.mean(self.d_scores)) if not self.train_dyn: obs_behavior = self.feat_select_func(self.obs) return obs_behavior, reward, not not_done, {} return self.obs, reward, not not_done, {}
def step(self, a): if self.act_noise and a is not None: a = utils.perturb(a, 0.05, self.np_random) if not self.use_gen_dyn: for _ in range(self.control_skip): # action is in not -1,1 if a is not None: self.act = np.clip(a, -1.0, 1.0) self.robot.apply_action(self.act) self._p.stepSimulation() if self.render: time.sleep(self._ts * 0.5) self.timer += 1 self.robot.update_x() self.update_extended_observation() obs_unnorm = np.array(self.obs) / self.robot.obs_scaling else: # gen_input = list(self.obs) + list(a) + [0.0] * (11+3) gen_input = list(self.obs) + list(a) + list( np.random.normal(0, 1, 14)) # TODO gen_input = utils.wrap(gen_input, is_cuda=False) # TODO gen_output = self.gen_dyn(gen_input) gen_output = utils.unwrap(gen_output, is_cuda=False) self.obs = gen_output obs_unnorm = np.array(self.obs) / self.robot.obs_scaling self.robot.last_x = self.robot.x self.robot.x += obs_unnorm[5] * (self.control_skip * self._ts) if self.render: all_qs = [self.robot.x] + list(obs_unnorm[:5]) all_qs[1] -= 1.5 for ind in range(self.robot.n_total_dofs): self._p.resetJointState(self.robot.hopper_id, ind, all_qs[ind], 0.0) time.sleep(self._ts * 5.0) if self.obs_noise: self.obs = utils.perturb(self.obs, 0.1, self.np_random) self.timer += self.control_skip reward = 2.0 # alive bonus reward += self.get_ave_dx() # print("v", self.get_ave_dx()) reward += -0.1 * np.square(a).sum() # print("act norm", -0.1 * np.square(a).sum()) q = np.array(obs_unnorm[2:5]) pos_mid = 0.5 * (self.robot.ll + self.robot.ul) q_scaled = 2 * (q - pos_mid) / (self.robot.ul - self.robot.ll) joints_at_limit = np.count_nonzero(np.abs(q_scaled) > 0.97) reward += -2.0 * joints_at_limit # print("jl", -2.0 * joints_at_limit) dq = np.array(obs_unnorm[8:11]) reward -= np.minimum(np.sum(np.abs(dq)) * 0.02, 5.0) # almost like /23 # print("vel pen", np.minimum(np.sum(np.abs(dq)) * 0.02, 5.0)) height = obs_unnorm[0] # ang = self._p.getJointState(self.robot.hopper_id, 2)[0] # print(joints_dq) # print(height) # print("ang", ang) not_done = (np.abs(dq) < 50).all() and (height > .3) and (height < 1.8) return self.obs, reward, not not_done, {}