def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() #print(next_obs) distance_to_goal = self.get_distance_to_goal() #reward = -distance_to_goal # modified reward with more emphesis on the xy-position vec = self.get_vec_to_goal() reward = -np.linalg.norm(vec, ord=1) #print('reward = ', reward) done = False if reward > -0.03: done = True reward = reward + 100 print("******** Reached Target ********") if self.kill_outside and (distance_to_goal > self.kill_radius): done = True reward = reward - 100 print("******** OUT of region ********") return Step(next_obs, reward, done)
def step(self, action): self.forward_dynamics(action) com = self.get_body_com("torso") # ref_x = x + self._init_torso_x #print(self._goal_idx) goal_reward = -np.sum(np.abs(com[:2] - self.goals[self._goal_idx]) ) + 4.0 # make it happy, not suicidal lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling)) contact_cost = 0.5 * 1e-3 * np.sum( np.square(np.clip(self.model.data.cfrc_ext, -1, 1))), survive_reward = 0.05 reward = goal_reward - ctrl_cost - contact_cost + survive_reward state = self._state notdone = np.isfinite(state).all() \ and state[2] >= 0.2 and state[2] <= 1.0 done = not notdone ob = self.get_current_obs() return Step(ob, float(reward), done)
def step(self,action): self.model.data.ctrl = action reward = 0 timesInHand = 0 for _ in range(self.frame_skip): self.model.step() step_reward = self.reward() timesInHand += step_reward > 0 reward += step_reward done = reward == 0 and self.numClose > 0 # Stop it if the block is flinged ob = self.get_current_obs() new_com = self.model.data.com_subtree[0] self.dcom = new_com - self.current_com self.current_com = new_com return Step(ob, float(reward), done,timeInHand=timesInHand)
def step(self, act): self.state, r = self.qmdp.transition(self.state, act) self.state_img = self.process_goals(self.state) obs = np.zeros((self.params['grid_n'], self.params['grid_m'], 3)) obs[:, :, 0] = self.env_img obs[:, :, 1] = self.goal_img obs[:, :, 2] = self.state_img done = False self.act = act self.step_count = self.step_count + 1 # if np.isclose(r, self.params['R_obst']) or self.step_count > self.params['traj_limit']: current_coord = self.state_lin_to_bin(self.state) goal_coord = self.state_lin_to_bin(self.goal_state) dist = np.sqrt((current_coord[0] - goal_coord[0])**2 + (current_coord[1] - goal_coord[1])**2) r += self.params['kdist'] * dist if self.step_count > self.params['traj_limit'] or self.goal_img[ current_coord[0]][current_coord[1]] == 1: done = True return Step(observation=obs, reward=r, done=done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum( np.square(action / scaling)) forward_reward = np.linalg.norm(self.get_body_comvel( "torso")) # swimmer has no problem of jumping reward reward = forward_reward - ctrl_cost done = False if self.sparse_rew: if abs(self.get_body_com("torso")[0]) > 100.0: reward = 1.0 done = True else: reward = 0. com = np.concatenate([self.get_body_com("torso").flat]).reshape(-1) ori = self.get_ori() return Step(next_obs, reward, done, com=com, ori=ori)
def step(self, action): qpos = np.copy(self.model.data.qpos) qpos[2, 0] += action[1] ori = qpos[2, 0] # compute increment in each direction dx = math.cos(ori) * action[0] dy = math.sin(ori) * action[0] # ensure that the robot is within reasonable range qpos[0, 0] = np.clip(qpos[0, 0] + dx, -self.size, self.size) qpos[1, 0] = np.clip(qpos[1, 0] + dy, -self.size, self.size) self.model.data.qpos = qpos self.model.forward() next_obs = self.get_current_obs() if self.align_mode: reward = max(self.reward_dir[0] * dx + self.reward_dir[1] * dy, 0) else: x, y = qpos[0, 0], qpos[1, 0] reward = -y * dx + x * dy reward /= (1 + np.abs(np.sqrt(x ** 2 + y ** 2) - self.target_dist)) return Step(next_obs, reward, False)
def step(self, action): if isinstance(self._wrapped_env.action_space, Box) or isinstance( self._wrapped_env.action_space, gymBox): # rescale the action if isinstance(self._wrapped_env.action_space, Box): lb, ub = self._wrapped_env.action_space.bounds else: lb = self._wrapped_env.action_space.low ub = self._wrapped_env.action_space.high scaled_action = lb + (action + 1.) * 0.5 * (ub - lb) scaled_action = np.clip(scaled_action, lb, ub) else: scaled_action = action wrapped_step = self._wrapped_env.step(scaled_action) next_obs, reward, done, info = wrapped_step if self._normalize_obs: next_obs = self._apply_normalize_obs(next_obs) if self._normalize_reward: reward = self._apply_normalize_reward(reward) return Step(next_obs, reward * self._scale_reward, done, **info)
def step(self, action): _, reward, done, info = self.wrapped_env.step(action) next_obs = self.get_current_obs() if self._circle_mode: pos = self.wrapped_env.get_body_com("torso") vel = self.wrapped_env.get_body_comvel("torso") dt = self.wrapped_env.model.opt.timestep x, y = pos[0], pos[1] dx, dy = vel[0], vel[1] reward = -y * dx + x * dy reward /= (1 + np.abs(np.sqrt(x**2 + y**2) - self._target_dist)) if self._nonlinear_reward: reward *= np.abs(reward) self._step += 1 if self._max_path_length_range is not None: if self._step > self._last_step: done = True return Step(next_obs, reward, done, **info)
def step(self, action): if self.MANUAL_COLLISION: old_pos = self.wrapped_env.get_xy() inner_next_obs, inner_rew, done, info = self.wrapped_env.step( action) new_pos = self.wrapped_env.get_xy() if self._is_in_collision(new_pos): self.wrapped_env.set_xy(old_pos) done = False else: inner_next_obs, inner_rew, done, info = self.wrapped_env.step( action) next_obs = self.get_current_obs() reward = self.coef_inner_rew * inner_rew info['inner_rew'] = inner_rew if self.include_maze_obs: info['xy_pos'] = self.goal_observation return Step(next_obs, reward, done, **info)
def step(self, Action): action = Action['action'] action = np.tanh(action) data = self._conn.sendreq({"cmd": "step", "action": action.tolist()}) prev_action = self.prev_action self.prev_action = action assert 'obs' in data assert 'rew' in data assert 'done' in data assert 'info' in data reward = data['rew'] if Action['policy_num'] == 2: if data["info"]["fail"]: reward = 200.0 else: dacc = action[2] dsteer = action[3] reward = -0.05 * (np.abs(dacc) + np.abs(dsteer)) return Step(observation=data['obs'], reward=reward, done=data['done'])
def step(self, action): action = action * 0.12 qpos = np.copy(self.model.data.qpos) #qpos[2, 0] += action[1] #ori = qpos[2, 0] ## compute increment in each direction #dx = math.cos(ori) * action[0] #dy = math.sin(ori) * action[0] ## ensure that the robot is within reasonable range #qpos[0, 0] = np.clip(qpos[0, 0] + dx, -7, 7) #qpos[1, 0] = np.clip(qpos[1, 0] + dy, -7, 7) #self.model.data.qpos = qpos #self.model.forward() #next_obs = self.get_current_obs() xy = [qpos[0, 0] + action[0], qpos[1, 0] + action[1]] self.set_xy(xy) next_obs = self.get_xy() return Step(next_obs, 0, False)
def step(self, action): """ Note: override this method with great care, as it post-processes the observations, etc. """ reward_computer = self.compute_reward(action) # forward the state action = self._inject_action_noise(action) for _ in range(self.frame_skip): self.forward_dynamics(action) # notifies that we have stepped the world next(reward_computer) # actually get the reward reward = next(reward_computer) self._invalidate_state_caches() self.steps += 1 done = self.is_current_done() or self.steps >= 400 next_obs = self.get_current_obs() return Step(observation=next_obs, reward=reward, done=done)
def step(self, action): #next state = what happens after taking "action" temp_state = np.copy(self._state) dt = 0.1 temp_state[0] = self._state[ 0] + self._state[2] * dt + 0.5 * action[0] * dt * dt temp_state[1] = self._state[ 1] + self._state[3] * dt + 0.5 * action[1] * dt * dt temp_state[2] = self._state[2] + action[0] * dt temp_state[3] = self._state[3] + action[1] * dt self._state = np.copy(temp_state) #make the reward what you care about x, y, vx, vy = self._state reward = vx - np.sqrt( abs(y - 0) ) #we care about moving in the forward x direction... and keeping our y value close to 0... (aka "going straight") done = 0 #x>500 #when do you consider this to be "done" (rollout stops... "terminal") next_observation = np.copy(self._state) return Step(observation=next_observation, reward=reward, done=done)
def step(self, action): self.wrapped_env.forward_dynamics(action) next_obs = self.get_current_obs() alive_bonus = self.wrapped_env.alive_bonus data = self.wrapped_env.model.data comvel = self.wrapped_env.get_body_comvel("torso") lin_vel_reward = comvel[0] vel_deviation_cost = 0.5 * self.wrapped_env.vel_deviation_cost_coeff * np.sum( np.square(comvel[1:])) vel_reward = lin_vel_reward - vel_deviation_cost lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = .5 * self.wrapped_env.ctrl_cost_coeff * np.sum( np.square(action / scaling)) impact_cost = .5 * self.wrapped_env.impact_cost_coeff * np.sum( np.square(np.clip(data.cfrc_ext, -1, 1))) done = data.qpos[2] < 0.8 or data.qpos[2] > 2.0 if self._circle_mode: pos = self.wrapped_env.get_body_com("torso") vel = self.wrapped_env.get_body_comvel("torso") dt = self.wrapped_env.model.opt.timestep x, y = pos[0], pos[1] dx, dy = vel[0], vel[1] vel_reward = -y * dx + x * dy vel_reward /= (1 + np.abs(np.sqrt(x**2 + y**2) - self._target_dist)) reward = vel_reward + alive_bonus - ctrl_cost - impact_cost self._step += 1 if self._max_path_length_range is not None: if self._step > self._last_step: done = True return Step(next_obs, reward, done)
def step(self, action): if not hasattr(self, "iteration"): self.iteration = 0 import pdb; pdb.set_trace() self.init_pos = self.get_body_com("distal_4") self.frame_skip = 5 pobj = self.get_body_com("object") pdistr = self.get_body_com("distractor") pgoal = self.get_body_com("goal") ptip = self.get_body_com("distal_4") reward_ctrl = - np.square(action).sum() # if self.iteration >= 100 and self.iteration < 130:# and np.mean(self.dist[-10:]) <= 0.017: # # print('going back!') # reward_dist = - np.linalg.norm(self.init_pos[:-1]-ptip[:-1]) # reward_distr = np.linalg.norm(pdistr[:-1]-ptip[:-1]) # # reward = reward_dist + 0.01 * reward_ctrl# + 0.1 * reward_distr # reward = 1.1 * reward_dist + 0.1 * reward_ctrl + 0.1 * reward_distr# + 0.1 * reward_distr # # reward = reward_dist + 0.1 * reward_ctrl + 0.1 * reward_distr# + 0.1 * reward_distr # elif self.iteration >= 130: # reward_dist = - np.linalg.norm(pgoal[:-1]-pobj[:-1]) # reward_dist_distr = - np.linalg.norm(pgoal[:-1]-pdistr[:-1]) # reward_near = - np.linalg.norm(pdistr[:-1]-ptip[:-1]) # # reward_return = - np.linalg.norm(self.init_pos-ptip) # reward = reward_dist + reward_dist_distr + 0.1 * reward_ctrl + 0.5 * reward_near # else: # reward_dist = - np.linalg.norm(pgoal[:-1]-pobj[:-1]) # reward_near = - np.linalg.norm(pobj[:-1] - ptip[:-1]) # # reward_return = - np.linalg.norm(self.init_pos-ptip) # self.dist.append(-reward_dist) # reward = reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near reward_dist = - np.linalg.norm(pgoal-pobj) reward_dist_distr = - np.linalg.norm(pgoal-pdistr) reward_near = - np.linalg.norm(pdistr-ptip) # reward_return = - np.linalg.norm(self.init_pos-ptip) reward = reward_dist + reward_dist_distr + 0.1 * reward_ctrl + 0.5 * reward_near self.forward_dynamics(action) # TODO - frame skip next_obs = self.get_current_obs() done = False self.iteration += 1 return Step(next_obs, reward, done)
def step(self, actid): action = self.actions[actid] load = self.state[0] Pij, Qij, Vi2, Lij, total_loss, iter_total, convergence_flag = utils.load_flow( action, load) if convergence_flag != 1: print('power flow not converge') Vi = np.sqrt(Vi2) # penalt for voltage constraint violation 0.01; total loss; tap change #reward = -0.01*np.sum(Vi<0.95)-0.01*np.sum(Vi>1.05)-total_loss-0.001*np.sum(np.abs(action-self.state[24:27])) #reward = - total_loss*10*100 info = {} info['gain'] = -(total_loss * 5 * 40 + 0.1 * np.sum(np.abs(action - self.state[1:6])) ) ## bus4: 10; bus34: 5 info['cost'] = np.sum(Vi < 0.95) + np.sum(Vi > 1.05) reward = info['gain'] - info['cost'] if (self.local_time + 1) % 168 == 0: done = True else: done = False # update to next state self.global_time += 1 if self.mode == 'Train' and self.global_time >= self.load_len_train: self.global_time = 0 if self.mode == 'Test' and self.global_time >= self.load_len_test: self.global_time = 0 self.local_time += 1 self.state = np.zeros(1 + 5 + 1) if self.mode == 'Train': self.state[:1] = self.load_train[self.global_time] else: self.state[:1] = self.load_test[self.global_time] self.state[1:6] = action # current tap self.state[6] = self.local_time return Step(self.state, reward, done, **info)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() action = np.clip(action, *self.action_bounds) reward_ctrl = -1e-1 * 0.5 * np.sum(np.square(action)) norm_next_obs = (next_obs - self._task_config.s_mean) / self._task_config.s_std reward_run = np.sum(norm_next_obs * self._task_config.goal_velocity * self._task_config.coef) reward_run = reward_run.astype(np.float32) reward = reward_ctrl + reward_run done = False reward_state = norm_next_obs return Step(next_obs, reward, done, reward_ctrl=reward_ctrl, reward_state=reward_state)
def step(self, action, **kwargs): """ Run one timestep of the environment's dynamics. When end of episode is reached, reset() should be called to reset the environment's internal state. Input ----- action : an action provided by the environment Outputs ------- (observation, reward, done, info) observation : agent's observation of the current environment reward [Float] : amount of reward due to the previous action done : a boolean, indicating whether the episode has ended info : a dictionary containing other diagnostic information from the previous action """ prev_state = self._state self._state = prev_state + np.clip(action, -0.1, 0.1) reward = self.reward(prev_state, action, self._state) done = self.done(self._state) next_observation = np.copy(self._state) return Step(next_observation, reward, done)
def step(self, action, collectingInitialData=False): xposbefore = self.get_body_com("torso")[0] self.forward_dynamics(action) comvel = self.get_body_comvel("torso") forward_reward = comvel[0] xposafter = self.get_body_com("torso")[0] lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling)) contact_cost = 0 survive_reward = 0.05 reward = forward_reward - ctrl_cost - contact_cost + survive_reward state = self._state notdone = np.isfinite(state).all() \ and self.get_body_com("torso")[2] >= 0.3 and self.get_body_com("torso")[2] <= 1.0 #used to be 0.2, state[2] done = not notdone ob = self.get_current_obs() return Step(ob, float(reward), done)
def step(self, action): self.forward_dynamics(action) comvel = self.get_body_comvel("torso") forward_reward = comvel[0] lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling)) contact_cost = 0.5 * 1e-3 * np.sum( np.square(np.clip(self.model.data.cfrc_ext, -1, 1))), survive_reward = 0.05 reward = forward_reward - ctrl_cost - contact_cost + survive_reward state = self._state notdone = np.isfinite(state).all() \ and state[2] >= 0.2 and state[2] <= 1.0 done = not notdone ob = self.get_current_obs() if forward_reward <= 3.0: reward = 0. else: reward = 1.0 return Step(ob, float(reward), done)
def step(self, action, collectingInitialData=False): xposbefore = self.model.data.qpos[0, 0] self.forward_dynamics(action) next_obs = self.get_current_obs() action = np.clip(action, *self.action_bounds) ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action)) run_cost = -1 * self.get_body_comvel("torso")[0] cost = ctrl_cost + run_cost ############ rllab rewards reward = -cost ########### open ai gym rewards '''xposafter = self.model.data.qpos[0,0] reward_ctrl = - 0.1 * np.square(action).sum() reward_run = (xposafter - xposbefore)/self.model.opt.timestep reward = reward_ctrl + reward_run''' ######### done = False return Step(next_obs, reward, done)
def step(self, action): self.forward_dynamics(action) alive_bonus = self.alive_bonus data = self.model.data comvel = self.get_body_comvel("torso") lin_vel_reward = comvel[0] lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = .5 * self.ctrl_cost_coeff * np.sum( np.square(action / scaling)) impact_cost = 0. vel_deviation_cost = 0.5 * self.vel_deviation_cost_coeff * np.sum( np.square(comvel[1:])) reward = lin_vel_reward + alive_bonus - ctrl_cost - \ impact_cost - vel_deviation_cost pos = data.qpos.flat[2] done = pos < 0.8 or pos > 2.0 next_obs = self.get_current_obs() return Step(next_obs, reward, done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() blockchoice = self.goal[0] curr_block_xidx = int(3 + 2 * blockchoice) curr_block_yidx = int(4 + 2 * blockchoice) #TODO: Maybe need to change angle here curr_gripper_pos = self.model.data.site_xpos[0, :2] curr_block_pos = np.array( [next_obs[curr_block_xidx], next_obs[curr_block_yidx]]) goal_pos = self.goal[1:3] dist_to_block = np.linalg.norm(curr_gripper_pos - curr_block_pos) block_dist = np.linalg.norm(goal_pos - curr_block_pos) goal_dist = np.linalg.norm(goal_pos) if self.sparse and block_dist > 0.2: reward = -5 * goal_dist else: reward = -5 * block_dist done = False return Step(next_obs, reward, done)
def step(self, action): # self.forward_dynamics(action) # comvel = self.get_body_comvel("torso") # forward_reward = self.goal_direction*comvel[0] # lb, ub = self.action_bounds # scaling = (ub - lb) * 0.5 # ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling)) # contact_cost = 0.5 * 1e-3 * np.sum( # np.square(np.clip(self.model.data.cfrc_ext, -1, 1))), # survive_reward = 0.05 # reward = forward_reward - ctrl_cost - contact_cost + survive_reward # state = self._state # notdone = np.isfinite(state).all() \ # and state[2] >= 0.2 and state[2] <= 1.0 # done = not notdone # ob = self.get_current_obs() # return Step(ob, float(reward), done) # obs = self.get_current_obs() # selected_arm_mean = self.arm_means[action] # reward = float(np.random.random() < selected_arm_mean) # self.ts += 1 # done = self.ts >= self.max_path_length # state = np.zeros((2)) # state[0] = reward # state[1] = 1 # return Step(state, reward, done) ps = self.Ps[self.state, action] next_state = special.weighted_sample(ps, np.arange(self.n_states)) reward_mean = self.Rs[self.state, action] reward = reward_mean + np.random.normal() * 1 / np.sqrt(self.tau) self.ts += 1 self.state = next_state done = self.ts >= self.max_path_length state = np.zeros((2 + self.n_states)) state[self.state] = 1 state[self.n_states] = reward state[self.n_states + 1] = done return Step(state, reward, done)
def step(self, action): assert self.action_space.contains( action), "%r (%s) invalid" % (action, type(action)) # Apply the action c = np.asarray(action) buff = struct.pack('%sf' % len(c), *c) self.s.send(buff) # Receive the state stateSize = self.s.recv(4) stateSize = struct.unpack('i', stateSize)[0] state = self.s.recv(1000) while len(state) < stateSize: state += self.s.recv(1000) state = np.asarray([ struct.unpack('f', state[i:i + 4])[0] for i in range(0, len(state), 4) ]) # Get the y position of the root joint y = state[3] done = y < self.y_threshold if not done: reward = 1.0 elif self.steps_beyond_done is None: # skeleton just fell! self.steps_beyond_done = 0 reward = -100.0 else: if self.steps_beyond_done == 0: logger.warn( "You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior." ) self.steps_beyond_done += 1 reward = 0.0 next_observation = np.array(self.state) return Step(observation=next_observation, reward=reward, done=done)
def step(self, action): A, B = self._dynamics[self._get_dynamics_id()] if self.force_act is not None: action = self.force_act chosen_controller = self._controllers[action] process_noise = np.random.standard_normal(size=A.shape) * self.NOISE control_gain = (A - B.dot(chosen_controller)) + process_noise if self.normalize_action: assert False gradient = control_gain.dot(self.state) next_state = 0.2 * gradient / np.linalg.norm(gradient) + self.state else: next_state = 0.2 * control_gain.dot( self.state / np.linalg.norm(self.state)) + self.state reward = -np.linalg.norm(next_state)**2 # quad reward if self.observation_space.contains(next_state): done = abs(reward) < 0.4 # will change else: done = True # print reward self.state = next_state.copy() return Step(observation=self.state, reward=reward, done=done)
def step(self, action): #action = np.clip(action, -0.1, 0.1) tmp_finger = self.get_body_com("fingertip") vec = self.get_body_com("fingertip") - self._goal reward_dist = -np.linalg.norm(vec) #print(vec,reward_dist) reward_ctrl = -np.square(action).sum() sparse_reward = self.sparsify_rewards(reward_dist) reward = reward_dist + reward_ctrl sparse_reward = sparse_reward + reward_ctrl reward = sparse_reward self.do_simulation(action, self.frame_skip) ob = self._get_obs() done = False env_infos = dict(finger=tmp_finger.tolist(), reward_dist=reward_dist, reward_ctrl=reward_ctrl, sparse_reward=sparse_reward, goal=self._goal) #print(env_infos['finger']) return Step(ob, reward, done)
def step(self, action): if self.MANUAL_COLLISION: old_pos = self.wrapped_env.get_xy() _, _, done, info = self.wrapped_env.step(action) new_pos = self.wrapped_env.get_xy() if self._is_in_collision(new_pos): self.wrapped_env.set_xy(old_pos) done = False else: _, _, done, info = self.wrapped_env.step(action) next_obs = self.get_current_obs() x, y = self.wrapped_env.get_body_com("torso")[:2] # ref_x = x + self._init_torso_x # ref_y = y + self._init_torso_y reward = 0 minx, maxx, miny, maxy = self._goal_range # print "goal range: x [%s,%s], y [%s,%s], now [%s,%s]" % (str(minx), str(maxx), str(miny), str(maxy), # str(x), str(y)) if minx <= x <= maxx and miny <= y <= maxy: done = True reward = 1 return Step(next_obs, reward, done, **info)
def step(self, action): self.forward_dynamics(action) comvel = self.get_body_comvel("torso") if self._task_config.goal_velocity == -math.inf: forward_reward = -1 * comvel[0] elif self._task_config.goal_velocity == math.inf: forward_reward = comvel[0] else: forward_reward = -np.abs(comvel[0] - self._task_config.goal_velocity) + 1.0 lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling)) contact_cost = 0. # contact_cost = 0.5 * 1e-3 * np.sum( # np.square(np.clip(self.model.data.cfrc_ext, -1, 1))), survive_reward = 0.05 reward = forward_reward - ctrl_cost - contact_cost + survive_reward state = self._state notdone = np.isfinite(state).all() and state[2] >= 0.2 and state[2] <= 1.0 done = not notdone ob = self.get_current_obs() return Step(ob, float(reward), done)
def step(self, action): self.model.data.ctrl = action # Taking Steps in the Environment reward = 0 for _ in range(self.frame_skip): self.model.step() step_reward = self.timestep_reward() reward += step_reward # Reached the End of Trajectory done = False onGround = self.touching_group( "geom_object", ["ground", "goal_wall1", "goal_wall2", "goal_wall3", "goal_wall4"]) if onGround and self.numClose > 10: reward += self.final_reward() done = True ob = self.get_current_obs() new_com = self.model.data.com_subtree[0] self.dcom = new_com - self.current_com self.current_com = new_com # Recording Metrics obj_position = self.get_body_com("object") goal_position = self.get_body_com("goal") distance = np.linalg.norm((goal_position - obj_position)[:2]) normalizedDistance = distance / self.init_block_goal_dist return Step(ob, float(reward), done, distance=distance, norm_distance=normalizedDistance)