def __del__(self): print('Arm random goal destructor called') if self.is_validation: # If fail_points is not defined yet, do nothing if not hasattr(self, 'fail_points'): return def print_list(list_): for item in list_: print(item) print('Saving failed goals to data.pkl') with open('failed_points.pkl', 'wb') as f: pickle.dump(self.fail_points, f, pickle.HIGHEST_PROTOCOL) print('Failed goals, target coords: ') print_list(enumerate([x for x, y in self.fail_points])) # We spawn gazebo markers here to exploit the fact that spinningup test_policy somehow doesn't close gazebo after it's done k = 100 ut_gazebo.create_marker(self.node, 0.0, 0.0, 0.0, diameter=0.004) for x, y in self.fail_points: ut_gazebo.create_marker(self.node, x[0], x[1], x[2], diameter=0.001, id=k) k += 1
def __init__(self, node: rclpy.node.Node, robot, **kwargs): super().__init__(node, robot, **kwargs) # The target coords is currently arbitrarily set to some point achievable # This is the target for grip_end_point when target joint values are: [1.00, -1.00, 1.00] target_x = 0.10175 target_y = -0.05533 target_z = 0.1223 self.target_coords = numpy.array([target_x, target_y, target_z]) if isinstance(robot, LobotArmSim): # Check if is gazebo or not # Spawn the target marker if it is gazebo print(f"Fixed goal: spawning to: {self.target_coords}") ut_gazebo.create_marker(node, target_x, target_y, target_z, diameter=0.004) self.previous_coords = numpy.array([0.0, 0.0, 0.0])
def reset(self): self.previous_coords = numpy.array([0.0, 0.0, 0.0]) if isinstance(self.robot, LobotArmSim): # Check if is simulator or real # Move the target marker if it is gazebo print(f'Moving to {(self.target_coords[0], self.target_coords[1], self.target_coords[2])}') spawn_success = ut_gazebo.create_marker(self.node, self.target_coords[0], self.target_coords[1], self.target_coords[2], diameter=0.004)
def reset(self): self.reward_noise_sigma = self.original_reward_noise_sigma # Set initial coordinates initial_joint_values = numpy.array([0.0, 0.0, 0.0]) res = self._fk.calculate('world', 'grip_end_point', initial_joint_values) self.previous_coords = numpy.array( [res.translation.x, res.translation.y, res.translation.z]) self.initial_coords = numpy.array( [res.translation.x, res.translation.y, res.translation.z]) self.__reset_count += 1 if self.__reset_count % self.episodes_per_goal == 0: self.target_coords_ik, self.target_coords = self.__get_target_coords( ) print( f'Moving to [{self.target_coords[0]:.6f}, {self.target_coords[1]:.6f}, {self.target_coords[2]:.6f}], ' f'Given by joint values [{self.target_coords_ik[0]:.6f}, {self.target_coords_ik[1]:.6f}, {self.target_coords_ik[2]:.6f}]' ) if isinstance(self.robot, LobotArmSim): # Check if is simulator or real # Move the target marker if it is gazebo spawn_success = ut_gazebo.create_marker(self.node, self.target_coords[0], self.target_coords[1], self.target_coords[2], diameter=0.004)
def __init__(self, node: rclpy.node.Node, robot, task_kwargs: Dict = None, max_time_step: int = 500): if task_kwargs is None: task_kwargs = {} self.node = node self.robot = robot self.accepted_dist_to_bounds = task_kwargs.get( 'accepted_dist_to_bounds', 0.001) self.accepted_error = task_kwargs.get('accepted_error', 0.001) self.reach_target_bonus_reward = task_kwargs.get( 'reach_target_bonus_reward', 0.0) self.reach_bounds_penalty = task_kwargs.get('reach_bounds_penalty', 0.0) self.contact_penalty = task_kwargs.get('contact_penalty', 0.0) self.episodes_per_goal = task_kwargs.get('episodes_per_goal', 1) self.goal_buffer_size = task_kwargs.get('goal_buffer_size', 20) self.goal_from_buffer_prob = task_kwargs.get('goal_from_buffer_prob', 0.0) print( f'-------------------------------Setting task parameters-------------------------------' ) print( 'accepted_dist_to_bounds: %f # Allowable distance to joint limits' % self.accepted_dist_to_bounds) print( 'accepted_error: %f # Allowable distance from target coordinates' % self.accepted_error) print( 'reach_target_bonus_reward: %f # Bonus reward upon reaching target' % self.reach_target_bonus_reward) print( 'reach_bounds_penalty: %f # Reward penalty when reaching joint limit' % self.reach_bounds_penalty) print('contact_penalty: %f # Reward penalty for collision' % self.contact_penalty) print( 'episodes_per_goal: %d # Number of episodes before generating another random goal' % self.episodes_per_goal) print( 'goal_buffer_size: %d # Number goals to store in buffer to be reused later' % self.goal_buffer_size) print( 'goal_from_buffer_prob: %f # Probability of selecting a random goal from the goal buffer, value between 0 and 1' % self.goal_from_buffer_prob) print( f'-------------------------------------------------------------------------------------' ) assert self.accepted_dist_to_bounds >= 0.0, 'Allowable distance to joint limits should be positive' assert self.accepted_error >= 0.0, 'Accepted error to end coordinates should be positive' assert self.reach_target_bonus_reward >= 0.0, 'Reach target bonus reward should be positive' assert self.contact_penalty >= 0.0, 'Contact penalty should be positive' assert self.reach_bounds_penalty >= 0.0, 'Reach bounds penalty should be positive' assert isinstance( self.episodes_per_goal, int ), f'Episodes per goal should be an integer, current type: {type(self.episodes_per_goal)}' assert self.episodes_per_goal >= 1, 'Episodes per goal be greather than or equal to 1, i.e. episodes_per_goal >= 1' assert isinstance( self.goal_buffer_size, int ), f'Goal buffer size should be an integer, current type: {type(self.goal_buffer_size)}' assert self.goal_buffer_size > 0, 'Goal buffer size should be greather than or equal to 1, i.e. episodes_per_goal >= 1' assert 0 <= self.goal_from_buffer_prob <= 1, 'Probability of selecting goal from buffer should be between 0 and 1' self._max_time_step = max_time_step lobot_desc_share_path = get_package_share_directory( 'lobot_description') arm_urdf_path = os.path.join(lobot_desc_share_path, 'robots/arm_standalone.urdf') self._fk = fk.ForwardKinematics(arm_urdf_path) self.target_coords = self.__generate_target_coords() self.coords_buffer = deque(maxlen=self.goal_buffer_size) self.coords_buffer.append(self.target_coords) target_x = self.target_coords[0] target_y = self.target_coords[1] target_z = self.target_coords[2] if isinstance(robot, LobotArmSim): # Check if is gazebo # Spawn the target marker if it is gazebo print(f'Spawning to: {(target_x, target_y, target_z)}') spawn_success = ut_gazebo.create_marker(node, target_x, target_y, target_z, diameter=0.004) self.previous_coords = numpy.array([0.0, 0.0, 0.0]) self.__reset_count = 0
def compute_reward(self, joint_states: numpy.ndarray, arm_state: ArmState) -> Tuple[float, Dict]: assert len( joint_states ) == 3, f'Expected 3 values for joint states, but got {len(joint_states)} values instead' coords_get_result = self.__get_coords(joint_states) assert len( coords_get_result ) == 3, f'Expected 3 values after getting coordinates, but got {len(coords_get_result)} values instead' current_coords = coords_get_result assert arm_state != ArmState.Undefined, f'Arm state cannot be undefined, please check logic' # Give 0 reward on initial state # if numpy.array_equal(self.previous_coords, numpy.array([0.0, 0.0, 0.0])): # # print('Initial state detected, giving 0 reward') # reward = 0.0 # else: reward = self.__calc_dist_change(self.previous_coords, current_coords) # normalise rewards mag_target = numpy.linalg.norm(self.initial_coords - self.target_coords) normalised_reward = reward / mag_target # Scale up normalised reward slightly such that the total reward is between 0 and 10 instead of between 0 and 1 normalised_reward *= 10 # Scale up reward so that it is not so small if not normalised normal_scaled_reward = reward * 100 # Calculate current distance to goal (for information purposes only) dist = numpy.linalg.norm(current_coords - self.target_coords) reward_info = { 'normalised_reward': normalised_reward, 'normal_reward': normal_scaled_reward, 'distance_to_goal': dist, 'target_coords': self.target_coords, 'current_coords': current_coords } if self.normalise_reward: reward = normalised_reward else: reward = normal_scaled_reward # Calculate exponential reward component if self.exp_rew_scaling is not None: exp_reward = self.__calc_exponential_reward( self.previous_coords, current_coords) reward_info['exp_reward'] = exp_reward reward += exp_reward else: reward_info['exp_reward'] = 0.0 # Add reward noise if self.reward_noise_mu is not None and self.reward_noise_sigma is not None: rew_noise = numpy.random.normal(self.reward_noise_mu, self.reward_noise_sigma) if self.reward_noise_decay is not None: self.reward_noise_sigma = self.reward_noise_sigma * math.exp( -self.reward_noise_decay) reward_info['rew_noise'] = rew_noise reward += rew_noise else: reward_info['rew_noise'] = 0.0 self.previous_coords = current_coords # Reward shaping logic # Check if it has reached target destination if arm_state == ArmState.Reached: # if reached target destination and is continuous run, we generate another set of coordinates # This has to be after the __calc_dist_change function because that uses self.target_coords to calculate if self.continuous_run: self.target_coords_ik, self.target_coords = self.__get_target_coords( ) print( f'Moving to [{self.target_coords[0]:.6f}, {self.target_coords[1]:.6f}, {self.target_coords[2]:.6f}], ' f'Given by joint values [{self.target_coords_ik[0]:.6f}, {self.target_coords_ik[1]:.6f}, {self.target_coords_ik[2]:.6f}]' ) ut_gazebo.create_marker(self.node, self.target_coords[0], self.target_coords[1], self.target_coords[2], diameter=0.004) reward += self.reach_target_bonus_reward # Check if it has approached any joint limits if arm_state == ArmState.ApproachJointLimits: reward -= self.reach_bounds_penalty # Check for collision if arm_state == ArmState.Collision: reward -= self.contact_penalty if arm_state == ArmState.Timeout: reward -= self.timeout_penalty return reward, reward_info
def __init__(self, node: rclpy.node.Node, robot, max_time_step: int = 500, accepted_dist_to_bounds=0.001, accepted_error=0.001, reach_target_bonus_reward=0.0, reach_bounds_penalty=0.0, contact_penalty=0.0, timeout_penalty=0.0, episodes_per_goal=1, goal_buffer_size=20, goal_from_buffer_prob=0.0, num_adjacent_goals=0, is_validation=False, random_goal_seed=None, random_goal_file=None, normalise_reward=False, continuous_run=False, reward_noise_mu=None, reward_noise_sigma=None, reward_noise_decay=None, exp_rew_scaling=None): self.node = node self.robot = robot self._max_time_step = max_time_step self.accepted_dist_to_bounds = accepted_dist_to_bounds self.accepted_error = accepted_error self.reach_target_bonus_reward = reach_target_bonus_reward self.reach_bounds_penalty = reach_bounds_penalty self.contact_penalty = contact_penalty self.timeout_penalty = timeout_penalty self.episodes_per_goal = episodes_per_goal self.goal_buffer_size = goal_buffer_size self.goal_from_buffer_prob = goal_from_buffer_prob self.num_adjacent_goals = num_adjacent_goals self.is_validation = is_validation self.random_goal_seed = random_goal_seed self.random_goal_file = random_goal_file self.normalise_reward = normalise_reward self.continuous_run = continuous_run self.reward_noise_mu = reward_noise_mu self.reward_noise_sigma = reward_noise_sigma self.original_reward_noise_sigma = reward_noise_sigma self.reward_noise_decay = reward_noise_decay self.exp_rew_scaling = exp_rew_scaling print( f'-------------------------------Setting task parameters-------------------------------' ) print( 'max_time_step: %8d # Maximum time step before stopping the episode' % self._max_time_step) print( 'accepted_dist_to_bounds: %8.7f # Allowable distance to joint limits (radians)' % self.accepted_dist_to_bounds) print( 'accepted_error: %8.7f # Allowable distance from target coordinates (metres)' % self.accepted_error) print( 'reach_target_bonus_reward: %8.7f # Bonus reward upon reaching target' % self.reach_target_bonus_reward) print( 'reach_bounds_penalty: %8.7f # Reward penalty when reaching joint limit' % self.reach_bounds_penalty) print( 'contact_penalty: %8.7f # Reward penalty for collision' % self.contact_penalty) print( 'timeout_penalty: %8.7f # Reward penalty for collision' % self.timeout_penalty) print( 'episodes_per_goal: %8d # Number of episodes before generating another random goal' % self.episodes_per_goal) print( 'goal_buffer_size: %8d # Number goals to store in buffer to be reused later' % self.goal_buffer_size) print( 'goal_from_buffer_prob: %8.7f # Probability of selecting a random goal from the goal buffer, value between 0 and 1' % self.goal_from_buffer_prob) print( 'num_adjacent_goals: %8d # Number of nearby goals to be generated for each randomly generated goal ' % self.num_adjacent_goals) print( f'random_goal_seed: {str(self.random_goal_seed):8} # Seed used to generate the random goals' ) print( f'random_goal_file: {self.random_goal_file} # Path to the numpy save file containing the random goals' ) print( 'is_validation: %8r # Whether this is a validation run, if true will print which points failed and how many reached' % self.is_validation) print( 'normalise_reward: %8r # Perform reward normalisation, this happens before reward bonus and penalties' % self.normalise_reward) print( 'continuous_run: %8r # Continuously run the simulation, even after it reaches the destination' % self.continuous_run) print( f'reward_noise_mu: {self.reward_noise_mu} # Reward noise mean (reward noise follows gaussian distribution)' ) print( f'reward_noise_sigma: {self.reward_noise_sigma} # Reward noise standard deviation, recommended 0.5' ) print( f'reward_noise_decay: {self.reward_noise_decay} # Constant for exponential reward noise decay (recommended 0.31073, decays to 0.002 in 20 steps)' ) print( f'exp_rew_scaling: {self.exp_rew_scaling} # Constant for exponential reward scaling (None by default, recommended 5.0, cumulative exp_reward = 29.48)' ) print( f'-------------------------------------------------------------------------------------' ) assert self.accepted_dist_to_bounds >= 0.0, 'Allowable distance to joint limits should be positive' assert self.accepted_error >= 0.0, 'Accepted error to end coordinates should be positive' assert self.reach_target_bonus_reward >= 0.0, 'Reach target bonus reward should be positive' assert self.contact_penalty >= 0.0, 'Contact penalty should be positive' assert self.timeout_penalty >= 0.0, 'Timeout penalty should be positive' assert self.reach_bounds_penalty >= 0.0, 'Reach bounds penalty should be positive' assert isinstance( self.episodes_per_goal, int ), f'Episodes per goal should be an integer, current type: {type(self.episodes_per_goal)}' assert self.episodes_per_goal >= 1, 'Episodes per goal be greather than or equal to 1, i.e. episodes_per_goal >= 1' assert isinstance( self.goal_buffer_size, int ), f'Goal buffer size should be an integer, current type: {type(self.goal_buffer_size)}' assert self.goal_buffer_size > 0, 'Goal buffer size should be greather than or equal to 1, i.e. episodes_per_goal >= 1' assert 0 <= self.goal_from_buffer_prob <= 1, 'Probability of selecting goal from buffer should be between 0 and 1' assert isinstance( self.num_adjacent_goals, int ), f'Number of adjacent goals should be an integer, current type: {type(self.num_adjacent_goals)}' assert self.num_adjacent_goals >= 0, f'Number of adjacent goals should be positive, current value: {self.num_adjacent_goals}' if self.random_goal_seed is not None: assert isinstance( self.random_goal_seed, int ), f'Random goal seed should be an integer, current type: {type(self.random_goal_seed)}' if self.random_goal_file is not None: assert os.path.exists(self.random_goal_file) if reward_noise_decay is not None: assert self.reward_noise_mu is not None and self.reward_noise_sigma is not None assert isinstance(self.reward_noise_mu, float) and isinstance( self.reward_noise_sigma, float) if exp_rew_scaling is not None: assert isinstance( self.exp_rew_scaling, float ), f'Exponential reward scaling factor should be a float, current type: {type(self.exp_rew_scaling)}' self._max_time_step = max_time_step lobot_desc_share_path = get_package_share_directory( 'lobot_description') arm_urdf_path = os.path.join(lobot_desc_share_path, 'robots/arm_standalone.urdf') self._fk = fk.ForwardKinematics(arm_urdf_path) self.coords_buffer = deque(maxlen=self.goal_buffer_size) self.target_coords_ik, self.target_coords = self.__get_target_coords() target_x = self.target_coords[0] target_y = self.target_coords[1] target_z = self.target_coords[2] if isinstance(robot, LobotArmSim): # Check if is gazebo # Spawn the target marker if it is gazebo print(f'Spawning to: {(target_x, target_y, target_z)}') spawn_success = ut_gazebo.create_marker(node, target_x, target_y, target_z, diameter=0.004) self.previous_coords = None self.initial_coords = None self.__reset_count: int = 0 self.fail_points = [] self.__reach_count: int = 0