def test_step_both(self): # Create two instances and send different actions to them. # Verify that both go towards their target robot1 = SimFinger(finger_type="trifingerpro") robot2 = SimFinger(finger_type="trifingerpro") start_position = np.array([0.0, 0.7, -1.5] * 3) robot1.reset_finger_positions_and_velocities(start_position) robot2.reset_finger_positions_and_velocities(start_position) action1 = robot1.Action(position=[0.5, 0.7, -1.5] * 3) action2 = robot2.Action(position=[-0.1, 0.7, -1.5] * 3) for i in range(1000): t1 = robot1.append_desired_action(action1) t2 = robot2.append_desired_action(action2) obs1 = robot1.get_observation(t1) obs2 = robot2.get_observation(t2) if i > 1: self.assertTrue((obs2.position != obs1.position).any()) self.assertLess(np.linalg.norm(action1.position - obs1.position), 0.1) self.assertLess(np.linalg.norm(action2.position - obs2.position), 0.1)
def test_constant_torque(self): """Compare two runs sending a constant torque. In each run the finger is reset to a fixed position and a constant torque is applied for a number of steps. The final observation of each run is compared. If the simulation behaves deterministically, the observations should be equal. """ finger = SimFinger(finger_type="fingerone", ) start_position = [0.5, -0.7, -1.5] action = finger.Action(torque=[0.3, 0.3, 0.3]) def run(): finger.reset_finger_positions_and_velocities(start_position) for i in range(30): finger._set_desired_action(action) finger._step_simulation() return finger._get_latest_observation() first_run = run() second_run = run() np.testing.assert_array_equal(first_run.torque, second_run.torque) np.testing.assert_array_equal(first_run.position, second_run.position) np.testing.assert_array_equal(first_run.velocity, second_run.velocity)
def execute_random_motion(*, finger_name, nb_timesteps, enable_visualization): finger = SimFinger( finger_type=finger_name, enable_visualization=enable_visualization, ) cube = collision_objects.Block() for t in range(nb_timesteps): if t % 50 == 0: torque = np.random.uniform(low=-0.36, high=0.36, size=(9)) finger.append_desired_action(finger.Action(torque=torque)) del finger del cube
def test_step_one(self): # Create two instances, send actions and step only one. Verify that # the other one does not move. robot1 = SimFinger(finger_type="trifingerpro") robot2 = SimFinger(finger_type="trifingerpro") start_position = np.array([0.0, 0.7, -1.5] * 3) robot1.reset_finger_positions_and_velocities(start_position) robot2.reset_finger_positions_and_velocities(start_position) action = robot1.Action(torque=[0.3, 0.3, 0.3] * 3) for i in range(30): robot1._set_desired_action(action) robot1._step_simulation() obs1 = robot1._get_latest_observation() obs2 = robot2._get_latest_observation() self.assertTrue((start_position != obs1.position).any()) np.testing.assert_array_equal(start_position, obs2.position)
class TriFingerPush(gym.Env): """A gym environment to enable training on any of the valid robots, real or simulated, for the task of pushing. """ def __init__( self, control_rate_s, finger_type, enable_visualization, ): """Intializes the constituents of the pushing environment. Constructor sets up the finger robot depending on the finger type, sets up the observation and action spaces, smoothing for reducing jitter on the robot, and provides for a way to synchronize robots being trained independently. Args: control_rate_s (float): the rate (in seconds) at which step method of the env will run. The actual robot controller may run at a higher rate, so this is used to compute the number of robot control updates per environment step. finger_type (string): Name of the finger type. In order to get a list of the valid finger types, call :meth:`.finger_types_data.get_valid_finger_types` enable_visualization (bool): if the simulation env is to be visualized """ #: an instance of the simulated robot depending on the desired #: robot type self.finger = SimFinger( finger_type=finger_type, enable_visualization=enable_visualization, ) self.num_fingers = finger_types_data.get_number_of_fingers(finger_type) #: the number of times the same action is to be applied to #: the robot in one step. self.steps_per_control = int( round(control_rate_s / self.finger.time_step_s)) assert (abs(control_rate_s - self.steps_per_control * self.finger.time_step_s) <= 0.000001) #: the types of observations that should be a part of the environment's #: observed state self.observations_keys = [ "joint_positions", "joint_velocities", "action_joint_positions", "goal_position", "object_position", ] #: the size of each of the observation type that is part of the #: observation keys (in the same order) self.observations_sizes = [ 3 * self.num_fingers, 3 * self.num_fingers, 3 * self.num_fingers, 3, 3, ] # sets up the observation and action spaces for the environment, # unscaled spaces have the custom bounds set up for each observation # or action type, whereas all the values in the observation and action # spaces lie between 1 and -1 self.spaces = FingerSpaces( num_fingers=self.num_fingers, observations_keys=self.observations_keys, observations_sizes=self.observations_sizes, separate_goals=False, ) self.unscaled_observation_space = ( self.spaces.get_unscaled_observation_space()) self.unscaled_action_space = self.spaces.get_unscaled_action_space() self.observation_space = self.spaces.get_scaled_observation_space() self.action_space = self.spaces.get_scaled_action_space() #: a logger to enable logging of observations if desired self.logger = DataLogger() #: the object that has to be pushed self.block = collision_objects.Block() #: a marker to visualize where the target goal position for the episode #: is self.goal_marker = visual_objects.Marker( number_of_goals=1, goal_size=0.0325, initial_position=[0.19, 0.08, 0.0425], ) self.reset() def _compute_reward(self, object_position, goal): """ The reward function of the environment Args: observation (list): the observation at the current step goal (list): the desired goal for the episode Returns: the reward, and the done signal """ done = False dist = utils.compute_distance(object_position, goal) reward = -dist done = False return reward, done def _get_state(self, observation, action, log_observation=False): """ Get the current observation from the env for the agent Args: log_observation (bool): specify whether you want to log the observation Returns: observation (list): comprising of the observations corresponding to the key values in the observation_keys """ joint_positions = observation.position joint_velocities = observation.velocity tip_positions = self.finger.pinocchio_utils.forward_kinematics( joint_positions) end_effector_position = np.concatenate(tip_positions) flat_goals = np.concatenate([self.goal] * self.num_fingers) if self.num_fingers == 1: flat_goals = self.goal end_effector_to_goal = list( np.subtract(flat_goals, end_effector_position)) # populate this observation dict from which you can select which # observation types to finally choose depending on the keys # used for constructing the observation space of the environment observation_dict = {} observation_dict["joint_positions"] = joint_positions observation_dict["joint_velocities"] = joint_velocities observation_dict["end_effector_position"] = end_effector_position observation_dict["end_effector_to_goal"] = end_effector_to_goal observation_dict["goal_position"] = self.goal observation_dict["object_position"], _ = self.block.get_state() observation_dict["action_joint_positions"] = action # returns only the observations corresponding to the keys that were # used for constructing the observation space if log_observation: self.logger.append( observation_dict["joint_positions"], observation_dict["end_effector_position"], time.time(), ) observation = [ v for key in self.spaces.observations_keys for v in observation_dict[key] ] return observation def step(self, action): """ The env step method Args: action (list): the joint positions that have to be achieved Returns: the observation scaled to lie between [-1;1], the reward, the done signal, and info on if the agent was successful at the current step """ # unscales the action to the ranges of the action space of the # environment explicitly (as the prediction from the network # lies in the range [-1;1]) unscaled_action = utils.unscale(action, self.unscaled_action_space) # this is the control loop to send the actions for a few timesteps # which depends on the actual control rate finger_action = self.finger.Action(position=unscaled_action) state = None for _ in range(self.steps_per_control): t = self.finger.append_desired_action(finger_action) observation = self.finger.get_observation(t) if state is None: state = self._get_state(observation, unscaled_action, True) key_observation = state[self.spaces.key_to_index["object_position"]] reward, done = self._compute_reward(key_observation, self.goal) info = {"is_success": np.float32(done)} scaled_observation = utils.scale(state, self.unscaled_observation_space) print("reward", reward) return scaled_observation, reward, done, info def reset(self): """ Episode reset Returns: the scaled to [-1;1] observation from the env after the reset """ # resets the finger to a random position action = sample.feasible_random_joint_positions_for_reaching( self.finger, self.spaces.action_bounds) observation = self.finger.reset_finger_positions_and_velocities(action) #: the episode target for the agent which is sampled randomly #: for each episode self.goal = sample.random_position_in_arena(height_limits=0.0425) #: the position from which the object is initialized at the #: beginning of each episode self.block_position = sample.random_position_in_arena( height_limits=0.0425) self.goal_marker.set_state([self.goal]) self.block.set_state(self.block_position, [0, 0, 0, 1]) # logs relevant information for replayability self.logger.new_episode(self.block_position, self.goal) return utils.scale( self._get_state(observation, action, True), self.unscaled_observation_space, )
class TestRobotEquivalentInterface(unittest.TestCase): """ Test the methods of SimFinger that provide an interface equivalent to the RobotFrontend of robot_interfaces. """ def setUp(self): self.finger = SimFinger( finger_type="fingerone", time_step=0.001, enable_visualization=False, ) self.start_position = [0, -0.7, -1.5] self.finger.reset_finger_positions_and_velocities(self.start_position) def tearDown(self): # destroy the simulation to ensure that the next test starts with a # clean state del self.finger def test_timing_action_t_vs_observation_t(self): """Verify that observation_t is really not affected by action_t.""" # Apply a max torque action for one step action = self.finger.Action(torque=[ -self.finger.max_motor_torque, -self.finger.max_motor_torque, -self.finger.max_motor_torque, ]) t = self.finger.append_desired_action(action) obs = self.finger.get_observation(t) # as obs_t is from just before action_t is applied, the position should # not yet have changed np.testing.assert_array_equal(self.start_position, obs.position) # after applying another action (even with zero torque), we should see # the effect t = self.finger.append_desired_action(self.finger.Action()) obs = self.finger.get_observation(t) # new position should be less, as negative torque is applied np.testing.assert_array_less(obs.position, self.start_position) def test_timing_action_t_vs_observation_tplus1(self): """Verify that observation_{t+1} is affected by action_t.""" # Apply a max torque action for one step action = self.finger.Action(torque=[ -self.finger.max_motor_torque, -self.finger.max_motor_torque, -self.finger.max_motor_torque, ]) t = self.finger.append_desired_action(action) obs = self.finger.get_observation(t + 1) # new position should be less, as negative torque is applied np.testing.assert_array_less(obs.position, self.start_position) def test_timing_observation_t_vs_tplus1(self): # Apply a max torque action for one step action = self.finger.Action(torque=[ -self.finger.max_motor_torque, -self.finger.max_motor_torque, -self.finger.max_motor_torque, ]) t = self.finger.append_desired_action(action) obs_t = self.finger.get_observation(t) obs_tplus1 = self.finger.get_observation(t + 1) # newer position should be lesser, as negative torque is applied np.testing.assert_array_less(obs_tplus1.position, obs_t.position) def test_timing_observation_t_multiple_times(self): """ Verify that calling get_observation(t) multiple times always gives same result. """ # Apply a max torque action for one step action = self.finger.Action(torque=[ -self.finger.max_motor_torque, -self.finger.max_motor_torque, -self.finger.max_motor_torque, ]) t = self.finger.append_desired_action(action) obs_t1 = self.finger.get_observation(t) # observation should not change when calling multiple times with same t for i in range(10): obs_ti = self.finger.get_observation(t) np.testing.assert_array_equal(obs_t1.position, obs_ti.position) np.testing.assert_array_equal(obs_t1.velocity, obs_ti.velocity) np.testing.assert_array_equal(obs_t1.torque, obs_ti.torque) np.testing.assert_array_equal(obs_t1.tip_force, obs_ti.tip_force) def test_timing_observation_tplus1_multiple_times(self): """ Verify that calling get_observation(t + 1) multiple times always gives same result. """ # Apply a max torque action for one step action = self.finger.Action(torque=[ -self.finger.max_motor_torque, -self.finger.max_motor_torque, -self.finger.max_motor_torque, ]) t = self.finger.append_desired_action(action) obs_t1 = self.finger.get_observation(t + 1) # observation should not change when calling multiple times with same t for i in range(10): obs_ti = self.finger.get_observation(t + 1) np.testing.assert_array_equal(obs_t1.position, obs_ti.position) np.testing.assert_array_equal(obs_t1.velocity, obs_ti.velocity) np.testing.assert_array_equal(obs_t1.torque, obs_ti.torque) np.testing.assert_array_equal(obs_t1.tip_force, obs_ti.tip_force) def test_exception_on_old_t(self): """Verify that calling get_observation with invalid t raises error.""" # verify that t < 0 is not accepted with self.assertRaises(ValueError): self.finger.get_observation(-1) # append two actions t1 = self.finger.append_desired_action(self.finger.Action()) t2 = self.finger.append_desired_action(self.finger.Action()) # it should be possible to get observation for t2 and t2 + 1 but not t1 # or t2 + 2 self.finger.get_observation(t2) self.finger.get_observation(t2 + 1) with self.assertRaises(ValueError): self.finger.get_observation(t1) with self.assertRaises(ValueError): self.finger.get_observation(t2 + 2) def test_observation_types(self): """Verify that all fields of observation are np.ndarrays.""" # Apply a max torque action for one step action = self.finger.Action(torque=[ -self.finger.max_motor_torque, -self.finger.max_motor_torque, -self.finger.max_motor_torque, ]) t = self.finger.append_desired_action(action) obs = self.finger.get_observation(t) # verify types self.assertIsInstance(obs.torque, np.ndarray) self.assertIsInstance(obs.position, np.ndarray) self.assertIsInstance(obs.velocity, np.ndarray) self.assertIsInstance(obs.tip_force, np.ndarray) def test_get_desired_action(self): # verify that t < 0 is not accepted with self.assertRaises(ValueError): self.finger.get_desired_action(-1) orig_action = self.finger.Action(torque=[1.0, 2.0, 3.0], position=[0.0, -1.0, -2.0]) t = self.finger.append_desired_action(orig_action) action = self.finger.get_desired_action(t) np.testing.assert_array_equal(orig_action.torque, action.torque) np.testing.assert_array_equal(orig_action.position, action.position) # verify types self.assertIsInstance(action.torque, np.ndarray) self.assertIsInstance(action.position, np.ndarray) self.assertIsInstance(action.position_kd, np.ndarray) self.assertIsInstance(action.position_kp, np.ndarray) def test_get_applied_action(self): # verify that t < 0 is not accepted with self.assertRaises(ValueError): self.finger.get_applied_action(-1) desired_action = self.finger.Action(torque=[5, 5, 5]) t = self.finger.append_desired_action(desired_action) applied_action = self.finger.get_applied_action(t) np.testing.assert_array_almost_equal( applied_action.torque, [ self.finger.max_motor_torque, self.finger.max_motor_torque, self.finger.max_motor_torque, ], ) # verify types self.assertIsInstance(applied_action.torque, np.ndarray) self.assertIsInstance(applied_action.position, np.ndarray) self.assertIsInstance(applied_action.position_kd, np.ndarray) self.assertIsInstance(applied_action.position_kp, np.ndarray) def test_get_timestamp_ms_001(self): t = self.finger.append_desired_action(self.finger.Action()) first_stamp = self.finger.get_timestamp_ms(t) t = self.finger.append_desired_action(self.finger.Action()) second_stamp = self.finger.get_timestamp_ms(t) # time step is set to 0.001, so the difference between two steps should # be 1 ms. self.assertEqual(second_stamp - first_stamp, 1) def test_get_timestamp_ms_tplus1_001(self): t = self.finger.append_desired_action(self.finger.Action()) stamp_t = self.finger.get_timestamp_ms(t) stamp_tp1 = self.finger.get_timestamp_ms(t + 1) # time step is set to 0.001, so the difference between two steps should # be 1 ms. self.assertEqual(stamp_tp1 - stamp_t, 1) def test_get_timestamp_ms_invalid_t(self): # verify that t < 0 is not accepted with self.assertRaises(ValueError): self.finger.get_timestamp_ms(-1) t = self.finger.append_desired_action(self.finger.Action()) with self.assertRaises(ValueError): self.finger.get_timestamp_ms(t - 1) with self.assertRaises(ValueError): self.finger.get_timestamp_ms(t + 2) def test_get_current_timeindex(self): # no time index before first action with self.assertRaises(ValueError): self.finger.get_current_timeindex() t = self.finger.append_desired_action(self.finger.Action()) self.assertEqual(self.finger.get_current_timeindex(), t)
class TriFingerReach(gym.Env): """ A gym environment to enable training on either the single or the tri-fingers robots for the task of reaching """ def __init__( self, control_rate_s, finger_type, enable_visualization, smoothing_params, use_real_robot=False, finger_config_suffix="0", synchronize=False, ): """Intializes the constituents of the reaching environment. Constructor sets up the finger robot depending on the finger type, and also whether an instance of the simulated or the real robot is to be created. Also sets up the observation and action spaces, smoothing for reducing jitter on the robot, and provides for a way to synchronize robots being trained independently. Args: control_rate_s (float): the rate (in seconds) at which step method of the env will run. The actual robot controller may run at a higher rate, so this is used to compute the number of robot control updates per environment step. finger_type (string): Name of the finger type. In order to get a dictionary of the valid finger types, call :meth:`.finger_types_data.get_valid_finger_types` enable_visualization (bool): if the simulation env is to be visualized smoothing_params (dict): num_episodes (int): the total number of episodes for which the training is performed start_after (float): the fraction of episodes after which the smoothing of applied actions to the motors should start final_alpha (float): smoothing coeff that will be reached at the end of the smoothing stop_after (float): the fraction of total episodes by which final alpha is to be reached, after which the same final alpha will be used for smoothing in the remainder of the episodes is_test (bool, optional): Include this for testing use_real_robot (bool): if training is to be performed on the real robot ([default] False) finger_config_suffix: pass this if only one of the three fingers is to be trained. Valid choices include [0, 120, 240] ([default] 0) synchronize (bool): Set this to True if you want to train independently on three fingers in separate processes, but have them synchronized. ([default] False) """ #: an instance of a simulated, or a real robot depending on #: what is desired. if use_real_robot: from pybullet_fingers.real_finger import RealFinger self.finger = RealFinger( finger_type=finger_type, finger_config_suffix=finger_config_suffix, enable_visualization=enable_visualization, ) else: self.finger = SimFinger( finger_type=finger_type, enable_visualization=enable_visualization, ) self.num_fingers = finger_types_data.get_number_of_fingers(finger_type) #: the number of times the same action is to be applied to #: the robot. self.steps_per_control = int( round(control_rate_s / self.finger.time_step_s)) assert (abs(control_rate_s - self.steps_per_control * self.finger.time_step_s) <= 0.000001) #: the types of observations that should be a part of the environment's #: observed state self.observations_keys = [ "joint_positions", "joint_velocities", "goal_position", "action_joint_positions", ] self.observations_sizes = [ 3 * self.num_fingers, 3 * self.num_fingers, 3 * self.num_fingers, 3 * self.num_fingers, ] # sets up the observation and action spaces for the environment, # unscaled spaces have the custom bounds set up for each observation # or action type, whereas all the values in the observation and action # spaces lie between 1 and -1 self.spaces = FingerSpaces( num_fingers=self.num_fingers, observations_keys=self.observations_keys, observations_sizes=self.observations_sizes, separate_goals=True, ) self.unscaled_observation_space = ( self.spaces.get_unscaled_observation_space()) self.unscaled_action_space = self.spaces.get_unscaled_action_space() self.observation_space = self.spaces.get_scaled_observation_space() self.action_space = self.spaces.get_scaled_action_space() #: a logger to enable logging of observations if desired self.logger = DataLogger() # sets up smooothing if "is_test" in smoothing_params: self.smoothing_start_episode = 0 self.smoothing_alpha = smoothing_params["final_alpha"] self.smoothing_increase_step = 0 self.smoothing_stop_episode = math.inf else: self.smoothing_stop_episode = int( smoothing_params["num_episodes"] * smoothing_params["stop_after"]) self.smoothing_start_episode = int( smoothing_params["num_episodes"] * smoothing_params["start_after"]) num_smoothing_increase_steps = (self.smoothing_stop_episode - self.smoothing_start_episode) self.smoothing_alpha = 0 self.smoothing_increase_step = (smoothing_params["final_alpha"] / num_smoothing_increase_steps) self.smoothed_action = None self.episode_count = 0 #: a marker to visualize where the target goal position for the episode #: is to which the tip link(s) of the robot should reach self.enable_visualization = enable_visualization if self.enable_visualization: self.goal_marker = visual_objects.Marker( number_of_goals=self.num_fingers) # set up synchronization if it's set to true self.synchronize = synchronize if synchronize: now = datetime.datetime.now() self.next_start_time = datetime.datetime(now.year, now.month, now.day, now.hour, now.minute + 1) else: self.next_start_time = None self.seed() self.reset() def _compute_reward(self, observation, goal): """ The reward function of the environment Args: observation (list): the observation at the current step goal (list): the desired goal for the episode Returns: the reward, and the done signal """ joint_positions = observation[ self.spaces.key_to_index["joint_positions"]] end_effector_positions = self.finger.pinocchio_utils.forward_kinematics( np.array(joint_positions)) # TODO is matrix norm really always same as vector norm on flattend # matrices? distance_to_goal = utils.compute_distance(end_effector_positions, goal) reward = -distance_to_goal done = False return reward * self.steps_per_control, done def _get_state(self, observation, action, log_observation=False): """ Get the current observation from the env for the agent Args: log_observation (bool): specify whether you want to log the observation Returns: observation (list): comprising of the observations corresponding to the key values in the observation_keys """ tip_positions = self.finger.pinocchio_utils.forward_kinematics( observation.position) end_effector_position = np.concatenate(tip_positions) joint_positions = observation.position joint_velocities = observation.velocity flat_goals = np.concatenate(self.goal) end_effector_to_goal = list( np.subtract(flat_goals, end_effector_position)) # populate this observation dict from which you can select which # observation types to finally choose depending on the keys # used for constructing the observation space of the environment observation_dict = {} observation_dict["end_effector_position"] = end_effector_position observation_dict["joint_positions"] = joint_positions observation_dict["joint_velocities"] = joint_velocities observation_dict["end_effector_to_goal"] = end_effector_to_goal observation_dict["goal_position"] = flat_goals observation_dict["action_joint_positions"] = action if log_observation: self.logger.append(joint_positions, end_effector_position, time.time()) # returns only the observations corresponding to the keys that were # used for constructing the observation space observation = [ v for key in self.observations_keys for v in observation_dict[key] ] return observation def step(self, action): """ The env step method Args: action (list): the joint positions that have to be achieved Returns: the observation scaled to lie between [-1;1], the reward, the done signal, and info on if the agent was successful at the current step """ # Unscale the action to the ranges of the action space of the # environment, explicitly (as the prediction from the network # lies in the range [-1;1]) unscaled_action = utils.unscale(action, self.unscaled_action_space) # smooth the action by taking a weighted average with the previous # action, where the weight, ie, the smoothing_alpha is gradually # increased at every episode reset (see the reset method for details) if self.smoothed_action is None: # start with current position # self.smoothed_action = self.finger.observation.position self.smoothed_action = unscaled_action self.smoothed_action = (self.smoothing_alpha * self.smoothed_action + (1 - self.smoothing_alpha) * unscaled_action) # this is the control loop to send the actions for a few timesteps # which depends on the actual control rate finger_action = self.finger.Action(position=self.smoothed_action) state = None for _ in range(self.steps_per_control): t = self.finger.append_desired_action(finger_action) observation = self.finger.get_observation(t) # get observation from first iteration (when action is applied the # first time) if state is None: state = self._get_state(observation, self.smoothed_action, True) if self.synchronize: self.observation = observation reward, done = self._compute_reward(state, self.goal) info = {"is_success": np.float32(done)} scaled_observation = utils.scale(state, self.unscaled_observation_space) return scaled_observation, reward, done, info def reset(self): """ Episode reset Returns: the scaled to [-1;1] observation from the env after the reset """ # synchronize episode starts with wall time # (freeze the robot at the current position before starting the sleep) if self.next_start_time: try: t = self.finger.append_desired_action( self.finger.Action(position=self.observation.position)) self.finger.get_observation(t) except Exception: pass utils.sleep_until(self.next_start_time) self.next_start_time += datetime.timedelta(seconds=4) # updates smoothing parameters self.update_smoothing() self.episode_count += 1 self.smoothed_action = None # resets the finger to a random position action = sample.feasible_random_joint_positions_for_reaching( self.finger, self.spaces.action_bounds) observation = self.finger.reset_finger_positions_and_velocities(action) # generates a random goal for the next episode target_joint_config = np.asarray( sample.feasible_random_joint_positions_for_reaching( self.finger, self.spaces.action_bounds)) self.goal = self.finger.pinocchio_utils.forward_kinematics( target_joint_config) if self.enable_visualization: self.goal_marker.set_state(self.goal) # logs relevant information for replayability self.logger.new_episode(target_joint_config, self.goal) return utils.scale( self._get_state(observation, action=action), self.unscaled_observation_space, ) def update_smoothing(self): """ Update the smoothing coefficient with which the action to be applied is smoothed """ if (self.smoothing_start_episode <= self.episode_count < self.smoothing_stop_episode): self.smoothing_alpha += self.smoothing_increase_step print("episode: {}, smoothing: {}".format(self.episode_count, self.smoothing_alpha))