def train(self, dataset, num_iter, writer, validation_dataset=None): """Train on dataset for a specific number of iterations.""" del validation_dataset for i in range(num_iter): obs, act, _ = dataset.random_sample() # Get heightmap from RGB-D images. configs = act['camera_config'] colormap, heightmap = self.get_heightmap(obs, configs) # Get training labels from data sample. pose0, pose1 = act['params']['pose0'], act['params']['pose1'] p0_position, p0_rotation = pose0[0], pose0[1] p0 = utils.position_to_pixel(p0_position, self.bounds, self.pixel_size) p0_theta = -np.float32( utils.get_rot_from_pybullet_quaternion(p0_rotation)[2]) p1_position, p1_rotation = pose1[0], pose1[1] p1 = utils.position_to_pixel(p1_position, self.bounds, self.pixel_size) p1_theta = -np.float32( utils.get_rot_from_pybullet_quaternion(p1_rotation)[2]) p1_theta = p1_theta - p0_theta p0_theta = 0 # Concatenate color with depth images. input_image = np.concatenate( (colormap, heightmap[Ellipsis, None], heightmap[Ellipsis, None], heightmap[Ellipsis, None]), axis=2) # Do data augmentation (perturb rotation and translation). input_image, _, roundedpixels, _ = utils.perturb( input_image, [p0, p1]) p0, p1 = roundedpixels # Compute training loss. loss0 = self.pick_model.train(input_image, p0, theta=0) loss1 = self.place_model.train(input_image, p1, theta=0) loss2 = self.match_model.train(input_image, p0, p1, theta=p1_theta) with writer.as_default(): tf.summary.scalar('pick_loss', self.pick_model.metric.result(), step=self.total_iter + i) tf.summary.scalar('place_loss', self.place_model.metric.result(), step=self.total_iter + i) tf.summary.scalar('match_loss', self.match_model.metric.result(), step=self.total_iter + i) print( f'Train Iter: {self.total_iter + i} Loss: {loss0:.4f} {loss1:.4f} {loss2:.4f}' ) self.total_iter += num_iter self.save()
def extract_x_y_theta(self, object_info, t_worldaug_world=None, preserve_theta=False): """Extract in-plane theta.""" object_position = object_info[0] object_quat_xyzw = object_info[1] if t_worldaug_world is not None: object_quat_wxyz = (object_quat_xyzw[3], object_quat_xyzw[0], object_quat_xyzw[1], object_quat_xyzw[2]) t_world_object = transformations.quaternion_matrix(object_quat_wxyz) t_world_object[0:3, 3] = np.array(object_position) t_worldaug_object = t_worldaug_world @ t_world_object object_quat_wxyz = transformations.quaternion_from_matrix( t_worldaug_object) if not preserve_theta: object_quat_xyzw = (object_quat_wxyz[1], object_quat_wxyz[2], object_quat_wxyz[3], object_quat_wxyz[0]) object_position = t_worldaug_object[0:3, 3] object_xy = object_position[0:2] object_theta = -np.float32( utils.get_rot_from_pybullet_quaternion(object_quat_xyzw) [2]) / self.theta_scale return np.hstack( (object_xy, object_theta)).astype(np.float32), object_position, object_quat_xyzw
def get_six_dof_object(self, object_info, t_worldaug_world=None): """Calculate the pose of 6DOF object.""" object_position = object_info[0] object_quat_xyzw = object_info[1] if t_worldaug_world is not None: object_quat_wxyz = (object_quat_xyzw[3], object_quat_xyzw[0], object_quat_xyzw[1], object_quat_xyzw[2]) t_world_object = transformations.quaternion_matrix( object_quat_wxyz) t_world_object[0:3, 3] = np.array(object_position) t_worldaug_object = t_worldaug_world @ t_world_object object_quat_wxyz = transformations.quaternion_from_matrix( t_worldaug_object) object_quat_xyzw = (object_quat_wxyz[1], object_quat_wxyz[2], object_quat_wxyz[3], object_quat_wxyz[0]) object_position = t_worldaug_object[0:3, 3] euler = utils.get_rot_from_pybullet_quaternion(object_quat_xyzw) roll = euler[0] pitch = euler[1] theta = -euler[2] return np.asarray([ object_position[0], object_position[1], object_position[2], roll, pitch, theta ])
def base2origin(self, pos): pos = pos.copy() theta = utils.get_rot_from_pybullet_quaternion(self.base_rot)[2] x = np.cos(theta) * pos[0] - np.sin(theta) * pos[1] + self.base_pos[0] y = np.sin(theta) * pos[0] + np.cos(theta) * pos[1] + self.base_pos[1] z = pos[2] + self.base_pos[2] return np.array([x, y, z])
def get_six_dof_act(self, transform_params, heightmap, pose0, pose1): """Adjust SE(3) poses via the in-plane SE(2) augmentation transform.""" p1_position, p1_rotation = pose1[0], pose1[1] p0_position, p0_rotation = pose0[0], pose0[1] if transform_params is not None: t_world_center, t_world_centernew = utils.get_se3_from_image_transform( *transform_params, heightmap, self.bounds, self.pixel_size) t_worldnew_world = t_world_centernew @ np.linalg.inv( t_world_center) else: t_worldnew_world = np.eye(4) p1_quat_wxyz = (p1_rotation[3], p1_rotation[0], p1_rotation[1], p1_rotation[2]) t_world_p1 = transformations.quaternion_matrix(p1_quat_wxyz) t_world_p1[0:3, 3] = np.array(p1_position) t_worldnew_p1 = t_worldnew_world @ t_world_p1 p0_quat_wxyz = (p0_rotation[3], p0_rotation[0], p0_rotation[1], p0_rotation[2]) t_world_p0 = transformations.quaternion_matrix(p0_quat_wxyz) t_world_p0[0:3, 3] = np.array(p0_position) t_worldnew_p0 = t_worldnew_world @ t_world_p0 t_worldnew_p0theta0 = t_worldnew_p0 * 1.0 t_worldnew_p0theta0[0:3, 0:3] = np.eye(3) # PLACE FRAME, adjusted for this 0 rotation on pick t_p0_p0theta0 = np.linalg.inv(t_worldnew_p0) @ t_worldnew_p0theta0 t_worldnew_p1theta0 = t_worldnew_p1 @ t_p0_p0theta0 # convert the above rotation to euler quatwxyz_worldnew_p1theta0 = transformations.quaternion_from_matrix( t_worldnew_p1theta0) q = quatwxyz_worldnew_p1theta0 quatxyzw_worldnew_p1theta0 = (q[1], q[2], q[3], q[0]) p1_rotation = quatxyzw_worldnew_p1theta0 p1_euler = utils.get_rot_from_pybullet_quaternion(p1_rotation) roll_scaled = p1_euler[0] / self.theta_scale pitch_scaled = p1_euler[1] / self.theta_scale p1_theta_scaled = -p1_euler[2] / self.theta_scale x = p1_position[0] y = p1_position[1] z = p1_position[2] return np.array([x, y, p1_theta_scaled, roll_scaled, pitch_scaled, z])
def reset(self, env): self.num_steps = 1 self.goal = {'places': {}, 'steps': []} # Generate randomly shaped box. box_size = self.random_size(0.05, 0.15, 0.05, 0.15, 0.01, 0.06) # Add corner. dimx = (box_size[0] / 2 - 0.025 + 0.0025, box_size[0] / 2 + 0.0025) dimy = (box_size[1] / 2 + 0.0025, box_size[1] / 2 - 0.025 + 0.0025) corner_template = 'assets/corner/corner-template.urdf' replace = {'DIMX': dimx, 'DIMY': dimy} corner_urdf = self.fill_template(corner_template, replace) corner_size = (box_size[0], box_size[1], 0) corner_pose = self.random_pose(env, corner_size) env.add_object(corner_urdf, corner_pose, fixed=True) os.remove(corner_urdf) # Add possible placing poses. theta = utils.get_rot_from_pybullet_quaternion(corner_pose[1])[2] flipped_rotation = utils.get_pybullet_quaternion_from_rot( (0, 0, theta + np.pi)) flipped_pose = (corner_pose[0], flipped_rotation) alt_x = (box_size[0] / 2) - (box_size[1] / 2) alt_y = (box_size[1] / 2) - (box_size[0] / 2) alt_position = (alt_x, alt_y, 0) alt_rotation0 = utils.get_pybullet_quaternion_from_rot( (0, 0, np.pi / 2)) alt_rotation1 = utils.get_pybullet_quaternion_from_rot( (0, 0, 3 * np.pi / 2)) alt_pose0 = utils.multiply(corner_pose, (alt_position, alt_rotation0)) alt_pose1 = utils.multiply(corner_pose, (alt_position, alt_rotation1)) self.goal['places'] = { 0: corner_pose, 1: flipped_pose, 2: alt_pose0, 3: alt_pose1 } # Add box. box_template = 'assets/box/box-template.urdf' box_urdf = self.fill_template(box_template, {'DIM': box_size}) box_pose = self.random_pose(env, box_size) box_id = env.add_object(box_urdf, box_pose) os.remove(box_urdf) self.color_random_brown(box_id) self.goal['steps'].append({box_id: (2 * np.pi, [0, 1, 2, 3])})
def act(self, obs, info, compute_error=False, gt_act=None): """Run inference and return best action given visual observations.""" act = {'camera_config': self.camera_config, 'primitive': None} if not obs: return act # Get heightmap from RGB-D images. colormap, heightmap = self.get_heightmap(obs, self.camera_config) # Concatenate color with depth images. input_image = np.concatenate( (colormap, heightmap[Ellipsis, None], heightmap[Ellipsis, None], heightmap[Ellipsis, None]), axis=2) # Attention model forward pass. attention = self.attention_model.forward(input_image) argmax = np.argmax(attention) argmax = np.unravel_index(argmax, shape=attention.shape) p0_pixel = argmax[:2] p0_theta = argmax[2] * (2 * np.pi / attention.shape[2]) # Transport model forward pass. transport = self.transport_model.forward(input_image, p0_pixel) _, z, roll, pitch = self.rpz_model.forward(input_image, p0_pixel) argmax = np.argmax(transport) argmax = np.unravel_index(argmax, shape=transport.shape) # Index into 3D discrete tensor, grab z, roll, pitch activations z_best = z[:, argmax[0], argmax[1], argmax[2]][Ellipsis, None] roll_best = roll[:, argmax[0], argmax[1], argmax[2]][Ellipsis, None] pitch_best = pitch[:, argmax[0], argmax[1], argmax[2]][Ellipsis, None] # Send through regressors for each of z, roll, pitch z_best = self.rpz_model.z_regressor(z_best)[0, 0] roll_best = self.rpz_model.roll_regressor(roll_best)[0, 0] pitch_best = self.rpz_model.pitch_regressor(pitch_best)[0, 0] p1_pixel = argmax[:2] p1_theta = argmax[2] * (2 * np.pi / transport.shape[2]) # Pixels to end effector poses. p0_position = utils.pixel_to_position(p0_pixel, heightmap, self.bounds, self.pixel_size) p1_position = utils.pixel_to_position(p1_pixel, heightmap, self.bounds, self.pixel_size) p1_position = (p1_position[0], p1_position[1], z_best) p0_rotation = utils.get_pybullet_quaternion_from_rot((0, 0, -p0_theta)) p1_rotation = utils.get_pybullet_quaternion_from_rot( (roll_best, pitch_best, -p1_theta)) if compute_error: gt_p0_position, gt_p0_rotation = gt_act['params']['pose0'] gt_p1_position, gt_p1_rotation = gt_act['params']['pose1'] gt_p0_pixel = np.array( utils.position_to_pixel(gt_p0_position, self.bounds, self.pixel_size)) gt_p1_pixel = np.array( utils.position_to_pixel(gt_p1_position, self.bounds, self.pixel_size)) self.p0_pixel_error( np.linalg.norm(gt_p0_pixel - np.array(p0_pixel))) self.p1_pixel_error( np.linalg.norm(gt_p1_pixel - np.array(p1_pixel))) gt_p0_theta = -np.float32( utils.get_rot_from_pybullet_quaternion(gt_p0_rotation)[2]) gt_p1_theta = -np.float32( utils.get_rot_from_pybullet_quaternion(gt_p1_rotation)[2]) self.p0_theta_error( abs((np.rad2deg(gt_p0_theta - p0_theta) + 180) % 360 - 180)) self.p1_theta_error( abs((np.rad2deg(gt_p1_theta - p1_theta) + 180) % 360 - 180)) return None return self.p0_p1_position_rotations_to_act(act, p0_position, p0_rotation, p1_position, p1_rotation)
def get_data_batch(self, dataset, augment=True): """Use dataset to extract and preprocess data. Supports adding a goal image, in which case the current and goal images are stacked together channel-wise (first 6 for current, last 6 for goal) before doing data augmentation, to ensure consistency. Args: dataset: a ravens.Dataset (train or validation) augment: if True, perform data augmentation. Returns: tuple of data for training: (input_image, p0, p0_theta, p1, p1_theta) tuple additionally includes (z, roll, pitch) if self.six_dof if self.use_goal_image, then the goal image is stacked with the current image in `input_image`. If splitting up current and goal images is desired, it should be done outside this method. """ if self.use_goal_image: obs, act, _, goal = dataset.random_sample(goal_images=True) else: obs, act, _ = dataset.random_sample() # Get heightmap from RGB-D images, including goal images if specified. configs = act['camera_config'] colormap, heightmap = self.get_heightmap(obs, configs) if self.use_goal_image: colormap_g, heightmap_g = self.get_heightmap(goal, configs) # Get training labels from data sample. pose0, pose1 = act['params']['pose0'], act['params']['pose1'] p0_position, p0_rotation = pose0[0], pose0[1] p0 = utils.position_to_pixel(p0_position, self.bounds, self.pixel_size) p0_theta = -np.float32( utils.get_rot_from_pybullet_quaternion(p0_rotation)[2]) p1_position, p1_rotation = pose1[0], pose1[1] p1 = utils.position_to_pixel(p1_position, self.bounds, self.pixel_size) p1_theta = -np.float32( utils.get_rot_from_pybullet_quaternion(p1_rotation)[2]) # Concatenate color with depth images. input_image = self.concatenate_c_h(colormap, heightmap) # If using goal image, stack _with_ input_image before data augmentation. if self.use_goal_image: goal_image = self.concatenate_c_h(colormap_g, heightmap_g) input_image = np.concatenate((input_image, goal_image), axis=2) assert input_image.shape[2] == 12, input_image.shape # Do data augmentation (perturb rotation and translation). if augment: input_image, _, rounded_pixels, transform_params = utils.perturb( input_image, [p0, p1]) p0, p1 = rounded_pixels if self.six_dof: if not augment: transform_params = None p0_theta, p1_theta, z, roll, pitch = self.get_six_dof( transform_params, heightmap, pose0, pose1, augment=augment) return input_image, p0, p0_theta, p1, p1_theta, z, roll, pitch else: # If using a goal image, it is stacked with `input_image` and split later. p1_theta = p1_theta - p0_theta p0_theta = 0 return input_image, p0, p0_theta, p1, p1_theta
def get_six_dof(self, transform_params, heightmap, pose0, pose1, augment=True): """Adjust SE(3) poses via the in-plane SE(2) augmentation transform.""" debug_visualize = False p1_position, p1_rotation = pose1[0], pose1[1] p0_position, p0_rotation = pose0[0], pose0[1] if debug_visualize: self.vis = utils.create_visualizer() self.transport_model.vis = self.vis if augment: t_world_center, t_world_centernew = utils.get_se3_from_image_transform( *transform_params, heightmap, self.bounds, self.pixel_size) if debug_visualize: label = 't_world_center' utils.make_frame(self.vis, label, h=0.05, radius=0.0012, o=1.0) self.vis[label].set_transform(t_world_center) label = 't_world_centernew' utils.make_frame(self.vis, label, h=0.05, radius=0.0012, o=1.0) self.vis[label].set_transform(t_world_centernew) t_worldnew_world = t_world_centernew @ np.linalg.inv( t_world_center) else: t_worldnew_world = np.eye(4) p1_quat_wxyz = (p1_rotation[3], p1_rotation[0], p1_rotation[1], p1_rotation[2]) t_world_p1 = transformations.quaternion_matrix(p1_quat_wxyz) t_world_p1[0:3, 3] = np.array(p1_position) t_worldnew_p1 = t_worldnew_world @ t_world_p1 p0_quat_wxyz = (p0_rotation[3], p0_rotation[0], p0_rotation[1], p0_rotation[2]) t_world_p0 = transformations.quaternion_matrix(p0_quat_wxyz) t_world_p0[0:3, 3] = np.array(p0_position) t_worldnew_p0 = t_worldnew_world @ t_world_p0 if debug_visualize: label = 't_worldnew_p1' utils.make_frame(self.vis, label, h=0.05, radius=0.0012, o=1.0) self.vis[label].set_transform(t_worldnew_p1) label = 't_world_p1' utils.make_frame(self.vis, label, h=0.05, radius=0.0012, o=1.0) self.vis[label].set_transform(t_world_p1) label = 't_worldnew_p0-0thetaoriginally' utils.make_frame(self.vis, label, h=0.05, radius=0.0021, o=1.0) self.vis[label].set_transform(t_worldnew_p0) # PICK FRAME, using 0 rotation due to suction rotational symmetry t_worldnew_p0theta0 = t_worldnew_p0 * 1.0 t_worldnew_p0theta0[0:3, 0:3] = np.eye(3) if debug_visualize: label = 'PICK' utils.make_frame(self.vis, label, h=0.05, radius=0.0021, o=1.0) self.vis[label].set_transform(t_worldnew_p0theta0) # PLACE FRAME, adjusted for this 0 rotation on pick t_p0_p0theta0 = np.linalg.inv(t_worldnew_p0) @ t_worldnew_p0theta0 t_worldnew_p1theta0 = t_worldnew_p1 @ t_p0_p0theta0 if debug_visualize: label = 'PLACE' utils.make_frame(self.vis, label, h=0.05, radius=0.0021, o=1.0) self.vis[label].set_transform(t_worldnew_p1theta0) # convert the above rotation to euler quatwxyz_worldnew_p1theta0 = transformations.quaternion_from_matrix( t_worldnew_p1theta0) q = quatwxyz_worldnew_p1theta0 quatxyzw_worldnew_p1theta0 = (q[1], q[2], q[3], q[0]) p1_rotation = quatxyzw_worldnew_p1theta0 p1_euler = utils.get_rot_from_pybullet_quaternion(p1_rotation) roll = p1_euler[0] pitch = p1_euler[1] p1_theta = -p1_euler[2] p0_theta = 0 z = p1_position[2] return p0_theta, p1_theta, z, roll, pitch
def extract_x_y_theta(self, object_info, t_worldaug_world=None, preserve_theta=False, softbody=False): """Given either object OR action pose info, return stuff to put in GT observation. Note: only called from within this class and 2-step case via subclassing. During training, there is normally data augmentation applied, so t_worldaug_world is NOT None, and augmentation is applied as if the 'image' were adjusted. However, for actions, we preserve theta so `object_quat_xyzw` does not change, and we query index=2 for the z rotation. For most deformables tasks, we did not use rotations, hence theta=0. Get t_world_object which is a 4x4 homogeneous matrix, then stick the position there (why?). Then we do the matrix multiplication with '@' to get the homogeneous matrix representation of the object after augmentation is applied. https://stackoverflow.com/questions/34142485/difference-between-numpy-dot-and-python-3-5-matrix-multiplication Then the position is extracted from the resulting homogeneous matrix. (And we ignore the z coordinate for that, but in fact for cloth vertices I would keep it.) Augmentation for cloth vertices? For now I'm following what they do for positions only (ignoring orientation), and returning updated (x,y,z) where the last value is the z (height) and not a rotation (doesn't make sense with vertrics anyway). Augmentations are all in SE(2), so the z coordinates should not change. Indeed that's what I see after extracting positions from `t_worldaug_object`. And yes, with vertices, they are flattened in the same way regardless of whether data augmentation is used, so every third value represents a vertex height. :D So we get [ v1_x, v1_y, v1_z, v2_x, v2_y, v2_z, ... ] in the flattened version. Args: t_worldaug_world: Only True if there's an augmentation and we need to change the ground truth pose as if we applied the augmentation. Only happens during (a) determining mean/stds, and (b) training. During initialization and the action (loading) we do not use this. preserve_theta: Only True if calling where `object_info` is an action pose. softbody: Only True if `object_info` is a vertex mesh of a PyBullet soft body. Returns: object_x_y_theta, pos, quat: the first is an SE(2) pose and parameterized by three numbers, the xy position and a scalar rotation `theta`. In case we have a cloth, I'm not returning anything after that (no pos or quat) so that if we use the older API for cloth (which we should not be!), it should crash code. """ # ------------------------------------------------------------------------------------------ # # Daniel: sanity checks to make sure we're handling the soft body ID correctly. # The last `if` of `softbody` helps to distinguish among actions, which are also length 2. # ------------------------------------------------------------------------------------------ # if (self.task in TASKS_SOFT) and (len(object_info) == 2) and softbody: nb_vertices, vert_pos_l = object_info assert nb_vertices == 100, f'We should be using 100 vertices but have: {nb_vertices}' assert nb_vertices == len( vert_pos_l), f'{nb_vertices}, {len(vert_pos_l)}' vert_pos_np = np.array(vert_pos_l) # (100, 3) if t_worldaug_world is not None: augmented_vertices = [] # For each vertex, apply data augmentation. for i in range(vert_pos_np.shape[0]): vertex_position = vert_pos_np[i, :] # Use identity quaternion (w=1, others 0). Othewrise, follow normal augmentation. t_world_object = transformations.quaternion_matrix( (1, 0, 0, 0)) t_world_object[0:3, 3] = np.array(vertex_position) t_worldaug_object = t_worldaug_world @ t_world_object new_v_position = t_worldaug_object[0:3, 3] augmented_vertices.append(new_v_position) # Stack everything. augmented_flattened = np.concatenate( augmented_vertices).flatten().astype(np.float32) return augmented_flattened else: vert_flattened = vert_pos_np.flatten().astype(np.float32) return vert_flattened # ------------------------------------------------------------------------------------------ # # Now proceed as usual, untouched from normal ravens code except for documentation/assertions. object_position = object_info[0] object_quat_xyzw = object_info[1] if t_worldaug_world is not None: object_quat_wxyz = (object_quat_xyzw[3], object_quat_xyzw[0], object_quat_xyzw[1], object_quat_xyzw[2]) t_world_object = transformations.quaternion_matrix( object_quat_wxyz) t_world_object[0:3, 3] = np.array(object_position) # Daniel: pretty sure this has to be true. Upper left 3x3 is rotation, upper right 3x1 is position. assert t_world_object.shape == ( 4, 4), f'shape is not 4x4: {t_world_object.shape}' assert t_worldaug_world.shape == ( 4, 4), f'shape is not 4x4: {t_worldaug_world.shape}' # Daniel: data augmentation. Then, extract `object_position` from augmented object. t_worldaug_object = t_worldaug_world @ t_world_object object_quat_wxyz = transformations.quaternion_from_matrix( t_worldaug_object) if not preserve_theta: object_quat_xyzw = (object_quat_wxyz[1], object_quat_wxyz[2], object_quat_wxyz[3], object_quat_wxyz[0]) object_position = t_worldaug_object[0:3, 3] object_xy = object_position[0:2] object_theta = -np.float32( utils.get_rot_from_pybullet_quaternion(object_quat_xyzw) [2]) / self.THETA_SCALE return np.hstack((object_xy, object_theta)).astype( np.float32), object_position, object_quat_xyzw