def get_sample(self, dataset, augment=True): """Get a dataset sample. Args: dataset: a ravens.Dataset (train or validation) augment: if True, perform data augmentation. Returns: tuple of data for training: (input_image, p0, p0_theta, p1, p1_theta) tuple additionally includes (z, roll, pitch) if self.six_dof if self.use_goal_image, then the goal image is stacked with the current image in `input_image`. If splitting up current and goal images is desired, it should be done outside this method. """ (obs, act, _, _), _ = dataset.sample() img = self.get_image(obs) # Get training labels from data sample. p0_xyz, p0_xyzw = act['pose0'] p1_xyz, p1_xyzw = act['pose1'] p0 = utils.xyz_to_pix(p0_xyz, self.bounds, self.pix_size) p0_theta = -np.float32(utils.quatXYZW_to_eulerXYZ(p0_xyzw)[2]) p1 = utils.xyz_to_pix(p1_xyz, self.bounds, self.pix_size) p1_theta = -np.float32(utils.quatXYZW_to_eulerXYZ(p1_xyzw)[2]) p1_theta = p1_theta - p0_theta p0_theta = 0 # Data augmentation. if augment: img, _, (p0, p1), _ = utils.perturb(img, [p0, p1]) return img, p0, p0_theta, p1, p1_theta
def train(self, dataset, num_iter, writer, validation_dataset=None): """Train on dataset for a specific number of iterations.""" del validation_dataset for i in range(num_iter): obs, act, _ = dataset.random_sample() # Get heightmap from RGB-D images. configs = act['camera_config'] colormap, heightmap = self.get_heightmap(obs, configs) # Get training labels from data sample. pose0, pose1 = act['params']['pose0'], act['params']['pose1'] p0_position, p0_rotation = pose0[0], pose0[1] p0 = utils.position_to_pixel(p0_position, self.bounds, self.pixel_size) p0_theta = -np.float32( utils.get_rot_from_pybullet_quaternion(p0_rotation)[2]) p1_position, p1_rotation = pose1[0], pose1[1] p1 = utils.position_to_pixel(p1_position, self.bounds, self.pixel_size) p1_theta = -np.float32( utils.get_rot_from_pybullet_quaternion(p1_rotation)[2]) p1_theta = p1_theta - p0_theta p0_theta = 0 # Concatenate color with depth images. input_image = np.concatenate( (colormap, heightmap[Ellipsis, None], heightmap[Ellipsis, None], heightmap[Ellipsis, None]), axis=2) # Do data augmentation (perturb rotation and translation). input_image, _, roundedpixels, _ = utils.perturb( input_image, [p0, p1]) p0, p1 = roundedpixels # Compute training loss. loss0 = self.pick_model.train(input_image, p0, theta=0) loss1 = self.place_model.train(input_image, p1, theta=0) loss2 = self.match_model.train(input_image, p0, p1, theta=p1_theta) with writer.as_default(): tf.summary.scalar('pick_loss', self.pick_model.metric.result(), step=self.total_iter + i) tf.summary.scalar('place_loss', self.place_model.metric.result(), step=self.total_iter + i) tf.summary.scalar('match_loss', self.match_model.metric.result(), step=self.total_iter + i) print( f'Train Iter: {self.total_iter + i} Loss: {loss0:.4f} {loss1:.4f} {loss2:.4f}' ) self.total_iter += num_iter self.save()
def get_data_batch(self, dataset, augment=True): """Sample batch.""" batch_obs = [] batch_act = [] for _ in range(self.batch_size): obs, act, _ = dataset.random_sample() # Get heightmap from RGB-D images. configs = act['camera_config'] colormap, heightmap = self.get_heightmap(obs, configs) # self.show_images(colormap, heightmap) # Concatenate color with depth images. input_image = np.concatenate((colormap, heightmap[Ellipsis, None], heightmap[Ellipsis, None], heightmap[Ellipsis, None]), axis=2) # or just use rgb # input_image = colormap # Apply augmentation if augment: # note: these pixels are made up, # just to keep the perturb function happy. p0 = (160, 80) p1 = (160, 80) input_image, _, _, transform_params = utils.perturb( input_image, [p0, p1], set_theta_zero=False) t_world_center, t_world_centeraug = utils.get_se3_from_image_transform( *transform_params, heightmap, self.bounds, self.pixel_size) t_worldaug_world = t_world_centeraug @ np.linalg.inv(t_world_center) else: t_worldaug_world = np.eye(4) batch_obs.append(input_image) batch_act.append(self.act_to_gt_act( act, t_worldaug_world)) # this samples pick points from surface # import matplotlib.pyplot as plt # plt.imshow(input_image) # plt.scatter(p0[1], p0[0]) # plt.scatter(p1[1], p1[0]) # plt.show() # plt.imshow(input_image) # plt.scatter(p0[1], p0[0]) # plt.scatter(p1[1], p1[0]) # plt.show() batch_obs = np.array(batch_obs) batch_act = np.array(batch_act) return batch_obs, batch_act
def get_sample(self, dataset, augment=True): (obs, act, _, _), _ = dataset.sample() img = self.get_image(obs) # Get training labels from data sample. p0_xyz, p0_xyzw = act['pose0'] p1_xyz, p1_xyzw = act['pose1'] p0 = utils.xyz_to_pix(p0_xyz, self.bounds, self.pix_size) p0_theta = -np.float32(utils.quatXYZW_to_eulerXYZ(p0_xyzw)[2]) p1 = utils.xyz_to_pix(p1_xyz, self.bounds, self.pix_size) p1_theta = -np.float32(utils.quatXYZW_to_eulerXYZ(p1_xyzw)[2]) p1_theta = p1_theta - p0_theta p0_theta = 0 if augment: img, _, (p0, p1), transforms = utils.perturb(img, [p0, p1]) p0_theta, p1_theta, z, roll, pitch = self.get_six_dof( transforms, img[:, :, 3], (p0_xyz, p0_xyzw), (p1_xyz, p1_xyzw)) return img, p0, p0_theta, p1, p1_theta, z, roll, pitch
def get_data_batch(self, dataset, augment=True): """Use dataset to extract and preprocess data. Supports adding a goal image, in which case the current and goal images are stacked together channel-wise (first 6 for current, last 6 for goal) before doing data augmentation, to ensure consistency. Args: dataset: a ravens.Dataset (train or validation) augment: if True, perform data augmentation. Returns: tuple of data for training: (input_image, p0, p0_theta, p1, p1_theta) tuple additionally includes (z, roll, pitch) if self.six_dof if self.use_goal_image, then the goal image is stacked with the current image in `input_image`. If splitting up current and goal images is desired, it should be done outside this method. """ if self.use_goal_image: obs, act, _, goal = dataset.random_sample(goal_images=True) else: obs, act, _ = dataset.random_sample() # Get heightmap from RGB-D images, including goal images if specified. configs = act['camera_config'] colormap, heightmap = self.get_heightmap(obs, configs) if self.use_goal_image: colormap_g, heightmap_g = self.get_heightmap(goal, configs) # Get training labels from data sample. pose0, pose1 = act['params']['pose0'], act['params']['pose1'] p0_position, p0_rotation = pose0[0], pose0[1] p0 = utils.position_to_pixel(p0_position, self.bounds, self.pixel_size) p0_theta = -np.float32( utils.get_rot_from_pybullet_quaternion(p0_rotation)[2]) p1_position, p1_rotation = pose1[0], pose1[1] p1 = utils.position_to_pixel(p1_position, self.bounds, self.pixel_size) p1_theta = -np.float32( utils.get_rot_from_pybullet_quaternion(p1_rotation)[2]) # Concatenate color with depth images. input_image = self.concatenate_c_h(colormap, heightmap) # If using goal image, stack _with_ input_image before data augmentation. if self.use_goal_image: goal_image = self.concatenate_c_h(colormap_g, heightmap_g) input_image = np.concatenate((input_image, goal_image), axis=2) assert input_image.shape[2] == 12, input_image.shape # Do data augmentation (perturb rotation and translation). if augment: input_image, _, rounded_pixels, transform_params = utils.perturb( input_image, [p0, p1]) p0, p1 = rounded_pixels if self.six_dof: if not augment: transform_params = None p0_theta, p1_theta, z, roll, pitch = self.get_six_dof( transform_params, heightmap, pose0, pose1, augment=augment) return input_image, p0, p0_theta, p1, p1_theta, z, roll, pitch else: # If using a goal image, it is stacked with `input_image` and split later. p1_theta = p1_theta - p0_theta p0_theta = 0 return input_image, p0, p0_theta, p1, p1_theta
def train(self, dataset, num_iter, writer): """Train on dataset for a specific number of iterations. Daniel: notice how little training data we use! One 'iteration' is simply one image and an associated action, drawn by (a) sampling demo, then (b) sampling time within it. We do heavy data augmentation, but it's still just one real image. If using a goal image, we use a different random_sample method that also picks the LAST image of that episode, which is assigned as the goal image. This would likely not work for super long-horizon tasks, but maybe; (Agarwal et al., NeurIPS 2016) in the PokeBot paper actually got something like this 'greedy-style' planning to work. Otherwise we might have to do something like (Nair et al., ICRA 2017) in the follow-up work where we feed in a target image for each time step, which would be the *next* image saved. For data augmentation with this goal image, I believe we should stack the current and goal image together, and THEN do augmentation. The perturb method will make sure placing pixels are preserved -- which for short-horizon environments usually means the goal image will contain most of the relevant information. When data augmenting, for both normal and goal-conditioned Transporters, the p1_theta (rotation) is the same, but pick points are correctly 'converted' to those appropriate for the augmented images. """ for i in range(num_iter): if self.use_goal_image: obs, act, info, goal = dataset.random_sample(goal_images=True) else: obs, act, info = dataset.random_sample() # Get heightmap from RGB-D images. configs = act['camera_config'] colormap, heightmap = self.get_heightmap(obs, configs) if self.use_goal_image: colormap_g, heightmap_g = self.get_heightmap(goal, configs) # Get training labels from data sample. pose0, pose1 = act['params']['pose0'], act['params']['pose1'] p0_position, p0_rotation = pose0[0], pose0[1] p0 = utils.position_to_pixel(p0_position, self.bounds, self.pixel_size) p0_theta = -np.float32(p.getEulerFromQuaternion(p0_rotation)[2]) p1_position, p1_rotation = pose1[0], pose1[1] p1 = utils.position_to_pixel(p1_position, self.bounds, self.pixel_size) p1_theta = -np.float32(p.getEulerFromQuaternion(p1_rotation)[2]) p1_theta = p1_theta - p0_theta p0_theta = 0 # Concatenate color with depth images. input_image = self.concatenate_c_h(colormap, heightmap) # If using goal image, stack _with_ input_image for data augmentation. if self.use_goal_image: goal_image = self.concatenate_c_h(colormap_g, heightmap_g) input_image = np.concatenate((input_image, goal_image), axis=2) assert input_image.shape[2] == 12, input_image.shape # Do data augmentation (perturb rotation and translation). original_pixels = (p0, p1) input_image, pixels = utils.perturb(input_image, [p0, p1]) p0, p1 = pixels # Optionally visualize images _after_ data agumentation. if False: self.visualize_images(p0, p0_theta, p1, p1_theta, original_pixels, colormap=colormap, heightmap=heightmap, colormap_g=colormap_g, heightmap_g=heightmap_g, input_image=input_image, before_aug=False) # Compute Attention training loss. if self.attn_no_targ and self.use_goal_image: maxdim = int(input_image.shape[2] / 2) input_only = input_image[:, :, :maxdim] loss0 = self.attention_model.train(input_only, p0, p0_theta) else: loss0 = self.attention_model.train(input_image, p0, p0_theta) with writer.as_default(): tf.summary.scalar('attention_loss', self.attention_model.metric.result(), step=self.total_iter+i) # Compute Transport training loss. if isinstance(self.transport_model, Attention): loss1 = self.transport_model.train(input_image, p1, p1_theta) elif isinstance(self.transport_model, TransportGoal): half = int(input_image.shape[2] / 2) img_curr = input_image[:, :, :half] img_goal = input_image[:, :, half:] loss1 = self.transport_model.train(img_curr, img_goal, p0, p1, p1_theta) else: loss1 = self.transport_model.train(input_image, p0, p1, p1_theta) with writer.as_default(): tf.summary.scalar('transport_loss', self.transport_model.metric.result(), step=self.total_iter+i) print(f'Train Iter: {self.total_iter + i} Loss: {loss0:.4f} {loss1:.4f}') self.total_iter += num_iter self.save()