def get_mu_sigma(self, feat): # feat is B x T+1 x 3 # print('getting mu, sigma') # print('feat', feat.shape) B, C = list(feat.shape) pred = self.mlp(feat) # pred = pred*0.1 # do not move so fast # mu is good to go # mu = pred[:,:3] mu = pred[:, :2] # sig needs some cleaning # sig_raw = pred[:,3:].reshape(B, 3, 3) sig_raw = pred[:, 2:].reshape(B, 2, 2) # sig_raw = sig_raw + 1e-6 * utils_geom.eye_3x3(B) # help it be diagonal sig_raw = sig_raw + 1e-8 * utils_geom.eye_2x2(B) # help it be diagonal sig_raw = self.expm_softclip(sig_raw, L=5) # print('mu_', sig_, sig_.shape) # make it positive semi-definite # (r2p2 used a matrix exponential to do this step) sig = utils_basic.matmul2(sig_raw, sig_raw.permute(0, 2, 1)) # sig = sig + 1e-4 * utils_geom.eye_3x3(B) # help it be diagonal sig_inv = utils_basic.matmul2(-sig_raw, -sig_raw.permute(0, 2, 1)) # sig_inv = sig.inverse() # print('mu', mu.shape) # print('sig', sig.shape) return mu, sig, sig_inv, sig_raw
def get_mem_T_ref(B, Z, Y, X, bounds='default'): # sometimes we want the mat itself # note this is not a rigid transform # if not bounds=='default': # xmin, xmax, ymin, ymax, zmin, zmax = bounds # # these can be shaped B or be scalars # else: # xmin, xmax, ymin, ymax, zmin, zmax = XMIN, XMAX, YMIN, YMAX, ZMIN, ZMAX # for interpretability, let's construct this in two steps... # translation center_T_ref = utils_geom.eye_4x4(B) center_T_ref[:, 0, 3] = -hyp.XMIN center_T_ref[:, 1, 3] = -hyp.YMIN center_T_ref[:, 2, 3] = -hyp.ZMIN VOX_SIZE_X = (hyp.XMAX - hyp.XMIN) / float(X) VOX_SIZE_Y = (hyp.YMAX - hyp.YMIN) / float(Y) VOX_SIZE_Z = (hyp.ZMAX - hyp.ZMIN) / float(Z) # scaling mem_T_center = utils_geom.eye_4x4(B) mem_T_center[:, 0, 0] = 1. / VOX_SIZE_X mem_T_center[:, 1, 1] = 1. / VOX_SIZE_Y mem_T_center[:, 2, 2] = 1. / VOX_SIZE_Z mem_T_ref = utils_basic.matmul2(mem_T_center, center_T_ref) return mem_T_ref
def convert_params_to_lrt(self, obj_len, obj_xyz_sce, obj_rot, cam_T_sce): # this borrows from utils_geom.convert_box_to_ref_T_obj B = list(obj_xyz_sce.shape)[0] obj_xyz_cam = utils_geom.apply_4x4(cam_T_sce, obj_xyz_sce.unsqueeze(1)).squeeze(1) # # compose with lengths # lrt = utils_geom.merge_lrt(obj_len, cam_T_obj) # return lrt rot0 = utils_geom.eye_3x3(B) # tra = torch.stack([x, y, z], axis=1) center_T_ref = utils_geom.merge_rt(rot0, -obj_xyz_cam) # center_T_ref is B x 4 x 4 t0 = torch.zeros([B, 3]) obj_T_center = utils_geom.merge_rt(obj_rot, t0) # this is B x 4 x 4 # we want obj_T_ref # first we to translate to center, # and then rotate around the origin obj_T_ref = utils_basic.matmul2(obj_T_center, center_T_ref) # return the inverse of this, so that we can transform obj corners into cam coords ref_T_obj = utils_geom.safe_inverse(obj_T_ref) # return ref_T_obj # compose with lengths lrt = utils_geom.merge_lrt(obj_len, ref_T_obj) return lrt
def convert_box_to_ref_T_obj(box3D): # turn the box into obj_T_ref (i.e., obj_T_cam) B = list(box3D.shape)[0] # box3D is B x 9 x, y, z, lx, ly, lz, rx, ry, rz = torch.unbind(box3D, axis=1) rot0 = eye_3x3(B) tra = torch.stack([x, y, z], axis=1) center_T_ref = merge_rt(rot0, -tra) # center_T_ref is B x 4 x 4 t0 = torch.zeros([B, 3]) rot = eul2rotm(rx, ry, rz) rot = torch.transpose(rot, 1, 2) # other dir obj_T_center = merge_rt(rot, t0) # this is B x 4 x 4 # we want obj_T_ref # first we to translate to center, # and then rotate around the origin obj_T_ref = utils_basic.matmul2(obj_T_center, center_T_ref) # return the inverse of this, so that we can transform obj corners into cam coords ref_T_obj = obj_T_ref.inverse() return ref_T_obj
def apply_4x4_to_lrtlist(Y_T_X, lrtlist_X): B, N, D = list(lrtlist_X.shape) assert (D == 19) B2, E, F = list(Y_T_X.shape) assert (B2 == B) assert (E == 4 and F == 4) lenlist, rtlist_X = split_lrtlist(lrtlist_X) # rtlist_X is B x N x 4 x 4 Y_T_Xs = Y_T_X.unsqueeze(1).repeat(1, N, 1, 1) Y_T_Xs_ = Y_T_Xs.view(B * N, 4, 4) rtlist_X_ = rtlist_X.view(B * N, 4, 4) rtlist_Y_ = utils_basic.matmul2(Y_T_Xs_, rtlist_X_) rtlist_Y = rtlist_Y_.view(B, N, 4, 4) lrtlist_Y = merge_lrtlist(lenlist, rtlist_Y) return lrtlist_Y
def get_mem_T_ref(B, Z, Y, X): # sometimes we want the mat itself # note this is not a rigid transform # for interpretability, let's construct this in two steps... # translation center_T_ref = utils_geom.eye_4x4(B) center_T_ref[:, 0, 3] = -XMIN center_T_ref[:, 1, 3] = -YMIN center_T_ref[:, 2, 3] = -ZMIN VOX_SIZE_X = (XMAX - XMIN) / float(X) VOX_SIZE_Y = (YMAX - YMIN) / float(Y) VOX_SIZE_Z = (ZMAX - ZMIN) / float(Z) # scaling mem_T_center = utils_geom.eye_4x4(B) mem_T_center[:, 0, 0] = 1. / VOX_SIZE_X mem_T_center[:, 1, 1] = 1. / VOX_SIZE_Y mem_T_center[:, 2, 2] = 1. / VOX_SIZE_Z mem_T_ref = utils_basic.matmul2(mem_T_center, center_T_ref) return mem_T_ref
def get_zoom_T_ref(lrt, Z, Y, X, additive_pad=0.0): # lrt is B x 19 B, E = list(lrt.shape) assert (E == 19) lens, ref_T_obj = utils_geom.split_lrt(lrt) lx, ly, lz = lens.unbind(1) debug = False if debug: print('lx, ly, lz') print(lx) print(ly) print(lz) obj_T_ref = utils_geom.safe_inverse(ref_T_obj) # this is B x 4 x 4 if debug: print('ok, got obj_T_ref:') print(obj_T_ref) # we want a tiny bit of padding # additive helps avoid nans with invalid objects # mult helps expand big objects lx = lx + additive_pad ly = ly + additive_pad * 0.5 # pad less in this dim, since it is usually pointless lz = lz + additive_pad # lx *= 1.1 # ly *= 1.1 # lz *= 1.1 # translation center_T_obj_r = utils_geom.eye_3x3(B) center_T_obj_t = torch.stack([lx / 2., ly / 2., lz / 2.], dim=1) if debug: print('merging these:') print(center_T_obj_r.shape) print(center_T_obj_t.shape) center_T_obj = utils_geom.merge_rt(center_T_obj_r, center_T_obj_t) if debug: print('ok, got center_T_obj:') print(center_T_obj) # scaling Z_VOX_SIZE_X = (lx) / float(X) Z_VOX_SIZE_Y = (ly) / float(Y) Z_VOX_SIZE_Z = (lz) / float(Z) diag = torch.stack([ 1. / Z_VOX_SIZE_X, 1. / Z_VOX_SIZE_Y, 1. / Z_VOX_SIZE_Z, torch.ones([B], device=torch.device('cuda')) ], axis=1).view(B, 4) if debug: print('diag:') print(diag) print(diag.shape) zoom_T_center = torch.diag_embed(diag) if debug: print('ok, got zoom_T_center:') print(zoom_T_center) print(zoom_T_center.shape) # compose these zoom_T_obj = utils_basic.matmul2(zoom_T_center, center_T_obj) if debug: print('ok, got zoom_T_obj:') print(zoom_T_obj) print(zoom_T_obj.shape) zoom_T_ref = utils_basic.matmul2(zoom_T_obj, obj_T_ref) if debug: print('ok, got zoom_T_ref:') print(zoom_T_ref) return zoom_T_ref
def forward(self, feed): results = dict() if 'log_freq' not in feed.keys(): feed['log_freq'] = None start_time = time.time() summ_writer = utils_improc.Summ_writer(writer=feed['writer'], global_step=feed['global_step'], set_name=feed['set_name'], log_freq=feed['log_freq'], fps=8) writer = feed['writer'] global_step = feed['global_step'] total_loss = torch.tensor(0.0).cuda() __p = lambda x: utils_basic.pack_seqdim(x, B) __u = lambda x: utils_basic.unpack_seqdim(x, B) __pb = lambda x: utils_basic.pack_boxdim(x, hyp.N) __ub = lambda x: utils_basic.unpack_boxdim(x, hyp.N) if hyp.aug_object_ent_dis: __pb_a = lambda x: utils_basic.pack_boxdim( x, hyp.max_obj_aug + hyp.max_obj_aug_dis) __ub_a = lambda x: utils_basic.unpack_boxdim( x, hyp.max_obj_aug + hyp.max_obj_aug_dis) else: __pb_a = lambda x: utils_basic.pack_boxdim(x, hyp.max_obj_aug) __ub_a = lambda x: utils_basic.unpack_boxdim(x, hyp.max_obj_aug) B, H, W, V, S, N = hyp.B, hyp.H, hyp.W, hyp.V, hyp.S, hyp.N PH, PW = hyp.PH, hyp.PW K = hyp.K BOX_SIZE = hyp.BOX_SIZE Z, Y, X = hyp.Z, hyp.Y, hyp.X Z2, Y2, X2 = int(Z / 2), int(Y / 2), int(X / 2) Z4, Y4, X4 = int(Z / 4), int(Y / 4), int(X / 4) D = 9 tids = torch.from_numpy(np.reshape(np.arange(B * N), [B, N])) rgb_camXs = feed["rgb_camXs_raw"] pix_T_cams = feed["pix_T_cams_raw"] camRs_T_origin = feed["camR_T_origin_raw"] origin_T_camRs = __u(utils_geom.safe_inverse(__p(camRs_T_origin))) origin_T_camXs = feed["origin_T_camXs_raw"] camX0_T_camXs = utils_geom.get_camM_T_camXs(origin_T_camXs, ind=0) camRs_T_camXs = __u( torch.matmul(utils_geom.safe_inverse(__p(origin_T_camRs)), __p(origin_T_camXs))) camXs_T_camRs = __u(utils_geom.safe_inverse(__p(camRs_T_camXs))) camX0_T_camRs = camXs_T_camRs[:, 0] camX1_T_camRs = camXs_T_camRs[:, 1] camR_T_camX0 = utils_geom.safe_inverse(camX0_T_camRs) xyz_camXs = feed["xyz_camXs_raw"] depth_camXs_, valid_camXs_ = utils_geom.create_depth_image( __p(pix_T_cams), __p(xyz_camXs), H, W) dense_xyz_camXs_ = utils_geom.depth2pointcloud(depth_camXs_, __p(pix_T_cams)) xyz_camRs = __u( utils_geom.apply_4x4(__p(camRs_T_camXs), __p(xyz_camXs))) xyz_camX0s = __u( utils_geom.apply_4x4(__p(camX0_T_camXs), __p(xyz_camXs))) occXs = __u(utils_vox.voxelize_xyz(__p(xyz_camXs), Z, Y, X)) occXs_to_Rs = utils_vox.apply_4x4s_to_voxs(camRs_T_camXs, occXs) occXs_to_Rs_45 = cross_corr.rotate_tensor_along_y_axis(occXs_to_Rs, 45) occXs_half = __u(utils_vox.voxelize_xyz(__p(xyz_camXs), Z2, Y2, X2)) occRs_half = __u(utils_vox.voxelize_xyz(__p(xyz_camRs), Z2, Y2, X2)) occX0s_half = __u(utils_vox.voxelize_xyz(__p(xyz_camX0s), Z2, Y2, X2)) unpXs = __u( utils_vox.unproject_rgb_to_mem(__p(rgb_camXs), Z, Y, X, __p(pix_T_cams))) unpXs_half = __u( utils_vox.unproject_rgb_to_mem(__p(rgb_camXs), Z2, Y2, X2, __p(pix_T_cams))) unpX0s_half = __u( utils_vox.unproject_rgb_to_mem( __p(rgb_camXs), Z2, Y2, X2, utils_basic.matmul2( __p(pix_T_cams), utils_geom.safe_inverse(__p(camX0_T_camXs))))) unpRs = __u( utils_vox.unproject_rgb_to_mem( __p(rgb_camXs), Z, Y, X, utils_basic.matmul2( __p(pix_T_cams), utils_geom.safe_inverse(__p(camRs_T_camXs))))) unpRs_half = __u( utils_vox.unproject_rgb_to_mem( __p(rgb_camXs), Z2, Y2, X2, utils_basic.matmul2( __p(pix_T_cams), utils_geom.safe_inverse(__p(camRs_T_camXs))))) dense_xyz_camRs_ = utils_geom.apply_4x4(__p(camRs_T_camXs), dense_xyz_camXs_) inbound_camXs_ = utils_vox.get_inbounds(dense_xyz_camRs_, Z, Y, X).float() inbound_camXs_ = torch.reshape(inbound_camXs_, [B * S, 1, H, W]) depth_camXs = __u(depth_camXs_) valid_camXs = __u(valid_camXs_) * __u(inbound_camXs_) summ_writer.summ_oneds('2D_inputs/depth_camXs', torch.unbind(depth_camXs, dim=1), maxdepth=21.0) summ_writer.summ_oneds('2D_inputs/valid_camXs', torch.unbind(valid_camXs, dim=1)) summ_writer.summ_rgbs('2D_inputs/rgb_camXs', torch.unbind(rgb_camXs, dim=1)) summ_writer.summ_occs('3D_inputs/occXs', torch.unbind(occXs, dim=1)) summ_writer.summ_unps('3D_inputs/unpXs', torch.unbind(unpXs, dim=1), torch.unbind(occXs, dim=1)) occRs = __u(utils_vox.voxelize_xyz(__p(xyz_camRs), Z, Y, X)) if hyp.do_eval_boxes: if hyp.dataset_name == "clevr_vqa": gt_boxes_origin_corners = feed['gt_box'] gt_scores_origin = feed['gt_scores'].detach().cpu().numpy() classes = feed['classes'] scores = gt_scores_origin tree_seq_filename = feed['tree_seq_filename'] gt_boxes_origin = nlu.get_ends_of_corner( gt_boxes_origin_corners) gt_boxes_origin_end = torch.reshape(gt_boxes_origin, [hyp.B, hyp.N, 2, 3]) gt_boxes_origin_theta = nlu.get_alignedboxes2thetaformat( gt_boxes_origin_end) gt_boxes_origin_corners = utils_geom.transform_boxes_to_corners( gt_boxes_origin_theta) gt_boxesR_corners = __ub( utils_geom.apply_4x4(camRs_T_origin[:, 0], __pb(gt_boxes_origin_corners))) gt_boxesR_theta = utils_geom.transform_corners_to_boxes( gt_boxesR_corners) gt_boxesR_end = nlu.get_ends_of_corner(gt_boxesR_corners) else: tree_seq_filename = feed['tree_seq_filename'] tree_filenames = [ join(hyp.root_dataset, i) for i in tree_seq_filename if i != "invalid_tree" ] invalid_tree_filenames = [ join(hyp.root_dataset, i) for i in tree_seq_filename if i == "invalid_tree" ] num_empty = len(invalid_tree_filenames) trees = [pickle.load(open(i, "rb")) for i in tree_filenames] len_valid = len(trees) if len_valid > 0: gt_boxesR, scores, classes = nlu.trees_rearrange(trees) if num_empty > 0: gt_boxesR = np.concatenate([ gt_boxesR, empty_gt_boxesR ]) if len_valid > 0 else empty_gt_boxesR scores = np.concatenate([ scores, empty_scores ]) if len_valid > 0 else empty_scores classes = np.concatenate([ classes, empty_classes ]) if len_valid > 0 else empty_classes gt_boxesR = torch.from_numpy( gt_boxesR).cuda().float() # torch.Size([2, 3, 6]) gt_boxesR_end = torch.reshape(gt_boxesR, [hyp.B, hyp.N, 2, 3]) gt_boxesR_theta = nlu.get_alignedboxes2thetaformat( gt_boxesR_end) #torch.Size([2, 3, 9]) gt_boxesR_corners = utils_geom.transform_boxes_to_corners( gt_boxesR_theta) class_names_ex_1 = "_".join(classes[0]) summ_writer.summ_text('eval_boxes/class_names', class_names_ex_1) gt_boxesRMem_corners = __ub( utils_vox.Ref2Mem(__pb(gt_boxesR_corners), Z2, Y2, X2)) gt_boxesRMem_end = nlu.get_ends_of_corner(gt_boxesRMem_corners) gt_boxesRMem_theta = utils_geom.transform_corners_to_boxes( gt_boxesRMem_corners) gt_boxesRUnp_corners = __ub( utils_vox.Ref2Mem(__pb(gt_boxesR_corners), Z, Y, X)) gt_boxesRUnp_end = nlu.get_ends_of_corner(gt_boxesRUnp_corners) gt_boxesX0_corners = __ub( utils_geom.apply_4x4(camX0_T_camRs, __pb(gt_boxesR_corners))) gt_boxesX0Mem_corners = __ub( utils_vox.Ref2Mem(__pb(gt_boxesX0_corners), Z2, Y2, X2)) gt_boxesX0Mem_theta = utils_geom.transform_corners_to_boxes( gt_boxesX0Mem_corners) gt_boxesX0Mem_end = nlu.get_ends_of_corner(gt_boxesX0Mem_corners) gt_boxesX0_end = nlu.get_ends_of_corner(gt_boxesX0_corners) gt_cornersX0_pix = __ub( utils_geom.apply_pix_T_cam(pix_T_cams[:, 0], __pb(gt_boxesX0_corners))) rgb_camX0 = rgb_camXs[:, 0] rgb_camX1 = rgb_camXs[:, 1] summ_writer.summ_box_by_corners('eval_boxes/gt_boxescamX0', rgb_camX0, gt_boxesX0_corners, torch.from_numpy(scores), tids, pix_T_cams[:, 0]) unps_vis = utils_improc.get_unps_vis(unpX0s_half, occX0s_half) unp_vis = torch.mean(unps_vis, dim=1) unps_visRs = utils_improc.get_unps_vis(unpRs_half, occRs_half) unp_visRs = torch.mean(unps_visRs, dim=1) unps_visRs_full = utils_improc.get_unps_vis(unpRs, occRs) unp_visRs_full = torch.mean(unps_visRs_full, dim=1) summ_writer.summ_box_mem_on_unp('eval_boxes/gt_boxesR_mem', unp_visRs, gt_boxesRMem_end, scores, tids) unpX0s_half = torch.mean(unpX0s_half, dim=1) unpX0s_half = nlu.zero_out(unpX0s_half, gt_boxesX0Mem_end, scores) occX0s_half = torch.mean(occX0s_half, dim=1) occX0s_half = nlu.zero_out(occX0s_half, gt_boxesX0Mem_end, scores) summ_writer.summ_unp('3D_inputs/unpX0s', unpX0s_half, occX0s_half) if hyp.do_feat: featXs_input = torch.cat([occXs, occXs * unpXs], dim=2) featXs_input_ = __p(featXs_input) freeXs_ = utils_vox.get_freespace(__p(xyz_camXs), __p(occXs_half)) freeXs = __u(freeXs_) visXs = torch.clamp(occXs_half + freeXs, 0.0, 1.0) mask_ = None if (type(mask_) != type(None)): assert (list(mask_.shape)[2:5] == list( featXs_input_.shape)[2:5]) featXs_, feat_loss = self.featnet(featXs_input_, summ_writer, mask=__p(occXs)) #mask_) total_loss += feat_loss validXs = torch.ones_like(visXs) _validX00 = validXs[:, 0:1] _validX01 = utils_vox.apply_4x4s_to_voxs(camX0_T_camXs[:, 1:], validXs[:, 1:]) validX0s = torch.cat([_validX00, _validX01], dim=1) validRs = utils_vox.apply_4x4s_to_voxs(camRs_T_camXs, validXs) visRs = utils_vox.apply_4x4s_to_voxs(camRs_T_camXs, visXs) featXs = __u(featXs_) _featX00 = featXs[:, 0:1] _featX01 = utils_vox.apply_4x4s_to_voxs(camX0_T_camXs[:, 1:], featXs[:, 1:]) featX0s = torch.cat([_featX00, _featX01], dim=1) emb3D_e = torch.mean(featX0s[:, 1:], dim=1) vis3D_e_R = torch.max(visRs[:, 1:], dim=1)[0] emb3D_g = featX0s[:, 0] vis3D_g_R = visRs[:, 0] validR_combo = torch.min(validRs, dim=1).values summ_writer.summ_feats('3D_feats/featXs_input', torch.unbind(featXs_input, dim=1), pca=True) summ_writer.summ_feats('3D_feats/featXs_output', torch.unbind(featXs, dim=1), valids=torch.unbind(validXs, dim=1), pca=True) summ_writer.summ_feats('3D_feats/featX0s_output', torch.unbind(featX0s, dim=1), valids=torch.unbind( torch.ones_like(validRs), dim=1), pca=True) summ_writer.summ_feats('3D_feats/validRs', torch.unbind(validRs, dim=1), pca=False) summ_writer.summ_feat('3D_feats/vis3D_e_R', vis3D_e_R, pca=False) summ_writer.summ_feat('3D_feats/vis3D_g_R', vis3D_g_R, pca=False) if hyp.do_munit: object_classes, filenames = nlu.create_object_classes( classes, [tree_seq_filename, tree_seq_filename], scores) if hyp.do_munit_fewshot: emb3D_e_R = utils_vox.apply_4x4_to_vox(camR_T_camX0, emb3D_e) emb3D_g_R = utils_vox.apply_4x4_to_vox(camR_T_camX0, emb3D_g) emb3D_R = emb3D_e_R emb3D_e_R_object, emb3D_g_R_object, validR_combo_object = nlu.create_object_tensors( [emb3D_e_R, emb3D_g_R], [validR_combo], gt_boxesRMem_end, scores, [BOX_SIZE, BOX_SIZE, BOX_SIZE]) emb3D_R_object = (emb3D_e_R_object + emb3D_g_R_object) / 2 content, style = self.munitnet.net.gen_a.encode(emb3D_R_object) objects_taken, _ = self.munitnet.net.gen_a.decode( content, style) styles = style contents = content elif hyp.do_3d_style_munit: emb3D_e_R = utils_vox.apply_4x4_to_vox(camR_T_camX0, emb3D_e) emb3D_g_R = utils_vox.apply_4x4_to_vox(camR_T_camX0, emb3D_g) emb3D_R = emb3D_e_R # st() emb3D_e_R_object, emb3D_g_R_object, validR_combo_object = nlu.create_object_tensors( [emb3D_e_R, emb3D_g_R], [validR_combo], gt_boxesRMem_end, scores, [BOX_SIZE, BOX_SIZE, BOX_SIZE]) emb3D_R_object = (emb3D_e_R_object + emb3D_g_R_object) / 2 camX1_T_R = camXs_T_camRs[:, 1] camX0_T_R = camXs_T_camRs[:, 0] assert hyp.B == 2 assert emb3D_e_R_object.shape[0] == 2 munit_loss, sudo_input_0, sudo_input_1, recon_input_0, recon_input_1, sudo_input_0_cycle, sudo_input_1_cycle, styles, contents, adin = self.munitnet( emb3D_R_object[0:1], emb3D_R_object[1:2]) if hyp.store_content_style_range: if self.max_content == None: self.max_content = torch.zeros_like( contents[0][0]).cuda() - 100000000 if self.min_content == None: self.min_content = torch.zeros_like( contents[0][0]).cuda() + 100000000 if self.max_style == None: self.max_style = torch.zeros_like( styles[0][0]).cuda() - 100000000 if self.min_style == None: self.min_style = torch.zeros_like( styles[0][0]).cuda() + 100000000 self.max_content = torch.max( torch.max(self.max_content, contents[0][0]), contents[1][0]) self.min_content = torch.min( torch.min(self.min_content, contents[0][0]), contents[1][0]) self.max_style = torch.max( torch.max(self.max_style, styles[0][0]), styles[1][0]) self.min_style = torch.min( torch.min(self.min_style, styles[0][0]), styles[1][0]) data_to_save = { 'max_content': self.max_content.cpu().numpy(), 'min_content': self.min_content.cpu().numpy(), 'max_style': self.max_style.cpu().numpy(), 'min_style': self.min_style.cpu().numpy() } with open('content_style_range.p', 'wb') as f: pickle.dump(data_to_save, f) elif hyp.is_contrastive_examples: if hyp.normalize_contrast: content0 = (contents[0] - self.min_content) / ( self.max_content - self.min_content + 1e-5) content1 = (contents[1] - self.min_content) / ( self.max_content - self.min_content + 1e-5) style0 = (styles[0] - self.min_style) / ( self.max_style - self.min_style + 1e-5) style1 = (styles[1] - self.min_style) / ( self.max_style - self.min_style + 1e-5) else: content0 = contents[0] content1 = contents[1] style0 = styles[0] style1 = styles[1] # euclid_dist_content = torch.sum(torch.sqrt((content0 - content1)**2))/torch.prod(torch.tensor(content0.shape)) # euclid_dist_style = torch.sum(torch.sqrt((style0-style1)**2))/torch.prod(torch.tensor(style0.shape)) euclid_dist_content = (content0 - content1).norm(2) / ( content0.numel()) euclid_dist_style = (style0 - style1).norm(2) / (style0.numel()) content_0_pooled = torch.mean( content0.reshape(list(content0.shape[:2]) + [-1]), dim=-1) content_1_pooled = torch.mean( content1.reshape(list(content1.shape[:2]) + [-1]), dim=-1) euclid_dist_content_pooled = (content_0_pooled - content_1_pooled).norm(2) / ( content_0_pooled.numel()) content_0_normalized = content0 / content0.norm() content_1_normalized = content1 / content1.norm() style_0_normalized = style0 / style0.norm() style_1_normalized = style1 / style1.norm() content_0_pooled_normalized = content_0_pooled / content_0_pooled.norm( ) content_1_pooled_normalized = content_1_pooled / content_1_pooled.norm( ) cosine_dist_content = torch.sum(content_0_normalized * content_1_normalized) cosine_dist_style = torch.sum(style_0_normalized * style_1_normalized) cosine_dist_content_pooled = torch.sum( content_0_pooled_normalized * content_1_pooled_normalized) print("euclid dist [content, pooled-content, style]: ", euclid_dist_content, euclid_dist_content_pooled, euclid_dist_style) print("cosine sim [content, pooled-content, style]: ", cosine_dist_content, cosine_dist_content_pooled, cosine_dist_style) if hyp.run_few_shot_on_munit: if (global_step % 300) == 1 or (global_step % 300) == 0: wrong = False try: precision_style = float(self.tp_style) / self.all_style precision_content = float( self.tp_content) / self.all_content except ZeroDivisionError: wrong = True if not wrong: summ_writer.summ_scalar( 'precision/unsupervised_precision_style', precision_style) summ_writer.summ_scalar( 'precision/unsupervised_precision_content', precision_content) # st() self.embed_list_style = defaultdict(lambda: []) self.embed_list_content = defaultdict(lambda: []) self.tp_style = 0 self.all_style = 0 self.tp_content = 0 self.all_content = 0 self.check = False elif not self.check and not nlu.check_fill_dict( self.embed_list_content, self.embed_list_style): print("Filling \n") for index, class_val in enumerate(object_classes): if hyp.dataset_name == "clevr_vqa": class_val_content, class_val_style = class_val.split( "/") else: class_val_content, class_val_style = [ class_val.split("/")[0], class_val.split("/")[0] ] print(len(self.embed_list_style.keys()), "style class", len(self.embed_list_content), "content class", self.embed_list_content.keys()) if len(self.embed_list_style[class_val_style] ) < hyp.few_shot_nums: self.embed_list_style[class_val_style].append( styles[index].squeeze()) if len(self.embed_list_content[class_val_content] ) < hyp.few_shot_nums: if hyp.avg_3d: content_val = contents[index] content_val = torch.mean(content_val.reshape( [content_val.shape[1], -1]), dim=-1) # st() self.embed_list_content[ class_val_content].append(content_val) else: self.embed_list_content[ class_val_content].append( contents[index].reshape([-1])) else: self.check = True try: print(float(self.tp_content) / self.all_content) print(float(self.tp_style) / self.all_style) except Exception as e: pass average = True if average: for key, val in self.embed_list_style.items(): if isinstance(val, type([])): self.embed_list_style[key] = torch.mean( torch.stack(val, dim=0), dim=0) for key, val in self.embed_list_content.items(): if isinstance(val, type([])): self.embed_list_content[key] = torch.mean( torch.stack(val, dim=0), dim=0) else: for key, val in self.embed_list_style.items(): if isinstance(val, type([])): self.embed_list_style[key] = torch.stack(val, dim=0) for key, val in self.embed_list_content.items(): if isinstance(val, type([])): self.embed_list_content[key] = torch.stack( val, dim=0) for index, class_val in enumerate(object_classes): class_val = class_val if hyp.dataset_name == "clevr_vqa": class_val_content, class_val_style = class_val.split( "/") else: class_val_content, class_val_style = [ class_val.split("/")[0], class_val.split("/")[0] ] style_val = styles[index].squeeze().unsqueeze(0) if not average: embed_list_val_style = torch.cat(list( self.embed_list_style.values()), dim=0) embed_list_key_style = list( np.repeat( np.expand_dims( list(self.embed_list_style.keys()), 1), hyp.few_shot_nums, 1).reshape([-1])) else: embed_list_val_style = torch.stack(list( self.embed_list_style.values()), dim=0) embed_list_key_style = list( self.embed_list_style.keys()) embed_list_val_style = utils_basic.l2_normalize( embed_list_val_style, dim=1).permute(1, 0) style_val = utils_basic.l2_normalize(style_val, dim=1) scores_styles = torch.matmul(style_val, embed_list_val_style) index_key = torch.argmax(scores_styles, dim=1).squeeze() selected_class_style = embed_list_key_style[index_key] self.styles_prediction[class_val_style].append( selected_class_style) if class_val_style == selected_class_style: self.tp_style += 1 self.all_style += 1 if hyp.avg_3d: content_val = contents[index] content_val = torch.mean(content_val.reshape( [content_val.shape[1], -1]), dim=-1).unsqueeze(0) else: content_val = contents[index].reshape( [-1]).unsqueeze(0) if not average: embed_list_val_content = torch.cat(list( self.embed_list_content.values()), dim=0) embed_list_key_content = list( np.repeat( np.expand_dims( list(self.embed_list_content.keys()), 1), hyp.few_shot_nums, 1).reshape([-1])) else: embed_list_val_content = torch.stack(list( self.embed_list_content.values()), dim=0) embed_list_key_content = list( self.embed_list_content.keys()) embed_list_val_content = utils_basic.l2_normalize( embed_list_val_content, dim=1).permute(1, 0) content_val = utils_basic.l2_normalize(content_val, dim=1) scores_content = torch.matmul(content_val, embed_list_val_content) index_key = torch.argmax(scores_content, dim=1).squeeze() selected_class_content = embed_list_key_content[ index_key] self.content_prediction[class_val_content].append( selected_class_content) if class_val_content == selected_class_content: self.tp_content += 1 self.all_content += 1 # st() munit_loss = hyp.munit_loss_weight * munit_loss recon_input_obj = torch.cat([recon_input_0, recon_input_1], dim=0) recon_emb3D_R = nlu.update_scene_with_objects( emb3D_R, recon_input_obj, gt_boxesRMem_end, scores) sudo_input_obj = torch.cat([sudo_input_0, sudo_input_1], dim=0) styled_emb3D_R = nlu.update_scene_with_objects( emb3D_R, sudo_input_obj, gt_boxesRMem_end, scores) styled_emb3D_e_X1 = utils_vox.apply_4x4_to_vox( camX1_T_R, styled_emb3D_R) styled_emb3D_e_X0 = utils_vox.apply_4x4_to_vox( camX0_T_R, styled_emb3D_R) emb3D_e_X1 = utils_vox.apply_4x4_to_vox(camX1_T_R, recon_emb3D_R) emb3D_e_X0 = utils_vox.apply_4x4_to_vox(camX0_T_R, recon_emb3D_R) emb3D_e_X1_og = utils_vox.apply_4x4_to_vox(camX1_T_R, emb3D_R) emb3D_e_X0_og = utils_vox.apply_4x4_to_vox(camX0_T_R, emb3D_R) emb3D_R_aug_diff = torch.abs(emb3D_R - recon_emb3D_R) summ_writer.summ_feat(f'aug_feat/og', emb3D_R) summ_writer.summ_feat(f'aug_feat/og_gen', recon_emb3D_R) summ_writer.summ_feat(f'aug_feat/og_aug_diff', emb3D_R_aug_diff) if hyp.cycle_style_view_loss: sudo_input_obj_cycle = torch.cat( [sudo_input_0_cycle, sudo_input_1_cycle], dim=0) styled_emb3D_R_cycle = nlu.update_scene_with_objects( emb3D_R, sudo_input_obj_cycle, gt_boxesRMem_end, scores) styled_emb3D_e_X0_cycle = utils_vox.apply_4x4_to_vox( camX0_T_R, styled_emb3D_R_cycle) styled_emb3D_e_X1_cycle = utils_vox.apply_4x4_to_vox( camX1_T_R, styled_emb3D_R_cycle) summ_writer.summ_scalar('munit_loss', munit_loss.cpu().item()) total_loss += munit_loss if hyp.do_occ and hyp.occ_do_cheap: occX0_sup, freeX0_sup, _, freeXs = utils_vox.prep_occs_supervision( camX0_T_camXs, xyz_camXs, Z2, Y2, X2, agg=True) summ_writer.summ_occ('occ_sup/occ_sup', occX0_sup) summ_writer.summ_occ('occ_sup/free_sup', freeX0_sup) summ_writer.summ_occs('occ_sup/freeXs_sup', torch.unbind(freeXs, dim=1)) summ_writer.summ_occs('occ_sup/occXs_sup', torch.unbind(occXs_half, dim=1)) occ_loss, occX0s_pred_ = self.occnet( torch.mean(featX0s[:, 1:], dim=1), occX0_sup, freeX0_sup, torch.max(validX0s[:, 1:], dim=1)[0], summ_writer) occX0s_pred = __u(occX0s_pred_) total_loss += occ_loss if hyp.do_view: assert (hyp.do_feat) PH, PW = hyp.PH, hyp.PW sy = float(PH) / float(hyp.H) sx = float(PW) / float(hyp.W) assert (sx == 0.5) # else we need a fancier downsampler assert (sy == 0.5) projpix_T_cams = __u( utils_geom.scale_intrinsics(__p(pix_T_cams), sx, sy)) # st() if hyp.do_munit: feat_projX00 = utils_vox.apply_pixX_T_memR_to_voxR( projpix_T_cams[:, 0], camX0_T_camXs[:, 1], emb3D_e_X1, # use feat1 to predict rgb0 hyp.view_depth, PH, PW) feat_projX00_og = utils_vox.apply_pixX_T_memR_to_voxR( projpix_T_cams[:, 0], camX0_T_camXs[:, 1], emb3D_e_X1_og, # use feat1 to predict rgb0 hyp.view_depth, PH, PW) # only for checking the style styled_feat_projX00 = utils_vox.apply_pixX_T_memR_to_voxR( projpix_T_cams[:, 0], camX0_T_camXs[:, 1], styled_emb3D_e_X1, # use feat1 to predict rgb0 hyp.view_depth, PH, PW) if hyp.cycle_style_view_loss: styled_feat_projX00_cycle = utils_vox.apply_pixX_T_memR_to_voxR( projpix_T_cams[:, 0], camX0_T_camXs[:, 1], styled_emb3D_e_X1_cycle, # use feat1 to predict rgb0 hyp.view_depth, PH, PW) else: feat_projX00 = utils_vox.apply_pixX_T_memR_to_voxR( projpix_T_cams[:, 0], camX0_T_camXs[:, 1], featXs[:, 1], # use feat1 to predict rgb0 hyp.view_depth, PH, PW) rgb_X00 = utils_basic.downsample(rgb_camXs[:, 0], 2) rgb_X01 = utils_basic.downsample(rgb_camXs[:, 1], 2) valid_X00 = utils_basic.downsample(valid_camXs[:, 0], 2) view_loss, rgb_e, emb2D_e = self.viewnet(feat_projX00, rgb_X00, valid_X00, summ_writer, "rgb") if hyp.do_munit: _, rgb_e, emb2D_e = self.viewnet(feat_projX00_og, rgb_X00, valid_X00, summ_writer, "rgb_og") if hyp.do_munit: styled_view_loss, styled_rgb_e, styled_emb2D_e = self.viewnet( styled_feat_projX00, rgb_X00, valid_X00, summ_writer, "recon_style") if hyp.cycle_style_view_loss: styled_view_loss_cycle, styled_rgb_e_cycle, styled_emb2D_e_cycle = self.viewnet( styled_feat_projX00_cycle, rgb_X00, valid_X00, summ_writer, "recon_style_cycle") rgb_input_1 = torch.cat( [rgb_X01[1], rgb_X01[0], styled_rgb_e[0]], dim=2) rgb_input_2 = torch.cat( [rgb_X01[0], rgb_X01[1], styled_rgb_e[1]], dim=2) complete_vis = torch.cat([rgb_input_1, rgb_input_2], dim=1) summ_writer.summ_rgb('munit/munit_recons_vis', complete_vis.unsqueeze(0)) if not hyp.do_munit: total_loss += view_loss else: if hyp.basic_view_loss: total_loss += view_loss if hyp.style_view_loss: total_loss += styled_view_loss if hyp.cycle_style_view_loss: total_loss += styled_view_loss_cycle summ_writer.summ_scalar('loss', total_loss.cpu().item()) if hyp.save_embed_tsne: for index, class_val in enumerate(object_classes): class_val_content, class_val_style = class_val.split("/") style_val = styles[index].squeeze().unsqueeze(0) self.cluster_pool.update(style_val, [class_val_style]) print(self.cluster_pool.num) if self.cluster_pool.is_full(): embeds, classes = self.cluster_pool.fetch() with open("offline_cluster" + '/%st.txt' % 'classes', 'w') as f: for index, embed in enumerate(classes): class_val = classes[index] f.write("%s\n" % class_val) f.close() with open("offline_cluster" + '/%st.txt' % 'embeddings', 'w') as f: for index, embed in enumerate(embeds): # embed = utils_basic.l2_normalize(embed,dim=0) print("writing {} embed".format(index)) embed_l_s = [str(i) for i in embed.tolist()] embed_str = '\t'.join(embed_l_s) f.write("%s\n" % embed_str) f.close() st() return total_loss, results
def get_gt_flow(obj_lrtlist_camRs, obj_scorelist, camRs_T_camXs, Z, Y, X, K=2, mod='', vis=True, summ_writer=None): # this constructs the flow field according to the given # box trajectories (obj_lrtlist_camRs) (collected from a moving camR) # and egomotion (encoded in camRs_T_camXs) # (so they do not take into account egomotion) # so, we first generate the flow for all the objects, # then in the background, put the ego flow N, B, S, D = list(obj_lrtlist_camRs.shape) assert (S == 2) # as a flow util, this expects S=2 flows = [] masks = [] for k in list(range(K)): obj_masklistR0 = utils_vox.assemble_padded_obj_masklist( obj_lrtlist_camRs[k, :, 0:1], obj_scorelist[k, :, 0:1], Z, Y, X, coeff=1.0) # this is B x 1(N) x 1(C) x Z x Y x Z # obj_masklistR0 = obj_masklistR0.squeeze(1) # this is B x 1 x Z x Y x X obj_mask0 = obj_masklistR0.squeeze(1) # this is B x 1 x Z x Y x X camR_T_cam0 = camRs_T_camXs[:, 0] camR_T_cam1 = camRs_T_camXs[:, 1] cam0_T_camR = utils_geom.safe_inverse(camR_T_cam0) cam1_T_camR = utils_geom.safe_inverse(camR_T_cam1) # camR0_T_camR1 = camR0_T_camRs[:,1] # camR1_T_camR0 = utils_geom.safe_inverse(camR0_T_camR1) # obj_masklistA1 = utils_vox.apply_4x4_to_vox(camR1_T_camR0, obj_masklistA0) # if vis and (summ_writer is not None): # summ_writer.summ_occ('flow/obj%d_maskA0' % k, obj_masklistA0) # summ_writer.summ_occ('flow/obj%d_maskA1' % k, obj_masklistA1) if vis and (summ_writer is not None): # summ_writer.summ_occ('flow/obj%d_mask0' % k, obj_mask0) summ_writer.summ_oned('flow/obj%d_mask0_%s' % (k, mod), torch.mean(obj_mask0, 3)) _, ref_T_objs_list = utils_geom.split_lrtlist(obj_lrtlist_camRs[k]) # this is B x S x 4 x 4 ref_T_obj0 = ref_T_objs_list[:, 0] ref_T_obj1 = ref_T_objs_list[:, 1] obj0_T_ref = utils_geom.safe_inverse(ref_T_obj0) obj1_T_ref = utils_geom.safe_inverse(ref_T_obj1) # these are B x 4 x 4 mem_T_ref = utils_vox.get_mem_T_ref(B, Z, Y, X) ref_T_mem = utils_vox.get_ref_T_mem(B, Z, Y, X) ref1_T_ref0 = utils_basic.matmul2(ref_T_obj1, obj0_T_ref) cam1_T_cam0 = utils_basic.matmul3(cam1_T_camR, ref1_T_ref0, camR_T_cam0) mem1_T_mem0 = utils_basic.matmul3(mem_T_ref, cam1_T_cam0, ref_T_mem) xyz_mem0 = utils_basic.gridcloud3D(B, Z, Y, X) xyz_mem1 = utils_geom.apply_4x4(mem1_T_mem0, xyz_mem0) xyz_mem0 = xyz_mem0.reshape(B, Z, Y, X, 3) xyz_mem1 = xyz_mem1.reshape(B, Z, Y, X, 3) # only use these displaced points within the obj mask # obj_mask03 = obj_mask0.view(B, Z, Y, X, 1).repeat(1, 1, 1, 1, 3) obj_mask0 = obj_mask0.view(B, Z, Y, X, 1) # # xyz_mem1[(obj_mask03 < 1.0).bool()] = xyz_mem0 # cond = (obj_mask03 < 1.0).float() cond = (obj_mask0 > 0.0).float() xyz_mem1 = cond * xyz_mem1 + (1.0 - cond) * xyz_mem0 flow = xyz_mem1 - xyz_mem0 flow = flow.permute(0, 4, 1, 2, 3) obj_mask0 = obj_mask0.permute(0, 4, 1, 2, 3) # if vis and k==0: if vis: summ_writer.summ_3D_flow('flow/gt_%d_%s' % (k, mod), flow, clip=4.0) masks.append(obj_mask0) flows.append(flow) camR_T_cam0 = camRs_T_camXs[:, 0] camR_T_cam1 = camRs_T_camXs[:, 1] cam0_T_camR = utils_geom.safe_inverse(camR_T_cam0) cam1_T_camR = utils_geom.safe_inverse(camR_T_cam1) mem_T_ref = utils_vox.get_mem_T_ref(B, Z, Y, X) ref_T_mem = utils_vox.get_ref_T_mem(B, Z, Y, X) cam1_T_cam0 = utils_basic.matmul2(cam1_T_camR, camR_T_cam0) mem1_T_mem0 = utils_basic.matmul3(mem_T_ref, cam1_T_cam0, ref_T_mem) xyz_mem0 = utils_basic.gridcloud3D(B, Z, Y, X) xyz_mem1 = utils_geom.apply_4x4(mem1_T_mem0, xyz_mem0) xyz_mem0 = xyz_mem0.reshape(B, Z, Y, X, 3) xyz_mem1 = xyz_mem1.reshape(B, Z, Y, X, 3) flow = xyz_mem1 - xyz_mem0 flow = flow.permute(0, 4, 1, 2, 3) bkg_flow = flow # allow zero motion in the bkg any_mask = torch.max(torch.stack(masks, axis=0), axis=0)[0] masks.append(1.0 - any_mask) flows.append(bkg_flow) flows = torch.stack(flows, axis=0) masks = torch.stack(masks, axis=0) masks = masks.repeat(1, 1, 3, 1, 1, 1) flow = utils_basic.reduce_masked_mean(flows, masks, dim=0) if vis: summ_writer.summ_3D_flow('flow/gt_complete', flow, clip=4.0) # flow is shaped B x 3 x D x H x W return flow