def one2one_identity(self, im1, im2): normalized_im1 = T.normalize(im1, mean=self.mean, std=self.std) scale_im1, scale_ratio1 = T.scale(normalized_im1, short_size=self.base_size) input_im1 = T.center_crop(scale_im1, crop_size=self.crop_size) normalized_im2 = T.normalize(im2, mean=self.mean, std=self.std) scale_im2, scale_ratio2 = T.scale(normalized_im2, short_size=self.base_size) input_im2 = T.center_crop(scale_im2, crop_size=self.crop_size) batch = np.asarray([input_im1, input_im2], dtype=np.float32) scores = self.inference(batch, output_layer=self.prob_layer) return M.cosine_similarity(scores[0], scores[1])
def __getitem__(self, index): img_path = os.path.join(self.img_dir, self.json_data[index]['img_fn']) img = np.array(cv2.imread(img_path), dtype=np.float32) keypoints = self.json_data[index]['keypoints'] if 'bodysize' in self.json_data[index]: norm = self.json_data[index]['bodysize'] elif 'headsize' in self.json_data[index]: norm = self.json_data[index]['headsize'] else: norm = self.json_data[index]['normalize'] img, keypoints, ratio = self.trans(img, keypoints) label = np.zeros((self.s, self.s, self.num_kpt), dtype=np.float32) offset = np.zeros((self.s, self.s, self.num_kpt * 2), dtype=np.float32) for px in range(self.num_kpt): if keypoints[px * 3 + 2] == 0 or keypoints[ px * 3 + 0] <= 0 or keypoints[px * 3 + 0] >= self.size or keypoints[ px * 3 + 1] <= 0 or keypoints[ px * 3 + 1] >= self.size or keypoints[px * 3 + 2] == 0: continue else: grid_loc_x = math.floor(keypoints[px * 3 + 0] // self.grid_size) grid_loc_y = math.floor(keypoints[px * 3 + 1] // self.grid_size) label[grid_loc_y][grid_loc_x][px] = 1 offset[grid_loc_y][grid_loc_x][px] = ( keypoints[px * 3 + 0] % self.grid_size) / self.grid_size offset[grid_loc_y][grid_loc_x][self.num_kpt + px] = ( keypoints[px * 3 + 1] % self.grid_size) / self.grid_size img1 = self._enhance(img.copy(), 1.0) img2 = self._enhance(img.copy(), 1.5) img3 = self._enhance(img.copy(), 2.0) img0 = normalize(to_tensor(img)).unsqueeze(dim=0) img1 = normalize(to_tensor(img1)).unsqueeze(dim=0) img2 = normalize(to_tensor(img2)).unsqueeze(dim=0) img3 = normalize(to_tensor(img3)).unsqueeze(dim=0) img = img0 label = to_tensor(label) offset = to_tensor(offset) norm = norm * ratio return img, label, offset, norm
def __getitem__(self, index): img_path = os.path.join(self.data_root, self.image_set, self.img_list[index]) img = np.array(cv2.imread(img_path), dtype=np.float32) mask_path = os.path.join(self.info_root, 'pose_mask', self.img_list[index].replace('.jpg', '.npy')) mask = np.load(mask_path) mask = np.array(mask, dtype=np.float32) kpt = self.kpt_list[index] center = self.center_list[index] scale = self.scale_list[index] img, mask, kpt, center = self.transformer(img, mask, kpt, center, scale) height, width, _ = img.shape mask = cv2.resize(mask, (width / self.stride, height / self.stride)).reshape((height / self.stride, width / self.stride, 1)) heatmap = np.zeros((height / self.stride, width / self.stride, len(kpt[0]) + 1), dtype=np.float32) heatmap = generate_heatmap(heatmap, kpt, self.stride, self.sigma) heatmap[:,:,0] = 1.0 - np.max(heatmap[:,:,1:], axis=2) # for background heatmap = heatmap * mask vecmap = np.zeros((height / self.stride, width / self.stride, len(self.vec_pair[0]) * 2), dtype=np.float32) cnt = np.zeros((height / self.stride, width / self.stride, len(self.vec_pair[0])), dtype=np.int32) vecmap = generate_vector(vecmap, cnt, kpt, self.vec_pair, self.stride, self.theta) vecmap = vecmap * mask img = transforms.normalize(transforms.to_tensor(img), [128.0, 128.0, 128.0], [256.0, 256.0, 256.0]) # mean, std mask = transforms.to_tensor(mask) heatmap = transforms.to_tensor(heatmap) vecmap = transforms.to_tensor(vecmap) return img, heatmap, vecmap, mask
def eval_im(self, im): im = im.astype(np.float32, copy=True) h, w = im.shape[:2] normalized_im = T.normalize(im, mean=self.mean, std=self.std) scale_ims, scale_ratios = T.multi_scale_by_max(normalized_im, scales=self.scales, image_flip=self.image_flip) score_map = np.zeros((h, w, self.class_num), dtype=np.float32) for _im, _ratio in zip(scale_ims, scale_ratios): if _ratio > 0: score_map += cv2.resize(self.scale_process(_im), (w, h)) else: score_map += cv2.resize(self.scale_process(_im), (w, h))[:, ::-1] score_map /= len(self.scales) if self.crf: tmp_data = np.asarray([im.transpose(2, 0, 1)], dtype=np.float32) tmp_score = np.asarray([score_map.transpose(2, 0, 1)], dtype=np.float32) self.crf.blobs['data'].reshape(*tmp_data.shape) self.crf.blobs['data'].data[...] = tmp_data self.crf.blobs['data_dim'].data[...] = [[[h, w]]] self.crf.blobs['score'].reshape(*tmp_score.shape) self.crf.blobs['score'].data[...] = tmp_score * self.crf_factor self.crf.forward() score_map = self.crf.blobs[self.prob_layer].data[0].transpose(1, 2, 0) return score_map.argmax(2)
def __getitem__(self, index): img_path = os.path.join(self.img_dir, self.json_data[index]['img_fn']) img = np.array(cv2.imread(img_path), dtype=np.float32) keypoints = self.json_data[index]['keypoints'] if self.trans is not None: img, keypoints = self.trans(img, keypoints) label = np.zeros((self.s, self.s, self.num_kpt), dtype=np.float32) offset = np.zeros((self.s, self.s, self.num_kpt * 2), dtype=np.float32) for px in range(self.num_kpt): if keypoints[px * 3 + 2] == 0 or keypoints[ px * 3 + 0] <= 0 or keypoints[px * 3 + 0] >= self.size or keypoints[ px * 3 + 1] <= 0 or keypoints[ px * 3 + 1] >= self.size or keypoints[px * 3 + 2] == 0: continue else: grid_loc_x = math.floor(keypoints[px * 3 + 0] // self.grid_size) grid_loc_y = math.floor(keypoints[px * 3 + 1] // self.grid_size) label[grid_loc_y][grid_loc_x][px] = 1 offset[grid_loc_y][grid_loc_x][px] = ( keypoints[px * 3 + 0] % self.grid_size) / self.grid_size offset[grid_loc_y][grid_loc_x][self.num_kpt + px] = ( keypoints[px * 3 + 1] % self.grid_size) / self.grid_size img = normalize(to_tensor(img)) label = to_tensor(label) offset = to_tensor(offset) return img, label, offset
def process(filename): image, label, filename = read_tfrecord(filename) image = transforms.resize_and_crop_image(image, target_size=image_size) image = transforms.normalize(image, dtype=dtype) result = (image, label) if not drop_filename: result += (filename, ) return result
def det_im(self, im): im = im.astype(np.float32, copy=True) normalized_im = T.normalize(im, mean=self.mean, std=self.std) scale_im, scale_ratio = T.scale(normalized_im, short_size=self.scales[0], max_size=self.max_sizes[0]) input_data = scale_im.transpose(2, 0, 1) input_data = input_data.reshape((1,) + input_data.shape) self.net.blobs['data'].reshape(*input_data.shape) input_blob = {'data': input_data, 'rois': None} input_blob['im_info'] = np.array([[scale_im.shape[0], scale_im.shape[1], 1.0]], dtype=np.float32) self.net.blobs['im_info'].reshape(*input_blob['im_info'].shape) # do forward forward_kwargs = {'data': input_blob['data'].astype(np.float32, copy=False)} forward_kwargs['im_info'] = input_blob['im_info'].astype(np.float32, copy=False) output_blob = self.net.forward(**forward_kwargs) rois = self.net.blobs['rois'].data.copy() boxes = rois[:, 1:5] scores = output_blob['cls_prob'] scores = scores.reshape(*scores.shape[:2]) # Apply bounding-box regression deltas box_deltas = output_blob['bbox_pred'] box_deltas = box_deltas.reshape(*box_deltas.shape[:2]) pred_boxes = bbox_transform_inv(boxes, box_deltas) pred_boxes = clip_boxes(pred_boxes, scale_im.shape) objs = [] for cls_ind, cls in enumerate(self.class_map[1:]): cls_ind += 1 # because we skipped background if cfg.TEST.AGNOSTIC: cls_boxes = pred_boxes[:, 4:8] else: cls_boxes = pred_boxes[:, cls_ind * 4:(cls_ind + 1) * 4] cls_scores = scores[:, cls_ind] dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])).astype(np.float32) inds = np.where(dets[:, 4] > self.conf_thresh) cls_dets = dets[inds] keep = nms(cls_dets, self.nms_thresh) dets_NMSed = cls_dets[keep, :] if self.box_vote: VOTEed = bbox_vote(dets_NMSed, cls_dets) else: VOTEed = dets_NMSed _obj = boxes_filter(VOTEed, bbox_id=cls_ind, class_name=cls, color=self.color_map[cls_ind], scale=scale_ratio, thresh=self.conf_thresh) objs.extend(_obj) return objs
def cls_batch(self, batch_ims): input_ims = [] for im in batch_ims: im = im.astype(np.float32, copy=True) normalized_im = T.normalize(im, mean=self.mean, std=self.std) scale_im, scale_ratio = T.scale(normalized_im, short_size=self.base_size) input_ims.append(T.center_crop(scale_im, crop_size=self.crop_size)) scores = self.inference(np.asarray(input_ims, dtype=np.float32), output_layer=self.prob_layer) return scores
def seg_im(self, im): """ Ignore self.scales; """ im = im.astype(np.float32, copy=True) h, w = im.shape[:2] normalized_im = T.normalize(im, mean=self.mean, std=self.std) scale_im, scale_ratio = T.scale_by_max(normalized_im, long_size=self.crop_size) input_im = T.padding_im(scale_im, target_size=(self.crop_size, self.crop_size), borderType=cv2.BORDER_CONSTANT) output = self.inference(np.asarray([input_im], dtype=np.float32)) score = output[0].transpose(1, 2, 0) score_map = cv2.resize(score, None, None, fx=1. / scale_ratio, fy=1. / scale_ratio)[:h, :w, :] return score_map.argmax(2)
def cls_im(self, im): im = im.astype(np.float32, copy=True) normalized_im = T.normalize(im, mean=self.mean, std=self.std) scale_im, scale_ratio = T.scale(normalized_im, short_size=self.base_size) crop_ims = [] if self.crop_type == 'center' or self.crop_type == 'single': # for single crop crop_ims.append(T.center_crop(scale_im, crop_size=self.crop_size)) elif self.crop_type == 'mirror' or self.crop_type == 'multi': # for 10 crops crop_ims.extend(T.mirror_crop(scale_im, crop_size=self.crop_size)) else: crop_ims.append(scale_im) scores = self.inference(np.asarray(crop_ims, dtype=np.float32), output_layer=self.prob_layer) return np.sum(scores, axis=0)
def get_contacts(oracle, body_name, direction): # NOTE - Other objects will only have a fixed set of contacts and directions #print 'direction', direction[2] assert direction[2] == 0 contacts = [] direction = normalize(direction) aabb = oracle.get_aabb(body_name) radius = sqrt(oracle.get_radius2D2(body_name)) #radius = body.GetLinks()[0].GetGeometries()[0].GetCylinderRadius() height = 2*aabb.extents()[2] #height = body.GetLinks()[0].GetGeometries()[0].GetCylinderHeight() distance = radius + PUSH_SEPERATION z = -height/2 + PUSH_HEIGHT tool_quat = quat_from_trans(get_tool_trans(oracle)) manip_point = -distance*direction + np.array([0, 0, z]) + aabb.pos() for rotation in [0, PI]: # NOTE - 2 hand trans can push in a given direction manip_quat = quat_dot(quat_look_at(-direction), quat_look_at(-unit_z()), quat_from_angle_vector(rotation, unit_x()), tool_quat) # Grip * Tool = Manip contacts.append(Contact(compute_grasp(trans_from_quat_point(manip_quat, manip_point), unit_trans()), direction)) return contacts
def seg_im(self, im): """ Ignore self.scales; """ im = im.astype(np.float32, copy=True) h, w = im.shape[:2] normalized_im = T.normalize(im, mean=self.mean, std=self.std) scale_im, scale_ratio = T.scale_by_max(normalized_im, long_size=self.crop_size) input_im = T.padding_im(scale_im, target_size=(self.crop_size, self.crop_size), borderType=cv2.BORDER_CONSTANT, borderValue=(0.0, 0.0, 0.0)) score = self.caffe_process(input_im) score_map = cv2.resize(score, None, None, fx=1. / scale_ratio, fy=1. / scale_ratio)[:h, :w, :] return score_map.argmax(2)
def get_contacts(oracle, body_name, direction): # NOTE - Other objects will only have a fixed set of contacts and directions #print 'direction', direction[2] assert direction[2] == 0 contacts = [] direction = normalize(direction) aabb = oracle.get_aabb(body_name) radius = sqrt(oracle.get_radius2D2(body_name)) #radius = body.GetLinks()[0].GetGeometries()[0].GetCylinderRadius() height = 2*aabb.extents()[2] #height = body.GetLinks()[0].GetGeometries()[0].GetCylinderHeight() distance = radius + PUSH_SEPERATION z = -height/2 + PUSH_HEIGHT tool_quat = quat_from_trans(get_tool_trans(oracle)) manip_point = -distance*direction + np.array([0, 0, z]) + aabb.pos() for rotation in [0, PI]: # NOTE - 2 hand trans can push in a given direction manip_quat = quat_dot(quat_look_at(-direction), quat_look_at(-unit_z()), quat_from_angle_vector(rotation, unit_x()), tool_quat) # Grip * Tool = Manip contacts.append(Contact(compute_grasp(trans_from_quat_point(manip_quat, manip_point), unit_trans()), direction)) return contacts # def box_contacts(oracle, body_name): # TODO - push boxes along their faces
def eval_batch(): # shuffle_conv1_channel() eval_len = len(SET_DICT) # eval_len = 1000 accuracy = np.zeros(len(args.top_k)) start_time = datetime.datetime.now() for i in xrange(eval_len - args.skip_num): im = cv2.imread(SET_DICT[i + args.skip_num]['path']) if (PIXEL_MEANS == np.array([103.52, 116.28, 123.675])).all() and \ (PIXEL_STDS == np.array([57.375, 57.12, 58.395])).all(): scale_im = T.pil_scale(Image.fromarray(im), args.base_size) scale_im = np.asarray(scale_im) else: scale_im, _ = T.scale(im, short_size=args.base_size) input_im = T.normalize(scale_im, mean=PIXEL_MEANS, std=PIXEL_STDS) crop_ims = [] if args.crop_type == 'center': # for single crop crop_ims.append(T.center_crop(input_im, crop_size=args.crop_size)) elif args.crop_type == 'multi': # for 10 crops crop_ims.extend(T.mirror_crop(input_im, crop_size=args.crop_size)) else: crop_ims.append(input_im) score_vec = np.zeros(args.class_num, dtype=np.float32) iter_num = int(len(crop_ims) / args.batch_size) timer_pt1 = datetime.datetime.now() for j in xrange(iter_num): scores = CLS.inference( np.asarray(crop_ims, dtype=np.float32)[j * args.batch_size:(j + 1) * args.batch_size], output_layer=args.prob_layer ) score_vec += np.sum(scores, axis=0) score_index = (-score_vec / len(crop_ims)).argsort() timer_pt2 = datetime.datetime.now() SET_DICT[i + args.skip_num]['evaluated'] = True SET_DICT[i + args.skip_num]['score_vec'] = score_vec / len(crop_ims) print 'Testing image: {}/{} {} {}/{} {}s' \ .format(str(i + 1), str(eval_len - args.skip_num), str(SET_DICT[i + args.skip_num]['path'].split('/')[-1]), str(score_index[0]), str(SET_DICT[i + args.skip_num]['gt']), str((timer_pt2 - timer_pt1).microseconds / 1e6 + (timer_pt2 - timer_pt1).seconds)), for j in xrange(len(args.top_k)): if SET_DICT[i + args.skip_num]['gt'] in score_index[:args.top_k[j]]: accuracy[j] += 1 tmp_acc = float(accuracy[j]) / float(i + 1) if args.top_k[j] == 1: print '\ttop_' + str(args.top_k[j]) + ':' + str(tmp_acc), else: print 'top_' + str(args.top_k[j]) + ':' + str(tmp_acc) end_time = datetime.datetime.now() w = open(LOG_PTH, 'w') s1 = 'Evaluation process ends at: {}. \nTime cost is: {}. '.format(str(end_time), str(end_time - start_time)) s2 = '\nThe model is: {}. \nThe val file is: {}. \n{} images has been tested, crop_type is: {}, base_size is: {}, ' \ 'crop_size is: {}.'.format(args.model_weights, args.val_file, str(eval_len - args.skip_num), args.crop_type, str(args.base_size), str(args.crop_size)) s3 = '\nThe PIXEL_MEANS is: ({}, {}, {}), PIXEL_STDS is : ({}, {}, {}).' \ .format(str(PIXEL_MEANS[0]), str(PIXEL_MEANS[1]), str(PIXEL_MEANS[2]), str(PIXEL_STDS[0]), str(PIXEL_STDS[1]), str(PIXEL_STDS[2])) s4 = '' for i in xrange(len(args.top_k)): _acc = float(accuracy[i]) / float(eval_len - args.skip_num) s4 += '\nAccuracy of top_{} is: {}; correct num is {}.'.format(str(args.top_k[i]), str(_acc), str(int(accuracy[i]))) print s1, s2, s3, s4 w.write(s1 + s2 + s3 + s4) w.close() if args.save_score_vec: w = open(LOG_PTH.replace('.txt', 'scorevec.txt'), 'w') for i in xrange(eval_len - args.skip_num): w.write(SET_DICT[i + args.skip_num]['score_vec']) w.close() print('DONE!')
def main(): a = get_args() prev_enc = 0 def train(i): loss = 0 noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if a.noise > 0 else None img_out = image_f(noise) if a.sharp != 0: lx = torch.mean( torch.abs(img_out[0, :, :, 1:] - img_out[0, :, :, :-1])) ly = torch.mean( torch.abs(img_out[0, :, 1:, :] - img_out[0, :, :-1, :])) loss -= a.sharp * (ly + lx) micro = 1 - a.macro if a.in_txt2 is None else False imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, trform_f, a.align, micro=micro) out_enc = model_clip.encode_image(imgs_sliced[-1]) if a.diverse != 0: imgs_sliced = slice_imgs([image_f(noise)], a.samples, a.modsize, trform_f, a.align, micro=micro) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += a.diverse * torch.cosine_similarity( out_enc, out_enc2, dim=-1).mean() del out_enc2 torch.cuda.empty_cache() if a.in_img is not None and os.path.isfile(a.in_img): # input image loss += sign * 0.5 * torch.cosine_similarity( img_enc, out_enc, dim=-1).mean() if a.in_txt is not None: # input text loss += sign * torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean() if a.notext > 0: loss -= sign * a.notext * torch.cosine_similarity( txt_plot_enc, out_enc, dim=-1).mean() if a.in_txt0 is not None: # subtract text loss += -sign * torch.cosine_similarity(txt_enc0, out_enc, dim=-1).mean() if a.sync > 0 and a.in_img is not None and os.path.isfile( a.in_img): # image composition prog_sync = (a.steps // a.fstep - i) / (a.steps // a.fstep) loss += prog_sync * a.sync * sim_loss(F.interpolate( img_out, sim_size).float(), img_in, normalize=True).squeeze() if a.in_txt2 is not None: # input text for micro details imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, trform_f, a.align, micro=True) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += sign * torch.cosine_similarity(txt_enc2, out_enc2, dim=-1).mean() del out_enc2 torch.cuda.empty_cache() if a.expand > 0: global prev_enc if i > 0: loss += a.expand * torch.cosine_similarity( out_enc, prev_enc, dim=-1).mean() prev_enc = out_enc.detach() del img_out, imgs_sliced, out_enc torch.cuda.empty_cache() assert not isinstance(loss, int), ' Loss not defined, check the inputs' if a.prog is True: lr_cur = lr0 + (i / a.steps) * (lr1 - lr0) for g in optimizer.param_groups: g['lr'] = lr_cur optimizer.zero_grad() loss.backward() optimizer.step() if i % a.fstep == 0: with torch.no_grad(): img = image_f(contrast=a.contrast).cpu().numpy()[0] if (a.sync > 0 and a.in_img is not None) or a.sharp != 0: img = img**1.3 # empirical tone mapping checkout(img, os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)), verbose=a.verbose) pbar.upd() # Load CLIP models model_clip, _ = clip.load(a.model) if a.verbose is True: print(' using model', a.model) xmem = {'RN50': 0.5, 'RN50x4': 0.16, 'RN101': 0.33} if 'RN' in a.model: a.samples = int(a.samples * xmem[a.model]) if a.multilang is True: model_lang = SentenceTransformer( 'clip-ViT-B-32-multilingual-v1').cuda() def enc_text(txt): if a.multilang is True: emb = model_lang.encode([txt], convert_to_tensor=True, show_progress_bar=False) else: emb = model_clip.encode_text(clip.tokenize(txt).cuda()) return emb.detach().clone() if a.diverse != 0: a.samples = int(a.samples * 0.5) if a.sync > 0: a.samples = int(a.samples * 0.5) if a.transform is True: trform_f = transforms.transforms_custom a.samples = int(a.samples * 0.95) else: trform_f = transforms.normalize() out_name = [] if a.in_txt is not None: if a.verbose is True: print(' ref text: ', basename(a.in_txt)) if a.translate: translator = Translator() a.in_txt = translator.translate(a.in_txt, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt) txt_enc = enc_text(a.in_txt) out_name.append(txt_clean(a.in_txt)) if a.notext > 0: txt_plot = torch.from_numpy(plot_text(a.in_txt, a.modsize) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda() txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone() if a.in_txt2 is not None: if a.verbose is True: print(' micro text:', basename(a.in_txt2)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt2 = translator.translate(a.in_txt2, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt2) txt_enc2 = enc_text(a.in_txt2) out_name.append(txt_clean(a.in_txt2)) if a.in_txt0 is not None: if a.verbose is True: print(' subtract text:', basename(a.in_txt0)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt0 = translator.translate(a.in_txt0, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt0) txt_enc0 = enc_text(a.in_txt0) out_name.append('off-' + txt_clean(a.in_txt0)) if a.multilang is True: del model_lang if a.in_img is not None and os.path.isfile(a.in_img): if a.verbose is True: print(' ref image:', basename(a.in_img)) img_in = torch.from_numpy( img_read(a.in_img) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda() img_in = img_in[:, :3, :, :] # fix rgb channels in_sliced = slice_imgs([img_in], a.samples, a.modsize, transforms.normalize(), a.align, micro=False)[0] img_enc = model_clip.encode_image(in_sliced).detach().clone() if a.sync > 0: sim_loss = lpips.LPIPS(net='vgg', verbose=False).cuda() sim_size = [s // 2 for s in a.size] img_in = F.interpolate(img_in, sim_size).float() else: del img_in del in_sliced torch.cuda.empty_cache() out_name.append(basename(a.in_img).replace(' ', '_')) params, image_f = fft_image([1, 3, *a.size], resume=a.resume, decay_power=a.decay) image_f = to_valid_rgb(image_f, colors=a.colors) if a.prog is True: lr1 = a.lrate * 2 lr0 = lr1 * 0.01 else: lr0 = a.lrate optimizer = torch.optim.Adam(params, lr0) sign = 1. if a.invert is True else -1. if a.verbose is True: print(' samples:', a.samples) out_name = '-'.join(out_name) out_name += '-%s' % a.model if 'RN' in a.model.upper() else '' tempdir = os.path.join(a.out_dir, out_name) os.makedirs(tempdir, exist_ok=True) pbar = ProgressBar(a.steps // a.fstep) for i in range(a.steps): train(i) os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(a.out_dir, out_name))) shutil.copy( img_list(tempdir)[-1], os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps))) if a.save_pt is True: torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
def eval_batch(): eval_len = len(SET_DICT) accuracy = np.zeros(len(args.top_k)) start_time = datetime.datetime.now() for i in xrange(eval_len - args.skip_num): im = cv2.imread(SET_DICT[i + args.skip_num]['path']) im = T.bgr2rgb(im) scale_im = T.pil_resize(Image.fromarray(im), args.base_size) normalized_im = T.normalize(np.asarray(scale_im) / 255.0, mean=PIXEL_MEANS, std=PIXEL_STDS) crop_ims = [] if args.crop_type == 'center': # for single crop crop_ims.append(T.center_crop(normalized_im, crop_size=args.crop_size)) elif args.crop_type == 'multi': # for 10 crops crop_ims.extend(T.mirror_crop(normalized_im, crop_size=args.crop_size)) else: crop_ims.append(normalized_im) score_vec = np.zeros(args.class_num, dtype=np.float32) iter_num = int(len(crop_ims) / args.batch_size) timer_pt1 = datetime.datetime.now() for j in xrange(iter_num): input_data = np.asarray(crop_ims, dtype=np.float32)[j * args.batch_size:(j + 1) * args.batch_size] input_data = input_data.transpose(0, 3, 1, 2) input_data = torch.autograd.Variable(torch.from_numpy(input_data).cuda(), volatile=True) outputs = MODEL(input_data) scores = outputs.data.cpu().numpy() score_vec += np.sum(scores, axis=0) score_index = (-score_vec / len(crop_ims)).argsort() - 1 timer_pt2 = datetime.datetime.now() SET_DICT[i + args.skip_num]['evaluated'] = True SET_DICT[i + args.skip_num]['score_vec'] = score_vec / len(crop_ims) print 'Testing image: {}/{} {} {}/{} {}s' \ .format(str(i + 1), str(eval_len - args.skip_num), str(SET_DICT[i + args.skip_num]['path'].split('/')[-1]), str(score_index[0]), str(SET_DICT[i + args.skip_num]['gt']), str((timer_pt2 - timer_pt1).microseconds / 1e6 + (timer_pt2 - timer_pt1).seconds)), for j in xrange(len(args.top_k)): if SET_DICT[i + args.skip_num]['gt'] in score_index[:args.top_k[j]]: accuracy[j] += 1 tmp_acc = float(accuracy[j]) / float(i + 1) if args.top_k[j] == 1: print '\ttop_' + str(args.top_k[j]) + ':' + str(tmp_acc), else: print 'top_' + str(args.top_k[j]) + ':' + str(tmp_acc) end_time = datetime.datetime.now() w = open(LOG_PTH, 'w') s1 = 'Evaluation process ends at: {}. \nTime cost is: {}. '.format(str(end_time), str(end_time - start_time)) s2 = '\nThe model is: {}. \nThe val file is: {}. \n{} images has been tested, crop_type is: {}, base_size is: {}, ' \ 'crop_size is: {}.'.format(args.model_weights, args.val_file, str(eval_len - args.skip_num), args.crop_type, str(args.base_size), str(args.crop_size)) s3 = '\nThe PIXEL_MEANS is: ({}, {}, {}), PIXEL_STDS is : ({}, {}, {}).' \ .format(str(PIXEL_MEANS[0]), str(PIXEL_MEANS[1]), str(PIXEL_MEANS[2]), str(PIXEL_STDS[0]), str(PIXEL_STDS[1]), str(PIXEL_STDS[2])) s4 = '' for i in xrange(len(args.top_k)): _acc = float(accuracy[i]) / float(eval_len - args.skip_num) s4 += '\nAccuracy of top_{} is: {}; correct num is {}.'.format(str(args.top_k[i]), str(_acc), str(int(accuracy[i]))) print s1, s2, s3, s4 w.write(s1 + s2 + s3 + s4) w.close() if args.save_score_vec: w = open(LOG_PTH.replace('.txt', 'scorevec.txt'), 'w') for i in xrange(eval_len - args.skip_num): w.write(SET_DICT[i + args.skip_num]['score_vec']) w.close() print('DONE!')
import transforms as t import numpy as np import matplotlib.pyplot as plt from hcat.unet import Unet_Constructor as GUnet from hcat.loss import dice_loss import torch data = dataloader.stack( path='/home/chris/Desktop/ColorImages', joint_transforms=[ t.to_float(), t.reshape(), t.random_crop([512, 512, 30]), ], image_transforms=[t.normalize([0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 0.5, 0.5])]) if torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' print('Initalizing Unet: ', end='') unet = GUnet(image_dimensions=3, in_channels=4, out_channels=1, feature_sizes=[16, 32, 64, 128], kernel={ 'conv1': (3, 3, 2), 'conv2': (3, 3, 1) },
from math import sqrt import numpy as np from transforms import quat_dot, normalize, unit_x, quat_look_at, compute_grasp, trans_from_quat_point, \ quat_from_angle_vector, unit_trans, quat_from_trans, unit_z, manip_trans_from_object_trans, trans_from_pose, \ quat_transform_point from manipulation.bodies.robot import get_tool_trans from tools.numerical import PI from tools.objects import str_object from manipulation.constants import APPROACH_DISTANCE APPROACH_VECTOR = APPROACH_DISTANCE*normalize(np.array([1, 0, -1])) # TODO - move this elsewhere class Contact(object): def __init__(self, contact_trans, direction, gripper_config=None, gripper_traj=None): # TODO - fill in self.direction = direction self.grasp_trans = contact_trans # TODO - rename self.contact_trans self.gripper_config = gripper_config self.gripper_traj = gripper_traj def __repr__(self): return self.__class__.__name__ + str_object(self.grasp_trans[:3, 3]) def manip_trans_from_pose_contact(pose, contact): return manip_trans_from_object_trans(trans_from_pose(pose.value), contact.grasp_trans) # NOTE - cannot use center of object to infer approach vector because gripper might not be tangent def approach_vector_from_pose_contact(pose, contact): # TODO - universal way of inferring approach_vector from manip_trans (probably not possible) approach_vector = quat_transform_point(quat_from_trans(manip_trans_from_pose_contact(pose, contact)), APPROACH_VECTOR) if contact.grasp_trans[0, 3] > 0: approach_vector[:2] *= -1 return approach_vector
from math import sqrt import numpy as np from transforms import quat_dot, normalize, unit_x, quat_look_at, compute_grasp, trans_from_quat_point, \ quat_from_angle_vector, unit_trans, quat_from_trans, unit_z, manip_trans_from_object_trans, trans_from_pose, \ quat_transform_point from manipulation.bodies.robot import get_tool_trans from misc.numerical import PI from misc.objects import str_object from manipulation.constants import APPROACH_DISTANCE import operator APPROACH_VECTOR = APPROACH_DISTANCE*normalize(np.array([1, 0, -1])) # TODO - move this elsewhere class Contact(object): def __init__(self, contact_trans, direction, gripper_config=None, gripper_traj=None): # TODO - fill in self.direction = direction self.grasp_trans = contact_trans # TODO - rename self.contact_trans self.gripper_config = gripper_config self.gripper_traj = gripper_traj def __repr__(self): return self.__class__.__name__ + str_object(self.grasp_trans[:3, 3]) def manip_trans_from_pose_contact(pose, contact): return manip_trans_from_object_trans(trans_from_pose(pose.value), contact.grasp_trans) # NOTE - cannot use center of object to infer approach vector because gripper might not be tangent def approach_vector_from_pose_contact(pose, contact): # TODO - universal way of inferring approach_vector from manip_trans (probably not possible) approach_vector = quat_transform_point(quat_from_trans(manip_trans_from_pose_contact(pose, contact)), APPROACH_VECTOR) if contact.grasp_trans[0, 3] > 0: approach_vector[:2] *= -1
def main(): a = get_args() # Load CLIP models model_clip, _ = clip.load(a.model) if a.verbose is True: print(' using model', a.model) xmem = {'RN50':0.5, 'RN50x4':0.16, 'RN101':0.33} if 'RN' in a.model: a.samples = int(a.samples * xmem[a.model]) workdir = os.path.join(a.out_dir, basename(a.in_txt)) workdir += '-%s' % a.model if 'RN' in a.model.upper() else '' os.makedirs(workdir, exist_ok=True) if a.diverse != 0: a.samples = int(a.samples * 0.5) if a.transform is True: trform_f = transforms.transforms_custom a.samples = int(a.samples * 0.95) else: trform_f = transforms.normalize() if a.in_txt0 is not None: if a.verbose is True: print(' subtract text:', basename(a.in_txt0)) if a.translate: translator = Translator() a.in_txt0 = translator.translate(a.in_txt0, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt0) if a.multilang is True: model_lang = SentenceTransformer('clip-ViT-B-32-multilingual-v1').cuda() txt_enc0 = model_lang.encode([a.in_txt0], convert_to_tensor=True, show_progress_bar=False).detach().clone() del model_lang else: txt_enc0 = model_clip.encode_text(clip.tokenize(a.in_txt0).cuda()).detach().clone() # make init global params_start, params_ema params_shape = [1, 3, a.size[0], a.size[1]//2+1, 2] params_start = torch.randn(*params_shape).cuda() # random init params_ema = 0. if a.resume is not None and os.path.isfile(a.resume): if a.verbose is True: print(' resuming from', a.resume) params_start = load_params(a.resume).cuda() if a.keep > 0: params_ema = params_start[0].detach().clone() else: a.resume = 'init.pt' torch.save(params_start, 'init.pt') # final init shutil.copy(a.resume, os.path.join(workdir, '000-%s.pt' % basename(a.resume))) prev_enc = 0 def process(txt, num): sd = 0.01 if a.keep > 0: sd = a.keep + (1-a.keep) * sd params, image_f = fft_image([1, 3, *a.size], resume='init.pt', sd=sd, decay_power=a.decay) image_f = to_valid_rgb(image_f, colors = a.colors) if a.prog is True: lr1 = a.lrate * 2 lr0 = a.lrate * 0.1 else: lr0 = a.lrate optimizer = torch.optim.Adam(params, lr0) if a.verbose is True: print(' ref text: ', txt) if a.translate: translator = Translator() txt = translator.translate(txt, dest='en').text if a.verbose is True: print(' translated to:', txt) if a.multilang is True: model_lang = SentenceTransformer('clip-ViT-B-32-multilingual-v1').cuda() txt_enc = model_lang.encode([txt], convert_to_tensor=True, show_progress_bar=False).detach().clone() del model_lang else: txt_enc = model_clip.encode_text(clip.tokenize(txt).cuda()).detach().clone() if a.notext > 0: txt_plot = torch.from_numpy(plot_text(txt, a.modsize)/255.).unsqueeze(0).permute(0,3,1,2).cuda() txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone() else: txt_plot_enc = None out_name = '%03d-%s' % (num+1, txt_clean(txt)) out_name += '-%s' % a.model if 'RN' in a.model.upper() else '' tempdir = os.path.join(workdir, out_name) os.makedirs(tempdir, exist_ok=True) pbar = ProgressBar(a.steps // a.fstep) for i in range(a.steps): loss = 0 noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if a.noise > 0 else None img_out = image_f(noise) if a.sharp != 0: lx = torch.mean(torch.abs(img_out[0,:,:,1:] - img_out[0,:,:,:-1])) ly = torch.mean(torch.abs(img_out[0,:,1:,:] - img_out[0,:,:-1,:])) loss -= a.sharp * (ly+lx) imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, trform_f, a.align, micro=1.) out_enc = model_clip.encode_image(imgs_sliced[-1]) loss -= torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean() if a.notext > 0: loss += a.notext * torch.cosine_similarity(txt_plot_enc, out_enc, dim=-1).mean() if a.diverse != 0: imgs_sliced = slice_imgs([image_f(noise)], a.samples, a.modsize, trform_f, a.align, micro=1.) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += a.diverse * torch.cosine_similarity(out_enc, out_enc2, dim=-1).mean() del out_enc2; torch.cuda.empty_cache() if a.expand > 0: global prev_enc if i > 0: loss += a.expand * torch.cosine_similarity(out_enc, prev_enc, dim=-1).mean() prev_enc = out_enc.detach().clone() if a.in_txt0 is not None: # subtract text loss += torch.cosine_similarity(txt_enc0, out_enc, dim=-1).mean() del img_out, imgs_sliced, out_enc; torch.cuda.empty_cache() if a.prog is True: lr_cur = lr0 + (i / a.steps) * (lr1 - lr0) for g in optimizer.param_groups: g['lr'] = lr_cur optimizer.zero_grad() loss.backward() optimizer.step() if i % a.fstep == 0: with torch.no_grad(): img = image_f(contrast=a.contrast).cpu().numpy()[0] if a.sharp != 0: img = img **1.3 # empirical tone mapping checkout(img, os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)), verbose=a.verbose) pbar.upd() del img if a.keep > 0: global params_start, params_ema params_ema = ema(params_ema, params[0].detach().clone(), num+1) torch.save((1-a.keep) * params_start + a.keep * params_ema, 'init.pt') torch.save(params[0], '%s.pt' % os.path.join(workdir, out_name)) shutil.copy(img_list(tempdir)[-1], os.path.join(workdir, '%s-%d.jpg' % (out_name, a.steps))) os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(workdir, out_name))) with open(a.in_txt, 'r', encoding="utf-8") as f: texts = f.readlines() texts = [tt.strip() for tt in texts if len(tt.strip()) > 0 and tt[0] != '#'] if a.verbose is True: print(' total lines:', len(texts)) print(' samples:', a.samples) for i, txt in enumerate(texts): process(txt, i) vsteps = int(a.length * 25 / len(texts)) # 25 fps tempdir = os.path.join(workdir, '_final') os.makedirs(tempdir, exist_ok=True) def read_pt(file): return torch.load(file).cuda() if a.verbose is True: print(' rendering complete piece') ptfiles = file_list(workdir, 'pt') pbar = ProgressBar(vsteps * len(ptfiles)) for px in range(len(ptfiles)): params1 = read_pt(ptfiles[px]) params2 = read_pt(ptfiles[(px+1) % len(ptfiles)]) params, image_f = fft_image([1, 3, *a.size], resume=params1, sd=1., decay_power=a.decay) image_f = to_valid_rgb(image_f, colors = a.colors) for i in range(vsteps): with torch.no_grad(): img = image_f((params2 - params1) * math.sin(1.5708 * i/vsteps)**2)[0].permute(1,2,0) img = torch.clip(img*255, 0, 255).cpu().numpy().astype(np.uint8) imsave(os.path.join(tempdir, '%05d.jpg' % (px * vsteps + i)), img) if a.verbose is True: cvshow(img) pbar.upd() os.system('ffmpeg -v warning -y -i %s\%%05d.jpg "%s.mp4"' % (tempdir, os.path.join(a.out_dir, basename(a.in_txt)))) if a.keep > 0: os.remove('init.pt')
try: d_feats = np.load(d_feats_file) except OSError as e: print( 'ERROR: File {} not found. Please follow the instructions to download the pre-computed features.' .format(d_feats_file)) sys.exit() # Load the query image img = Image.open(dataset.get_query_filename(args.qidx)) # Crop the query ROI img = img.crop(tuple(dataset.get_query_roi(args.qidx))) # Apply transformations img = trf.resize_image(img, 800) I = trf.to_tensor(img) I = trf.normalize( I, dict(rgb_means=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])) I = I.unsqueeze(0).to(device) # Forward pass to extract the features with torch.no_grad(): print('Extracting the representation of the query...') q_feat = model(I).numpy() print('Done\n') # Rank the database and visualize the top-k most similar images in the database dataset.vis_top(d_feats, args.qidx, q_feat=q_feat, topk=args.topk, out_image_file='out.png')