def build_coco_batches(dataset, setname, T, input_H, input_W): im_dir = '/data/ryli/datasets/coco/images' im_type = 'train2014' vocab_file = './data/vocabulary_Gref.txt' data_folder = './' + dataset + '/' + setname + '_batch/' data_prefix = dataset + '_' + setname if not os.path.isdir(data_folder): os.makedirs(data_folder) if dataset == 'Gref': refer = REFER('./external/refer/data', dataset='refcocog', splitBy='google') elif dataset == 'unc': refer = REFER('./external/refer/data', dataset='refcoco', splitBy='unc') elif dataset == 'unc+': refer = REFER('./external/refer/data', dataset='refcoco+', splitBy='unc') else: raise ValueError('Unknown dataset %s' % dataset) refs = [ refer.Refs[ref_id] for ref_id in refer.Refs if refer.Refs[ref_id]['split'] == setname ] vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) n_batch = 0 for ref in refs: im_name = 'COCO_' + im_type + '_' + str(ref['image_id']).zfill(12) im = skimage.io.imread('%s/%s/%s.jpg' % (im_dir, im_type, im_name)) seg = refer.Anns[ref['ann_id']]['segmentation'] rle = cocomask.frPyObjects(seg, im.shape[0], im.shape[1]) mask = np.max(cocomask.decode(rle), axis=2).astype(np.float32) if 'train' in setname: im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) mask = im_processing.resize_and_pad(mask, input_H, input_W) if im.ndim == 2: im = np.tile(im[:, :, np.newaxis], (1, 1, 3)) for sentence in ref['sentences']: print('saving batch %d' % (n_batch + 1)) sent = sentence['sent'] text = text_processing.preprocess_sentence(sent, vocab_dict, T) np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz', text_batch=text, im_batch=im, mask_batch=(mask > 0), sent_batch=[sent]) n_batch += 1
def build_referit_batches(setname, T, input_H, input_W): # data directory im_dir = '/data/ryli/text_objseg/exp-referit/referit-dataset/images/' mask_dir = '/data/ryli/text_objseg/exp-referit/referit-dataset/mask/' query_file = './data/referit/referit_query_' + setname + '.json' vocab_file = './data/vocabulary_referit.txt' # saving directory data_folder = './referit/' + setname + '_batch/' data_prefix = 'referit_' + setname if not os.path.isdir(data_folder): os.makedirs(data_folder) fp = open('./referit/trainval_list.txt', 'w') # load annotations query_dict = json.load(open(query_file)) im_list = query_dict.keys() vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) # collect training samples samples = [] for n_im, name in enumerate(im_list): im_name = name.split('_', 1)[0] + '.jpg' mask_name = name + '.mat' for sent in query_dict[name]: samples.append((im_name, mask_name, sent)) # save batches to disk num_batch = len(samples) for n_batch in range(num_batch): print('saving batch %d / %d' % (n_batch + 1, num_batch)) im_name, mask_name, sent = samples[n_batch] fp.write('%d\t%s%s\n' % (n_batch, im_dir, im_name)) im = skimage.io.imread(im_dir + im_name) mask = load_gt_mask(mask_dir + mask_name).astype(np.float32) if 'train' in setname: im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) mask = im_processing.resize_and_pad(mask, input_H, input_W) if im.ndim == 2: im = np.tile(im[:, :, np.newaxis], (1, 1, 3)) text = text_processing.preprocess_sentence(sent, vocab_dict, T) np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz', text_batch=text, im_batch=im, mask_batch=(mask > 0), sent_batch=[sent]) fp.close()
def build_coco_batches(dataset, setname, input_H, input_W): im_dir = '/data/ryli/datasets/coco/images' im_type = 'train2014' vocab_file = '/data/ryli/rmi_phrasecut/data/vocabulary_Gref.txt' data_folder = './data/' + dataset + '/' + setname + '_batch/' data_prefix = dataset + '_' + setname if not os.path.isdir(data_folder): os.makedirs(data_folder) if dataset == 'Gref': refer = REFER('/data/ryli/rmi_phrasecut/external/refer/data', dataset='refcocog', splitBy='google') elif dataset == 'unc': refer = REFER('/data/ryli/rmi_phrasecut/external/refer/data', dataset='refcoco', splitBy='unc') elif dataset == 'unc+': refer = REFER('/data/ryli/rmi_phrasecut/external/refer/data', dataset='refcoco+', splitBy='unc') else: raise ValueError('Unknown dataset %s' % dataset) refs = [ refer.Refs[ref_id] for ref_id in refer.Refs if refer.Refs[ref_id]['split'] == setname ] sent_data = [] encoder = setup_encoder() for ref in refs: for sentence in ref['sentences']: sent_data.append(sentence['sent'].decode('latin-1').strip()) encodings = encoder.encode(sent_data) n_batch = 0 for ref in refs: im_name = 'COCO_' + im_type + '_' + str(ref['image_id']).zfill(12) im = skimage.io.imread('%s/%s/%s.jpg' % (im_dir, im_type, im_name)) seg = refer.Anns[ref['ann_id']]['segmentation'] rle = cocomask.frPyObjects(seg, im.shape[0], im.shape[1]) mask = np.max(cocomask.decode(rle), axis=2).astype(np.float32) if 'train' in setname: mask = im_processing.resize_and_pad(mask, input_H, input_W) for sentence in ref['sentences']: print('saving batch %d' % (n_batch + 1)) sent = sentence['sent'] np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz', im_name_batch=im_name, encoding_batch=encodings[n_batch], mask_batch=(mask > 0), sent_batch=[sent]) n_batch += 1
def preprocess_data(im, mask, sent, obj_id): anchors = io.read_anchors(anchor_file) mask_color = object_color[obj_id] mask_obj = np.asarray(((mask == mask_color)[:, :, 0])) im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) mask = im_processing.resize_and_pad(mask_obj, input_H, input_W) bbox = im_processing.bboxes_from_masks(np.asarray(mask)) bbox[:, 2:4] += bbox[:, :2] label_bbox, true_bbox = processing_tools.preprocess_true_boxes( bbox, input_H, anchors) text = text_processing.preprocess_sentence(sent, vocab_dict, T) return { 'text_batch': np.asarray(text), 'im_batch': np.asarray(im), 'mask_batch': (mask > 0), 'sent_batch': [sent], 'label_bbox': label_bbox, 'true_bbox': true_bbox }
################################################################################ text_seq_batch = np.zeros((T, N), dtype=np.int32) imcrop_batch = np.zeros((N, input_H, input_W, 3), dtype=np.uint8) label_batch = np.zeros(N, dtype=np.bool) if not os.path.isdir(data_folder): os.mkdir(data_folder) for n_batch in range(num_batch): print('saving batch %d / %d' % (n_batch+1, num_batch)) batch_begin = n_batch * N batch_end = (n_batch+1) * N for n_sample in range(batch_begin, batch_end): imname, description, label = shuffled_training_samples[n_sample] im = skimage.io.imread(image_dir + imname) processed_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, input_H, input_W)) if processed_im.ndim == 2: processed_im = processed_im[:, :, np.newaxis] text_seq = text_processing.preprocess_sentence(description, vocab_dict, T) text_seq_batch[:, n_sample-batch_begin] = text_seq imcrop_batch[n_sample-batch_begin, ...] = processed_im label_batch[n_sample-batch_begin] = label np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz', text_seq_batch=text_seq_batch, imcrop_batch=imcrop_batch, label_batch=label_batch)
def test(iter, dataset, visualize, setname, dcrf, mu, tfmodel_folder, pre_emb=False, use_tree=False, neg_num=0.1): data_folder = './' + dataset + '/' + setname + '_batch/' data_prefix = dataset + '_' + setname if visualize: save_dir = './' + dataset + '/visualization/' + str(iter) + '/' if not os.path.isdir(save_dir): os.makedirs(save_dir) weights = os.path.join(tfmodel_folder, dataset + '_iter_' + str(iter) + '.tfmodel') score_thresh = 1e-9 eval_seg_iou_list = [.5, .6, .7, .8, .9] cum_I, cum_U = 0, 0 mean_IoU, mean_dcrf_IoU = 0, 0 seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) if dcrf: cum_I_dcrf, cum_U_dcrf = 0, 0 seg_correct_dcrf = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0. H, W = 320, 320 vocab_size = 8226 if dataset == 'referit' else 21692 emb_name = 'referit' if dataset == 'referit' else 'Gref' IU_result = list() if pre_emb: # use pretrained embbeding print("Use pretrained Embeddings.") model = LSCM_model(num_steps=30, H=H, W=W, mode='eval', vocab_size=vocab_size, emb_name=emb_name) else: model = LSCM_model(num_steps=30, H=H, W=W, mode='eval', vocab_size=vocab_size) # Load pretrained model snapshot_restorer = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) snapshot_restorer.restore(sess, weights) reader = data_reader.DataReader(data_folder, data_prefix, shuffle=False) NN = reader.num_batch for n_iter in range(reader.num_batch): if n_iter % (NN // 50) == 0: if n_iter / (NN // 50) % 5 == 0: sys.stdout.write(str(n_iter / (NN // 50) // 5)) else: sys.stdout.write('.') sys.stdout.flush() batch = reader.read_batch(is_log=False) text = batch['text_batch'] im = batch['im_batch'] mask = batch['mask_batch'].astype(np.float32) valid_idx = np.zeros([1], dtype=np.int32) graph = batch['graph_batch'] height = batch['height_batch'] for idx in range(text.shape[0]): if text[idx] != 0: valid_idx[0] = idx break if neg_num != 0.1: graph[graph < 0.5] = neg_num proc_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, H, W)) proc_im_ = proc_im.astype(np.float32) proc_im_ = proc_im_[:, :, ::-1] proc_im_ -= mu if use_tree: scores_val, up_val, sigm_val = sess.run( [model.pred, model.up, model.sigm], feed_dict={ model.words: np.expand_dims(text, axis=0), model.im: np.expand_dims(proc_im_, axis=0), model.valid_idx: np.expand_dims(valid_idx, axis=0), model.graph_adj: np.expand_dims(graph, axis=0), model.tree_height: np.expand_dims(height, axis=0) }) else: scores_val, up_val, sigm_val = sess.run( [model.pred, model.up, model.sigm], feed_dict={ model.words: np.expand_dims(text, axis=0), model.im: np.expand_dims(proc_im_, axis=0), model.valid_idx: np.expand_dims(valid_idx, axis=0) }) # scores_val = np.squeeze(scores_val) # pred_raw = (scores_val >= score_thresh).astype(np.float32) up_val = np.squeeze(up_val) pred_raw = (up_val >= score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, mask.shape[0], mask.shape[1]) if dcrf: # Dense CRF post-processing sigm_val = np.squeeze(sigm_val) d = densecrf.DenseCRF2D(W, H, 2) U = np.expand_dims(-np.log(sigm_val), axis=0) U_ = np.expand_dims(-np.log(1 - sigm_val), axis=0) unary = np.concatenate((U_, U), axis=0) unary = unary.reshape((2, -1)) d.setUnaryEnergy(unary) d.addPairwiseGaussian(sxy=3, compat=3) d.addPairwiseBilateral(sxy=20, srgb=3, rgbim=proc_im, compat=10) Q = d.inference(5) pred_raw_dcrf = np.argmax(Q, axis=0).reshape( (H, W)).astype(np.float32) predicts_dcrf = im_processing.resize_and_crop( pred_raw_dcrf, mask.shape[0], mask.shape[1]) if visualize: sent = batch['sent_batch'][0] visualize_seg(im, mask, predicts, sent) if dcrf: visualize_seg(im, mask, predicts_dcrf, sent) I, U = eval_tools.compute_mask_IU(predicts, mask) IU_result.append({'batch_no': n_iter, 'I': I, 'U': U}) mean_IoU += float(I) / U cum_I += I cum_U += U msg = 'cumulative IoU = %f' % (cum_I / cum_U) for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou] seg_correct[n_eval_iou] += (I / U >= eval_seg_iou) if dcrf: I_dcrf, U_dcrf = eval_tools.compute_mask_IU(predicts_dcrf, mask) mean_dcrf_IoU += float(I_dcrf) / U_dcrf cum_I_dcrf += I_dcrf cum_U_dcrf += U_dcrf msg += '\tcumulative IoU (dcrf) = %f' % (cum_I_dcrf / cum_U_dcrf) for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou] seg_correct_dcrf[n_eval_iou] += (I_dcrf / U_dcrf >= eval_seg_iou) # print(msg) seg_total += 1 # Print results print('Segmentation evaluation (without DenseCRF):') result_str = '' for n_eval_iou in range(len(eval_seg_iou_list)): result_str += 'precision@%s = %f\n' % \ (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou] / seg_total) result_str += 'overall IoU = %f; mean IoU = %f\n' % (cum_I / cum_U, mean_IoU / seg_total) print(result_str) if dcrf: print('Segmentation evaluation (with DenseCRF):') result_str = '' for n_eval_iou in range(len(eval_seg_iou_list)): result_str += 'precision@%s = %f\n' % \ (str(eval_seg_iou_list[n_eval_iou]), seg_correct_dcrf[n_eval_iou] / seg_total) result_str += 'overall IoU = %f; mean IoU = %f\n' % ( cum_I_dcrf / cum_U_dcrf, mean_dcrf_IoU / seg_total) print(result_str)
def rmi_refvg_predictor(split='val', eval_img_count=-1, out_path='output/eval_refvg/rmi', model_iter=750000, dcrf=True, mu=the_mu): pretrained_model = './_rmi/refvg/tfmodel/refvg_resnet_RMI_iter_' + str( model_iter) + '.tfmodel' data_loader = RMIRefVGLoader(split=split) vocab_size = len(data_loader.vocab_dict) score_thresh = 1e-9 H, W = 320, 320 model = RMI_model(H=H, W=W, mode='eval', vocab_size=vocab_size, weights='resnet') # Load pretrained model snapshot_restorer = tf.train.Saver() sess = tf.Session() sess.run(tf.global_variables_initializer()) snapshot_restorer.restore(sess, pretrained_model) predictions = dict() while not data_loader.is_end: img_id, task_id, im, mask, sent, text = data_loader.get_img_data( rand=False, is_train=False) mask = mask.astype(np.float32) proc_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, H, W)) proc_im_ = proc_im.astype(np.float32) proc_im_ = proc_im_[:, :, ::-1] proc_im_ -= mu scores_val, up_val, sigm_val = sess.run( [model.pred, model.up, model.sigm], feed_dict={ model.words: np.expand_dims(text, axis=0), model.im: np.expand_dims(proc_im_, axis=0) }) # scores_val = np.squeeze(scores_val) # pred_raw = (scores_val >= score_thresh).astype(np.float32) up_val = np.squeeze(up_val) pred_raw = (up_val >= score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, mask.shape[0], mask.shape[1]) pred_mask = predicts if dcrf: # Dense CRF post-processing sigm_val = np.squeeze(sigm_val) d = densecrf.DenseCRF2D(W, H, 2) U = np.expand_dims(-np.log(sigm_val), axis=0) U_ = np.expand_dims(-np.log(1 - sigm_val), axis=0) unary = np.concatenate((U_, U), axis=0) unary = unary.reshape((2, -1)) d.setUnaryEnergy(unary) d.addPairwiseGaussian(sxy=3, compat=3) d.addPairwiseBilateral(sxy=20, srgb=3, rgbim=proc_im, compat=10) Q = d.inference(5) pred_raw_dcrf = np.argmax(Q, axis=0).reshape( (H, W)).astype(np.float32) predicts_dcrf = im_processing.resize_and_crop( pred_raw_dcrf, mask.shape[0], mask.shape[1]) pred_mask = predicts_dcrf if img_id not in predictions.keys(): predictions[img_id] = dict() pred_mask = np.packbits(pred_mask.astype(np.bool)) predictions[img_id][task_id] = {'pred_mask': pred_mask} print data_loader.img_idx, img_id, task_id if out_path is not None: print('rmi_refvg_predictor: saving predictions to %s ...' % out_path) if not os.path.exists(out_path): os.makedirs(out_path) fname = split if eval_img_count > 0: fname += '_%d' % eval_img_count fname += '.npy' f_path = os.path.join(out_path, fname) np.save(f_path, predictions) print('RMI refvg predictor done!') return predictions
imcrop_batch = np.zeros((N, input_H, input_W, 3), dtype=np.uint8) label_batch = np.zeros(N, dtype=np.bool) if not os.path.isdir(data_folder): os.mkdir(data_folder) for n_batch in range(num_batch): print('saving batch %d / %d' % (n_batch + 1, num_batch)) batch_begin = n_batch * N batch_end = (n_batch + 1) * N for n_sample in range(batch_begin, batch_end): img_id, description, label = shuffled_training_samples[n_sample] # load image and get host url for image imname = coco.loadImgs(img_id)[0]['coco_url'] im = skimage.io.imread(imname) processed_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) if processed_im.ndim == 2: processed_im = processed_im[:, :, np.newaxis] text_seq = text_processing.preprocess_sentence(description, vocab_dict, T) text_seq_batch[:, n_sample - batch_begin] = text_seq imcrop_batch[n_sample - batch_begin, ...] = processed_im label_batch[n_sample - batch_begin] = label np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz', text_seq_batch=text_seq_batch, imcrop_batch=imcrop_batch, label_batch=label_batch)
cum_I, cum_U = 0, 0 eval_seg_iou_list = [.5, .6, .7, .8, .9] seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0 text_seq_val = np.zeros((T, N), dtype=np.float32) imcrop_val = np.zeros((N, input_H, input_W, 3), dtype=np.float32) num_im = len(imlist) for n_im in range(num_im): print('testing image %d / %d' % (n_im, num_im)) imname = imlist[n_im] # Extract visual features from all proposals im = skimage.io.imread(image_dir + imname) processed_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, input_H, input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[...] = processed_im.astype(np.float32) - segmodel.vgg_net.channel_mean for imcrop_name, _, description in flat_query_dict[imname]: mask = load_gt_mask(mask_dir + imcrop_name + '.mat').astype(np.float32) labels = (mask > 0) processed_labels = im_processing.resize_and_pad(mask, input_H, input_W) > 0 text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, T) scores_val = sess.run(scores, feed_dict={ text_seq_batch : text_seq_val, imcrop_batch : imcrop_val }) scores_val = np.squeeze(scores_val)
def test(iter, dataset, visualize, setname, dcrf, mu, tfmodel_folder, model_name, pre_emb=False): data_folder = './' + dataset + '/' + setname + '_batch/' data_prefix = dataset + '_' + setname if visualize: save_dir = './' + dataset + '/visualization/' + str(iter) + '/' if not os.path.isdir(save_dir): os.makedirs(save_dir) weights = os.path.join(tfmodel_folder, dataset + '_iter_' + str(iter) + '.tfmodel') print("Loading trained weights from {}".format(weights)) score_thresh = 1e-9 eval_seg_iou_list = [.5, .6, .7, .8, .9] cum_I, cum_U = 0, 0 mean_IoU, mean_dcrf_IoU = 0, 0 seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) if dcrf: cum_I_dcrf, cum_U_dcrf = 0, 0 seg_correct_dcrf = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0. T = 20 # truncated long sentence H, W = 320, 320 vocab_size = 8803 if dataset == 'referit' else 12112 emb_name = 'referit' if dataset == 'referit' else 'Gref' vocab_file = './data/vocabulary_Gref.txt' vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) IU_result = list() if pre_emb: # use pretrained embbeding print("Use pretrained Embeddings.") model = get_segmentation_model(model_name, H=H, W=W, mode='eval', vocab_size=vocab_size, emb_name=emb_name, emb_dir=args.embdir) else: model = get_segmentation_model(model_name, H=H, W=W, mode='eval', vocab_size=vocab_size) # Load pretrained model snapshot_restorer = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) snapshot_restorer.restore(sess, weights) meta_expression = {} with open(args.meta) as meta_file: meta_expression = json.load(meta_file) videos = meta_expression['videos'] for vid_ind, vid in reversed(list(enumerate(videos.keys()))): print("Running on video {}/{}".format(vid_ind + 1, len(videos.keys()))) expressions = videos[vid]['expressions'] # instance_ids = [expression['obj_id'] for expression_id in videos[vid]['expressions']] frame_ids = videos[vid]['frames'] for eid in expressions: exp = expressions[eid]['exp'] index = int(eid) vis_dir = os.path.join(args.visdir, str('{}/{}/'.format(vid, index))) mask_dir = os.path.join(args.maskdir, str('{}/{}/'.format(vid, index))) if not os.path.exists(vis_dir): os.makedirs(vis_dir) if not os.path.exists(mask_dir): os.makedirs(mask_dir) avg_time = 0 total_frame = 0 # Process text text = np.array( text_processing.preprocess_sentence(exp, vocab_dict, T)) valid_idx = np.zeros([1], dtype=np.int32) for idx in range(text.shape[0]): if text[idx] != 0: valid_idx[0] = idx break for fid in frame_ids: vis_path = os.path.join(vis_dir, str('{}.png'.format(fid))) mask_path = os.path.join(mask_dir, str('{}.npy'.format(fid))) if os.path.exists(vis_path): continue frame = load_frame_from_id(vid, fid) if frame is None: continue last_time = time.time() # im = frame.copy() im = frame # mask = np.array(frame, dtype=np.float32) proc_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, H, W)) proc_im_ = proc_im.astype(np.float32) # proc_im_ = proc_im_[:, :, ::-1] proc_im_ -= mu scores_val, up_val, sigm_val = sess.run( [model.pred, model.up, model.sigm], feed_dict={ model.words: np.expand_dims(text, axis=0), model.im: np.expand_dims(proc_im_, axis=0), model.valid_idx: np.expand_dims(valid_idx, axis=0) }) # scores_val = np.squeeze(scores_val) # pred_raw = (scores_val >= score_thresh).astype(np.float32) up_val = np.squeeze(up_val) pred_raw = (up_val >= score_thresh).astype('uint8') * 255 # pred_raw = (up_val >= score_thresh).astype(np.float32) # predicts = im_processing.resize_and_crop(pred_raw, mask.shape[0], mask.shape[1]) if dcrf: # Dense CRF post-processing sigm_val = np.squeeze(sigm_val) + 1e-7 d = densecrf.DenseCRF2D(W, H, 2) U = np.expand_dims(-np.log(sigm_val), axis=0) U_ = np.expand_dims(-np.log(1 - sigm_val), axis=0) unary = np.concatenate((U_, U), axis=0) unary = unary.reshape((2, -1)) d.setUnaryEnergy(unary) d.addPairwiseGaussian(sxy=3, compat=3) d.addPairwiseBilateral(sxy=20, srgb=3, rgbim=proc_im, compat=10) Q = d.inference(5) pred_raw_dcrf = np.argmax(Q, axis=0).reshape( (H, W)).astype('uint8') * 255 # pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype(np.float32) # predicts_dcrf = im_processing.resize_and_crop(pred_raw_dcrf, mask.shape[0], mask.shape[1]) if visualize: if dcrf: cv2.imwrite(vis_path, pred_raw_dcrf) # np.save(mask_path, np.array(pred_raw_dcrf)) # visualize_seg(vis_path, im, exp, predicts_dcrf) else: np.save(mask_path, np.array(sigm_val)) # cv2.imwrite(vis_path, pred_raw) # visualize_seg(vis_path, im, exp, predicts) # np.save(mask_path, np.array(pred_raw)) # I, U = eval_tools.compute_mask_IU(predicts, mask) # IU_result.append({'batch_no': n_iter, 'I': I, 'U': U}) # mean_IoU += float(I) / U # cum_I += I # cum_U += U # msg = 'cumulative IoU = %f' % (cum_I / cum_U) # for n_eval_iou in range(len(eval_seg_iou_list)): # eval_seg_iou = eval_seg_iou_list[n_eval_iou] # seg_correct[n_eval_iou] += (I / U >= eval_seg_iou) # if dcrf: # I_dcrf, U_dcrf = eval_tools.compute_mask_IU(predicts_dcrf, mask) # mean_dcrf_IoU += float(I_dcrf) / U_dcrf # cum_I_dcrf += I_dcrf # cum_U_dcrf += U_dcrf # msg += '\tcumulative IoU (dcrf) = %f' % (cum_I_dcrf / cum_U_dcrf) # for n_eval_iou in range(len(eval_seg_iou_list)): # eval_seg_iou = eval_seg_iou_list[n_eval_iou] # seg_correct_dcrf[n_eval_iou] += (I_dcrf / U_dcrf >= eval_seg_iou) # print(msg) seg_total += 1
def build_a2d_batches(T, input_H, input_W, video=False): """ Build data batches of A2D Sentence dataset Args: T: limit of number of words input_H: height of input frame of I3D backbone input_W: width of input frame of I3D backbone video: select consecutive frames or standalone frame """ query_file = os.path.join(a2d_dir, 'a2d_annotation.txt') frame_dir = os.path.join(a2d_dir, 'Release/frames') vocab_file = os.path.join(root_dir, 'data/vocabulary_Gref.txt') dataset_name = 'a2d_sent_new' out_dataset_dir = os.path.join(root_dir, dataset_name) if not os.path.exists(out_dataset_dir): os.mkdir(out_dataset_dir) test_batch = os.path.join(out_dataset_dir, 'test_batch') train_batch = os.path.join(out_dataset_dir, 'train_batch') if not os.path.exists(test_batch): os.mkdir(test_batch) if not os.path.exists(train_batch): os.mkdir(train_batch) vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) test_prefix_list = list() train_prefix_list = list() split_dict = gen_split_dict() SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') with open(query_file, 'r') as f: reader = csv.reader(f) next(reader) total_count = 0 test_count = 0 train_count = 0 all_zero_mask_count = 0 for row in tqdm(reader): # each video belongs to test or train video_id = row[0] data_prefix = video_id if split_dict[data_prefix] == 1: save_dir = test_batch test_prefix_list.append(data_prefix) test = True else: save_dir = train_batch train_prefix_list.append(data_prefix) test = False # load sentence instance_id = int(row[1]) sent = row[2].lower() words = SENTENCE_SPLIT_REGEX.split(sent.strip()) words = [w for w in words if len(w.strip()) > 0] # remove punctuation and restrict sentence within 20 words if words[-1] == '.': words = words[:-1] if len(words) > T: words = words[:T] n_sent = "" for w in words: n_sent = n_sent + w + ' ' n_sent = n_sent.strip() n_sent = n_sent.encode('utf-8').decode("utf-8") text = text_processing.preprocess_sentence(n_sent, vocab_dict, T) image_paths = list() # for each video, get all the gt masks of a certain instance masks, frame_ids = get_masks(video_id, instance_id) for frame_id in frame_ids: image_path = os.path.join(frame_dir, video_id, '{:0>5d}.png'.format(frame_id)) image_paths.append(image_path) for frame_id, image_path, mask in zip(frame_ids, image_paths, masks): # abandon all zero mask batch if np.sum(mask) == 0: print("all zeros mask caught") all_zero_mask_count += 1 continue if video: # obtain 16 consecutive frames centered at the gt frame frame_paths = frame_range(frame_id=frame_id, frame_dir=os.path.join( frame_dir, video_id)) else: # only use the gt frame frame_paths = list() frames = list() if test: count = test_count test_count = test_count + 1 prefix = 'test_' image = skimage.io.imread(image_path) for frame_path in frame_paths: frames.append(skimage.io.imread(frame_path)) else: prefix = 'train_' count = train_count train_count = train_count + 1 image = skimage.io.imread(image_path) image = skimage.img_as_ubyte( im_processing.resize_and_pad(image, input_H, input_W)) mask = im_processing.resize_and_pad(mask, input_H, input_W) for frame_path in frame_paths: frame = skimage.io.imread(frame_path) frame = skimage.img_as_ubyte( im_processing.resize_and_pad( frame, input_H, input_W)) frames.append(frame) if debug: m0 = mask[:, :, np.newaxis] m0 = (m0 > 0).astype(np.uint8) m0 = np.concatenate([m0, m0, m0], axis=2) debug_image = image * m0 skimage.io.imsave( './debug/{}_{}_{}.png'.format(data_prefix, frame_id, sent.replace(' ', '_')), debug_image) # save batches np.savez(file=os.path.join( save_dir, dataset_name + '_' + prefix + str(count)), text_batch=text, mask_batch=(mask > 0), sent_batch=[sent], im_batch=image, frame_id=frame_id, frames=frames) total_count = total_count + 1 print() print("num of all zeros masks is: {}".format(all_zero_mask_count))
label_coarse_batch = np.zeros((N, featmap_H, featmap_W, 1), dtype=np.bool) label_fine_batch = np.zeros((N, input_H, input_W, 1), dtype=np.bool) if not os.path.isdir(data_folder): os.mkdir(data_folder) for n_batch in range(num_batch): print('saving batch %d / %d' % (n_batch + 1, num_batch)) batch_begin = n_batch * N batch_end = (n_batch + 1) * N for n_sample in range(batch_begin, batch_end): imname, mask_name, description = shuffled_training_samples[n_sample] im = skimage.io.imread(image_dir + imname) mask = load_gt_mask(mask_dir + mask_name).astype(np.float32) processed_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) if processed_im.ndim == 2: processed_im = processed_im[:, :, np.newaxis] processed_mask = im_processing.resize_and_pad(mask, input_H, input_W) subsampled_mask = skimage.transform.downscale_local_mean( processed_mask, (8, 8)) labels_fine = (processed_mask > 0) labels_coarse = (subsampled_mask > 0) text_seq = text_processing.preprocess_sentence(description, vocab_dict, T) text_seq_batch[:, n_sample - batch_begin] = text_seq imcrop_batch[n_sample - batch_begin, ...] = processed_im label_coarse_batch[n_sample - batch_begin,
def build_refvos_batch(setname, T, input_H, input_W, im_dir, mask_dir, meta_expressions, save_dir, inrange=None): vocab_file = './data/vocabulary_Gref.txt' print(save_dir) # saving directory data_folder = os.path.join(save_dir, 'refvos/' + setname + '_batch/') data_prefix = 'refvos_' + setname if not os.path.isdir(data_folder): os.makedirs(data_folder) # load annotations query_dict = json.load(open(meta_expressions)) videos = query_dict['videos'] samples = [] for vid in videos: video = videos[vid] expressions = video['expressions'] frames = video['frames'] for eid in expressions: exp = expressions[eid]['exp'] obj_id = expressions[eid]['obj_id'] for fid in frames: im_name = os.path.join(vid, fid + '.jpg') mask_name = os.path.join(vid, fid + '.png') samples.append((im_name, mask_name, exp, obj_id)) vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) # save batches to disk num_batch = len(samples) batch_ind = 0 if inrange == None: inrange = range(num_batch) for n_batch in inrange: print('saving batch %d / %d' % (n_batch + 1, num_batch)) im_name, mask_name, sent, obj_id = samples[n_batch] im_path = os.path.join(im_dir, im_name) mask_path = os.path.join(mask_dir, mask_name) if not (os.path.exists(im_path) and os.path.exists(mask_path)): continue im = skimage.io.imread(im_path) mask = skimage.io.imread(mask_path)[:, :, :3] mask_color = object_color[obj_id] mask_obj = np.asarray((mask == mask_color)) if (len(mask_obj.shape) == 0): continue mask_obj = mask_obj[:, :, 0] if np.max(mask_obj) == 0: print(im_name) continue if 'train' in setname: im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) mask = im_processing.resize_and_pad(mask_obj, input_H, input_W) if im.ndim == 2: im = np.tile(im[:, :, np.newaxis], (1, 1, 3)) text = text_processing.preprocess_sentence(sent, vocab_dict, T) np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz', text_batch=text, im_batch=im, mask_batch=(mask > 0), sent_batch=[sent]) batch_ind += 1
def inference(): with open('./seg_model/test.prototxt', 'w') as f: f.write(str(seg_model.generate_model('val', test_config.N))) caffe.set_device(test_config.gpu_id) caffe.set_mode_gpu() # Load pretrained model net = caffe.Net('./seg_model/test.prototxt', test_config.pretrained_model, caffe.TEST) ################################################################################ # Load annotations and bounding box proposals ################################################################################ query_dict = json.load(open(test_config.query_file)) bbox_dict = json.load(open(test_config.bbox_file)) imcrop_dict = json.load(open(test_config.imcrop_file)) imsize_dict = json.load(open(test_config.imsize_file)) imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict}) vocab_dict = text_processing.load_vocab_dict_from_file(test_config.vocab_file) ################################################################################ # Flatten the annotations ################################################################################ flat_query_dict = {imname: [] for imname in imlist} for imname in imlist: this_imcrop_names = imcrop_dict[imname] for imcrop_name in this_imcrop_names: gt_bbox = bbox_dict[imcrop_name] if imcrop_name not in query_dict: continue this_descriptions = query_dict[imcrop_name] for description in this_descriptions: flat_query_dict[imname].append((imcrop_name, gt_bbox, description)) ################################################################################ # Testing ################################################################################ cum_I, cum_U = 0.0, 0.0 eval_seg_iou_list = [0.5, 0.6, 0.7, 0.8, 0.9] seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0.0 # Pre-allocate arrays imcrop_val = np.zeros((test_config.N, test_config.input_H, test_config.input_W, 3), dtype=np.float32) text_seq_val = np.zeros((test_config.T, test_config.N), dtype=np.int32) num_im = len(imlist) for n_im in tqdm(range(num_im)): imname = imlist[n_im] # Extract visual features from all proposals im = skimage.io.imread(test_config.image_dir + imname) processed_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, test_config.input_H, test_config.input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[...] = processed_im.astype(np.float32) - seg_model.channel_mean imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2)) # Extract spatial features spatial_val = processing_tools.generate_spatial_batch(test_config.N, test_config.featmap_H, test_config.featmap_W) spatial_val = spatial_val.transpose((0, 3, 1, 2)) for imcrop_name, _, description in flat_query_dict[imname]: mask = load_gt_mask(test_config.mask_dir + imcrop_name + '.mat').astype(np.float32) labels = (mask > 0) processed_labels = im_processing.resize_and_pad(mask, test_config.input_H, test_config.input_W) processed_labels = processed_labels > 0 text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, test_config.T) cont_val = text_processing.create_cont(text_seq_val) net.blobs['language'].data[...] = text_seq_val net.blobs['cont'].data[...] = cont_val net.blobs['image'].data[...] = imcrop_val_trans net.blobs['spatial'].data[...] = spatial_val net.blobs['label'].data[...] = processed_labels net.forward() upscores = net.blobs['upscores'].data[...].copy() upscores = np.squeeze(upscores) # Evaluate the segmentation performance of using bounding box segmentation pred_raw = (upscores >= test_config.score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1]) I, U = eval_tools.compute_mask_IU(predicts, labels) cum_I += I cum_U += U this_IoU = I/float(U) for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou] seg_correct[n_eval_iou] += (I/float(U) >= eval_seg_iou) seg_total += 1 # Print results print('Final results on the whole test set') result_str = '' for n_eval_iou in range(len(eval_seg_iou_list)): result_str += 'precision@%s = %f\n' % \ (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou]/seg_total) result_str += 'overall IoU = %f\n' % (cum_I/cum_U) print(result_str)
for fi in range(start_frame_id, num_frames + start_frame_id): im_file = '/home/zhenyang/Workspace/data/drones/frames/' + video + '/%05d.jpg' % ( fi, ) ############################### # Run on the input image and query text text_seq_val = np.zeros((config.T, config.N), dtype=np.float32) imcrop_val = np.zeros((config.N, config.input_H, config.input_W, 3), dtype=np.float32) # Preprocess image and text im = skimage.io.imread(im_file) #im_, pad_h, pad_w, scale = resize_and_pad(im, config.input_H, config.input_W) #processed_im = skimage.img_as_ubyte(im_) processed_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, config.input_H, config.input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[0, :] = processed_im.astype( np.float32) - segmodel.channel_mean imcrop_val = imcrop_val.transpose((0, 3, 1, 2)) imcrop_val = imcrop_val[:, ::-1, :, :] spatial_val = processing_tools.generate_spatial_batch( config.N, config.featmap_H, config.featmap_W) spatial_val = spatial_val.transpose((0, 3, 1, 2)) text_seq_val[:, 0] = text_processing.preprocess_sentence( query, vocab_dict, config.T) cont_val = text_processing.create_cont(text_seq_val)
def test(modelname, iter, dataset, weights, setname, dcrf, mu, tfmodel_folder): data_folder = './' + dataset + '/' + setname + '_batch/' data_prefix = dataset + '_' + setname tfmodel_folder = './' + dataset + '/tfmodel/CMSA' pretrained_model = os.path.join( tfmodel_folder, dataset + '_' + modelname + '_release' + '.tfmodel') score_thresh = 1e-9 eval_seg_iou_list = [.5, .6, .7, .8, .9] cum_I, cum_U = 0, 0 mean_IoU, mean_dcrf_IoU = 0, 0 seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) if dcrf: cum_I_dcrf, cum_U_dcrf = 0, 0 seg_correct_dcrf = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0. H, W = 320, 320 vocab_size = 8803 if dataset == 'referit' else 12112 IU_result = list() model = CMSA_model(H=H, W=W, mode='eval', vocab_size=vocab_size, weights=weights) # Load pretrained model snapshot_restorer = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) snapshot_restorer.restore(sess, pretrained_model) reader = data_reader.DataReader(data_folder, data_prefix, shuffle=False) NN = reader.num_batch print('test in', dataset, setname) for n_iter in range(reader.num_batch): if n_iter % (NN // 50) == 0: if n_iter / (NN // 50) % 5 == 0: sys.stdout.write(str(n_iter / (NN // 50) // 5)) else: sys.stdout.write('.') sys.stdout.flush() batch = reader.read_batch(is_log=False) text = batch['text_batch'] im = batch['im_batch'] mask = batch['mask_batch'].astype(np.float32) proc_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, H, W)) proc_im_ = proc_im.astype(np.float32) proc_im_ = proc_im_[:, :, ::-1] proc_im_ -= mu scores_val, up_val, sigm_val = sess.run( [model.pred, model.up, model.sigm], feed_dict={ model.words: np.expand_dims(text, axis=0), model.im: np.expand_dims(proc_im_, axis=0) }) up_val = np.squeeze(up_val) pred_raw = (up_val >= score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, mask.shape[0], mask.shape[1]) if dcrf: # Dense CRF post-processing sigm_val = np.squeeze(sigm_val) d = densecrf.DenseCRF2D(W, H, 2) U = np.expand_dims(-np.log(sigm_val), axis=0) U_ = np.expand_dims(-np.log(1 - sigm_val), axis=0) unary = np.concatenate((U_, U), axis=0) unary = unary.reshape((2, -1)) d.setUnaryEnergy(unary) d.addPairwiseGaussian(sxy=3, compat=3) d.addPairwiseBilateral(sxy=20, srgb=3, rgbim=proc_im, compat=10) Q = d.inference(5) pred_raw_dcrf = np.argmax(Q, axis=0).reshape( (H, W)).astype(np.float32) predicts_dcrf = im_processing.resize_and_crop( pred_raw_dcrf, mask.shape[0], mask.shape[1]) I, U = eval_tools.compute_mask_IU(predicts, mask) IU_result.append({'batch_no': n_iter, 'I': I, 'U': U}) mean_IoU += float(I) / U cum_I += I cum_U += U msg = 'cumulative IoU = %f' % (cum_I / cum_U) for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou] seg_correct[n_eval_iou] += (I / U >= eval_seg_iou) if dcrf: I_dcrf, U_dcrf = eval_tools.compute_mask_IU(predicts_dcrf, mask) mean_dcrf_IoU += float(I_dcrf) / U_dcrf cum_I_dcrf += I_dcrf cum_U_dcrf += U_dcrf msg += '\tcumulative IoU (dcrf) = %f' % (cum_I_dcrf / cum_U_dcrf) for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou] seg_correct_dcrf[n_eval_iou] += (I_dcrf / U_dcrf >= eval_seg_iou) # print(msg) seg_total += 1 # Print results print('Segmentation evaluation (without DenseCRF):') result_str = '' for n_eval_iou in range(len(eval_seg_iou_list)): result_str += 'precision@%s = %f\n' % \ (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou]/seg_total) result_str += 'overall IoU = %f; mean IoU = %f\n' % (cum_I / cum_U, mean_IoU / seg_total) print(result_str) if dcrf: print('Segmentation evaluation (with DenseCRF):') result_str = '' for n_eval_iou in range(len(eval_seg_iou_list)): result_str += 'precision@%s = %f\n' % \ (str(eval_seg_iou_list[n_eval_iou]), seg_correct_dcrf[n_eval_iou]/seg_total) result_str += 'overall IoU = %f; mean IoU = %f\n' % ( cum_I_dcrf / cum_U_dcrf, mean_dcrf_IoU / seg_total) print(result_str)
eval_seg_iou_list = [.5, .6, .7, .8, .9] seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0 text_seq_val = np.zeros((T, N), dtype=np.float32) imcrop_val = np.zeros((N, input_H, input_W, 3), dtype=np.float32) num_im = len(imlist) for n_im in range(num_im): print('testing image %d / %d' % (n_im, num_im)) imname = imlist[n_im] # Extract visual features from all proposals im = skimage.io.imread(image_dir + imname) processed_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[...] = processed_im.astype( np.float32) - segmodel.vgg_net.channel_mean for imcrop_name, _, description in flat_query_dict[imname]: mask = load_gt_mask(mask_dir + imcrop_name + '.mat').astype(np.float32) labels = (mask > 0) processed_labels = im_processing.resize_and_pad( mask, input_H, input_W) > 0 text_seq_val[:, 0] = text_processing.preprocess_sentence( description, vocab_dict, T) scores_val = sess.run(scores, feed_dict={
def test(iter, dataset, visualize, setname, dcrf, mu, tfmodel_path, model_name, pre_emb=False): data_folder = './' + dataset + '/' + setname + '_batch/' data_prefix = dataset + '_' + setname if visualize: save_dir = './' + dataset + '/visualization/' + str(iter) + '/' if not os.path.isdir(save_dir): os.makedirs(save_dir) weights = os.path.join(tfmodel_path) print("Loading trained weights from {}".format(weights)) score_thresh = 1e-9 eval_seg_iou_list = [.5, .6, .7, .8, .9] cum_I, cum_U = 0, 0 mean_IoU, mean_dcrf_IoU = 0, 0 seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) if dcrf: cum_I_dcrf, cum_U_dcrf = 0, 0 seg_correct_dcrf = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0. T = 20 # truncated long sentence H, W = 320, 320 vocab_size = 8803 if dataset == 'referit' else 12112 emb_name = 'referit' if dataset == 'referit' else 'refvos' vocab_file = './data/vocabulary_refvos.txt' vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) IU_result = list() if pre_emb: # use pretrained embbeding print("Use pretrained Embeddings.") model = get_segmentation_model(model_name, H=H, W=W, mode='eval', vocab_size=vocab_size, emb_name=emb_name, emb_dir=args.embdir) else: model = get_segmentation_model(model_name, H=H, W=W, mode='eval', vocab_size=vocab_size) # Load pretrained model snapshot_restorer = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) snapshot_restorer.restore(sess, weights) meta_expression = {} with open(args.meta) as meta_file: meta_expression = json.load(meta_file) videos = meta_expression['videos'] plt.figure(figsize=[15, 4]) sorted_video_key = ['a9f23c9150', '6cc8bce61a', '03fe6115d4', 'a46012c642', 'c42fdedcdd', 'ee9415c553', '7daa6343e6', '4fe6619a47', '0e8a6b63bb', '65e0640a2a', '8939473ea7', 'b05faf54f7', '5d2020eff8', 'a00c3fa88e', '44e5d1a969', 'deed0ab4fc', 'b205d868e6', '48d2909d9e', 'c9ef04fe59', '1e20ceafae', '0f3f8b2b2f', 'b83923fd72', 'cb06f84b6e', '17cba76927', '35d5e5149d', '62bf7630b3', '0390fabe58', 'bf2d38aefe', '8b7b57b94d', '8d803e87f7', 'c16d9a4ade', '1a1dbe153e', 'd975e5f4a9', '226f1e10f7', '6cb5b08d93', '77df215672', '466734bc5c', '94fa9bd3b5', 'f2a45acf1c', 'ba8823f2d2', '06cd94d38d', 'b772ac822a', '246e38963b', 'b5514f75d8', '188cb4e03d', '3dd327ab4e', '8e2e5af6a8', '450bd2e238', '369919ef49', 'a4bce691c6', '64c6f2ed76', '0782a6df7e', '0062f687f1', 'c74fc37224', 'f7255a57d0', '4f5b3310e3', 'e027ebc228', '30fe0ed0ce', '6a75316e99', 'a2948d4116', '8273b59141', 'abae1ce57d', '621487be65', '45dc90f558', '9787f452bf', 'cdcfd9f93a', '4f6662e4e0', '853ca85618', '13ca7bbcfd', 'f143fede6f', '92fde455eb', '0b0c90e21a', '5460cc540a', '182dbfd6ba', '85968ae408', '541ccb0844', '43115c42b2', '65350fd60a', 'eb49ce8027', 'e11254d3b9', '20a93b4c54', 'a0fc95d8fc', '696e01387c', 'fef7e84268', '72d613f21a', '8c60938d92', '975be70866', '13c3cea202', '4ee0105885', '01c88b5b60', '33e8066265', '8dea7458de', 'c280d21988', 'fd8cf868b2', '35948a7fca', 'e10236eb37', 'a1251195e7', 'b2256e265c', '2b904b76c9', '1ab5f4bbc5', '47d01d34c8', 'd7a38bf258', '1a609fa7ee', '218ac81c2d', '9f16d17e42', 'fb104c286f', 'eb263ef128', '37b4ec2e1a', '0daaddc9da', 'cd69993923', '31d3a7d2ee', '60362df585', 'd7ff44ea97', '623d24ce2b', '6031809500', '54526e3c66', '0788b4033d', '3f4bacb16a', '06a5dfb511', '9f21474aca', '7a19a80b19', '9a38b8e463', '822c31928a', 'd1ac0d8b81', 'eea1a45e49', '9f429af409', '33c8dcbe09', '9da2156a73', '3be852ed44', '3674b2c70a', '547416bda1', '4037d8305d', '29c06df0f2', '1335b16cf9', 'b7b7e52e02', 'bc9ba8917e', 'dab44991de', '9fd2d2782b', 'f054e28786', 'b00ff71889', 'eeb18f9d47', '559a611d86', 'dea0160a12', '257f7fd5b8', 'dc197289ef', 'c2bbd6d121', 'f3678388a7', '332dabe378', '63883da4f5', 'b90f8c11db', 'dce363032d', '411774e9ff', '335fc10235', '7775043b5e', '3e03f623bb', '19cde15c4b', 'bf4cc89b18', '1a894a8f98', 'f7d7fb16d0', '61fca8cbf1', 'd69812339e', 'ab9a7583f1', 'e633eec195', '0a598e18a8', 'b3b92781d9', 'cd896a9bee', 'b7928ea5c0', '69c0f7494e', 'cc1a82ac2a', '39b7491321', '352ad66724', '749f1abdf9', '7f26b553ae', '0c04834d61', 'd1dd586cfd', '3b72dc1941', '39bce09d8d', 'cbea8f6bea', 'cc7c3138ff', 'd59c093632', '68dab8f80c', '1e0257109e', '4307020e0f', '4b783f1fc5', 'ebe7138e58', '1f390d22ea', '7a72130f21', 'aceb34fcbe', '9c0b55cae5', 'b58a97176b', '152fe4902a', 'a806e58451', '9ce299a510', '97b38cabcc', 'f39c805b54', '0620b43a31', '0723d7d4fe', '7741a0fbce', '7836afc0c2', 'a7462d6aaf', '34564d26d8', '31e0beaf99'] # sorted_video_key = ['6cc8bce61a'] for vid_ind, vid in enumerate(sorted_video_key): print("Running on video {}/{}".format(vid_ind + 1, len(videos.keys()))) expressions = videos[vid]['expressions'] # instance_ids = [expression['obj_id'] for expression_id in videos[vid]['expressions']] frame_ids = videos[vid]['frames'] for eid in expressions: exp = expressions[eid]['exp'] index = int(eid) vis_dir = args.visdir # mask_dir = os.path.join(args.maskdir, str('{}/{}/'.format(vid, index))) if not os.path.exists(vis_dir): os.makedirs(vis_dir) # if not os.path.exists(mask_dir): # os.makedirs(mask_dir) avg_time = 0 total_frame = 0 # Process text text = np.array(text_processing.preprocess_sentence(exp, vocab_dict, T)) valid_idx = np.zeros([1], dtype=np.int32) for idx in range(text.shape[0]): if text[idx] != 0: valid_idx[0] = idx break for fid in frame_ids: frame_id = int(fid) if (frame_id % 20 != 0): continue vis_path = os.path.join(vis_dir, str('{}_{}_{}.png'.format(vid,eid,fid))) frame = load_frame_from_id(vid, fid) if frame is None: continue last_time = time.time() # im = frame.copy() im = frame # mask = np.array(frame, dtype=np.float32) proc_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, H, W)) proc_im_ = proc_im.astype(np.float32) proc_im_ = proc_im_[:, :, ::-1] proc_im_ -= mu scores_val, up_val, sigm_val, up_c4 = sess.run([model.pred, model.up, model.sigm, model.up_c4, ], feed_dict={ model.words: np.expand_dims(text, axis=0), model.im: np.expand_dims(proc_im_, axis=0), model.valid_idx: np.expand_dims(valid_idx, axis=0) }) # scores_val = np.squeeze(scores_val) # pred_raw = (scores_val >= score_thresh).astype(np.float32) up_c4 = im_processing.resize_and_crop(sigmoid(np.squeeze(up_c4)), frame.shape[0], frame.shape[1]) sigm_val = im_processing.resize_and_crop(sigmoid(np.squeeze(sigm_val)), frame.shape[0], frame.shape[1]) up_val = np.squeeze(up_val) # if (not math.isnan(consitency_score) and consitency_score < 0.3): plt.clf() plt.subplot(1, 3, 1) plt.imshow(frame) plt.text(-0.7, -0.7, exp + str(consitency_score)) plt.subplot(1, 3, 2) plt.imshow(up_c4) plt.subplot(1, 3, 3) plt.imshow(sigm_val) plt.savefig(vis_path) # pred_raw = (up_val >= score_thresh).astype('uint8') * 255 # pred_raw = (up_val >= score_thresh).astype(np.float32) # predicts = im_processing.resize_and_crop(pred_raw, mask.shape[0], mask.shape[1]) # if dcrf: # # Dense CRF post-processing # sigm_val = np.squeeze(sigm_val) + 1e-7 # d = densecrf.DenseCRF2D(W, H, 2) # U = np.expand_dims(-np.log(sigm_val), axis=0) # U_ = np.expand_dims(-np.log(1 - sigm_val), axis=0) # unary = np.concatenate((U_, U), axis=0) # unary = unary.reshape((2, -1)) # d.setUnaryEnergy(unary) # d.addPairwiseGaussian(sxy=3, compat=3) # d.addPairwiseBilateral(sxy=20, srgb=3, rgbim=proc_im, compat=10) # Q = d.inference(5) # pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype('uint8') * 255 # # pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype(np.float32) # # predicts_dcrf = im_processing.resize_and_crop(pred_raw_dcrf, mask.shape[0], mask.shape[1]) # if visualize: # if dcrf: # cv2.imwrite(vis_path, pred_raw_dcrf) # # np.save(mask_path, np.array(pred_raw_dcrf)) # # visualize_seg(vis_path, im, exp, predicts_dcrf) # else: # np.save(mask_path, np.array(sigm_val)) # cv2.imwrite(vis_path, pred_raw) # visualize_seg(vis_path, im, exp, predicts) # np.save(mask_path, np.array(pred_raw)) # I, U = eval_tools.compute_mask_IU(predicts, mask) # IU_result.append({'batch_no': n_iter, 'I': I, 'U': U}) # mean_IoU += float(I) / U # cum_I += I # cum_U += U # msg = 'cumulative IoU = %f' % (cum_I / cum_U) # for n_eval_iou in range(len(eval_seg_iou_list)): # eval_seg_iou = eval_seg_iou_list[n_eval_iou] # seg_correct[n_eval_iou] += (I / U >= eval_seg_iou) # if dcrf: # I_dcrf, U_dcrf = eval_tools.compute_mask_IU(predicts_dcrf, mask) # mean_dcrf_IoU += float(I_dcrf) / U_dcrf # cum_I_dcrf += I_dcrf # cum_U_dcrf += U_dcrf # msg += '\tcumulative IoU (dcrf) = %f' % (cum_I_dcrf / cum_U_dcrf) # for n_eval_iou in range(len(eval_seg_iou_list)): # eval_seg_iou = eval_seg_iou_list[n_eval_iou] # seg_correct_dcrf[n_eval_iou] += (I_dcrf / U_dcrf >= eval_seg_iou) # print(msg) seg_total += 1
def test(modelname, iter, dataset, visualize, weights, setname, dcrf, mu): data_folder = './' + dataset + '/' + setname + '_batch/' data_prefix = dataset + '_' + setname if visualize: save_dir = './' + dataset + '/visualization/' + modelname + '_' + str(iter) + '/' if not os.path.isdir(save_dir): os.makedirs(save_dir) pretrained_model = './' + dataset + '/tfmodel_BRI/' + dataset + '_' + weights + '_' + modelname + '_iter_' + str(iter) + '.tfmodel' score_thresh = 1e-9 eval_seg_iou_list = [.5, .6, .7, .8, .9] cum_I, cum_U = 0, 0 seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) if dcrf: cum_I_dcrf, cum_U_dcrf = 0, 0 seg_correct_dcrf = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0. H, W = 320, 320 vocab_size = 8803 if dataset == 'referit' else 12112 if modelname == 'BRI': model = BRI_model(H=H, W=W, mode='eval', vocab_size=vocab_size, weights=weights) else: raise ValueError('Unknown model name %s' % (modelname)) # Load pretrained model snapshot_restorer = tf.train.Saver() sess = tf.Session() sess.run(tf.global_variables_initializer()) snapshot_restorer.restore(sess, pretrained_model) reader = data_reader.DataReader(data_folder, data_prefix, shuffle=False) for n_iter in range(reader.num_batch): batch = reader.read_batch() text = batch['text_batch'] im = batch['im_batch'] mask = batch['mask_batch'].astype(np.float32) proc_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, H, W)) proc_im_ = proc_im.astype(np.float32) proc_im_ = proc_im_[:,:,::-1] proc_im_ -= mu scores_val, up_val, sigm_val = sess.run([model.pred, model.up, model.sigm], feed_dict={ model.words: np.expand_dims(text, axis=0), model.im: np.expand_dims(proc_im_, axis=0) }) up_val = np.squeeze(up_val) pred_raw = (up_val >= score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, mask.shape[0], mask.shape[1]) if dcrf: # Dense CRF post-processing sigm_val = np.squeeze(sigm_val) d = Dcrf.DenseCRF2D(W, H, 2) U = np.expand_dims(-np.log(sigm_val), axis=0) U_ = np.expand_dims(-np.log(1 - sigm_val), axis=0) unary = np.concatenate((U_, U), axis=0) unary = unary.reshape((2, -1)) d.setUnaryEnergy(unary) d.addPairwiseGaussian(sxy=3, compat=3) d.addPairwiseBilateral(sxy=20, srgb=3, rgbim=proc_im, compat=10) Q = d.inference(5) pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype(np.float32) predicts_dcrf = im_processing.resize_and_crop(pred_raw_dcrf, mask.shape[0], mask.shape[1]) I, U = eval_tools.compute_mask_IU(predicts, mask) cum_I += I cum_U += U msg = 'cumulative IoU = %f' % (cum_I/cum_U) for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou] seg_correct[n_eval_iou] += (I/U >= eval_seg_iou) if dcrf: I_dcrf, U_dcrf = eval_tools.compute_mask_IU(predicts_dcrf, mask) cum_I_dcrf += I_dcrf cum_U_dcrf += U_dcrf msg += '\tcumulative IoU (dcrf) = %f' % (cum_I_dcrf/cum_U_dcrf) for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou] seg_correct_dcrf[n_eval_iou] += (I_dcrf/U_dcrf >= eval_seg_iou) print(msg) seg_total += 1 # Print results print('Segmentation evaluation (without DenseCRF):') result_str = '' for n_eval_iou in range(len(eval_seg_iou_list)): result_str += 'precision@%s = %f\n' % \ (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou]/seg_total) result_str += 'overall IoU = %f\n' % (cum_I/cum_U) print(result_str) if dcrf: print('Segmentation evaluation (with DenseCRF):') result_str = '' for n_eval_iou in range(len(eval_seg_iou_list)): result_str += 'precision@%s = %f\n' % \ (str(eval_seg_iou_list[n_eval_iou]), seg_correct_dcrf[n_eval_iou]/seg_total) result_str += 'overall IoU = %f\n' % (cum_I_dcrf/cum_U_dcrf) print(result_str)
# Collect training samples ################################################################################ training_samples = [] num_imcrop = len(imcrop_list) for n_imcrop in range(num_imcrop): if n_imcrop % 200 == 0: print('processing %d / %d' % (n_imcrop+1, num_imcrop)) imcrop_name = imcrop_list[n_imcrop] # Image and mask imname = imcrop_name.split('_', 1)[0] + '.jpg' mask_name = imcrop_name + '.mat' im = skimage.io.imread(image_dir + imname) mask = load_gt_mask(mask_dir + mask_name).astype(np.float32) processed_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, input_H, input_W)) if processed_im.ndim == 2: processed_im = processed_im[:, :, np.newaxis] processed_mask = im_processing.resize_and_pad(mask, input_H, input_W) subsampled_mask = skimage.transform.downscale_local_mean(processed_mask, (32, 32)) labels_fine = (processed_mask > 0) labels_coarse = (subsampled_mask > 0) for description in query_dict[imcrop_name]: text_seq = text_processing.preprocess_sentence(description, vocab_dict, T) training_samples.append((processed_im, text_seq, labels_coarse, labels_fine)) # Shuffle the training instances np.random.seed(3) shuffle_idx = np.random.permutation(len(training_samples))
def build_referit_batches(setname, T, input_H, input_W): # data directory im_dir = './data/referit/images/' mask_dir = './data/referit/mask/' query_file = './data/referit_query_' + setname + '.json' vocab_file = './data/vocabulary_spacy_referit.txt' # saving directory data_folder = './referit/' + setname + '_batch/' data_prefix = 'referit_' + setname if not os.path.isdir(data_folder): os.makedirs(data_folder) # load annotations query_dict = json.load(open(query_file)) im_list = query_dict.keys() vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) # collect training samples samples = [] for n_im, name in enumerate(im_list): im_name = name.split('_', 1)[0] + '.jpg' mask_name = name + '.mat' for sent in query_dict[name]: samples.append((im_name, mask_name, sent)) # save batches to disk # spacy load nlp = spacy.load("en_core_web_sm") SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') num_batch = len(samples) valid = 0 for n_batch in range(num_batch): print('saving batch %d / %d' % (n_batch + 1, num_batch)) im_name, mask_name, sent = samples[n_batch] im = skimage.io.imread(im_dir + im_name) mask = load_gt_mask(mask_dir + mask_name).astype(np.float32) if 'train' in setname: im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) mask = im_processing.resize_and_pad(mask, input_H, input_W) if im.ndim == 2: im = np.tile(im[:, :, np.newaxis], (1, 1, 3)) sent = sent.lower() words = SENTENCE_SPLIT_REGEX.split(sent.strip()) words = [w for w in words if len(w.strip()) > 0] # remove . if words[-1] == '.': words = words[:-1] if len(words) > 20: words = words[:20] n_sent = "" for w in words: n_sent = n_sent + w + ' ' n_sent = n_sent.strip() try: n_sent = n_sent.decode("utf-8") except UnicodeEncodeError: continue doc = nlp(n_sent) if (len(doc) > 30): continue text, graph, height = text_processing.preprocess_spacy_sentence( doc, vocab_dict, T) np.savez(file=data_folder + data_prefix + '_' + str(valid) + '.npz', text_batch=text, im_batch=im, mask_batch=(mask > 0), sent_batch=[n_sent], graph_batch=graph, height_batch=np.array([height], dtype=np.int32)) valid += 1
def build_coco_batches(dataset, setname, T, input_H, input_W): im_dir = './data/coco/images' im_type = 'train2014' vocab_file = './data/vocabulary_spacy_Gref.txt' data_folder = './' + dataset + '/' + setname + '_batch/' data_prefix = dataset + '_' + setname if not os.path.isdir(data_folder): os.makedirs(data_folder) print("data_folder:", data_folder) if dataset == 'Gref': refer = REFER('./external/refer/data', dataset='refcocog', splitBy='google') elif dataset == 'unc': refer = REFER('./external/refer/data', dataset='refcoco', splitBy='unc') elif dataset == 'unc+': refer = REFER('./external/refer/data', dataset='refcoco+', splitBy='unc') else: raise ValueError('Unknown dataset %s' % dataset) refs = [ refer.Refs[ref_id] for ref_id in refer.Refs if refer.Refs[ref_id]['split'] == setname ] vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) n_batch = 0 # spacy load nlp = spacy.load("en_core_web_sm") SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') for ref in refs: im_name = 'COCO_' + im_type + '_' + str(ref['image_id']).zfill(12) im = skimage.io.imread('%s/%s/%s.jpg' % (im_dir, im_type, im_name)) seg = refer.Anns[ref['ann_id']]['segmentation'] rle = cocomask.frPyObjects(seg, im.shape[0], im.shape[1]) mask = np.max(cocomask.decode(rle), axis=2).astype(np.float32) if 'train' in setname: im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) mask = im_processing.resize_and_pad(mask, input_H, input_W) if im.ndim == 2: im = np.tile(im[:, :, np.newaxis], (1, 1, 3)) for sentence in ref['sentences']: print('saving batch %d' % (n_batch + 1)) sent = sentence['sent'].lower() words = SENTENCE_SPLIT_REGEX.split(sent.strip()) words = [w for w in words if len(w.strip()) > 0] # remove . if words[-1] == '.': words = words[:-1] if len(words) > 20: words = words[:20] n_sent = "" for w in words: n_sent = n_sent + w + ' ' n_sent = n_sent.strip() try: n_sent = n_sent.decode("utf-8") except UnicodeEncodeError: continue doc = nlp(n_sent) if len(doc) > 30: continue n_sent = n_sent.decode("utf-8") doc = nlp(n_sent) text, graph, height = text_processing.preprocess_spacy_sentence( doc, vocab_dict, T) np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz', text_batch=text, im_batch=im, mask_batch=(mask > 0), sent_batch=[n_sent], graph_batch=graph, height_batch=np.array([height], dtype=np.int32)) n_batch += 1
def inference(config): with open('./seg_model/test.prototxt', 'w') as f: f.write(str(seg_model.generate_model('val', config))) caffe.set_device(config.gpu_id) caffe.set_mode_gpu() # Load pretrained model net = caffe.Net('./seg_model/test.prototxt', config.pretrained_model, caffe.TEST) ################################################################################ # Load annotations and bounding box proposals ################################################################################ query_dict = json.load(open(config.query_file)) bbox_dict = json.load(open(config.bbox_file)) imcrop_dict = json.load(open(config.imcrop_file)) imsize_dict = json.load(open(config.imsize_file)) imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict}) vocab_dict = text_processing.load_vocab_dict_from_file(config.vocab_file) ################################################################################ # Flatten the annotations ################################################################################ flat_query_dict = {imname: [] for imname in imlist} for imname in imlist: this_imcrop_names = imcrop_dict[imname] for imcrop_name in this_imcrop_names: gt_bbox = bbox_dict[imcrop_name] if imcrop_name not in query_dict: continue this_descriptions = query_dict[imcrop_name] for description in this_descriptions: flat_query_dict[imname].append((imcrop_name, gt_bbox, description)) ################################################################################ # Testing ################################################################################ cum_I, cum_U = 0.0, 0.0 eval_seg_iou_list = [0.5, 0.6, 0.7, 0.8, 0.9] seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0.0 # Pre-allocate arrays imcrop_val = np.zeros((config.N, config.input_H, config.input_W, 3), dtype=np.float32) text_seq_val = np.zeros((config.T, config.N), dtype=np.int32) num_im = len(imlist) for n_im in tqdm(range(num_im)): imname = imlist[n_im] # Extract visual features from all proposals im = skimage.io.imread(config.image_dir + imname) processed_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, config.input_H, config.input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[...] = processed_im.astype(np.float32) - seg_model.channel_mean imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2)) # Extract spatial features spatial_val = processing_tools.generate_spatial_batch(config.N, config.featmap_H, config.featmap_W) spatial_val = spatial_val.transpose((0, 3, 1, 2)) for imcrop_name, _, description in flat_query_dict[imname]: mask = load_gt_mask(config.mask_dir + imcrop_name + '.mat').astype(np.float32) labels = (mask > 0) processed_labels = im_processing.resize_and_pad(mask, config.input_H, config.input_W) processed_labels = processed_labels > 0 text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, config.T) cont_val = text_processing.create_cont(text_seq_val) net.blobs['language'].data[...] = text_seq_val net.blobs['cont'].data[...] = cont_val net.blobs['image'].data[...] = imcrop_val_trans net.blobs['spatial'].data[...] = spatial_val net.blobs['label'].data[...] = processed_labels net.forward() upscores = net.blobs['upscores'].data[...].copy() upscores = np.squeeze(upscores) # Evaluate the segmentation performance of using bounding box segmentation pred_raw = (upscores >= config.score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1]) I, U = eval_tools.compute_mask_IU(predicts, labels) cum_I += I cum_U += U this_IoU = I/float(U) for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou] seg_correct[n_eval_iou] += (I/float(U) >= eval_seg_iou) seg_total += 1 # Print results print('Final results on the whole test set') result_str = '' for n_eval_iou in range(len(eval_seg_iou_list)): result_str += 'precision@%s = %f\n' % \ (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou]/seg_total) result_str += 'overall IoU = %f\n' % (cum_I/cum_U) print(result_str)