Пример #1
0
def build_coco_batches(dataset, setname, T, input_H, input_W):
    im_dir = '/data/ryli/datasets/coco/images'
    im_type = 'train2014'
    vocab_file = './data/vocabulary_Gref.txt'

    data_folder = './' + dataset + '/' + setname + '_batch/'
    data_prefix = dataset + '_' + setname
    if not os.path.isdir(data_folder):
        os.makedirs(data_folder)

    if dataset == 'Gref':
        refer = REFER('./external/refer/data',
                      dataset='refcocog',
                      splitBy='google')
    elif dataset == 'unc':
        refer = REFER('./external/refer/data',
                      dataset='refcoco',
                      splitBy='unc')
    elif dataset == 'unc+':
        refer = REFER('./external/refer/data',
                      dataset='refcoco+',
                      splitBy='unc')
    else:
        raise ValueError('Unknown dataset %s' % dataset)
    refs = [
        refer.Refs[ref_id] for ref_id in refer.Refs
        if refer.Refs[ref_id]['split'] == setname
    ]
    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)

    n_batch = 0
    for ref in refs:
        im_name = 'COCO_' + im_type + '_' + str(ref['image_id']).zfill(12)
        im = skimage.io.imread('%s/%s/%s.jpg' % (im_dir, im_type, im_name))
        seg = refer.Anns[ref['ann_id']]['segmentation']
        rle = cocomask.frPyObjects(seg, im.shape[0], im.shape[1])
        mask = np.max(cocomask.decode(rle), axis=2).astype(np.float32)

        if 'train' in setname:
            im = skimage.img_as_ubyte(
                im_processing.resize_and_pad(im, input_H, input_W))
            mask = im_processing.resize_and_pad(mask, input_H, input_W)
        if im.ndim == 2:
            im = np.tile(im[:, :, np.newaxis], (1, 1, 3))

        for sentence in ref['sentences']:
            print('saving batch %d' % (n_batch + 1))
            sent = sentence['sent']
            text = text_processing.preprocess_sentence(sent, vocab_dict, T)

            np.savez(file=data_folder + data_prefix + '_' + str(n_batch) +
                     '.npz',
                     text_batch=text,
                     im_batch=im,
                     mask_batch=(mask > 0),
                     sent_batch=[sent])
            n_batch += 1
def vectorizeLearntEmbd(args):
    if args.checkpoint == '':
        # Network
        if args.savefile == "det":
            vocab_size = 8803
            embedding_dim = 1000
            vocab_file = './exp-referit/data/vocabulary_referit.txt'
            vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)
            pretrained_model = './exp-referit/tfmodel/referit_fc8_det_iter_25000.tfmodel'
        else:
            vocab_size = len(vocab)
            embedding_dim = len(embd[0])
            vocab_dict = dict()
            for i in range(len(vocab)): vocab_dict[vocab[i]] = i
            pretrained_model = './coco/tfmodel/cls_coco_glove_20000.tfmodel'

        # Inputs
        text_seq_batch = tf.placeholder(tf.int32, [T, N])
        embedem = embedding_layer(text_seq_batch, vocab_size, embedding_dim)  

        # Load pretrained model
        snapshot_restorer = tf.train.Saver(None)
        sess = tf.Session()
        snapshot_restorer.restore(sess, pretrained_model)     

        # Initialize arrays
        vectors = list()
        text_seq_val = np.zeros((T, N), dtype=np.int32)

        # Generate vector embeddings
        count = 0
        for word in words: 
            count += 1
            if count % 100 == 0: print("%d out of %d words processed" % (count, len(words)))

            # Preprocess word
            text_seq = text_processing.preprocess_sentence(word, vocab_dict, T)
            text_seq_val[:, 0] = text_seq

            # Extract LSTM language feature
            embedded_seq = sess.run(embedem, feed_dict={text_seq_batch:text_seq_val})
            temp = np.squeeze(np.transpose(embedded_seq))
            vectors.append(temp)

            if count == vector_count: break

        # Save vectors for easy recovery
        backup = args.savefile + "_TSNE_backup.npz"
        np.savez(os.path.join(plot_dir, backup), words=words, vectors=vectors)

    else:
        # Load saved vectors
        npzfile = np.load(os.path.join(plot_dir, args.checkpoint))
        vectors = npzfile['vectors']

    return vectors
Пример #3
0
    def __init__(self,
                 roidb_file,
                 vocab_file,
                 im_mean,
                 min_size=600,
                 max_size=1000,
                 T=20,
                 shuffle=True,
                 prefetch_num=8):

        print('Loading ROI data from file...', end='')
        sys.stdout.flush()
        if isinstance(roidb_file, list):
            roidb = []
            for fname in roidb_file:
                roidb += util.io.load_json(fname)
        else:
            if roidb_file.endswith('.json'):
                roidb = util.io.load_json(roidb_file)
            elif roidb_file.endswith('.npy'):
                roidb = util.io.load_numpy_obj(roidb_file)
            else:
                raise TypeError('unknown roidb format.')
        self.roidb = roidb
        print('Done.')

        self.vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)
        self.im_mean = im_mean

        self.min_size = min_size
        self.max_size = max_size

        self.T = T

        self.shuffle = shuffle
        self.prefetch_num = prefetch_num

        self.n_batch = 0
        self.n_epoch = 0

        self.num_batch = len(self.roidb)

        # Start prefetching thread
        self.prefetch_queue = queue.Queue(maxsize=prefetch_num)
        self.prefetch_thread = threading.Thread(
            target=run_prefetch,
            args=(self.prefetch_queue, self.roidb, self.im_mean, self.min_size,
                  self.max_size, self.vocab_dict, self.T, self.num_batch,
                  self.shuffle))
        self.prefetch_thread.daemon = True
        self.prefetch_thread.start()
Пример #4
0
def build_referit_batches(setname, T, input_H, input_W):
    # data directory
    im_dir = '/data/ryli/text_objseg/exp-referit/referit-dataset/images/'
    mask_dir = '/data/ryli/text_objseg/exp-referit/referit-dataset/mask/'
    query_file = './data/referit/referit_query_' + setname + '.json'
    vocab_file = './data/vocabulary_referit.txt'

    # saving directory
    data_folder = './referit/' + setname + '_batch/'
    data_prefix = 'referit_' + setname
    if not os.path.isdir(data_folder):
        os.makedirs(data_folder)
    fp = open('./referit/trainval_list.txt', 'w')

    # load annotations
    query_dict = json.load(open(query_file))
    im_list = query_dict.keys()
    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)

    # collect training samples
    samples = []
    for n_im, name in enumerate(im_list):
        im_name = name.split('_', 1)[0] + '.jpg'
        mask_name = name + '.mat'
        for sent in query_dict[name]:
            samples.append((im_name, mask_name, sent))

    # save batches to disk
    num_batch = len(samples)
    for n_batch in range(num_batch):
        print('saving batch %d / %d' % (n_batch + 1, num_batch))
        im_name, mask_name, sent = samples[n_batch]
        fp.write('%d\t%s%s\n' % (n_batch, im_dir, im_name))
        im = skimage.io.imread(im_dir + im_name)
        mask = load_gt_mask(mask_dir + mask_name).astype(np.float32)

        if 'train' in setname:
            im = skimage.img_as_ubyte(
                im_processing.resize_and_pad(im, input_H, input_W))
            mask = im_processing.resize_and_pad(mask, input_H, input_W)
        if im.ndim == 2:
            im = np.tile(im[:, :, np.newaxis], (1, 1, 3))

        text = text_processing.preprocess_sentence(sent, vocab_dict, T)

        np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz',
                 text_batch=text,
                 im_batch=im,
                 mask_batch=(mask > 0),
                 sent_batch=[sent])
    fp.close()
Пример #5
0
def inference(config):
    with open('./seg_model/test.prototxt', 'w') as f:
        f.write(str(seg_model.generate_model('val', config)))

    caffe.set_device(config.gpu_id)
    caffe.set_mode_gpu()

    # Load pretrained model
    net = caffe.Net('./seg_model/test.prototxt',
                    config.pretrained_model,
                    caffe.TEST)

    ################################################################################
    # Load annotations and bounding box proposals
    ################################################################################

    query_dict = json.load(open(config.query_file))
    bbox_dict = json.load(open(config.bbox_file))
    imcrop_dict = json.load(open(config.imcrop_file))
    imsize_dict = json.load(open(config.imsize_file))
    imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict})
    vocab_dict = text_processing.load_vocab_dict_from_file(config.vocab_file)

    ################################################################################
    # Flatten the annotations
    ################################################################################

    flat_query_dict = {imname: [] for imname in imlist}
    for imname in imlist:
        this_imcrop_names = imcrop_dict[imname]
        for imcrop_name in this_imcrop_names:
            gt_bbox = bbox_dict[imcrop_name]
            if imcrop_name not in query_dict:
                continue
            this_descriptions = query_dict[imcrop_name]
            for description in this_descriptions:
                flat_query_dict[imname].append((imcrop_name, gt_bbox, description))

    ################################################################################
    # Testing
    ################################################################################

    cum_I, cum_U = 0.0, 0.0
    eval_seg_iou_list = [0.5, 0.6, 0.7, 0.8, 0.9]
    seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
    seg_total = 0.0

    # Pre-allocate arrays
    imcrop_val = np.zeros((config.N, config.input_H, config.input_W, 3), dtype=np.float32)
    text_seq_val = np.zeros((config.T, config.N), dtype=np.int32)

    num_im = len(imlist)
    for n_im in tqdm(range(num_im)):
        imname = imlist[n_im]

        # Extract visual features from all proposals
        im = skimage.io.imread(config.image_dir + imname)
        processed_im = skimage.img_as_ubyte(
            im_processing.resize_and_pad(im, config.input_H, config.input_W))
                                                                         
        if processed_im.ndim == 2:
            processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3))

        imcrop_val[...] = processed_im.astype(np.float32) - seg_model.channel_mean
        imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2))

        # Extract spatial features
        spatial_val = processing_tools.generate_spatial_batch(config.N,
                                                              config.featmap_H,
                                                              config.featmap_W)
        spatial_val = spatial_val.transpose((0, 3, 1, 2))

        for imcrop_name, _, description in flat_query_dict[imname]:
            mask = load_gt_mask(config.mask_dir + imcrop_name + '.mat').astype(np.float32)
            labels = (mask > 0)
            processed_labels = im_processing.resize_and_pad(mask, config.input_H, config.input_W)
            processed_labels = processed_labels > 0

            text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, config.T)
            cont_val = text_processing.create_cont(text_seq_val)

            net.blobs['language'].data[...] = text_seq_val
            net.blobs['cont'].data[...] = cont_val
            net.blobs['image'].data[...] = imcrop_val_trans
            net.blobs['spatial'].data[...] = spatial_val
            net.blobs['label'].data[...] = processed_labels

            net.forward()
            upscores = net.blobs['upscores'].data[...].copy()
            upscores = np.squeeze(upscores)

            # Evaluate the segmentation performance of using bounding box segmentation
            pred_raw = (upscores >= config.score_thresh).astype(np.float32)
            predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1])
            I, U = eval_tools.compute_mask_IU(predicts, labels)
            cum_I += I
            cum_U += U
            this_IoU = I/float(U)
            for n_eval_iou in range(len(eval_seg_iou_list)):
                eval_seg_iou = eval_seg_iou_list[n_eval_iou]
                seg_correct[n_eval_iou] += (I/float(U) >= eval_seg_iou)
            seg_total += 1


    # Print results
    print('Final results on the whole test set')
    result_str = ''
    for n_eval_iou in range(len(eval_seg_iou_list)):
        result_str += 'precision@%s = %f\n' % \
            (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou]/seg_total)
    result_str += 'overall IoU = %f\n' % (cum_I/cum_U)
    print(result_str)
# Model Params
T = 20
N = 10
input_H = 512; featmap_H = (input_H // 32)
input_W = 512; featmap_W = (input_W // 32)

################################################################################
# Load annotations
################################################################################

query_dict = json.load(open(query_file))
imsize_dict = json.load(open(imsize_file))
imcrop_list = query_dict.keys()
imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict})
vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)

################################################################################
# Collect training samples
################################################################################

training_samples = []
num_imcrop = len(imcrop_list)
for n_imcrop in range(num_imcrop):
    if n_imcrop % 200 == 0: print('processing %d / %d' % (n_imcrop+1, num_imcrop))
    imcrop_name = imcrop_list[n_imcrop]

    # Image and mask
    imname = imcrop_name.split('_', 1)[0] + '.jpg'
    mask_name = imcrop_name + '.mat'
    im = skimage.io.imread(image_dir + imname)
Пример #7
0
# Load pretrained model
snapshot_restorer = tf.train.Saver()
sess = tf.Session()
snapshot_restorer.restore(sess, pretrained_model)

################################################################################
# Load annotations
################################################################################

query_dict = json.load(open(query_file))
bbox_dict = json.load(open(bbox_file))
imcrop_dict = json.load(open(imcrop_file))
imsize_dict = json.load(open(imsize_file))
imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict})
vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)

################################################################################
# Flatten the annotations
################################################################################

flat_query_dict = {imname: [] for imname in imlist}
for imname in imlist:
    this_imcrop_names = imcrop_dict[imname]
    for imcrop_name in this_imcrop_names:
        gt_bbox = bbox_dict[imcrop_name]
        if imcrop_name not in query_dict:
            continue
        this_descriptions = query_dict[imcrop_name]
        for description in this_descriptions:
            flat_query_dict[imname].append((imcrop_name, gt_bbox, description))
Пример #8
0
def inference(config):
    with open('./det_model/fc8.prototxt', 'w') as f:
        f.write(str(det_model.generate_fc8('val', config)))
    with open('./det_model/scores.prototxt', 'w') as f:
        f.write(str(det_model.generate_scores('val', config)))

    caffe.set_device(config.gpu_id)
    caffe.set_mode_gpu()

    # Load pretrained model
    fc8_net = caffe.Net('./det_model/fc8.prototxt', config.pretrained_model,
                        caffe.TEST)

    scores_net = caffe.Net('./det_model/scores.prototxt',
                           config.pretrained_model, caffe.TEST)

    ################################################################################
    # Load annotations and bounding box proposals
    ################################################################################

    query_dict = json.load(open(config.query_file))
    bbox_dict = json.load(open(config.bbox_file))
    imcrop_dict = json.load(open(config.imcrop_file))
    imsize_dict = json.load(open(config.imsize_file))
    imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict})
    vocab_dict = text_processing.load_vocab_dict_from_file(config.vocab_file)

    # Object proposals
    bbox_proposal_dict = {}
    for imname in imlist:
        bboxes = np.loadtxt(config.bbox_proposal_dir + imname[:-4] +
                            '.txt').astype(int).reshape((-1, 4))
        bbox_proposal_dict[imname] = bboxes

    ################################################################################
    # Flatten the annotations
    ################################################################################

    flat_query_dict = {imname: [] for imname in imlist}
    for imname in imlist:
        this_imcrop_names = imcrop_dict[imname]
        for imcrop_name in this_imcrop_names:
            gt_bbox = bbox_dict[imcrop_name]
            if imcrop_name not in query_dict:
                continue
            this_descriptions = query_dict[imcrop_name]
            for description in this_descriptions:
                flat_query_dict[imname].append(
                    (imcrop_name, gt_bbox, description))

    ################################################################################
    # Testing
    ################################################################################

    eval_bbox_num_list = [1, 10, 100]
    bbox_correct = np.zeros(len(eval_bbox_num_list), dtype=np.int32)
    bbox_total = 0

    # Pre-allocate arrays
    imcrop_val = np.zeros((config.N, config.input_H, config.input_W, 3),
                          dtype=np.float32)
    spatial_val = np.zeros((config.N, 8), dtype=np.float32)
    text_seq_val = np.zeros((config.T, config.N), dtype=np.int32)

    dummy_text_seq = np.zeros((config.T, config.N), dtype=np.int32)
    dummy_cont = np.zeros((config.T, config.N), dtype=np.int32)
    dummy_label = np.zeros((config.N, 1))

    num_im = len(imlist)
    for n_im in tqdm(range(num_im)):
        imname = imlist[n_im]
        imsize = imsize_dict[imname]
        bbox_proposals = bbox_proposal_dict[imname]
        num_proposal = bbox_proposals.shape[0]
        assert (config.N >= num_proposal)

        # Extract visual features from all proposals
        im = skimage.io.imread(config.image_dir + imname)
        if im.ndim == 2:
            im = np.tile(im[:, :, np.newaxis], (1, 1, 3))
        imcrop_val[:num_proposal,
                   ...] = im_processing.crop_bboxes_subtract_mean(
                       im, bbox_proposals, config.input_H,
                       det_model.channel_mean)
        imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2))

        # Extract bounding box features from proposals
        spatial_val[:num_proposal, ...] = \
            processing_tools.spatial_feature_from_bbox(bbox_proposals, imsize)

        fc8_net.blobs['language'].data[...] = dummy_text_seq
        fc8_net.blobs['cont'].data[...] = dummy_cont
        fc8_net.blobs['image'].data[...] = imcrop_val_trans
        fc8_net.blobs['spatial'].data[...] = spatial_val
        fc8_net.blobs['label'].data[...] = dummy_label

        fc8_net.forward()
        fc8_val = fc8_net.blobs['fc8'].data[...].copy()

        # Extract textual features from sentences
        for imcrop_name, gt_bbox, description in flat_query_dict[imname]:
            proposal_IoUs = eval_tools.compute_bbox_iou(
                bbox_proposals, gt_bbox)

            # Extract language feature
            text = text_processing.preprocess_sentence(description, vocab_dict,
                                                       config.T)
            text_seq_val[...] = np.array(text, dtype=np.int32).reshape((-1, 1))

            cont_val = text_processing.create_cont(text_seq_val)

            scores_net.blobs['language'].data[...] = text_seq_val
            scores_net.blobs['cont'].data[...] = cont_val
            scores_net.blobs['img_feature'].data[...] = fc8_val
            scores_net.blobs['spatial'].data[...] = spatial_val
            scores_net.blobs['label'].data[...] = dummy_label

            scores_net.forward()

            scores_val = scores_net.blobs['scores'].data.copy()
            scores_val = scores_val[:num_proposal, ...].reshape(-1)

            # Sort the scores for the proposals
            if config.use_nms:
                top_ids = eval_tools.nms(proposal.astype(np.float32),
                                         scores_val, config.nms_thresh)
            else:
                top_ids = np.argsort(scores_val)[::-1]

            # Evaluate on bounding boxes
            for n_eval_num in range(len(eval_bbox_num_list)):
                eval_bbox_num = eval_bbox_num_list[n_eval_num]
                bbox_correct[n_eval_num] += \
                    np.any(proposal_IoUs[top_ids[:eval_bbox_num]] >= config.correct_iou_thresh)
            bbox_total += 1

    print('Final results on the whole test set')
    result_str = ''
    for n_eval_num in range(len(eval_bbox_num_list)):
        result_str += 'recall@%s = %f\n' % \
            (str(eval_bbox_num_list[n_eval_num]), bbox_correct[n_eval_num]/bbox_total)
    print(result_str)
Пример #9
0
def test(iter,
         dataset,
         visualize,
         setname,
         dcrf,
         mu,
         tfmodel_folder,
         model_name,
         pre_emb=False):
    data_folder = './' + dataset + '/' + setname + '_batch/'
    data_prefix = dataset + '_' + setname
    if visualize:
        save_dir = './' + dataset + '/visualization/' + str(iter) + '/'
        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)
    weights = os.path.join(tfmodel_folder,
                           dataset + '_iter_' + str(iter) + '.tfmodel')
    print("Loading trained weights from {}".format(weights))

    score_thresh = 1e-9
    eval_seg_iou_list = [.5, .6, .7, .8, .9]
    cum_I, cum_U = 0, 0
    mean_IoU, mean_dcrf_IoU = 0, 0
    seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
    if dcrf:
        cum_I_dcrf, cum_U_dcrf = 0, 0
        seg_correct_dcrf = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
    seg_total = 0.
    T = 20  # truncated long sentence
    H, W = 320, 320
    vocab_size = 8803 if dataset == 'referit' else 12112
    emb_name = 'referit' if dataset == 'referit' else 'Gref'
    vocab_file = './data/vocabulary_Gref.txt'
    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)
    IU_result = list()

    if pre_emb:
        # use pretrained embbeding
        print("Use pretrained Embeddings.")
        model = get_segmentation_model(model_name,
                                       H=H,
                                       W=W,
                                       mode='eval',
                                       vocab_size=vocab_size,
                                       emb_name=emb_name,
                                       emb_dir=args.embdir)
    else:
        model = get_segmentation_model(model_name,
                                       H=H,
                                       W=W,
                                       mode='eval',
                                       vocab_size=vocab_size)

    # Load pretrained model
    snapshot_restorer = tf.train.Saver()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    snapshot_restorer.restore(sess, weights)

    meta_expression = {}
    with open(args.meta) as meta_file:
        meta_expression = json.load(meta_file)
    videos = meta_expression['videos']
    for vid_ind, vid in reversed(list(enumerate(videos.keys()))):
        print("Running on video {}/{}".format(vid_ind + 1, len(videos.keys())))
        expressions = videos[vid]['expressions']
        # instance_ids = [expression['obj_id'] for expression_id in videos[vid]['expressions']]
        frame_ids = videos[vid]['frames']
        for eid in expressions:
            exp = expressions[eid]['exp']
            index = int(eid)
            vis_dir = os.path.join(args.visdir,
                                   str('{}/{}/'.format(vid, index)))
            mask_dir = os.path.join(args.maskdir,
                                    str('{}/{}/'.format(vid, index)))
            if not os.path.exists(vis_dir):
                os.makedirs(vis_dir)
            if not os.path.exists(mask_dir):
                os.makedirs(mask_dir)
            avg_time = 0
            total_frame = 0
            #             Process text
            text = np.array(
                text_processing.preprocess_sentence(exp, vocab_dict, T))
            valid_idx = np.zeros([1], dtype=np.int32)
            for idx in range(text.shape[0]):
                if text[idx] != 0:
                    valid_idx[0] = idx
                    break
            for fid in frame_ids:
                vis_path = os.path.join(vis_dir, str('{}.png'.format(fid)))
                mask_path = os.path.join(mask_dir, str('{}.npy'.format(fid)))
                if os.path.exists(vis_path):
                    continue
                frame = load_frame_from_id(vid, fid)
                if frame is None:
                    continue
                last_time = time.time()
                #                 im = frame.copy()
                im = frame
                #                 mask = np.array(frame, dtype=np.float32)

                proc_im = skimage.img_as_ubyte(
                    im_processing.resize_and_pad(im, H, W))
                proc_im_ = proc_im.astype(np.float32)
                # proc_im_ = proc_im_[:, :, ::-1]
                proc_im_ -= mu
                scores_val, up_val, sigm_val = sess.run(
                    [model.pred, model.up, model.sigm],
                    feed_dict={
                        model.words: np.expand_dims(text, axis=0),
                        model.im: np.expand_dims(proc_im_, axis=0),
                        model.valid_idx: np.expand_dims(valid_idx, axis=0)
                    })
                # scores_val = np.squeeze(scores_val)
                # pred_raw = (scores_val >= score_thresh).astype(np.float32)
                up_val = np.squeeze(up_val)
                pred_raw = (up_val >= score_thresh).astype('uint8') * 255
                #                 pred_raw = (up_val >= score_thresh).astype(np.float32)
                #                 predicts = im_processing.resize_and_crop(pred_raw, mask.shape[0], mask.shape[1])
                if dcrf:
                    # Dense CRF post-processing
                    sigm_val = np.squeeze(sigm_val) + 1e-7
                    d = densecrf.DenseCRF2D(W, H, 2)
                    U = np.expand_dims(-np.log(sigm_val), axis=0)
                    U_ = np.expand_dims(-np.log(1 - sigm_val), axis=0)
                    unary = np.concatenate((U_, U), axis=0)
                    unary = unary.reshape((2, -1))
                    d.setUnaryEnergy(unary)
                    d.addPairwiseGaussian(sxy=3, compat=3)
                    d.addPairwiseBilateral(sxy=20,
                                           srgb=3,
                                           rgbim=proc_im,
                                           compat=10)
                    Q = d.inference(5)
                    pred_raw_dcrf = np.argmax(Q, axis=0).reshape(
                        (H, W)).astype('uint8') * 255
#                     pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype(np.float32)
#                     predicts_dcrf = im_processing.resize_and_crop(pred_raw_dcrf, mask.shape[0], mask.shape[1])
                if visualize:
                    if dcrf:
                        cv2.imwrite(vis_path, pred_raw_dcrf)
#                         np.save(mask_path, np.array(pred_raw_dcrf))
#                         visualize_seg(vis_path, im, exp, predicts_dcrf)
                    else:
                        np.save(mask_path, np.array(sigm_val))


#                         cv2.imwrite(vis_path, pred_raw)
#                         visualize_seg(vis_path, im, exp, predicts)
#                         np.save(mask_path, np.array(pred_raw))
# I, U = eval_tools.compute_mask_IU(predicts, mask)
# IU_result.append({'batch_no': n_iter, 'I': I, 'U': U})
# mean_IoU += float(I) / U
# cum_I += I
# cum_U += U
# msg = 'cumulative IoU = %f' % (cum_I / cum_U)
# for n_eval_iou in range(len(eval_seg_iou_list)):
#     eval_seg_iou = eval_seg_iou_list[n_eval_iou]
#     seg_correct[n_eval_iou] += (I / U >= eval_seg_iou)
# if dcrf:
#     I_dcrf, U_dcrf = eval_tools.compute_mask_IU(predicts_dcrf, mask)
#     mean_dcrf_IoU += float(I_dcrf) / U_dcrf
#     cum_I_dcrf += I_dcrf
#     cum_U_dcrf += U_dcrf
#     msg += '\tcumulative IoU (dcrf) = %f' % (cum_I_dcrf / cum_U_dcrf)
#     for n_eval_iou in range(len(eval_seg_iou_list)):
#         eval_seg_iou = eval_seg_iou_list[n_eval_iou]
#         seg_correct_dcrf[n_eval_iou] += (I_dcrf / U_dcrf >= eval_seg_iou)
# print(msg)
    seg_total += 1
def test(iter, dataset, visualize, setname, dcrf, mu, tfmodel_path, model_name, pre_emb=False):
    data_folder = './' + dataset + '/' + setname + '_batch/'
    data_prefix = dataset + '_' + setname
    if visualize:
        save_dir = './' + dataset + '/visualization/' + str(iter) + '/'
        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)
    weights = os.path.join(tfmodel_path)
    print("Loading trained weights from {}".format(weights))

    score_thresh = 1e-9
    eval_seg_iou_list = [.5, .6, .7, .8, .9]
    cum_I, cum_U = 0, 0
    mean_IoU, mean_dcrf_IoU = 0, 0
    seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
    if dcrf:
        cum_I_dcrf, cum_U_dcrf = 0, 0
        seg_correct_dcrf = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
    seg_total = 0.
    T = 20 # truncated long sentence
    H, W = 320, 320
    vocab_size = 8803 if dataset == 'referit' else 12112
    emb_name = 'referit' if dataset == 'referit' else 'refvos'
    vocab_file = './data/vocabulary_refvos.txt'
    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)
    IU_result = list()

    if pre_emb:
        # use pretrained embbeding
        print("Use pretrained Embeddings.")
        model = get_segmentation_model(model_name, H=H, W=W,
                                       mode='eval', 
                                       vocab_size=vocab_size, 
                                       emb_name=emb_name, 
                                       emb_dir=args.embdir)
    else:
        model = get_segmentation_model(model_name, H=H, W=W,
                                       mode='eval', vocab_size=vocab_size)

    # Load pretrained model
    snapshot_restorer = tf.train.Saver()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    snapshot_restorer.restore(sess, weights)
     
    meta_expression = {}
    with open(args.meta) as meta_file:
        meta_expression = json.load(meta_file)
    videos = meta_expression['videos']
    plt.figure(figsize=[15, 4])
    sorted_video_key = ['a9f23c9150', '6cc8bce61a', '03fe6115d4', 'a46012c642', 'c42fdedcdd', 'ee9415c553', '7daa6343e6', '4fe6619a47', '0e8a6b63bb', '65e0640a2a', '8939473ea7', 'b05faf54f7', '5d2020eff8', 'a00c3fa88e', '44e5d1a969', 'deed0ab4fc', 'b205d868e6', '48d2909d9e', 'c9ef04fe59', '1e20ceafae', '0f3f8b2b2f', 'b83923fd72', 'cb06f84b6e', '17cba76927', '35d5e5149d', '62bf7630b3', '0390fabe58', 'bf2d38aefe', '8b7b57b94d', '8d803e87f7', 'c16d9a4ade', '1a1dbe153e', 'd975e5f4a9', '226f1e10f7', '6cb5b08d93', '77df215672', '466734bc5c', '94fa9bd3b5', 'f2a45acf1c', 'ba8823f2d2', '06cd94d38d', 'b772ac822a', '246e38963b', 'b5514f75d8', '188cb4e03d', '3dd327ab4e', '8e2e5af6a8', '450bd2e238', '369919ef49', 'a4bce691c6', '64c6f2ed76', '0782a6df7e', '0062f687f1', 'c74fc37224', 'f7255a57d0', '4f5b3310e3', 'e027ebc228', '30fe0ed0ce', '6a75316e99', 'a2948d4116', '8273b59141', 'abae1ce57d', '621487be65', '45dc90f558', '9787f452bf', 'cdcfd9f93a', '4f6662e4e0', '853ca85618', '13ca7bbcfd', 'f143fede6f', '92fde455eb', '0b0c90e21a', '5460cc540a', '182dbfd6ba', '85968ae408', '541ccb0844', '43115c42b2', '65350fd60a', 'eb49ce8027', 'e11254d3b9', '20a93b4c54', 'a0fc95d8fc', '696e01387c', 'fef7e84268', '72d613f21a', '8c60938d92', '975be70866', '13c3cea202', '4ee0105885', '01c88b5b60', '33e8066265', '8dea7458de', 'c280d21988', 'fd8cf868b2', '35948a7fca', 'e10236eb37', 'a1251195e7', 'b2256e265c', '2b904b76c9', '1ab5f4bbc5', '47d01d34c8', 'd7a38bf258', '1a609fa7ee', '218ac81c2d', '9f16d17e42', 'fb104c286f', 'eb263ef128', '37b4ec2e1a', '0daaddc9da', 'cd69993923', '31d3a7d2ee', '60362df585', 'd7ff44ea97', '623d24ce2b', '6031809500', '54526e3c66', '0788b4033d', '3f4bacb16a', '06a5dfb511', '9f21474aca', '7a19a80b19', '9a38b8e463', '822c31928a', 'd1ac0d8b81', 'eea1a45e49', '9f429af409', '33c8dcbe09', '9da2156a73', '3be852ed44', '3674b2c70a', '547416bda1', '4037d8305d', '29c06df0f2', '1335b16cf9', 'b7b7e52e02', 'bc9ba8917e', 'dab44991de', '9fd2d2782b', 'f054e28786', 'b00ff71889', 'eeb18f9d47', '559a611d86', 'dea0160a12', '257f7fd5b8', 'dc197289ef', 'c2bbd6d121', 'f3678388a7', '332dabe378', '63883da4f5', 'b90f8c11db', 'dce363032d', '411774e9ff', '335fc10235', '7775043b5e', '3e03f623bb', '19cde15c4b', 'bf4cc89b18', '1a894a8f98', 'f7d7fb16d0', '61fca8cbf1', 'd69812339e', 'ab9a7583f1', 'e633eec195', '0a598e18a8', 'b3b92781d9', 'cd896a9bee', 'b7928ea5c0', '69c0f7494e', 'cc1a82ac2a', '39b7491321', '352ad66724', '749f1abdf9', '7f26b553ae', '0c04834d61', 'd1dd586cfd', '3b72dc1941', '39bce09d8d', 'cbea8f6bea', 'cc7c3138ff', 'd59c093632', '68dab8f80c', '1e0257109e', '4307020e0f', '4b783f1fc5', 'ebe7138e58', '1f390d22ea', '7a72130f21', 'aceb34fcbe', '9c0b55cae5', 'b58a97176b', '152fe4902a', 'a806e58451', '9ce299a510', '97b38cabcc', 'f39c805b54', '0620b43a31', '0723d7d4fe', '7741a0fbce', '7836afc0c2', 'a7462d6aaf', '34564d26d8', '31e0beaf99']
    # sorted_video_key = ['6cc8bce61a']
    for vid_ind, vid in enumerate(sorted_video_key):
        print("Running on video {}/{}".format(vid_ind + 1, len(videos.keys())))
        expressions = videos[vid]['expressions']
        # instance_ids = [expression['obj_id'] for expression_id in videos[vid]['expressions']]
        frame_ids = videos[vid]['frames']
        for eid in expressions:
            exp = expressions[eid]['exp']
            index = int(eid)
            vis_dir = args.visdir
#             mask_dir = os.path.join(args.maskdir, str('{}/{}/'.format(vid, index)))
            if not os.path.exists(vis_dir):
                os.makedirs(vis_dir)
#             if not os.path.exists(mask_dir):
#                 os.makedirs(mask_dir)
            avg_time = 0
            total_frame = 0
#             Process text
            text = np.array(text_processing.preprocess_sentence(exp, vocab_dict, T))
            valid_idx = np.zeros([1], dtype=np.int32)
            for idx in range(text.shape[0]):
                if text[idx] != 0:
                    valid_idx[0] = idx
                    break
            for fid in frame_ids:
                frame_id = int(fid)
                if (frame_id % 20 != 0):
                    continue
                vis_path = os.path.join(vis_dir, str('{}_{}_{}.png'.format(vid,eid,fid)))
                frame = load_frame_from_id(vid, fid)
                if frame is None:
                    continue
                last_time = time.time()
#                 im = frame.copy()
                im = frame
#                 mask = np.array(frame, dtype=np.float32)

                proc_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, H, W))
                proc_im_ = proc_im.astype(np.float32)
                proc_im_ = proc_im_[:, :, ::-1]
                proc_im_ -= mu
                scores_val, up_val, sigm_val, up_c4 = sess.run([model.pred, 
                                                                                model.up, 
                                                                                model.sigm, 
                                                                                model.up_c4, 
                                                                                ],
                                                                                feed_dict={
                                                                                    model.words: np.expand_dims(text, axis=0),
                                                                                    model.im: np.expand_dims(proc_im_, axis=0),
                                                                                    model.valid_idx: np.expand_dims(valid_idx, axis=0)
                                                                                })
                # scores_val = np.squeeze(scores_val)
                # pred_raw = (scores_val >= score_thresh).astype(np.float32)
                up_c4 = im_processing.resize_and_crop(sigmoid(np.squeeze(up_c4)), frame.shape[0], frame.shape[1])
                sigm_val = im_processing.resize_and_crop(sigmoid(np.squeeze(sigm_val)), frame.shape[0], frame.shape[1])
                up_val = np.squeeze(up_val)
                # if (not math.isnan(consitency_score) and consitency_score < 0.3):
                plt.clf()
                plt.subplot(1, 3, 1)
                plt.imshow(frame)
                plt.text(-0.7, -0.7, exp + str(consitency_score))
                plt.subplot(1, 3, 2)
                plt.imshow(up_c4)
                plt.subplot(1, 3, 3)
                plt.imshow(sigm_val)
                plt.savefig(vis_path)
#                 pred_raw = (up_val >= score_thresh).astype('uint8') * 255
#                 pred_raw = (up_val >= score_thresh).astype(np.float32)
#                 predicts = im_processing.resize_and_crop(pred_raw, mask.shape[0], mask.shape[1])
#                 if dcrf:
#                     # Dense CRF post-processing
#                     sigm_val = np.squeeze(sigm_val) + 1e-7
#                     d = densecrf.DenseCRF2D(W, H, 2)
#                     U = np.expand_dims(-np.log(sigm_val), axis=0)
#                     U_ = np.expand_dims(-np.log(1 - sigm_val), axis=0)
#                     unary = np.concatenate((U_, U), axis=0)
#                     unary = unary.reshape((2, -1))
#                     d.setUnaryEnergy(unary)
#                     d.addPairwiseGaussian(sxy=3, compat=3)
#                     d.addPairwiseBilateral(sxy=20, srgb=3, rgbim=proc_im, compat=10)
#                     Q = d.inference(5)
#                     pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype('uint8') * 255
# #                     pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype(np.float32)
# #                     predicts_dcrf = im_processing.resize_and_crop(pred_raw_dcrf, mask.shape[0], mask.shape[1])
#                 if visualize:
#                     if dcrf:
#                         cv2.imwrite(vis_path, pred_raw_dcrf)
# #                         np.save(mask_path, np.array(pred_raw_dcrf))
# #                         visualize_seg(vis_path, im, exp, predicts_dcrf)
#                     else:
#                         np.save(mask_path, np.array(sigm_val))
#                         cv2.imwrite(vis_path, pred_raw)
#                         visualize_seg(vis_path, im, exp, predicts)
#                         np.save(mask_path, np.array(pred_raw))
    # I, U = eval_tools.compute_mask_IU(predicts, mask)
    # IU_result.append({'batch_no': n_iter, 'I': I, 'U': U})
    # mean_IoU += float(I) / U
    # cum_I += I
    # cum_U += U
    # msg = 'cumulative IoU = %f' % (cum_I / cum_U)
    # for n_eval_iou in range(len(eval_seg_iou_list)):
    #     eval_seg_iou = eval_seg_iou_list[n_eval_iou]
    #     seg_correct[n_eval_iou] += (I / U >= eval_seg_iou)
    # if dcrf:
    #     I_dcrf, U_dcrf = eval_tools.compute_mask_IU(predicts_dcrf, mask)
    #     mean_dcrf_IoU += float(I_dcrf) / U_dcrf
    #     cum_I_dcrf += I_dcrf
    #     cum_U_dcrf += U_dcrf
    #     msg += '\tcumulative IoU (dcrf) = %f' % (cum_I_dcrf / cum_U_dcrf)
    #     for n_eval_iou in range(len(eval_seg_iou_list)):
    #         eval_seg_iou = eval_seg_iou_list[n_eval_iou]
    #         seg_correct_dcrf[n_eval_iou] += (I_dcrf / U_dcrf >= eval_seg_iou)
    # print(msg)
    seg_total += 1
Пример #11
0
def inference():
    with open('./seg_model/test.prototxt', 'w') as f:
        f.write(str(seg_model.generate_model('val', test_config.N)))

    caffe.set_device(test_config.gpu_id)
    caffe.set_mode_gpu()

    # Load pretrained model
    net = caffe.Net('./seg_model/test.prototxt',
                    test_config.pretrained_model,
                    caffe.TEST)

    ################################################################################
    # Load annotations and bounding box proposals
    ################################################################################

    query_dict = json.load(open(test_config.query_file))
    bbox_dict = json.load(open(test_config.bbox_file))
    imcrop_dict = json.load(open(test_config.imcrop_file))
    imsize_dict = json.load(open(test_config.imsize_file))
    imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict})
    vocab_dict = text_processing.load_vocab_dict_from_file(test_config.vocab_file)

    ################################################################################
    # Flatten the annotations
    ################################################################################

    flat_query_dict = {imname: [] for imname in imlist}
    for imname in imlist:
        this_imcrop_names = imcrop_dict[imname]
        for imcrop_name in this_imcrop_names:
            gt_bbox = bbox_dict[imcrop_name]
            if imcrop_name not in query_dict:
                continue
            this_descriptions = query_dict[imcrop_name]
            for description in this_descriptions:
                flat_query_dict[imname].append((imcrop_name, gt_bbox, description))

    ################################################################################
    # Testing
    ################################################################################

    cum_I, cum_U = 0.0, 0.0
    eval_seg_iou_list = [0.5, 0.6, 0.7, 0.8, 0.9]
    seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
    seg_total = 0.0

    # Pre-allocate arrays
    imcrop_val = np.zeros((test_config.N, test_config.input_H, test_config.input_W, 3), dtype=np.float32)
    text_seq_val = np.zeros((test_config.T, test_config.N), dtype=np.int32)

    num_im = len(imlist)
    for n_im in tqdm(range(num_im)):
        imname = imlist[n_im]

        # Extract visual features from all proposals
        im = skimage.io.imread(test_config.image_dir + imname)
        processed_im = skimage.img_as_ubyte(
            im_processing.resize_and_pad(im, test_config.input_H, test_config.input_W))
                                                                         
        if processed_im.ndim == 2:
            processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3))

        imcrop_val[...] = processed_im.astype(np.float32) - seg_model.channel_mean
        imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2))

        # Extract spatial features
        spatial_val = processing_tools.generate_spatial_batch(test_config.N,
                                                              test_config.featmap_H,
                                                              test_config.featmap_W)
        spatial_val = spatial_val.transpose((0, 3, 1, 2))

        for imcrop_name, _, description in flat_query_dict[imname]:
            mask = load_gt_mask(test_config.mask_dir + imcrop_name + '.mat').astype(np.float32)
            labels = (mask > 0)
            processed_labels = im_processing.resize_and_pad(mask, test_config.input_H, test_config.input_W)
            processed_labels = processed_labels > 0

            text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, test_config.T)
            cont_val = text_processing.create_cont(text_seq_val)

            net.blobs['language'].data[...] = text_seq_val
            net.blobs['cont'].data[...] = cont_val
            net.blobs['image'].data[...] = imcrop_val_trans
            net.blobs['spatial'].data[...] = spatial_val
            net.blobs['label'].data[...] = processed_labels

            net.forward()
            upscores = net.blobs['upscores'].data[...].copy()
            upscores = np.squeeze(upscores)

            # Evaluate the segmentation performance of using bounding box segmentation
            pred_raw = (upscores >= test_config.score_thresh).astype(np.float32)
            predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1])
            I, U = eval_tools.compute_mask_IU(predicts, labels)
            cum_I += I
            cum_U += U
            this_IoU = I/float(U)
            for n_eval_iou in range(len(eval_seg_iou_list)):
                eval_seg_iou = eval_seg_iou_list[n_eval_iou]
                seg_correct[n_eval_iou] += (I/float(U) >= eval_seg_iou)
            seg_total += 1


    # Print results
    print('Final results on the whole test set')
    result_str = ''
    for n_eval_iou in range(len(eval_seg_iou_list)):
        result_str += 'precision@%s = %f\n' % \
            (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou]/seg_total)
    result_str += 'overall IoU = %f\n' % (cum_I/cum_U)
    print(result_str)
Пример #12
0
def inference(config):
    with open('./det_model/fc8.prototxt', 'w') as f:
        f.write(str(det_model.generate_fc8('val', config)))
    with open('./det_model/scores.prototxt', 'w') as f:
        f.write(str(det_model.generate_scores('val', config)))

    caffe.set_device(config.gpu_id)
    caffe.set_mode_gpu()

    # Load pretrained model
    fc8_net = caffe.Net('./det_model/fc8.prototxt',
                        config.pretrained_model,
                        caffe.TEST)

    scores_net = caffe.Net('./det_model/scores.prototxt',
                           config.pretrained_model,
                           caffe.TEST)

    ################################################################################
    # Load annotations and bounding box proposals
    ################################################################################

    query_dict = json.load(open(config.query_file))
    bbox_dict = json.load(open(config.bbox_file))
    imcrop_dict = json.load(open(config.imcrop_file))
    imsize_dict = json.load(open(config.imsize_file))
    imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict})
    vocab_dict = text_processing.load_vocab_dict_from_file(config.vocab_file)

    # Object proposals
    bbox_proposal_dict = {}
    for imname in imlist:
        bboxes = np.loadtxt(config.bbox_proposal_dir + imname[:-4] + '.txt').astype(int).reshape((-1, 4))
        bbox_proposal_dict[imname] = bboxes

    ################################################################################
    # Flatten the annotations
    ################################################################################

    flat_query_dict = {imname: [] for imname in imlist}
    for imname in imlist:
        this_imcrop_names = imcrop_dict[imname]
        for imcrop_name in this_imcrop_names:
            gt_bbox = bbox_dict[imcrop_name]
            if imcrop_name not in query_dict:
                continue
            this_descriptions = query_dict[imcrop_name]
            for description in this_descriptions:
                flat_query_dict[imname].append((imcrop_name, gt_bbox, description))

    ################################################################################
    # Testing
    ################################################################################

    eval_bbox_num_list = [1, 10, 100]
    bbox_correct = np.zeros(len(eval_bbox_num_list), dtype=np.int32)
    bbox_total = 0

    # Pre-allocate arrays
    imcrop_val = np.zeros((config.N, config.input_H, config.input_W, 3), dtype=np.float32)
    spatial_val = np.zeros((config.N, 8), dtype=np.float32)
    text_seq_val = np.zeros((config.T, config.N), dtype=np.int32)

    dummy_text_seq = np.zeros((config.T, config.N), dtype=np.int32)
    dummy_cont = np.zeros((config.T, config.N), dtype=np.int32)
    dummy_label = np.zeros((config.N, 1))

    num_im = len(imlist)
    for n_im in tqdm(range(num_im)):
        imname = imlist[n_im]
        imsize = imsize_dict[imname]
        bbox_proposals = bbox_proposal_dict[imname]
        num_proposal = bbox_proposals.shape[0]
        assert(config.N >= num_proposal)

        # Extract visual features from all proposals
        im = skimage.io.imread(config.image_dir + imname)
        if im.ndim == 2:
            im = np.tile(im[:, :, np.newaxis], (1, 1, 3))
        imcrop_val[:num_proposal, ...] = im_processing.crop_bboxes_subtract_mean(
            im, bbox_proposals, config.input_H, det_model.channel_mean)
        imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2))

        # Extract bounding box features from proposals
        spatial_val[:num_proposal, ...] = \
            processing_tools.spatial_feature_from_bbox(bbox_proposals, imsize)

        fc8_net.blobs['language'].data[...] = dummy_text_seq
        fc8_net.blobs['cont'].data[...] = dummy_cont
        fc8_net.blobs['image'].data[...] = imcrop_val_trans
        fc8_net.blobs['spatial'].data[...] = spatial_val
        fc8_net.blobs['label'].data[...] = dummy_label

        fc8_net.forward()
        fc8_val = fc8_net.blobs['fc8'].data[...].copy()

        # Extract textual features from sentences
        for imcrop_name, gt_bbox, description in flat_query_dict[imname]:
            proposal_IoUs = eval_tools.compute_bbox_iou(bbox_proposals, gt_bbox)

            # Extract language feature
            text = text_processing.preprocess_sentence(description, vocab_dict, config.T)
            text_seq_val[...] = np.array(text, dtype=np.int32).reshape((-1, 1))

            cont_val = text_processing.create_cont(text_seq_val)

            scores_net.blobs['language'].data[...] = text_seq_val
            scores_net.blobs['cont'].data[...] = cont_val
            scores_net.blobs['img_feature'].data[...] = fc8_val
            scores_net.blobs['spatial'].data[...] = spatial_val
            scores_net.blobs['label'].data[...] = dummy_label

            scores_net.forward()

            scores_val = scores_net.blobs['scores'].data.copy()
            scores_val = scores_val[:num_proposal, ...].reshape(-1)

            # Sort the scores for the proposals
            if config.use_nms:
                top_ids = eval_tools.nms(proposal.astype(np.float32), scores_val, config.nms_thresh)
            else:
                top_ids = np.argsort(scores_val)[::-1]

            # Evaluate on bounding boxes
            for n_eval_num in range(len(eval_bbox_num_list)):
                eval_bbox_num = eval_bbox_num_list[n_eval_num]
                bbox_correct[n_eval_num] += \
                    np.any(proposal_IoUs[top_ids[:eval_bbox_num]] >= config.correct_iou_thresh)
            bbox_total += 1

    print('Final results on the whole test set')
    result_str = ''
    for n_eval_num in range(len(eval_bbox_num_list)):
        result_str += 'recall@%s = %f\n' % \
            (str(eval_bbox_num_list[n_eval_num]), bbox_correct[n_eval_num]/bbox_total)
    print(result_str)
Пример #13
0
def build_coco_batches(dataset, setname, T, input_H, input_W):
    im_dir = './data/coco/images'
    im_type = 'train2014'
    vocab_file = './data/vocabulary_spacy_Gref.txt'

    data_folder = './' + dataset + '/' + setname + '_batch/'
    data_prefix = dataset + '_' + setname
    if not os.path.isdir(data_folder):
        os.makedirs(data_folder)

    print("data_folder:", data_folder)

    if dataset == 'Gref':
        refer = REFER('./external/refer/data',
                      dataset='refcocog',
                      splitBy='google')
    elif dataset == 'unc':
        refer = REFER('./external/refer/data',
                      dataset='refcoco',
                      splitBy='unc')
    elif dataset == 'unc+':
        refer = REFER('./external/refer/data',
                      dataset='refcoco+',
                      splitBy='unc')
    else:
        raise ValueError('Unknown dataset %s' % dataset)
    refs = [
        refer.Refs[ref_id] for ref_id in refer.Refs
        if refer.Refs[ref_id]['split'] == setname
    ]
    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)

    n_batch = 0

    # spacy load
    nlp = spacy.load("en_core_web_sm")
    SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')

    for ref in refs:
        im_name = 'COCO_' + im_type + '_' + str(ref['image_id']).zfill(12)
        im = skimage.io.imread('%s/%s/%s.jpg' % (im_dir, im_type, im_name))
        seg = refer.Anns[ref['ann_id']]['segmentation']
        rle = cocomask.frPyObjects(seg, im.shape[0], im.shape[1])
        mask = np.max(cocomask.decode(rle), axis=2).astype(np.float32)

        if 'train' in setname:
            im = skimage.img_as_ubyte(
                im_processing.resize_and_pad(im, input_H, input_W))
            mask = im_processing.resize_and_pad(mask, input_H, input_W)
        if im.ndim == 2:
            im = np.tile(im[:, :, np.newaxis], (1, 1, 3))

        for sentence in ref['sentences']:
            print('saving batch %d' % (n_batch + 1))
            sent = sentence['sent'].lower()
            words = SENTENCE_SPLIT_REGEX.split(sent.strip())
            words = [w for w in words if len(w.strip()) > 0]
            # remove .
            if words[-1] == '.':
                words = words[:-1]
            if len(words) > 20:
                words = words[:20]
            n_sent = ""
            for w in words:
                n_sent = n_sent + w + ' '
            n_sent = n_sent.strip()
            try:
                n_sent = n_sent.decode("utf-8")
            except UnicodeEncodeError:
                continue
            doc = nlp(n_sent)
            if len(doc) > 30:
                continue

            n_sent = n_sent.decode("utf-8")
            doc = nlp(n_sent)
            text, graph, height = text_processing.preprocess_spacy_sentence(
                doc, vocab_dict, T)

            np.savez(file=data_folder + data_prefix + '_' + str(n_batch) +
                     '.npz',
                     text_batch=text,
                     im_batch=im,
                     mask_batch=(mask > 0),
                     sent_batch=[n_sent],
                     graph_batch=graph,
                     height_batch=np.array([height], dtype=np.int32))
            n_batch += 1
Пример #14
0
def build_referit_batches(setname, T, input_H, input_W):
    # data directory
    im_dir = './data/referit/images/'
    mask_dir = './data/referit/mask/'
    query_file = './data/referit_query_' + setname + '.json'
    vocab_file = './data/vocabulary_spacy_referit.txt'

    # saving directory
    data_folder = './referit/' + setname + '_batch/'
    data_prefix = 'referit_' + setname
    if not os.path.isdir(data_folder):
        os.makedirs(data_folder)

    # load annotations
    query_dict = json.load(open(query_file))
    im_list = query_dict.keys()
    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)

    # collect training samples
    samples = []
    for n_im, name in enumerate(im_list):
        im_name = name.split('_', 1)[0] + '.jpg'
        mask_name = name + '.mat'
        for sent in query_dict[name]:
            samples.append((im_name, mask_name, sent))

    # save batches to disk
    # spacy load
    nlp = spacy.load("en_core_web_sm")
    SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
    num_batch = len(samples)
    valid = 0
    for n_batch in range(num_batch):
        print('saving batch %d / %d' % (n_batch + 1, num_batch))
        im_name, mask_name, sent = samples[n_batch]
        im = skimage.io.imread(im_dir + im_name)
        mask = load_gt_mask(mask_dir + mask_name).astype(np.float32)

        if 'train' in setname:
            im = skimage.img_as_ubyte(
                im_processing.resize_and_pad(im, input_H, input_W))
            mask = im_processing.resize_and_pad(mask, input_H, input_W)
        if im.ndim == 2:
            im = np.tile(im[:, :, np.newaxis], (1, 1, 3))

        sent = sent.lower()
        words = SENTENCE_SPLIT_REGEX.split(sent.strip())
        words = [w for w in words if len(w.strip()) > 0]
        # remove .
        if words[-1] == '.':
            words = words[:-1]
        if len(words) > 20:
            words = words[:20]
        n_sent = ""
        for w in words:
            n_sent = n_sent + w + ' '
        n_sent = n_sent.strip()
        try:
            n_sent = n_sent.decode("utf-8")
        except UnicodeEncodeError:
            continue
        doc = nlp(n_sent)
        if (len(doc) > 30):
            continue

        text, graph, height = text_processing.preprocess_spacy_sentence(
            doc, vocab_dict, T)

        np.savez(file=data_folder + data_prefix + '_' + str(valid) + '.npz',
                 text_batch=text,
                 im_batch=im,
                 mask_batch=(mask > 0),
                 sent_batch=[n_sent],
                 graph_batch=graph,
                 height_batch=np.array([height], dtype=np.int32))
        valid += 1
Пример #15
0
    def __init__(self, config, use_category=True):

        option = '%s_%s' % (config.dataset, config.split)
        data_path = './data/raw/%s/data.json' % option
        vis_feat_path = './data/vis_feats/%s_ann_vis_feats.pkl' % config.dataset
        vocab_file = './data/word_embedding/vocabulary_72700.txt'
        info_print = config.info_print

        # load data
        self.use_category = use_category
        self.info_print = info_print
        with open(data_path) as data_file:
            self.data = json.load(data_file)
        self.vis_feat_path = vis_feat_path
        self.vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)

        # index to word
        self.ix_to_word = self.data['ix_to_word']
        self.word_to_ix = self.data['word_to_ix']

        # restruct refs, anns, images to dictionary
        self.refs = self.to_dict('refs', 'ref_id')
        self.anns = self.to_dict('anns', 'ann_id')
        self.sents = self.to_dict('sentences', 'sent_id')
        self.images = self.to_dict('images', 'image_id')

        # collect ref_ids, ann_ids, image_ids
        self.ref_ids = list(self.refs.keys())
        self.ann_ids = list(self.anns.keys())
        self.image_ids = list(self.images.keys())

        self.print_info('We have %d images.' % len(self.image_ids))
        self.print_info('We have %d anns.' % len(self.ann_ids))
        self.print_info('We have %d refs.' % len(self.ref_ids))

        # collect ref_to_ann, ref_to_sents, ann_to_image, image_to_anns, etc
        self.ref_to_ann = self.key_to_key('ref', 'ann_id')
        if use_category:
            self.ref_to_cat = self.key_to_key('ref', 'category_id')
        self.ref_to_image = self.key_to_key('ref', 'image_id')
        self.ref_to_sents = self.key_to_key('ref', 'sent_ids')
        self.ann_to_image = self.key_to_key('ann', 'image_id')
        if use_category:
            self.ann_to_cat = self.key_to_key('ann', 'category_id')
        self.ann_to_box = self.key_to_key('ann', 'box')
        self.image_to_anns = self.key_to_key('image', 'ann_ids')
        self.image_to_refs = self.key_to_key('image', 'ref_ids')
        self.print_info('Mapping finished.')

        # collect visual and spatial features
        self.print_info('Collecting visual and spatial features...')
        self.ann_spa_feats = self.fetch_spa_feat()  # spatial feature
        self.ann_vis_feats = self.fetch_vis_feat()  # visual feature

        # collect same/different type(category) anns set
        if use_category:
            self.st_anns, self.dt_anns = self.fetch_nn_ids()

        # collect dif features
        if use_category:
            self.print_info('Calculating dif features...')
            self.ann_spadif_feats = self.fetch_spadif_feat()  # spadif feature
            self.ann_visdif_feats = self.fetch_visdif_feat()  # visdif feature

        # collect train/val split ids
        self.print_info('Splitting image ids...')
        self.image_split_ids = {}
        self.batch_list = {}
        self.num_batch = {}
        self.epoch = {}
        split_list = ['train', 'val', 'test', 'testA', 'testB']
        for split in split_list:
            self.image_split_ids[split] = self.get_split_ids(split)
            self.batch_list[split] = []
            self.num_batch[split] = len(self.image_split_ids[split])
            self.epoch[split] = -1

        self.print_info('Initialization finished.')
Пример #16
0
def build_a2d_batches(T, input_H, input_W, video=False):
    """
    Build data batches of A2D Sentence dataset

    Args:
         T: limit of number of words
         input_H: height of input frame of I3D backbone
         input_W: width of input frame of I3D backbone
         video: select consecutive frames or standalone frame
    """

    query_file = os.path.join(a2d_dir, 'a2d_annotation.txt')
    frame_dir = os.path.join(a2d_dir, 'Release/frames')
    vocab_file = os.path.join(root_dir, 'data/vocabulary_Gref.txt')

    dataset_name = 'a2d_sent_new'
    out_dataset_dir = os.path.join(root_dir, dataset_name)
    if not os.path.exists(out_dataset_dir):
        os.mkdir(out_dataset_dir)
    test_batch = os.path.join(out_dataset_dir, 'test_batch')
    train_batch = os.path.join(out_dataset_dir, 'train_batch')
    if not os.path.exists(test_batch):
        os.mkdir(test_batch)
    if not os.path.exists(train_batch):
        os.mkdir(train_batch)

    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)
    test_prefix_list = list()
    train_prefix_list = list()
    split_dict = gen_split_dict()
    SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')

    with open(query_file, 'r') as f:
        reader = csv.reader(f)
        next(reader)
        total_count = 0
        test_count = 0
        train_count = 0
        all_zero_mask_count = 0
        for row in tqdm(reader):
            # each video belongs to test or train
            video_id = row[0]
            data_prefix = video_id
            if split_dict[data_prefix] == 1:
                save_dir = test_batch
                test_prefix_list.append(data_prefix)
                test = True
            else:
                save_dir = train_batch
                train_prefix_list.append(data_prefix)
                test = False
            # load sentence
            instance_id = int(row[1])
            sent = row[2].lower()
            words = SENTENCE_SPLIT_REGEX.split(sent.strip())
            words = [w for w in words if len(w.strip()) > 0]
            # remove punctuation and restrict sentence within 20 words
            if words[-1] == '.':
                words = words[:-1]
            if len(words) > T:
                words = words[:T]
            n_sent = ""
            for w in words:
                n_sent = n_sent + w + ' '
            n_sent = n_sent.strip()
            n_sent = n_sent.encode('utf-8').decode("utf-8")
            text = text_processing.preprocess_sentence(n_sent, vocab_dict, T)

            image_paths = list()
            # for each video, get all the gt masks of a certain instance
            masks, frame_ids = get_masks(video_id, instance_id)

            for frame_id in frame_ids:
                image_path = os.path.join(frame_dir, video_id,
                                          '{:0>5d}.png'.format(frame_id))
                image_paths.append(image_path)

            for frame_id, image_path, mask in zip(frame_ids, image_paths,
                                                  masks):
                # abandon all zero mask batch
                if np.sum(mask) == 0:
                    print("all zeros mask caught")
                    all_zero_mask_count += 1
                    continue
                if video:
                    # obtain 16 consecutive frames centered at the gt frame
                    frame_paths = frame_range(frame_id=frame_id,
                                              frame_dir=os.path.join(
                                                  frame_dir, video_id))
                else:
                    # only use the gt frame
                    frame_paths = list()
                frames = list()
                if test:
                    count = test_count
                    test_count = test_count + 1
                    prefix = 'test_'
                    image = skimage.io.imread(image_path)
                    for frame_path in frame_paths:
                        frames.append(skimage.io.imread(frame_path))
                else:
                    prefix = 'train_'
                    count = train_count
                    train_count = train_count + 1
                    image = skimage.io.imread(image_path)
                    image = skimage.img_as_ubyte(
                        im_processing.resize_and_pad(image, input_H, input_W))
                    mask = im_processing.resize_and_pad(mask, input_H, input_W)
                    for frame_path in frame_paths:
                        frame = skimage.io.imread(frame_path)
                        frame = skimage.img_as_ubyte(
                            im_processing.resize_and_pad(
                                frame, input_H, input_W))
                        frames.append(frame)

                if debug:
                    m0 = mask[:, :, np.newaxis]
                    m0 = (m0 > 0).astype(np.uint8)
                    m0 = np.concatenate([m0, m0, m0], axis=2)
                    debug_image = image * m0
                    skimage.io.imsave(
                        './debug/{}_{}_{}.png'.format(data_prefix, frame_id,
                                                      sent.replace(' ', '_')),
                        debug_image)

                # save batches
                np.savez(file=os.path.join(
                    save_dir, dataset_name + '_' + prefix + str(count)),
                         text_batch=text,
                         mask_batch=(mask > 0),
                         sent_batch=[sent],
                         im_batch=image,
                         frame_id=frame_id,
                         frames=frames)
                total_count = total_count + 1

        print()
        print("num of all zeros masks is: {}".format(all_zero_mask_count))
Пример #17
0
def build_refvos_batch(setname,
                       T,
                       input_H,
                       input_W,
                       im_dir,
                       mask_dir,
                       meta_expressions,
                       save_dir,
                       inrange=None):
    vocab_file = './data/vocabulary_Gref.txt'

    print(save_dir)
    # saving directory
    data_folder = os.path.join(save_dir, 'refvos/' + setname + '_batch/')
    data_prefix = 'refvos_' + setname
    if not os.path.isdir(data_folder):
        os.makedirs(data_folder)

    # load annotations
    query_dict = json.load(open(meta_expressions))
    videos = query_dict['videos']
    samples = []
    for vid in videos:
        video = videos[vid]
        expressions = video['expressions']
        frames = video['frames']
        for eid in expressions:
            exp = expressions[eid]['exp']
            obj_id = expressions[eid]['obj_id']
            for fid in frames:
                im_name = os.path.join(vid, fid + '.jpg')
                mask_name = os.path.join(vid, fid + '.png')
                samples.append((im_name, mask_name, exp, obj_id))

    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)

    # save batches to disk
    num_batch = len(samples)
    batch_ind = 0
    if inrange == None:
        inrange = range(num_batch)
    for n_batch in inrange:
        print('saving batch %d / %d' % (n_batch + 1, num_batch))
        im_name, mask_name, sent, obj_id = samples[n_batch]
        im_path = os.path.join(im_dir, im_name)
        mask_path = os.path.join(mask_dir, mask_name)
        if not (os.path.exists(im_path) and os.path.exists(mask_path)):
            continue
        im = skimage.io.imread(im_path)
        mask = skimage.io.imread(mask_path)[:, :, :3]
        mask_color = object_color[obj_id]
        mask_obj = np.asarray((mask == mask_color))
        if (len(mask_obj.shape) == 0):
            continue
        mask_obj = mask_obj[:, :, 0]
        if np.max(mask_obj) == 0:
            print(im_name)
            continue
        if 'train' in setname:
            im = skimage.img_as_ubyte(
                im_processing.resize_and_pad(im, input_H, input_W))
            mask = im_processing.resize_and_pad(mask_obj, input_H, input_W)
        if im.ndim == 2:
            im = np.tile(im[:, :, np.newaxis], (1, 1, 3))

        text = text_processing.preprocess_sentence(sent, vocab_dict, T)

        np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz',
                 text_batch=text,
                 im_batch=im,
                 mask_batch=(mask > 0),
                 sent_batch=[sent])
        batch_ind += 1