def compute_targets(ground_truth, proposals, pos_thresh=0.7, neg_thresh=0.3):
    # Get list of videos.
    video_set = set(ground_truth['video-id'].unique()).intersection(
        proposals['video-id'].unique())

    # Adaptation to query faster
    ground_truth_gbvn = ground_truth.groupby('video-id')
    proposals_gbvn = proposals.groupby('video-id')

    proposal_targets = proposals.copy()
    # label -1 for ignore
    proposal_labels = np.full(proposal_targets.shape[0], -1)
    proposal_tiou = np.full(proposal_targets.shape[0], -1.0)

    # For each video, compute tiou scores among the proposals and ground_truth
    for videoid in video_set:
        ground_truth_videoid = ground_truth_gbvn.get_group(videoid)
        this_video_ground_truth_idx = ground_truth_videoid.reset_index()
        this_video_ground_truth = this_video_ground_truth_idx.loc[:, [
            't-start', 't-end'
        ]].values

        proposals_videoid = proposals_gbvn.get_group(videoid)
        this_video_proposals = proposals_videoid.loc[:, ['t-start', 't-end'
                                                         ]].values
        this_video_proposals_idx = proposals_videoid.loc[:,
                                                         ['t-start', 't-end'
                                                          ]].index

        for idx, this_proposal in enumerate(this_video_proposals):
            tiou = segment_iou(this_proposal, this_video_ground_truth)
            argmax = tiou.argmax()
            if tiou[argmax] > pos_thresh:
                proposal_labels[this_video_proposals_idx[
                    idx]] = this_video_ground_truth_idx.label[
                        argmax]  # foreground
            elif tiou[argmax] < neg_thresh:
                proposal_labels[
                    this_video_proposals_idx[idx]] = 0  # background
            proposal_tiou[this_video_proposals_idx[idx]] = tiou[argmax]

    # Select samples according a criterion
    pos_idxs = np.where(proposal_labels > 0)[0]
    num_pos = pos_idxs.shape[0]
    neg_idxs = np.where(proposal_labels == 0)[0]
    class_num = max(proposal_labels)
    num_neg = min(int(num_pos / class_num), neg_idxs.shape[0])
    neg_idxs = np.random.permutation(neg_idxs)
    proposal_labels[neg_idxs[num_neg:]] = -1

    proposal_targets['label'] = proposal_labels
    proposal_targets['tiou'] = proposal_tiou
    proposal_targets = proposal_targets[
        proposal_targets.label != -1].reset_index()

    if DEBUG:
        for l in range(201):
            num = sum(proposal_targets['label'].values == l)
            print(num, " samples for class ", l)
    return proposal_targets
예제 #2
0
def _get_pos_neg(split_path, annotations, vid, slide_window_size, sampling_sec,
                 anc_len_all, anc_cen_all, pos_thresh, neg_thresh):
    if os.path.isfile(os.path.join(split_path, vid + '_bn.npy')):
        print('video: {}'.format(vid))

        video_prefix = os.path.join(split_path, vid)

        # load feature
        # T x H
        # as the multiprocessing cause error I change the method with numpy

        # resnet_feat = torch.from_numpy(
        #     np.load(video_prefix + '_resnet.npy')).float()
        # bn_feat = torch.from_numpy(
        #     np.load(video_prefix + '_bn.npy')).float()
        # if resnet_feat.size(0) != bn_feat.size(0):
        #     raise Exception(
        #         'number of frames does not match in feature!')
        # total_frame = bn_feat.size(0)

        resnet_feat = np.load(video_prefix + '_resnet.npy')
        bn_feat = np.load(video_prefix + '_bn.npy')
        if resnet_feat.shape[0] != bn_feat.shape[0]:
            raise Exception('number of frames does not match in feature!')
        total_frame = bn_feat.shape[0]

        window_start = 0
        window_end = slide_window_size
        window_start_t = window_start * sampling_sec
        window_end_t = window_end * sampling_sec
        pos_seg = defaultdict(list)
        neg_overlap = [0] * anc_len_all.shape[0]
        pos_collected = [False] * anc_len_all.shape[0]
        for j in range(anc_len_all.shape[0]):
            potential_match = []
            for ann_idx, ann in enumerate(annotations):
                seg = ann['segment']
                gt_start = seg[0] / sampling_sec
                gt_end = seg[1] / sampling_sec
                if gt_start > gt_end:
                    gt_start, gt_end = gt_end, gt_start
                if anc_cen_all[j] + anc_len_all[j] / 2. <= total_frame:
                    if window_start_t <= seg[
                        0] and window_end_t + sampling_sec * 2 >= \
                            seg[1]:
                        overlap = segment_iou(
                            np.array([gt_start, gt_end]),
                            np.array([[
                                anc_cen_all[j] - anc_len_all[j] / 2.,
                                anc_cen_all[j] + anc_len_all[j] / 2.
                            ]]))

                        neg_overlap[j] = max(overlap, neg_overlap[j])

                        if not pos_collected[j] and overlap >= pos_thresh:
                            len_offset = math.log(
                                (gt_end - gt_start) / anc_len_all[j])
                            cen_offset = ((gt_end + gt_start) / 2. -
                                          anc_cen_all[j]) / anc_len_all[j]
                            potential_match.append(
                                (ann_idx, j, overlap, len_offset, cen_offset,
                                 ann['sentence_idx']))
                            pos_collected[j] = True

            filled = False
            for item in potential_match:
                if item[0] not in pos_seg:
                    filled = True
                    pos_seg[item[0]].append(tuple(item[1:]))
                    break

            if not filled and len(potential_match) > 0:
                # randomly choose one
                shuffle(potential_match)
                item = potential_match[0]
                pos_seg[item[0]].append(tuple(item[1:]))

        missing_prop = 0
        if len(pos_seg.keys()) != len(annotations):
            print('Some annotations in video {} does not have '
                  'any matching proposal'.format(video_prefix))
            missing_prop = len(annotations) - len(pos_seg.keys())

        neg_seg = []
        for oi, overlap in enumerate(neg_overlap):
            if overlap < neg_thresh:
                neg_seg.append((oi, overlap))

        npos_seg = 0
        for k in pos_seg:
            npos_seg += len(pos_seg[k])

        print('pos anc: {}, neg anc: {}'.format(npos_seg, len(neg_seg)))

        return video_prefix, total_frame, pos_seg, neg_seg, missing_prop
    else:
        return None
    def inference(self, x, actual_frame_length, sampling_sec,
                  min_prop_num, max_prop_num,
                  min_prop_num_before_nms, pos_thresh, stride_factor,
                  gated_mask=False):
        B, T, _ = x.size()
        dtype = x.data.type()

        x_rgb, x_flow = torch.split(x, 2048, 2)
        x_rgb = self.rgb_emb(x_rgb.contiguous())
        x_flow = self.flow_emb(x_flow.contiguous())

        x = torch.cat((x_rgb, x_flow), 2)

        x = self.emb_out(x)

        vis_feat, all_emb = self.vis_emb(x)
        # vis_feat = self.vis_dropout(vis_feat)

        # B x T x H -> B x H x T
        # for 1d conv
        vis_feat = vis_feat.transpose(1,2).contiguous()

        prop_lst = []
        for i, kernel in enumerate(self.prop_out):

            kernel_size = self.kernel_list[i]
            if kernel_size <= actual_frame_length[0]: # no need to use larger kernel size in this case, batch size is only 1
                pred_o = kernel(vis_feat)
                anchor_c = Variable(torch.FloatTensor(np.arange(
                    float(kernel_size)/2.0,
                    float(T+1-kernel_size/2.0),
                    math.ceil(kernel_size/stride_factor)
                )).type(dtype))
                if anchor_c.size(0) != pred_o.size(-1):
                    raise Exception("size mismatch!")

                anchor_c = anchor_c.expand(B, 1, anchor_c.size(0))
                anchor_l = Variable(torch.FloatTensor(anchor_c.size()).fill_(kernel_size).type(dtype))

                pred_final = torch.cat((pred_o, anchor_l, anchor_c), 1)
                prop_lst.append(pred_final)
            else:
                print('skipping kernel sizes greater than {}'.format(
                    self.kernel_list[i]))
                break

        prop_all = torch.cat(prop_lst, 2)

        # assume 1st and 2nd are action prediction and overlap, respectively
        prop_all[:,:2,:] = F.sigmoid(prop_all[:,:2,:])


        pred_len = prop_all[:, 4, :] * torch.exp(prop_all[:, 2, :])
        pred_cen = prop_all[:, 5, :] + prop_all[:, 4, :] * prop_all[:, 3, :]

        nms_thresh_set = np.arange(0.9, 0.95, 0.05).tolist()
        all_proposal_results = []

        # store positional encodings, size of B x 4,
        # the first B values are predicted starts,
        # second B values are predicted ends,
        # third B values are anchor starts,
        # last B values are anchor ends
        pred_start_lst = [] #torch.zeros(B * 4).type(dtype)
        pred_end_lst = []
        anchor_start_lst = []
        anchor_end_lst = []
        anchor_window_mask = [] #Variable(torch.zeros(B, T).type(dtype))
        gate_scores = [] #Variable(torch.zeros(B, 1).type(dtype))

        for b in range(B):
            crt_pred = prop_all.data[b]
            crt_pred_cen = pred_cen.data[b]
            crt_pred_len = pred_len.data[b]
            pred_masks = []
            batch_result = []
            crt_nproposal = 0
            nproposal = torch.sum(torch.gt(prop_all.data[b, 0, :], pos_thresh))
            nproposal = min(max(nproposal, min_prop_num_before_nms),
                            prop_all.size(-1))
            pred_results = np.empty((nproposal, 3))
            _, sel_idx = torch.topk(crt_pred[0], nproposal)
 
            start_t = time.time()
            for nms_thresh in nms_thresh_set:
                for prop_idx in range(nproposal):
                    original_frame_len = actual_frame_length[b].item() + sampling_sec*2 # might be truncated at the end, hence + frame_to_second*2
                    pred_start_w = crt_pred_cen[sel_idx[prop_idx]] - crt_pred_len[sel_idx[prop_idx]] / 2.0
                    pred_end_w = crt_pred_cen[sel_idx[prop_idx]] + crt_pred_len[sel_idx[prop_idx]] / 2.0
                    pred_start = pred_start_w
                    pred_end = pred_end_w
                    if pred_start >= pred_end:
                        continue
                    if pred_end >= original_frame_len or pred_start < 0:
                        continue

                    hasoverlap = False
                    if crt_nproposal > 0:
                        if np.max(segment_iou(np.array([pred_start, pred_end]), pred_results[:crt_nproposal])) > nms_thresh:
                            hasoverlap = True

                    if not hasoverlap:
                        pred_bin_window_mask = torch.zeros(1, T, 1).type(dtype)
                        win_start = math.floor(max(min(pred_start, min(original_frame_len, T)-1), 0))
                        win_end = math.ceil(max(min(pred_end, min(original_frame_len, T)), 1))
                        # if win_start >= win_end:
                        #     print('length: {}, mask window start: {} >= window end: {}, skipping'.format(
                        #         original_frame_len, win_start, win_end,
                        #     ))
                        #     continue

                        pred_bin_window_mask[:, win_start:win_end] = 1
                        pred_masks.append(pred_bin_window_mask)

                        if self.learn_mask:
                            # 4, 5 are the indices for anchor length and center
                            anc_len = crt_pred[4, sel_idx[prop_idx]]
                            anc_cen = crt_pred[5, sel_idx[prop_idx]]
                            # only use the pos sample to train, could potentially use more sample for training mask, but this is easier to do
                            amask = torch.zeros(1,T).type(dtype)
                            amask[0,
                            max(0, math.floor(anc_cen - anc_len / 2.)):
                            min(T, math.ceil(anc_cen + anc_len / 2.))] = 1.
                            anchor_window_mask.append(amask)

                            pred_start_lst.append(torch.Tensor([pred_start_w]).type(dtype))
                            pred_end_lst.append(torch.Tensor([pred_end_w]).type(dtype))
                            anchor_start_lst.append(torch.Tensor([max(0,
                                                                 math.floor(
                                                                 anc_cen - anc_len / 2.))]).type(
                                                                 dtype))
                            anchor_end_lst.append(torch.Tensor([min(T,
                                                               math.ceil(
                                                               anc_cen + anc_len / 2.))]).type(
                                                               dtype))

                            gate_scores.append(torch.Tensor([crt_pred[0, sel_idx[prop_idx]]]).type(dtype))

                        pred_results[crt_nproposal] = np.array([win_start,
                                                                win_end,
                                                                crt_pred[0, sel_idx[prop_idx]]])
                        crt_nproposal += 1

                    if crt_nproposal >= max_prop_num:
                        break

                if crt_nproposal >= min_prop_num:
                    break

            mid1_t = time.time()

            if len(pred_masks) == 0: # append all-one window if no window is proposed
                pred_masks.append(torch.ones(1, T, 1).type(dtype))
                pred_results.append((0, min(original_frame_len, T), pos_thresh))
                crt_nproposal = 1

            pred_masks = Variable(torch.cat(pred_masks, 0))
            batch_x = x[b].unsqueeze(0).expand(pred_masks.size(0), x.size(1), x.size(2))

            if self.learn_mask:
                pe_pred_start = torch.cat(pred_start_lst, 0)
                pe_pred_end = torch.cat(pred_end_lst, 0)
                pe_anchor_start = torch.cat(anchor_start_lst, 0)
                pe_anchor_end = torch.cat(anchor_end_lst, 0)

                pe_locs = torch.cat((pe_pred_start, pe_pred_end, pe_anchor_start, pe_anchor_end), 0)
                pos_encs = positional_encodings(pe_locs, self.d_model // 4)
                npos = pos_encs.size(0)
                anchor_window_mask = Variable(torch.cat(anchor_window_mask, 0))
                in_pred_mask = torch.cat((pos_encs[:npos//4], pos_encs[npos//4:npos//4*2],
                                          pos_encs[npos//4 * 2:npos//4 * 3],
                                          pos_encs[npos//4 * 3:npos//4 * 4],
                                          anchor_window_mask), 1)
                pred_cont_masks  = self.mask_model(in_pred_mask).unsqueeze(2)

                if gated_mask:
                    gate_scores = Variable(torch.cat(gate_scores, 0).view(-1,1,1))
                    window_mask = (gate_scores * pred_masks
                                   + (1 - gate_scores) * pred_cont_masks)

                else:
                    window_mask = pred_cont_masks
            else:
                window_mask = pred_masks

            mid2_t = time.time()

            pred_sentence = []
            # use cap_batch as caption batch size
            cap_batch = math.ceil(480*256/T)
            for sent_i in range(math.ceil(window_mask.size(0)/cap_batch)):
                batch_start = sent_i*cap_batch
                batch_end = min((sent_i+1)*cap_batch, window_mask.size(0))
                pred_sentence += self.cap_model.greedy(batch_x[batch_start:batch_end],
                                                       window_mask[batch_start:batch_end], 20)

            pred_results = pred_results[:crt_nproposal]
            assert len(pred_sentence) == crt_nproposal, (
                "number of predicted sentence and proposal does not match"
            )

            for idx in range(len(pred_results)):
                batch_result.append((pred_results[idx][0],
                                     pred_results[idx][1],
                                     pred_results[idx][2],
                                     pred_sentence[idx]))
            all_proposal_results.append(tuple(batch_result))

            end_t = time.time()
            print('Processing time for tIoU: {:.2f}, mask: {:.2f}, caption: {:.2f}'.format(mid1_t-start_t, mid2_t-mid1_t, end_t-mid2_t))

        return all_proposal_results
예제 #4
0
def _get_pos_neg(split_path, annotations, vid,
                 slide_window_size, sampling_sec, anc_len_all,
                 anc_cen_all, pos_thresh, neg_thresh):
 
    if os.path.isfile(os.path.join(split_path, vid + '_bn.npy')):
        print('video: {}'.format(vid))

        video_prefix = os.path.join(split_path, vid)

        
        # (T,2048)  RGB
        resnet_feat = torch.from_numpy(np.load(video_prefix + '_resnet.npy')).float()
        
        # (T,1024) FLOW
        bn_feat = torch.from_numpy(np.load(video_prefix + '_bn.npy')).float()  

        if resnet_feat.size(0) != bn_feat.size(0):
            raise Exception('number of frames does not match in feature!')
          
        total_frame = bn_feat.size(0) 

       
        # 不太明白为什么乘sampling_sec
        window_start = 0
        window_end = slide_window_size
        window_start_t = window_start * sampling_sec   #这里相当于把窗口缩小
        window_end_t = window_end * sampling_sec
        
        
        pos_seg = defaultdict(list)  # 解决了dict中不存在默认值的问题,如果不存在键,默认为list
        neg_overlap = [0] * anc_len_all.shape[0]        # 6338
        pos_collected = [False] * anc_len_all.shape[0]  # 6338  
        
        # 遍历所有预设的anchor
        for j in range(anc_len_all.shape[0]):
            potential_match = []
            for ann_idx, ann in enumerate(annotations):
                seg = ann['segment']
               
                gt_start = seg[0] / sampling_sec    #相当于把gt的范围进行放大,预设的anchor是根据抽样后的时间长度预设的只有这样才和未抽样前相对应
                gt_end = seg[1] / sampling_sec
                if gt_start > gt_end:
                    gt_start, gt_end = gt_end, gt_start
                  
                # 预设的anchor不能超过total_frame并且gt在窗口内
                if anc_cen_all[j] + anc_len_all[j] / 2. <= total_frame:
                    if window_start_t <= seg[0] and window_end_t + sampling_sec * 2 >= seg[1]:
                        overlap = segment_iou(np.array([gt_start, gt_end]), np.array([[
                            anc_cen_all[j] - anc_len_all[j] / 2.,
                            anc_cen_all[j] + anc_len_all[j] / 2.]]))

                        neg_overlap[j] = max(overlap, neg_overlap[j])

                        if not pos_collected[j] and overlap >= pos_thresh:
                            len_offset = math.log(
                                (gt_end - gt_start) / anc_len_all[j])
                            cen_offset = ((gt_end + gt_start) / 2. -
                                          anc_cen_all[j]) / anc_len_all[j]
                         
                          
                            potential_match.append(
                                (ann_idx, j, overlap, len_offset, cen_offset,
                                 ann['sentence_idx']))
                            pos_collected[j] = True

	    # 把获取的正样本存储到pos_seg中
            filled = False
            for item in potential_match:
                if item[0] not in pos_seg:
                    filled = True
                    pos_seg[item[0]].append(tuple(item[1:]))  # {'ann_idx': ( j, overlap, len_offset, cen_offset, ann['sentence_idx'])}
                    break

            if not filled and len(potential_match)>0:
                # randomly choose one
                shuffle(potential_match)
                item = potential_match[0]
                pos_seg[item[0]].append(tuple(item[1:]))

        # 某些gt_segments没有任何匹配的预设anchor
        missing_prop = 0
        if len(pos_seg.keys()) != len(annotations):
            print('Some annotations in video {} does not have '
                  'any matching proposal'.format(video_prefix))
            missing_prop = len(annotations) - len(pos_seg.keys())

        neg_seg = []
        for oi, overlap in enumerate(neg_overlap):
            if overlap < neg_thresh:
                neg_seg.append((oi, overlap))

        npos_seg = 0
        for k in pos_seg:
            npos_seg += len(pos_seg[k])

        print(
            'pos anc: {}, neg anc: {}'.format(npos_seg,
                                              len(neg_seg)))

        return video_prefix, total_frame, pos_seg, neg_seg, missing_prop  
    else:
        return None