示例#1
0
    def initialize(self, image_file, box):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)  # align with training

        cx, cy, w, h = get_axis_aligned_bbox(box)
        target_pos = np.array([cx, cy])
        target_sz = np.array([w, h])

        self.state = self.siam_tracker.init(
            im, target_pos, target_sz, self.siam_net
        )  # init tracker
示例#2
0
def track_tune(tracker, net, video, config):
    arch = config['arch']
    benchmark_name = config['benchmark']
    resume = config['resume']
    hp = config['hp']  # scale_step, scale_penalty, scale_lr, window_influence

    tracker_path = join('test', (benchmark_name + resume.split('/')[-1].split('.')[0] +
                                 '_step_{:.4f}'.format(hp['scale_step']) +
                                 '_penalty_s_{:.4f}'.format(hp['scale_penalty']) +
                                 '_w_influence_{:.4f}'.format(hp['w_influence']) +
                                 '_scale_lr_{:.4f}'.format(hp['scale_lr'])).replace('.', '_'))  # no .

    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    result_path = join(tracker_path, '{:s}.txt'.format(video['name']))

    # occ for parallel running
    if not os.path.exists(result_path):
        fin = open(result_path, 'w')
        fin.close()
    else:
        return tracker_path

    start_frame, lost_times, toc = 0, 0, 0

    regions = []  # result and states[1 init / 2 lost / 0 skip]
    image_files, gt = video['image_files'], video['gt']
    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = tracker.init(im, target_pos, target_sz, net, hp=hp)  # init tracker
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append([float(1)] if 'VOT' in benchmark_name else gt[f])
        elif f > start_frame:  # tracking
            state = tracker.track(state, im)  # track
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append(location)

    with open(result_path, "w") as fin:
        for x in regions:
            p_bbox = x.copy()
            fin.write(
                ','.join([str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox)]) + '\n')

    return tracker_path
示例#3
0
def track(tracker, net, video, args):
    start_frame, lost_times, toc = 0, 0, 0

    # save result to evaluate
    if args.epoch_test:
        suffix = args.resume.split('/')[-1]
        suffix = suffix.split('.')[0]
        tracker_path = os.path.join('result', args.dataset, args.arch + suffix)
    else:
        tracker_path = os.path.join('result', args.dataset, args.arch)

    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    if 'VOT' in args.dataset:
        baseline_path = join(tracker_path, 'baseline')
        video_path = join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = join(video_path, video['name'] + '_001.txt')
    else:
        result_path = join(tracker_path, '{:s}.txt'.format(video['name']))

    if os.path.exists(result_path):
        return 0  # for mult-gputesting

    regions = []  # result and states[1 init / 2 lost / 0 skip]
    image_files, gt = video['image_files'], video['gt']
    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)

        tic = cv2.getTickCount()

        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = tracker.init(im, target_pos, target_sz,
                                 net)  # init tracker
            regions.append(1 if 'VOT' in args.dataset else gt[f])
        elif f > start_frame:  # tracking
            state = tracker.track(state, im)  # track
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = poly_iou(gt[f],
                                 location) if 'VOT' in args.dataset else 1
            if b_overlap > 0:
                regions.append(location)
            else:
                regions.append(2)
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        else:  # skip
            regions.append(0)
        toc += cv2.getTickCount() - tic
    toc /= cv2.getTickFrequency()

    with open(result_path, "w") as fin:
        if 'VOT' in args.dataset:
            for x in regions:
                if isinstance(x, int):
                    fin.write("{:d}\n".format(x))
                else:
                    p_bbox = x.copy()
                    fin.write(','.join([str(i) for i in p_bbox]) + '\n')
        else:
            for x in regions:
                p_bbox = x.copy()
                fin.write(','.join([
                    str(i + 1) if idx == 0 or idx == 1 else str(i)
                    for idx, i in enumerate(p_bbox)
                ]) + '\n')

    print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(
        video['name'], toc, f / toc, lost_times))

    return lost_times
示例#4
0
def track_tune(tracker, net, video, config):
    arch = config['arch']
    benchmark_name = config['benchmark']
    resume = config['resume']
    hp = config[
        'hp']  # penalty_k, scale_lr, window_influence, adaptive size (for vot2017 or later)

    tracker_path = join('test',
                        (benchmark_name + resume.split('/')[-1].split('.')[0] +
                         '_small_size_{:.4f}'.format(hp['small_sz']) +
                         '_big_size_{:.4f}'.format(hp['big_sz']) +
                         '_penalty_k_{:.4f}'.format(hp['penalty_k']) +
                         '_w_influence_{:.4f}'.format(hp['window_influence']) +
                         '_scale_lr_{:.4f}'.format(hp['lr'])).replace(
                             '.', '_'))  # no .

    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    if 'VOT' in benchmark_name:
        baseline_path = join(tracker_path, 'baseline')
        video_path = join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = join(video_path, video['name'] + '_001.txt')
    else:
        raise ValueError('Only VOT is supported')

    # occ for parallel running
    if not os.path.exists(result_path):
        fin = open(result_path, 'w')
        fin.close()
    else:
        if benchmark_name.startswith('VOT'):
            return 0
        else:
            raise ValueError('Only VOT is supported')

    start_frame, lost_times, toc = 0, 0, 0
    regions = []  # result and states[1 init / 2 lost / 0 skip]
    image_files, gt = video['image_files'], video['gt']
    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = tracker.init(im, target_pos, target_sz, net,
                                 hp=hp)  # init tracker
            regions.append([float(1)] if 'VOT' in benchmark_name else gt[f])
        elif f > start_frame:  # tracking
            state = tracker.track(state, im)  # track
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = poly_iou(gt[f],
                                 location) if 'VOT' in benchmark_name else 1
            if b_overlap > 0:
                regions.append(location)
            else:
                regions.append([float(2)])
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        else:  # skip
            regions.append([float(0)])

    # save results for OTB
    if benchmark_name.startswith('VOT'):
        return regions
    else:
        raise ValueError('Only VOT is supported')
示例#5
0
def track_tune(tracker, net, video, config):
    arch = config['arch']
    benchmark_name = config['benchmark']
    resume = config['resume']
    hp = config['hp']  # scale_step, scale_penalty, scale_lr, window_influence

    tracker_path = join('test',
                        (benchmark_name + resume.split('/')[-1].split('.')[0] +
                         '_step_{:.4f}'.format(hp['scale_step']) +
                         '_penalty_s_{:.4f}'.format(hp['scale_penalty']) +
                         '_w_influence_{:.4f}'.format(hp['w_influence']) +
                         '_scale_lr_{:.4f}'.format(hp['scale_lr'])).replace(
                             '.', '_'))  # no .

    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    if 'VOT' in benchmark_name:
        baseline_path = join(tracker_path, 'baseline')
        video_path = join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = join(video_path, video['name'] + '_001.txt')
    elif 'GOT10K' in benchmark_name:
        re_video_path = os.path.join(tracker_path, video['name'])
        if not exists(re_video_path): os.makedirs(re_video_path)
        result_path = os.path.join(re_video_path,
                                   '{:s}.txt'.format(video['name']))
    else:
        result_path = join(tracker_path, '{:s}.txt'.format(video['name']))

        # occ for parallel running
        if not os.path.exists(result_path):
            fin = open(result_path, 'w')
            fin.close()
        else:
            if benchmark_name.startswith('OTB'):
                return tracker_path
            elif benchmark_name.startswith('VOT') or benchmark_name.startswith(
                    'GOT10K'):
                return 0
            else:
                print('benchmark not supported now')
                return

    start_frame, lost_times, toc = 0, 0, 0

    regions = []  # result and states[1 init / 2 lost / 0 skip]
    image_files, gt = video['image_files'], video['gt']
    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = tracker.init(im, target_pos, target_sz, net,
                                 hp=hp)  # init tracker
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append([float(1)] if 'VOT' in benchmark_name else gt[f])
        elif f > start_frame:  # tracking
            state = tracker.track(state, im)  # track
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = poly_iou(gt[f],
                                 location) if 'VOT' in benchmark_name else 1
            if b_overlap > 0:
                regions.append(location)
            else:
                regions.append([float(2)])
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        else:  # skip
            regions.append([float(0)])

        # save results for OTB
        if 'OTB' in benchmark_name or 'LASOT' in benchmark_name:
            with open(result_path, "w") as fin:
                for x in regions:
                    p_bbox = x.copy()
                    fin.write(','.join([
                        str(i + 1) if idx == 0 or idx == 1 else str(i)
                        for idx, i in enumerate(p_bbox)
                    ]) + '\n')
        elif 'VISDRONE' in benchmark_name or 'GOT10K' in benchmark_name:
            with open(result_path, "w") as fin:
                for x in regions:
                    p_bbox = x.copy()
                    fin.write(
                        ','.join([str(i)
                                  for idx, i in enumerate(p_bbox)]) + '\n')
        elif 'VOT' in benchmark_name:
            with open(result_path, "w") as fin:
                for x in regions:
                    if isinstance(x, int):
                        fin.write("{:d}\n".format(x))
                    else:
                        p_bbox = x.copy()
                        fin.write(','.join([str(i) for i in p_bbox]) + '\n')

        if 'OTB' in benchmark_name or 'VIS' in benchmark_name or 'VOT' in benchmark_name or 'GOT10K' in benchmark_name:
            return tracker_path
        else:
            print('benchmark not supported now')
示例#6
0
def track(siam_tracker, online_tracker, siam_net, video, args):
    start_frame, toc = 0, 0

    # save result to evaluate
    if args.epoch_test:
        suffix = args.resume.split('/')[-1]
        suffix = suffix.split('.')[0]
        tracker_path = os.path.join('result', args.dataset, args.arch + suffix)
    else:
        tracker_path = os.path.join('result', args.dataset, args.arch)

    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    if 'VOT' in args.dataset:
        baseline_path = os.path.join(tracker_path, 'baseline')
        video_path = os.path.join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = os.path.join(video_path, video['name'] + '_001.txt')
    else:
        result_path = os.path.join(tracker_path,
                                   '{:s}.txt'.format(video['name']))

    if os.path.exists(result_path):
        return  # for mult-gputesting

    regions = []
    lost = 0

    image_files, gt = video['image_files'], video['gt']

    for f, image_file in enumerate(image_files):

        im = cv2.imread(image_file)
        rgb_im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)  # align with training

        tic = cv2.getTickCount()
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])

            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])

            state = siam_tracker.init(im, target_pos, target_sz,
                                      siam_net)  # init tracker

            if args.online:
                online_tracker.init(im,
                                    rgb_im,
                                    siam_net,
                                    target_pos,
                                    target_sz,
                                    True,
                                    dataname=args.dataset,
                                    resume=args.resume)

            # location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append(1 if 'VOT' in args.dataset else gt[f])
        elif f > start_frame:  # tracking
            if args.online:
                state = online_tracker.track(im, rgb_im, siam_tracker, state)
            else:
                state = siam_tracker.track(state, im)

            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = poly_iou(gt[f],
                                 location) if 'VOT' in args.dataset else 1
            if b_overlap > 0:
                regions.append(location)
            else:
                regions.append(2)
                start_frame = f + 5
                lost += 1
        else:
            regions.append(0)

        toc += cv2.getTickCount() - tic

    with open(result_path, "w") as fin:
        if 'VOT' in args.dataset:
            for x in regions:
                if isinstance(x, int):
                    fin.write("{:d}\n".format(x))
                else:
                    p_bbox = x.copy()
                    fin.write(','.join([str(i) for i in p_bbox]) + '\n')
        elif 'OTB' in args.dataset or 'LASOT' in args.dataset:
            for x in regions:
                p_bbox = x.copy()
                fin.write(','.join([
                    str(i + 1) if idx == 0 or idx == 1 else str(i)
                    for idx, i in enumerate(p_bbox)
                ]) + '\n')
        elif 'VISDRONE' in args.dataset or 'GOT10K' in args.dataset or 'TN' in args.dataset:
            for x in regions:
                p_bbox = x.copy()
                fin.write(','.join([str(i)
                                    for idx, i in enumerate(p_bbox)]) + '\n')

    toc /= cv2.getTickFrequency()
    print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps  Lost {}'.format(
        video['name'], toc, f / toc, lost))
示例#7
0
def main():
    net = models.__dict__[args.arch](anchors_nums=args.anchor_nums,
                                     cls_type=args.cls_type)
    net = load_pretrain(net, args.resume)
    net.eval()
    net = net.cuda()

    # prepare tracker
    info = edict()
    info.arch = args.arch
    info.cls_type = args.cls_type
    info.dataset = args.dataset
    info.epoch_test = args.epoch_test
    tracker = SiamRPN(info)

    dataset_root = os.path.join("/ssd", args.dataset)
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)
    model_name = args.resume.split('/')[-1].split('.')[0]
    total_lost = 0
    """
    eao will lower than origin version(0.393->0.390) due to the  
    number of digits after the decimal point
    """
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                # if len(gt_bbox) == 4:
                #     gt_bbox = [gt_bbox[0], gt_bbox[1],
                #        gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1,
                #        gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1,
                #        gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    #gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h]

                    target_pos = np.array([cx, cy])
                    target_sz = np.array([w, h])
                    state = tracker.init(img, target_pos, target_sz,
                                         net)  # init tracker
                    state["arch"] = args.arch
                    #tracker.init(img, gt_bbox_)
                    #pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    state = tracker.track(state, img)  # track
                    location = cxy_wh_2_rect(state['target_pos'],
                                             state['target_sz'])

                    #outputs = tracker.track(img)
                    pred_bbox = location
                    #overlap=poly_iou(gt_bbox,location)
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(
                        img, [np.array(gt_bbox, np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(
                            img,
                            [np.array(pred_bbox, np.int).reshape(
                                (-1, 1, 2))], True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                                      (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('results', args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]

                    target_pos = np.array([cx, cy])
                    target_sz = np.array([w, h])
                    state = tracker.init(img, target_pos, target_sz,
                                         net)  # init tracker
                    state["arch"] = args.arch
                    #tracker.init(img, gt_bbox_)

                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    state = tracker.track(state, img)  # track
                    location = cxy_wh_2_rect(state['target_pos'],
                                             state['target_sz'])

                    pred_bbox = location

                    #outputs = tracker.track(img)
                    #pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    scores.append(state['score'])
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(
                        img, (gt_bbox[0], gt_bbox[1]),
                        (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]),
                        (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0] + pred_bbox[2],
                                   pred_bbox[1] + pred_bbox[3]), (0, 255, 255),
                                  3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(
                    video_path, '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write(
                            "{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('result', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))
示例#8
0
def main():
    # load config
    cfg_from_file(args.config)

    dataset_root = os.path.join('dataset', args.dataset)

    # create model
    net = ModelBuilder()
    checkpoint = torch.load(args.model)
    if 'state_dict' in checkpoint:
        net.load_state_dict(checkpoint['state_dict'])
    else:
        net.load_state_dict(checkpoint)
    net.cuda().eval()
    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)

    model_name = args.save_name
    total_lost = 0
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    target_pos, target_sz = np.array([cx,
                                                      cy]), np.array([w, h])
                    state = CGACD_init(img, target_pos, target_sz, net)
                    pred_bbox = cxy_wh_2_rect(state['target_pos'],
                                              state['target_sz'])
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    state = CGACD_track(state, img)
                    pred_bbox = cxy_wh_2_rect(state['target_pos'],
                                              state['target_sz'])
                    pred_polygon = [
                        pred_bbox[0], pred_bbox[1],
                        pred_bbox[0] + pred_bbox[2], pred_bbox[1],
                        pred_bbox[0] + pred_bbox[2],
                        pred_bbox[1] + pred_bbox[3], pred_bbox[0],
                        pred_bbox[1] + pred_bbox[3]
                    ]
                    overlap = vot_overlap(gt_bbox, pred_polygon,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    target_pos = state['target_pos']
                    target_sz = state['target_sz']
                    cv2.rectangle(img, (int(target_pos[0] - target_sz[0] / 2),
                                        int(target_pos[1] - target_sz[1] / 2)),
                                  (int(target_pos[0] + target_sz[0] / 2),
                                   int(target_pos[1] + target_sz[1] / 2)),
                                  (0, 255, 0), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2,
                                cv2.LINE_AA)
                    cv2.imshow(video.name, img)
                    cv2.moveWindow(video.name, 100, 100)
                    key = cv2.waitKey(1)
                    if key == 27:
                        break
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('result', args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    if 'OTB' in args.dataset:
                        target_pos, target_sz = rect1_2_cxy_wh(gt_bbox)
                    else:
                        cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                        target_pos, target_sz = np.array([cx, cy
                                                          ]), np.array([w, h])
                    state = CGACD_init(img, target_pos, target_sz, net)
                    if 'OTB' in args.dataset:
                        pred_bbox = cxy_wh_2_rect1(state['target_pos'],
                                                   state['target_sz'])
                    else:
                        pred_bbox = cxy_wh_2_rect(state['target_pos'],
                                                  state['target_sz'])
                    pred_bboxes.append(pred_bbox)
                else:
                    state = CGACD_track(state, img)
                    pred_bbox = cxy_wh_2_rect(state['target_pos'],
                                              state['target_sz'])
                    pred_bboxes.append(pred_bbox)
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    target_pos = state['target_pos']
                    target_sz = state['target_sz']
                    cv2.rectangle(img, (int(target_pos[0] - target_sz[0] / 2),
                                        int(target_pos[1] - target_sz[1] / 2)),
                                  (int(target_pos[0] + target_sz[0] / 2),
                                   int(target_pos[1] + target_sz[1] / 2)),
                                  (0, 255, 0), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2,
                                cv2.LINE_AA)
                    cv2.imshow(video.name, img)
                    cv2.moveWindow(video.name, 100, 100)
                    key = cv2.waitKey(1)
                    if key == 27:
                        break
            toc /= cv2.getTickFrequency()
            if 'GOT-10k' == args.dataset:
                video_path = os.path.join('result', args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('result', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))
示例#9
0
def track_tune(tracker, net, video, config):
    arch = config['arch']
    benchmark_name = config['benchmark']
    resume = config['resume']
    hp = config['hp']  # scale_step, scale_penalty, scale_lr, window_influence

    tracker_path = join('test', (benchmark_name + resume.split('/')[-1].split('.')[0] +
                                     '_small_size_{:.4f}'.format(hp['small_sz']) +
                                     '_big_size_{:.4f}'.format(hp['big_sz']) +
                                     '_lambda_u_{:.4f}'.format(hp['choose_thr']) +
                                     '_lambda_s_{:.4f}'.format(hp['choose_thr']) +
                                     '_cyclic_thr_{:.4f}'.format(hp['choose_thr']) +
                                     '_choose_thr_{:.4f}'.format(hp['choose_thr']) +
                                     '_penalty_k_{:.4f}'.format(hp['penalty_k']) +
                                     '_w_influence_{:.4f}'.format(hp['window_influence']) +
                                     '_scale_lr_{:.4f}'.format(hp['lr'])).replace('.', '_'))  # no .
    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    if 'VOT' in benchmark_name:
        baseline_path = join(tracker_path, 'baseline')
        video_path = join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = join(video_path, video['name'] + '_001.txt')
    elif 'GOT10K' in benchmark_name:
        re_video_path = os.path.join(tracker_path, video['name'])
        if not exists(re_video_path): os.makedirs(re_video_path)
        result_path = os.path.join(re_video_path, '{:s}.txt'.format(video['name']))
    else:
        result_path = join(tracker_path, '{:s}.txt'.format(video['name']))

    # occ for parallel running
    if not os.path.exists(result_path):
        fin = open(result_path, 'w')
        fin.close()
    else:
        if benchmark_name.startswith('OTB'):
            return tracker_path
        elif benchmark_name.startswith('VOT') or benchmark_name.startswith('GOT10K'):
            return 0
        else:
            print('benchmark not supported now')
            return

    start_frame, lost_times, toc = 0, 0, 0

    regions = []  # result and states[1 init / 2 lost / 0 skip]

    # for rgbt splited test

    image_files, gt = video['image_files'], video['gt']

    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            mask_gt = None

            state = tracker.init(im, target_pos, target_sz, net, online=False, mask=mask_gt, debug=False, hp=hp)  # init tracker
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append([float(1)] if 'VOT' in benchmark_name else gt[f])
        elif f > start_frame:  # tracking
            state = tracker.track(state, im)  # track
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = poly_iou(gt[f], location) if 'VOT' in benchmark_name else 1

            polygon = state['polygon']
            if not polygon is None:
                polygon = [polygon[0][0], polygon[0][1], polygon[1][0], polygon[1][1], polygon[2][0], polygon[2][1],
                           polygon[3][0], polygon[3][1]]
                polygon = np.array(polygon)
            else:
                x1, y1, w, h = location
                x2, y2 = x1 + w, y1 + h
                polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2])

            if poly_iou(np.array(location), np.array(polygon)) > state['choose_thr']:
                record = polygon
            else:
                x1, y1, w, h = location
                x2, y2 = x1 + w, y1 + h
                polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2])
                record = polygon

            if not 'VOT' in benchmark_name:  # change polygon to [x, y, w, h]
                x1, y1, x2, y2 = record[0], record[1], record[4], record[5]
                record = np.array([x1, y1, x2 - x1 + 1, y2 - y1 + 1])

            if b_overlap > 0:
                regions.append(record)
            else:
                regions.append([float(2)])
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        else:  # skip
            regions.append([float(0)])

    # save results for OTB
    if 'OTB' in benchmark_name or 'LASOT' in benchmark_name:
        with open(result_path, "w") as fin:
            for x in regions:
                p_bbox = x.copy()
                fin.write(
                    ','.join([str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox)]) + '\n')
    elif 'VISDRONE' in benchmark_name  or 'GOT10K' in benchmark_name:
        with open(result_path, "w") as fin:
            for x in regions:
                p_bbox = x.copy()
                fin.write(','.join([str(i) for idx, i in enumerate(p_bbox)]) + '\n')
    elif 'VOT' in benchmark_name:
        with open(result_path, "w") as fin:
            for x in regions:
                if isinstance(x, int):
                    fin.write("{:d}\n".format(x))
                else:
                    p_bbox = x.copy()
                    fin.write(','.join([str(i) for i in p_bbox]) + '\n')

    if 'OTB' in benchmark_name or 'VIS' in benchmark_name or 'VOT' in benchmark_name or 'GOT10K' in benchmark_name:
        return tracker_path
    else:
        print('benchmark not supported now')
示例#10
0
def track_box(siam_tracker, online_tracker, siam_net, video, args):
    """
    track a benchmark with only box annoated
    attention: not for benchmark evaluation, just a demo
    """

    tracker_path = os.path.join('result', args.dataset, args.arch)

    if 'VOT' in args.dataset:
        baseline_path = os.path.join(tracker_path, 'baseline')
        video_path = os.path.join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = os.path.join(video_path, video['name'] + '_001.txt')
    else:
        result_path = os.path.join(tracker_path, '{:s}.txt'.format(video['name']))

    if os.path.exists(result_path):
        return  # for mult-gputesting

    regions = []
    b_overlaps, b_overlaps2, b_overlaps3 = [], [], []
    lost = 0
    start_frame, toc = 0, 0
    image_files, gt = video['image_files'], video['gt']

    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if args.online:
            rgb_im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
        if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)   # align with training

        tic = cv2.getTickCount()
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            mask_gt = None

            state = siam_tracker.init(im, target_pos, target_sz, siam_net, online=args.online, mask=mask_gt, debug=args.debug)  # init siamese tracker

            if args.online:
                online_tracker.init(im, rgb_im, siam_net, target_pos, target_sz, True, dataname=args.dataset, resume=args.resume)

        elif f > start_frame:  # tracking
            if args.online:
                state = online_tracker.track(im, rgb_im, siam_tracker, state)
            else:
                state = siam_tracker.track(state, im, name=image_file)

            mask = state['mask']

            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = poly_iou(gt[f], location) if 'VOT' in args.dataset else 1
            polygon = state['polygon']

            if not polygon is None:
                polygon = [polygon[0][0], polygon[0][1], polygon[1][0], polygon[1][1], polygon[2][0], polygon[2][1], polygon[3][0], polygon[3][1]]
                polygon = np.array(polygon)
                # b_overlap2 = poly_iou(gt[f], polygon)
            else:
                x1, y1, w, h = location
                x2, y2 = x1 + w, y1 + h
                polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2])

            if poly_iou(np.array(location), np.array(polygon)) > state['choose_thr']:
                record = polygon
                # b_overlaps3.append(b_overlap2)
            else:
                x1, y1, w, h = location
                x2, y2 = x1 + w, y1 + h
                polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2])
                record = polygon
                # b_overlaps3.append(b_overlap)

            # print('b_overlap: {}, b_overlap2: {}'.format(b_overlap, b_overlap2))
            # b_overlaps.append(b_overlap)
            # b_overlaps2.append(b_overlap2)

            if not 'VOT' in benchmark_name:  # change polygon to [x, y, w, h]
                x1, y1, x2, y2 = record[0], record[1], record[4], record[5]
                record = np.array([x1, y1, x2 - x1 + 1, y2 - y1 + 1])

            if b_overlap > 0:
                regions.append(record)
            else:
                regions.append(2)
                start_frame = f + 5
                lost += 1

            if args.vis:
                COLORS = np.random.randint(128, 255, size=(1, 3), dtype="uint8")
                COLORS = np.vstack([[0, 0, 0], COLORS]).astype("uint8")
                mask = COLORS[mask]
                output = ((0.4 * im) + (0.6 * mask)).astype("uint8")
                cv2.imshow("mask", output)
                cv2.waitKey(1)

        toc += cv2.getTickCount() - tic

    # print('b_overlap: {}, b_overlap2: {}, b_overlap3: {}'.format(np.array(b_overlaps).mean(), np.array(b_overlaps2).mean(), np.array(b_overlaps3).mean()))

    with open(result_path, "w") as fin:
        if 'VOT' in args.dataset:
            for x in regions:
                if isinstance(x, int):
                    fin.write("{:d}\n".format(x))
                else:
                    p_bbox = x.copy()
                    fin.write(','.join([str(i) for i in p_bbox]) + '\n')
        elif 'OTB' in args.dataset or 'LASOT' in args.dataset:
            for x in regions:
                p_bbox = x.copy()
                fin.write(
                    ','.join([str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox)]) + '\n')
        elif 'VISDRONE' in args.dataset or 'GOT10K' in args.dataset:
            for x in regions:
                p_bbox = x.copy()
                fin.write(','.join([str(i) for idx, i in enumerate(p_bbox)]) + '\n')

    toc /= cv2.getTickFrequency()
    print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps'.format(video['name'], toc, f / toc))