def detect_instance(score_map, mask, class_id, max_fragment_size=0):
    # converting pixel-wise instance ids into detection form

    pred_score = []
    pred_label = []
    pred_mask = []

    for ag_score, ag_mask, ag_class in zip(score_map, mask, class_id):
        if np.sum(ag_mask) < 1:
            continue
        segments = pyutils.to_one_hot(
            measure.label(ag_mask, connectivity=1, background=0))[1:]
        # connected components analysis

        for seg_mask in segments:
            if np.sum(seg_mask) < max_fragment_size:
                pred_score.append(0)
            else:
                pred_score.append(np.max(ag_score * seg_mask))
            pred_label.append(ag_class)
            pred_mask.append(seg_mask)

    return {
        'score': np.stack(pred_score, 0),
        'mask': np.stack(pred_mask, 0),
        'class': np.stack(pred_label, 0)
    }
def cluster_centroids(centroids, displacement, thres=2.5):
    # thres: threshold for grouping centroid (see supp)

    dp_strength = np.sqrt(displacement[1]**2 + displacement[0]**2)
    height, width = dp_strength.shape

    weak_dp_region = dp_strength < thres

    dp_label = measure.label(weak_dp_region, connectivity=1, background=0)
    dp_label_1d = dp_label.reshape(-1)

    centroids_1d = centroids[0] * width + centroids[1]

    clusters_1d = dp_label_1d[centroids_1d]

    cluster_map = imutils.compress_range(
        clusters_1d.reshape(height, width) + 1)

    return pyutils.to_one_hot(cluster_map)
def _work_gpu(process_id, model, dataset, args):
    n_gpus = torch.cuda.device_count()
    databin = dataset[process_id]
    data_loader = DataLoader(databin,
                             shuffle=False,
                             num_workers=args.num_workers // n_gpus,
                             pin_memory=False)

    with torch.no_grad(), cuda.device(process_id):

        model.cuda()

        for iter, pack in tqdm(enumerate(data_loader), total=len(databin)):
            img_name = pack['name'][0]
            path = os.path.join(args.ins_seg_out_dir, img_name + '.npy')
            if not os.path.exists(path):
                os.makedirs(os.path.dirname(path), exist_ok=True)
                size = np.asarray(pack['size'])

                edge, dp = model(pack['img'][0].cuda(non_blocking=True))

                dp = dp.cpu().numpy()

                cam_dict = np.load(args.cam_out_dir + '/' + img_name + '.npy',
                                   allow_pickle=True).item()

                cams = cam_dict['cam'].cuda()
                keys = cam_dict['keys']

                centroids = find_centroids_with_refinement(dp)
                instance_map = cluster_centroids(centroids, dp)
                instance_cam = separte_score_by_mask(cams, instance_map)

                rw = indexing.propagate_to_edge(instance_cam,
                                                edge,
                                                beta=args.beta,
                                                exp_times=args.exp_times,
                                                radius=5)

                rw_up = F.interpolate(
                    rw, scale_factor=4, mode='bilinear',
                    align_corners=False)[:, 0, :size[0], :size[1]]
                rw_up = rw_up / torch.max(rw_up)

                rw_up_bg = F.pad(rw_up, (0, 0, 0, 0, 1, 0),
                                 value=args.ins_seg_bg_thres)

                num_classes = len(keys)
                num_instances = instance_map.shape[0]

                instance_shape = torch.argmax(rw_up_bg, 0).cpu().numpy()
                instance_shape = pyutils.to_one_hot(
                    instance_shape,
                    maximum_val=num_instances * num_classes + 1)[1:]
                instance_class_id = np.repeat(keys, num_instances)

                detected = detect_instance(rw_up.cpu().numpy(),
                                           instance_shape,
                                           instance_class_id,
                                           max_fragment_size=size[0] *
                                           size[1] * 0.01)

                np.save(path, detected)

                if process_id == n_gpus - 1 and iter % (len(databin) //
                                                        4) == 0:
                    print("%d " % ((5 * iter + 1) // (len(databin) // 4)),
                          end='')