Пример #1
0
 def __init__(self,
              image,
              phrase,
              model,
              transform=None,
              imsize=256,
              max_query_len=128,
              bert_model='bert-base-uncased'):
     self.tokenizer = BertTokenizer.from_pretrained(bert_model,
                                                    do_lower_case=True)
     self.query_len = max_query_len
     self.phrase = phrase
     self.imsize = imsize
     self.model = model
     img, _, ratio, dw, dh = letterbox(image, None, self.imsize)
     if (transform is not None):
         self.img = transform(img).unsqueeze(0)
     self.dw = dw
     self.dh = dh
     self.ratio = ratio
Пример #2
0
    def __getitem__(self, idx):
        if self.testmode:
            img, phrase = self.pull_item(idx)
        else:
            img, phrase, bbox = self.pull_item(idx)

        phrase = phrase.lower()
        if self.augment:
            augment_flip, augment_hsv, augment_affine = True, True, True

        ## seems a bug in torch transformation resize, so separate in advance
        h, w = img.shape[0], img.shape[1]
        if self.augment:
            ## random horizontal flip
            if augment_flip and random.random() > 0.5:
                img = cv2.flip(img, 1)
                bbox[0], bbox[2] = w - bbox[2] - 1, w - bbox[0] - 1
                phrase = phrase.replace('right', '*&^special^&*').replace(
                    'left', 'right').replace('*&^special^&*', 'left')
            ## random intensity, saturation change
            if augment_hsv:
                fraction = 0.50
                img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR),
                                       cv2.COLOR_BGR2HSV)
                S = img_hsv[:, :, 1].astype(np.float32)
                V = img_hsv[:, :, 2].astype(np.float32)
                a = (random.random() * 2 - 1) * fraction + 1
                if a > 1:
                    np.clip(S, a_min=0, a_max=255, out=S)
                a = (random.random() * 2 - 1) * fraction + 1
                V *= a
                if a > 1:
                    np.clip(V, a_min=0, a_max=255, out=V)

                img_hsv[:, :, 1] = S.astype(np.uint8)
                img_hsv[:, :, 2] = V.astype(np.uint8)
                img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR),
                                   cv2.COLOR_BGR2RGB)
            img, _, ratio, dw, dh = letterbox(img, None, self.imsize)
            bbox[0], bbox[2] = bbox[0] * ratio + dw, bbox[2] * ratio + dw
            bbox[1], bbox[3] = bbox[1] * ratio + dh, bbox[3] * ratio + dh
            ## random affine transformation
            if augment_affine:
                img, _, bbox, M = random_affine(img, None, bbox, \
                    degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10))
        else:  ## should be inference, or specified training
            img, _, ratio, dw, dh = letterbox(img, None, self.imsize)
            if self.testmode == False:
                bbox[0], bbox[2] = bbox[0] * ratio + dw, bbox[2] * ratio + dw
                bbox[1], bbox[3] = bbox[1] * ratio + dh, bbox[3] * ratio + dh

        ## Norm, to tensor
        if self.transform is not None:
            img = self.transform(img)

        phrase = self.tokenize_phrase(phrase)
        if self.testmode == False:
            object_map = Image.new("L", (img.size(1), img.size(2)))
            object_map_ = ImageDraw.Draw(object_map)
            # print(bbox.shape)
            # print(img.shape)
            bbox_ = list(map(int, list(bbox)))
            object_map_.rectangle(bbox_, fill="white")
            object_map_8 = (np.array(
                object_map.resize((img.size(1) // 8, img.size(2) // 8),
                                  Image.BILINEAR)) > 1).astype(int)
            object_map_16 = (np.array(
                object_map.resize((img.size(1) // 16, img.size(2) // 16),
                                  Image.BILINEAR)) > 1).astype(int)
            object_map_32 = (np.array(
                object_map.resize((img.size(1) // 32, img.size(2) // 32),
                                  Image.BILINEAR)) > 1).astype(int)
        if self.testmode:
            return img, np.array(phrase, dtype=int), \
                 np.array(ratio, dtype=np.float32), \
                np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0]
        else:
            return img, object_map_32, object_map_16, object_map_8, np.array(phrase, dtype=int), \
            np.array(bbox, dtype=np.float32)
Пример #3
0
    def __getitem__(self, idx):
        img, phrase, bbox = self.pull_item(idx)
        # phrase = phrase.decode("utf-8").encode().lower()
        phrase = phrase.lower()
        if self.augment:
            augment_flip, augment_hsv, augment_affine = True, True, True

        ## seems a bug in torch transformation resize, so separate in advance
        h, w = img.shape[0], img.shape[1]
        if self.augment:
            ## random horizontal flip
            if augment_flip and random.random() > 0.5:
                img = cv2.flip(img, 1)
                bbox[0], bbox[2] = w - bbox[2] - 1, w - bbox[0] - 1
                phrase = phrase.replace('right', '*&^special^&*').replace(
                    'left', 'right').replace('*&^special^&*', 'left')
            ## random intensity, saturation change
            if augment_hsv:
                fraction = 0.50
                img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR),
                                       cv2.COLOR_BGR2HSV)
                S = img_hsv[:, :, 1].astype(np.float32)
                V = img_hsv[:, :, 2].astype(np.float32)
                a = (random.random() * 2 - 1) * fraction + 1
                if a > 1:
                    np.clip(S, a_min=0, a_max=255, out=S)
                a = (random.random() * 2 - 1) * fraction + 1
                V *= a
                if a > 1:
                    np.clip(V, a_min=0, a_max=255, out=V)

                img_hsv[:, :, 1] = S.astype(np.uint8)
                img_hsv[:, :, 2] = V.astype(np.uint8)
                img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR),
                                   cv2.COLOR_BGR2RGB)
            img, _, ratio, dw, dh = letterbox(img, None, self.imsize)
            bbox[0], bbox[2] = bbox[0] * ratio + dw, bbox[2] * ratio + dw
            bbox[1], bbox[3] = bbox[1] * ratio + dh, bbox[3] * ratio + dh
            ## random affine transformation
            if augment_affine:
                img, _, bbox, M = random_affine(img, None, bbox, \
                    degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10))
        else:  ## should be inference, or specified training
            img, _, ratio, dw, dh = letterbox(img, None, self.imsize)
            bbox[0], bbox[2] = bbox[0] * ratio + dw, bbox[2] * ratio + dw
            bbox[1], bbox[3] = bbox[1] * ratio + dh, bbox[3] * ratio + dh

        ## Norm, to tensor
        if self.transform is not None:
            img = self.transform(img)
        if self.lstm:
            phrase = self.tokenize_phrase(phrase)
            word_id = phrase
            # word_mask = np.zeros(word_id.shape)
            word_mask = np.array(word_id > 0, dtype=int)
        else:
            ## encode phrase to bert input
            examples = read_examples(phrase, idx)
            features = convert_examples_to_features(examples=examples,
                                                    seq_length=self.query_len,
                                                    tokenizer=self.tokenizer)
            word_id = features[0].input_ids
            word_mask = features[0].input_mask
        if self.testmode:
            return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
                np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \
                np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0]
        else:
            return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
            np.array(bbox, dtype=np.float32)
Пример #4
0
    def __getitem__(self, idx):
        img, phrase, bbox, mask, img_path, clust_label = self.pull_item(idx)
        # phrase = phrase.decode("utf-8").encode().lower()
        phrase = phrase.lower()
        center_gt = np.array(ndimage.measurements.center_of_mass(mask)[::-1],
                             dtype=np.float32)
        mask_origin = mask
        image_origin = img

        if self.augment:
            augment_flip, augment_hsv, augment_affine = True, True, True

        ## seems a bug in torch transformation resize, so separate in advance
        h, w = img.shape[0], img.shape[1]
        if self.augment:
            ## random horizontal flip
            if augment_flip and random.random() > 0.5:
                img = cv2.flip(img, 1)
                mask = cv2.flip(mask, 1)
                bbox[0], bbox[2] = w - bbox[2] - 1, w - bbox[0] - 1
                center_gt[0] = w - center_gt[0] - 1

                phrase = phrase.replace('right', '*&^special^&*').replace(
                    'left', 'right').replace('*&^special^&*', 'left')
            ## random intensity, saturation change
            if augment_hsv:
                fraction = 0.50
                img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR),
                                       cv2.COLOR_BGR2HSV)
                S = img_hsv[:, :, 1].astype(np.float32)
                V = img_hsv[:, :, 2].astype(np.float32)
                a = (random.random() * 2 - 1) * fraction + 1
                if a > 1:
                    np.clip(S, a_min=0, a_max=255, out=S)
                a = (random.random() * 2 - 1) * fraction + 1
                V *= a
                if a > 1:
                    np.clip(V, a_min=0, a_max=255, out=V)

                img_hsv[:, :, 1] = S.astype(np.uint8)
                img_hsv[:, :, 2] = V.astype(np.uint8)
                img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR),
                                   cv2.COLOR_BGR2RGB)
            img, mask, ratio, dw, dh = letterbox(img, mask, self.imsize)
            bbox[0], bbox[2] = bbox[0] * ratio + dw, bbox[2] * ratio + dw
            bbox[1], bbox[3] = bbox[1] * ratio + dh, bbox[3] * ratio + dh

            center_gt[0], center_gt[
                1] = center_gt[0] * ratio + dw, center_gt[1] * ratio + dh
            center = center_gt

            ## random affine transformation
            if augment_affine:
                img, mask, bbox, center, M = random_affine(img, mask, bbox, center, \
                    degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10))
        else:  ## should be inference, or specified training
            img, mask, ratio, dw, dh = letterbox(img, mask, self.imsize)
            bbox[0], bbox[2] = bbox[0] * ratio + dw, bbox[2] * ratio + dw
            bbox[1], bbox[3] = bbox[1] * ratio + dh, bbox[3] * ratio + dh

            center_gt[0], center_gt[
                1] = center_gt[0] * ratio + dw, center_gt[1] * ratio + dh
            center = center_gt

        if self.gaussian is not None:
            gap = np.random.normal(0, 1, 2) * self.gaussian
            center = center + gap
            center[0] = min(max(center[0], dw), self.imsize - dw)
            center[1] = min(max(center[1], dh), self.imsize - dh)
        # center[0] = (bbox[0]+bbox[2])/2
        # center[1] = (bbox[1]+bbox[3])/2

        # center_gt[0], center_gt[1] = center_gt[0] * ratio + dw, center_gt[1] *ratio+dh
        # center = center_gt# ndimage.measurements.center_of_mass(mask)[::-1]  # (y,x)

        # bbox[0]=center[1]
        # bbox[1]=center[0]
        # center=center[::-1]

        ## Norm, to tensor
        if self.transform is not None:
            img = self.transform(img)
        if self.lstm:
            phrase = self.tokenize_phrase(phrase)
            word_id = phrase
            word_mask = np.zeros(word_id.shape)
        else:
            ## encode phrase to bert input
            examples = read_examples(phrase, idx)
            features = convert_examples_to_features(examples=examples,
                                                    seq_length=self.query_len,
                                                    tokenizer=self.tokenizer)
            word_id = features[0].input_ids
            word_mask = features[0].input_mask
        if self.testmode:
            return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
                np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \
                np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0], mask,np.array(center,dtype=np.float32),phrase,mask_origin,image_origin
        else:
            return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
            np.array(bbox, dtype=np.float32), mask, np.array(center,dtype=np.float32), np.array(clust_label,dtype=np.float32)