Пример #1
0
    def __init__(self,
                 in_channels,
                 charset=DefaultCharset(),
                 inner_channels=512,
                 max_size=32,
                 height=1,
                 gt_as_output=None,
                 step_dropout=0,
                 **kwargs):
        super(AttentionDecoder, self).__init__()

        self.inner_channels = inner_channels
        self.encode = self._init_encoder(in_channels)

        self.max_size = max_size
        self.charset = charset
        self.height = height
        self.decoder = AttentionRNNCell(inner_channels, max_size + height,
                                        len(charset))
        self.step_dropout = step_dropout

        self.onehot_embedding_x = nn.Embedding(max_size, max_size)
        self.onehot_embedding_x.weight.data = torch.eye(max_size)
        self.onehot_embedding_y = nn.Embedding(height, height)
        self.onehot_embedding_y.weight.data = torch.eye(height)

        self.gt_as_output = gt_as_output
        self.loss_function = nn.NLLLoss(reduction='none')
class SequenceRecognitionRepresenter(Configurable):
    charset = State(default=DefaultCharset())

    def __init__(self, cmd={}, **kwargs):
        self.load_all(**kwargs)

    def label_to_string(self, label):
        return self.charset.label_to_string(label)

    def represent(self, batch, pred):
        images, labels = batch['image'], batch['label']
        mask = torch.ones(pred.shape[0], dtype=torch.int).to(pred.device)

        for i in range(pred.shape[1]):
            mask = (1 -
                    (pred[:, i] == self.charset.blank).type(torch.int)) * mask
            pred[:, i] = pred[:, i] * mask + self.charset.blank * (1 - mask)

        output = []
        for i in range(labels.shape[0]):
            label_str = self.label_to_string(labels[i])
            pred_str = self.label_to_string(pred[i])
            if False and label_str != pred_str:
                print('label: %s , pred: %s' % (label_str, pred_str))
                img = (np.clip(
                    images[i].cpu().data.numpy().transpose(1, 2, 0) + 0.5, 0,
                    1) * 255).astype('uint8')
                webcv.imshow(
                    '【 pred: <%s> , label: <%s> 】' % (pred_str, label_str),
                    np.array(img, dtype=np.uint8))
                if webcv.waitKey() == ord('q'):
                    continue
            output.append({'label_string': label_str, 'pred_string': pred_str})

        return output
Пример #3
0
class SequenceRecognitionVisualizer(Configurable):
    charset = State(default=DefaultCharset())

    def __init__(self, cmd={}, **kwargs):
        self.eager = cmd.get('eager_show', False)
        self.load_all(**kwargs)

    def visualize(self, batch, output, interested):
        return self.visualize_batch(batch, output)

    def visualize_batch(self, batch, output):
        images, labels, lengths = batch['image'], batch['label'], batch[
            'length']
        for i in range(images.shape[0]):
            image = NormalizeImage.restore(images[i])
            gt = self.charset.label_to_string(labels[i])
            webcv2.imshow(output[i]['pred_string'] + '_' + str(i) + '_' + gt,
                          image)
            # folder = 'images/dropout/lexicon/'
            # np.save(folder + output[i]['pred_string'] + '_' + gt + '_' + batch['data_ids'][i], image)
        webcv2.waitKey()
        return {
            'image': (np.clip(
                batch['image'][0].cpu().data.numpy().transpose(1, 2, 0) + 0.5,
                0, 1) * 255).astype('uint8')
        }
Пример #4
0
    def __init__(self,
                 charset=DefaultCharset(),
                 inner_channels=256,
                 in_channels=256,
                 need_reduce=False,
                 reduce_func=None,
                 loss_func='pytorch'):

        super().__init__()
        rnn_input = in_channels
        if need_reduce:
            rnn_input = inner_channels
        self.rnn = nn.Sequential(
            BidirectionalLSTM(rnn_input, inner_channels, inner_channels),
            BidirectionalLSTM(inner_channels, inner_channels, len(charset)))
        self.inner_channels = inner_channels
        if need_reduce:
            if reduce_func == 'conv':
                self.fpn2rnn = self._init_conv(in_channels)
            elif need_reduce and reduce_func == 'pooling':
                self.fpn2rnn = self._init_pooling()
        self.softmax = nn.Softmax()
        if loss_func == 'pytorch':
            self.ctc_loss = nn.CTCLoss(zero_infinity=True)
        else:
            self.ctc_loss = CTCLoss()
Пример #5
0
    def __init__(self,
                 in_channels,
                 charset=DefaultCharset(),
                 inner_channels=256,
                 stride=1,
                 blank=0,
                 **kwargs):
        super(CTCDecoder2D, self).__init__()
        self.charset = charset
        from ops import ctc_loss_2d
        self.ctc_loss = ctc_loss_2d

        self.inner_channels = inner_channels
        self.pred_mask = nn.Sequential(
            nn.AvgPool2d(kernel_size=(stride, stride),
                         stride=(stride, stride)),
            nn.Conv2d(in_channels, inner_channels, kernel_size=3, padding=1),
            nn.Conv2d(inner_channels, 1, kernel_size=1), nn.Softmax(dim=2))

        self.pred_classify = nn.Sequential(
            nn.AvgPool2d(kernel_size=(stride, stride),
                         stride=(stride, stride)),
            nn.Conv2d(in_channels, inner_channels, kernel_size=3, padding=1),
            nn.Conv2d(inner_channels, len(charset), kernel_size=1))
        self.blank = blank
        self.tiny = torch.tensor(torch.finfo().tiny, requires_grad=False)
        self.register_buffer('saved_tiny', self.tiny)
Пример #6
0
    def __init__(self,
                 in_channels,
                 charset=DefaultCharset(),
                 inner_channels=256,
                 use_resnet=False,
                 bias=False):
        super(SegRecognizer, self).__init__()
        self.use_resnet = use_resnet
        self.mask = nn.Sequential(
            nn.Conv2d(in_channels, inner_channels, kernel_size=3, padding=1),
            nn.Conv2d(inner_channels, 1, kernel_size=1, padding=0),
            nn.Sigmoid())
        self.classify = nn.Sequential(
            nn.Conv2d(in_channels, inner_channels, kernel_size=3, padding=1),
            nn.Conv2d(inner_channels, len(charset), kernel_size=1, padding=0))

        #for fpn
        self.toplayer = nn.Conv2d(2048,
                                  256,
                                  kernel_size=1,
                                  stride=1,
                                  padding=0)  #to reduce channels
        #upsample
        self.up5 = nn.Upsample(scale_factor=2, mode='nearest')
        self.up4 = nn.Upsample(scale_factor=2, mode='nearset')
        self.up3 = nn.Upsample(scale_factor=2, mode='nearset')

        self.smooth1 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)
        self.smooth2 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)
        self.smooth3 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)

        self.latlayer1 = nn.Conv2d(1024,
                                   256,
                                   kernel_size=1,
                                   stride=1,
                                   padding=0)
        self.latlayer2 = nn.Conv2d(512,
                                   256,
                                   kernel_size=1,
                                   stride=1,
                                   padding=0)
        self.latlayer3 = nn.Conv2d(256,
                                   256,
                                   kernel_size=1,
                                   stride=1,
                                   padding=0)
Пример #7
0
    def __init__(self,
                 in_channels,
                 charset=DefaultCharset(),
                 inner_channels=256,
                 **kwargs):
        super(CTCDecoder, self).__init__()
        self.ctc_loss = nn.CTCLoss(reduction='mean')
        self.inner_channels = inner_channels
        self.encode = self._init_encoder(in_channels)

        self.pred_conv = nn.Conv2d(inner_channels,
                                   len(charset),
                                   kernel_size=1,
                                   bias=True,
                                   padding=0)
        self.softmax = nn.LogSoftmax(dim=1)

        self.blank = 0
        if 'blank' in kwargs:
            self.blank = kwargs['blank']
Пример #8
0
class ImageCropper(Configurable):
    charset = State(default=DefaultCharset())
    max_size = State(default=32)
    image_size = State(default=[64, 512])
    mode = State(default='resize')

    RGB_MEAN = np.array([122.67891434, 116.66876762, 104.00698793])

    def __init__(self, cmd={}, **kwargs):
        self.load_all(**kwargs)
        self.resize = ResizeImage(self.image_size, self.mode)

    def is_vertival(self, height, width):
        return height > width * 1.5

    def ensure_horizontal(self, image):
        if self.is_vertival(*image.shape[:2]):
            image = np.flip(np.swapaxes(image, 0, 1), 0)
        return image

    def crop(self, image, poly):
        box = min_area_rect(poly)

        w = np.linalg.norm(box[1] - box[0])
        h = np.linalg.norm(box[2] - box[1])

        src = box.astype('float32')
        dst = np.array([(0, 0), (w, 0), (w, h), (0, h)], 'float32')
        mat = cv2.getPerspectiveTransform(src, dst)

        image = cv2.warpPerspective(image, mat, (w, h))
        image = image.astype('float32')

        image = self.ensure_horizontal(image)
        image = self.resize(image)

        image -= self.RGB_MEAN
        image /= 255.

        return image
class SequenceRecognitionEvaluationRepresenter(Configurable):
    charset = State(default=DefaultCharset())

    def __init__(self, cmd={}, **kwargs):
        self.load_all(**kwargs)

    def label_to_string(self, label):
        return self.charset.label_to_string(label)

    def represent(self, batch, pred):
        images, labels, lengths = batch
        mask = torch.ones(pred.shape[0], dtype=torch.int)

        for i in range(pred.shape[1]):
            mask = (1 -
                    (pred[:, i] == self.charset.blank).type(torch.int)) * mask
            pred[:, i] = pred[:, i] * mask + self.charset.blank * (1 - mask)

        output = []
        for i in range(images.shape[0]):
            pred_str = self.label_to_string(pred[i])
            output.append(pred_str)
        return output
Пример #10
0
class MakeRecognitionLabel(DataProcess):
    charset = State(default=DefaultCharset())
    max_size = State(default=32)

    def process(self, data):
        assert 'gt' in data, '`gt` in data is required by this process'
        gt = data['gt']
        label = self.gt_to_label(gt)
        data['label'] = label
        if label.sum() == 0:
            raise 'Empty Label'  # FIXME: package into a class.

        length = len(gt)
        if self.max_size is not None:
            length = min(length, self.max_size)
        length = np.array(length, dtype=np.int32)
        data['length'] = length
        return data

    def gt_to_label(self, gt, image=None):
        if self.max_size is not None:
            return self.charset.string_to_label(gt)[:self.max_size]
        else:
            return self.charset.string_to_label(gt)
Пример #11
0
class CropFileDataset(data.Dataset, Configurable):
    file_pattern = State()
    charset = State(default=DefaultCharset())
    max_size = State(default=32)
    image_size = State(default=[64, 512])
    mode = State(default='resize')

    RGB_MEAN = np.array([122.67891434, 116.66876762, 104.00698793])

    def __init__(self, file_pattern=None, cmd={}, **kwargs):
        self.load_all(**kwargs)
        self.file_pattern = file_pattern or self.file_pattern
        self.resize = ResizeImage(self.image_size, self.mode)
        self.prepare()

    def prepare(self):
        self.file_paths = glob.glob(self.file_pattern)
        assert len(self.file_paths) > 0

        self.gt = []
        for file_name in self.file_paths:
            base_name = os.path.basename(file_name)
            names = base_name.split('_')
            gt_with_suffix = '_'.join(names[1:])
            assert gt_with_suffix.endswith('.jpg')
            gt = gt_with_suffix[:gt_with_suffix.rindex('.jpg')]
            self.gt.append(gt)
        self.num_samples = len(self.file_paths)
        return self

    def is_vertival(self, height, width):
        return height > width * 1.5

    def ensure_horizontal(self, image):
        if self.is_vertival(*image.shape[:2]):
            image = np.flip(np.swapaxes(image, 0, 1), 0)
        return image

    def __getitem__(self, index, retry=0):
        if index >= self.num_samples:
            index = index % self.num_samples
        file_path = self.file_paths[index]
        gt = self.gt[index]

        image = cv2.imread(file_path, cv2.IMREAD_COLOR).astype('float32')
        image = self.ensure_horizontal(image)
        image = self.resize(image)

        image -= self.RGB_MEAN
        image /= 255.
        length = np.array(min(len(gt), self.max_size), dtype=np.int32)

        image = torch.from_numpy(image).permute(2, 0, 1).float()
        label = self.gt_to_label(gt, image)
        return image, label, length

    def gt_to_label(self, gt, image=None):
        return self.charset.string_to_label(gt)[:config.max_size]

    def __len__(self):
        return self.num_samples

    @classmethod
    def restore(cls, data):
        data = data.permute(1, 2, 0).to('cpu').data.numpy()
        data = data * 255.
        data += cls.RGB_MEAN
        return data.astype(np.uint8)