def __init__(self, in_channels, charset=DefaultCharset(), inner_channels=512, max_size=32, height=1, gt_as_output=None, step_dropout=0, **kwargs): super(AttentionDecoder, self).__init__() self.inner_channels = inner_channels self.encode = self._init_encoder(in_channels) self.max_size = max_size self.charset = charset self.height = height self.decoder = AttentionRNNCell(inner_channels, max_size + height, len(charset)) self.step_dropout = step_dropout self.onehot_embedding_x = nn.Embedding(max_size, max_size) self.onehot_embedding_x.weight.data = torch.eye(max_size) self.onehot_embedding_y = nn.Embedding(height, height) self.onehot_embedding_y.weight.data = torch.eye(height) self.gt_as_output = gt_as_output self.loss_function = nn.NLLLoss(reduction='none')
class SequenceRecognitionRepresenter(Configurable): charset = State(default=DefaultCharset()) def __init__(self, cmd={}, **kwargs): self.load_all(**kwargs) def label_to_string(self, label): return self.charset.label_to_string(label) def represent(self, batch, pred): images, labels = batch['image'], batch['label'] mask = torch.ones(pred.shape[0], dtype=torch.int).to(pred.device) for i in range(pred.shape[1]): mask = (1 - (pred[:, i] == self.charset.blank).type(torch.int)) * mask pred[:, i] = pred[:, i] * mask + self.charset.blank * (1 - mask) output = [] for i in range(labels.shape[0]): label_str = self.label_to_string(labels[i]) pred_str = self.label_to_string(pred[i]) if False and label_str != pred_str: print('label: %s , pred: %s' % (label_str, pred_str)) img = (np.clip( images[i].cpu().data.numpy().transpose(1, 2, 0) + 0.5, 0, 1) * 255).astype('uint8') webcv.imshow( '【 pred: <%s> , label: <%s> 】' % (pred_str, label_str), np.array(img, dtype=np.uint8)) if webcv.waitKey() == ord('q'): continue output.append({'label_string': label_str, 'pred_string': pred_str}) return output
class SequenceRecognitionVisualizer(Configurable): charset = State(default=DefaultCharset()) def __init__(self, cmd={}, **kwargs): self.eager = cmd.get('eager_show', False) self.load_all(**kwargs) def visualize(self, batch, output, interested): return self.visualize_batch(batch, output) def visualize_batch(self, batch, output): images, labels, lengths = batch['image'], batch['label'], batch[ 'length'] for i in range(images.shape[0]): image = NormalizeImage.restore(images[i]) gt = self.charset.label_to_string(labels[i]) webcv2.imshow(output[i]['pred_string'] + '_' + str(i) + '_' + gt, image) # folder = 'images/dropout/lexicon/' # np.save(folder + output[i]['pred_string'] + '_' + gt + '_' + batch['data_ids'][i], image) webcv2.waitKey() return { 'image': (np.clip( batch['image'][0].cpu().data.numpy().transpose(1, 2, 0) + 0.5, 0, 1) * 255).astype('uint8') }
def __init__(self, charset=DefaultCharset(), inner_channels=256, in_channels=256, need_reduce=False, reduce_func=None, loss_func='pytorch'): super().__init__() rnn_input = in_channels if need_reduce: rnn_input = inner_channels self.rnn = nn.Sequential( BidirectionalLSTM(rnn_input, inner_channels, inner_channels), BidirectionalLSTM(inner_channels, inner_channels, len(charset))) self.inner_channels = inner_channels if need_reduce: if reduce_func == 'conv': self.fpn2rnn = self._init_conv(in_channels) elif need_reduce and reduce_func == 'pooling': self.fpn2rnn = self._init_pooling() self.softmax = nn.Softmax() if loss_func == 'pytorch': self.ctc_loss = nn.CTCLoss(zero_infinity=True) else: self.ctc_loss = CTCLoss()
def __init__(self, in_channels, charset=DefaultCharset(), inner_channels=256, stride=1, blank=0, **kwargs): super(CTCDecoder2D, self).__init__() self.charset = charset from ops import ctc_loss_2d self.ctc_loss = ctc_loss_2d self.inner_channels = inner_channels self.pred_mask = nn.Sequential( nn.AvgPool2d(kernel_size=(stride, stride), stride=(stride, stride)), nn.Conv2d(in_channels, inner_channels, kernel_size=3, padding=1), nn.Conv2d(inner_channels, 1, kernel_size=1), nn.Softmax(dim=2)) self.pred_classify = nn.Sequential( nn.AvgPool2d(kernel_size=(stride, stride), stride=(stride, stride)), nn.Conv2d(in_channels, inner_channels, kernel_size=3, padding=1), nn.Conv2d(inner_channels, len(charset), kernel_size=1)) self.blank = blank self.tiny = torch.tensor(torch.finfo().tiny, requires_grad=False) self.register_buffer('saved_tiny', self.tiny)
def __init__(self, in_channels, charset=DefaultCharset(), inner_channels=256, use_resnet=False, bias=False): super(SegRecognizer, self).__init__() self.use_resnet = use_resnet self.mask = nn.Sequential( nn.Conv2d(in_channels, inner_channels, kernel_size=3, padding=1), nn.Conv2d(inner_channels, 1, kernel_size=1, padding=0), nn.Sigmoid()) self.classify = nn.Sequential( nn.Conv2d(in_channels, inner_channels, kernel_size=3, padding=1), nn.Conv2d(inner_channels, len(charset), kernel_size=1, padding=0)) #for fpn self.toplayer = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=0) #to reduce channels #upsample self.up5 = nn.Upsample(scale_factor=2, mode='nearest') self.up4 = nn.Upsample(scale_factor=2, mode='nearset') self.up3 = nn.Upsample(scale_factor=2, mode='nearset') self.smooth1 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0) self.smooth2 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0) self.smooth3 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0) self.latlayer1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0) self.latlayer2 = nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0) self.latlayer3 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)
def __init__(self, in_channels, charset=DefaultCharset(), inner_channels=256, **kwargs): super(CTCDecoder, self).__init__() self.ctc_loss = nn.CTCLoss(reduction='mean') self.inner_channels = inner_channels self.encode = self._init_encoder(in_channels) self.pred_conv = nn.Conv2d(inner_channels, len(charset), kernel_size=1, bias=True, padding=0) self.softmax = nn.LogSoftmax(dim=1) self.blank = 0 if 'blank' in kwargs: self.blank = kwargs['blank']
class ImageCropper(Configurable): charset = State(default=DefaultCharset()) max_size = State(default=32) image_size = State(default=[64, 512]) mode = State(default='resize') RGB_MEAN = np.array([122.67891434, 116.66876762, 104.00698793]) def __init__(self, cmd={}, **kwargs): self.load_all(**kwargs) self.resize = ResizeImage(self.image_size, self.mode) def is_vertival(self, height, width): return height > width * 1.5 def ensure_horizontal(self, image): if self.is_vertival(*image.shape[:2]): image = np.flip(np.swapaxes(image, 0, 1), 0) return image def crop(self, image, poly): box = min_area_rect(poly) w = np.linalg.norm(box[1] - box[0]) h = np.linalg.norm(box[2] - box[1]) src = box.astype('float32') dst = np.array([(0, 0), (w, 0), (w, h), (0, h)], 'float32') mat = cv2.getPerspectiveTransform(src, dst) image = cv2.warpPerspective(image, mat, (w, h)) image = image.astype('float32') image = self.ensure_horizontal(image) image = self.resize(image) image -= self.RGB_MEAN image /= 255. return image
class SequenceRecognitionEvaluationRepresenter(Configurable): charset = State(default=DefaultCharset()) def __init__(self, cmd={}, **kwargs): self.load_all(**kwargs) def label_to_string(self, label): return self.charset.label_to_string(label) def represent(self, batch, pred): images, labels, lengths = batch mask = torch.ones(pred.shape[0], dtype=torch.int) for i in range(pred.shape[1]): mask = (1 - (pred[:, i] == self.charset.blank).type(torch.int)) * mask pred[:, i] = pred[:, i] * mask + self.charset.blank * (1 - mask) output = [] for i in range(images.shape[0]): pred_str = self.label_to_string(pred[i]) output.append(pred_str) return output
class MakeRecognitionLabel(DataProcess): charset = State(default=DefaultCharset()) max_size = State(default=32) def process(self, data): assert 'gt' in data, '`gt` in data is required by this process' gt = data['gt'] label = self.gt_to_label(gt) data['label'] = label if label.sum() == 0: raise 'Empty Label' # FIXME: package into a class. length = len(gt) if self.max_size is not None: length = min(length, self.max_size) length = np.array(length, dtype=np.int32) data['length'] = length return data def gt_to_label(self, gt, image=None): if self.max_size is not None: return self.charset.string_to_label(gt)[:self.max_size] else: return self.charset.string_to_label(gt)
class CropFileDataset(data.Dataset, Configurable): file_pattern = State() charset = State(default=DefaultCharset()) max_size = State(default=32) image_size = State(default=[64, 512]) mode = State(default='resize') RGB_MEAN = np.array([122.67891434, 116.66876762, 104.00698793]) def __init__(self, file_pattern=None, cmd={}, **kwargs): self.load_all(**kwargs) self.file_pattern = file_pattern or self.file_pattern self.resize = ResizeImage(self.image_size, self.mode) self.prepare() def prepare(self): self.file_paths = glob.glob(self.file_pattern) assert len(self.file_paths) > 0 self.gt = [] for file_name in self.file_paths: base_name = os.path.basename(file_name) names = base_name.split('_') gt_with_suffix = '_'.join(names[1:]) assert gt_with_suffix.endswith('.jpg') gt = gt_with_suffix[:gt_with_suffix.rindex('.jpg')] self.gt.append(gt) self.num_samples = len(self.file_paths) return self def is_vertival(self, height, width): return height > width * 1.5 def ensure_horizontal(self, image): if self.is_vertival(*image.shape[:2]): image = np.flip(np.swapaxes(image, 0, 1), 0) return image def __getitem__(self, index, retry=0): if index >= self.num_samples: index = index % self.num_samples file_path = self.file_paths[index] gt = self.gt[index] image = cv2.imread(file_path, cv2.IMREAD_COLOR).astype('float32') image = self.ensure_horizontal(image) image = self.resize(image) image -= self.RGB_MEAN image /= 255. length = np.array(min(len(gt), self.max_size), dtype=np.int32) image = torch.from_numpy(image).permute(2, 0, 1).float() label = self.gt_to_label(gt, image) return image, label, length def gt_to_label(self, gt, image=None): return self.charset.string_to_label(gt)[:config.max_size] def __len__(self): return self.num_samples @classmethod def restore(cls, data): data = data.permute(1, 2, 0).to('cpu').data.numpy() data = data * 255. data += cls.RGB_MEAN return data.astype(np.uint8)