示例#1
0
def build_dataloader(cfg, is_train=True):
    type_name = cfg.DATASETS.TYPE
    model_type = cfg.BASE.TYPE
    # 考虑不需区分dataset是否是检测或识别,只需填入需要的地址
    if is_train == True:
        '''Use a letter to encode the type of the model'''
        if model_type == 'R':
            alphabet = Alphabet(cfg.ADDRESS.ALPHABET)
        else:
            alphabet = Alphabet()

        train_data_dir = cfg.ADDRESS.TRAIN_DATA_DIR
        train_anno_dir = cfg.ADDRESS.TRAIN_GT_DIR
        val_data_dir = cfg.ADDRESS.VAL_DATA_DIR
        val_anno_dir = cfg.ADDRESS.VAL_GT_DIR

        train_set = get_dataset(cfg, type_name, train_data_dir, train_anno_dir, split='train', alphabet=alphabet)
        val_set = get_dataset(cfg, type_name, val_data_dir, val_anno_dir, split='val', alphabet=alphabet)

        train_dataloader = get_dataloader(cfg, type_name, dataset=train_set, split='train')
        val_dataloader = get_dataloader(cfg, type_name, dataset=val_set, split='val')

        # remind the relation of batch_size and num of gpus
        images_per_batch = cfg.MODEL.BATCH_SIZE
        num_gpus = int(cfg.BASE.NUM_GPUS)
        assert (
                images_per_batch % num_gpus == 0
        ), "IMS_PER_BATCH ({}) must be divisible by the number "
        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
        images_per_gpu = images_per_batch // num_gpus

        if images_per_gpu > 5:
            logger = logging.getLogger(__name__)
            # logger.warning(
            #     "每GPU图片数量过高时可能遇到内存溢出,"
            #     "若发生该情况请调整BATCH_SIZE,并调整学习率等其他可能影响效果的因素"
            # )

        return train_dataloader, val_dataloader

    else:
        if model_type == 'R':
            alphabet = Alphabet(cfg.ADDRESS.ALPHABET)
        else:
            alphabet = Alphabet()

        test_data_dir = cfg.ADDRESS.TEST_DATA_DIR
        test_anno_dir = cfg.ADDRESS.TEST_GT_DIR
        test_set = get_dataset(cfg, name, test_data_dir, test_anno_dir, split='test', alphabet=alphabet)

        test_dataloader = get_dataloader(cfg, name, dataset=test_set, split='test')
        return test_dataloader
示例#2
0
    def __init__(self, opt):
        nn.Module.__init__(self)

        from alphabet.alphabet import Alphabet
        self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET))
        self.opt = opt

        # self.stn = SpatialTransformer(self.opt)
        self.cnn = self.getCNN_cap()
        self.rnn = self.getEncoder()
        # n_class,hidden_size,num_embedding,input_size
        # self.attention = Attention(self.n_class,256, 128,256)
        self.attention = Attention(256, 256, self.n_class, 128)


        # Spatial transformer localization-network
        self.localization = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=7),
            nn.MaxPool2d(2, stride=2),
            nn.ReLU(True),
            nn.Conv2d(8, 10, kernel_size=5),
            nn.MaxPool2d(2, stride=2),
            nn.ReLU(True)
        )

        # Regressor for the 3 * 2 affine matrix
        self.fc_loc = nn.Sequential(
            nn.Linear(10 * 4 * 21, 32),
            nn.ReLU(True),
            nn.Linear(32, 3 * 2)
        )

        # Initialize the weights/bias with identity transformation
        self.fc_loc[2].weight.data.fill_(0)
        self.fc_loc[2].bias.data = torch.FloatTensor([1, 0, 0, 0, 1, 0])
示例#3
0
文件: models.py 项目: happog/FudanOCR
    def __init__(self, opt):
        nn.Module.__init__(self)

        from alphabet.alphabet import Alphabet
        self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET))

        self.blstm = BLSTM(512, 256)
        self.attention = Attention(input_size=512, hidden_size=256, num_classes=self.n_class, num_embeddings=128)
示例#4
0
 def __init__(self, opt):
     super(CRNN, self).__init__()
     self.opt = opt
     '''alphabet'''
     from alphabet.alphabet import Alphabet
     self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET)) + 1
     '''cnn'''
     self.cnn = self.getCNN()
     '''rnn'''
     self.rnn = self.getRNN()
示例#5
0
    def __init__(self, opt=None):
        nn.Module.__init__(self)
        self.opt = opt

        from alphabet.alphabet import Alphabet
        self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET))

        self.cnn = ResNet(num_in=opt.IMAGE.IMG_CHANNEL, block=BasicBlock, layers=[1, 2, 5, 3])  # (BS,6,40)
        self.encoder = Encoder()  # (40,BS,512)
        self.decoder = Attention(opt=opt)
示例#6
0
    def __init__(self, opt):
        nn.Module.__init__(self)
        self.opt = opt

        from alphabet.alphabet import Alphabet
        self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET))

        self.cnn = CapsNet(E=10)  # (BS,6,40)
        self.encoder = Encoder()  # (40,BS,512)
        self.decoder = Attention(opt)

        self.fc = nn.Linear(272, 512, bias=True)
        self.relu = nn.ReLU()
示例#7
0
    def __init__(self, opt):
        super(Attention, self).__init__()
        self.attention_cell = AttentionCell()

        from alphabet.alphabet import Alphabet
        self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET))

        self.generator = nn.Linear(512, self.n_class)
        self.char_embeddings = Parameter(torch.randn(self.n_class + 1, 128))
        '''给conv_feats用的'''
        self.conv = nn.Conv2d(512, 512, 3, 1, 1)
        self.bn = nn.BatchNorm2d(512)
        self.relu = nn.ReLU()
示例#8
0
文件: models.py 项目: dun933/FudanOCR
    def __init__(self, opt):
        nn.Module.__init__(self)

        from alphabet.alphabet import Alphabet
        self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET))
        self.opt = opt

        self.stn = SpatialTransformer(self.opt)
        self.cnn = self.getCNN()
        self.rnn = self.getEncoder()
        # n_class,hidden_size,num_embedding,input_size
        # self.attention = Attention(self.n_class,256, 128,256)
        self.attention = Attention(256, 256, self.n_class, 128)
示例#9
0
 def loadTool(self):
     '''
     根据模型的类型,加载相应的组件
     '''
     if self.opt.BASE.TYPE == 'R':
         self.alphabet = Alphabet(self.opt.ADDRESS.ALPHABET)
         if self.opt.BASE.MODEL == 'GRCNN' or self.opt.BASE.MODEL == 'CRNN' or self.opt.BASE.MODEL == 'CAPSOCR2':
             from utils.strLabelConverterForCTC import strLabelConverterForCTC
             self.converter = strLabelConverterForCTC(self.alphabet.str)
         else:
             from utils.strLabelConverterForAttention import strLabelConverterForAttention
             self.converter = strLabelConverterForAttention(self.alphabet.str)
         self.highestAcc = 0
         self.val_times = 0
示例#10
0
    def __init__(self, opt):
        nn.Module.__init__(self)

        from alphabet.alphabet import Alphabet
        self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET))
        self.opt = opt

        # self.stn = SpatialTransformer(self.opt)
        self.cnn = self.getCNN_sr()
        self.rnn = self.getEncoder()
        # n_class,hidden_size,num_embedding,input_size
        # self.attention = Attention(self.n_class,256, 128,256)
        self.attention = Attention(256, 256, self.n_class, 128)

        self.conv_layers = nn.ModuleList()
        self.norm_layers = nn.ModuleList()

        # ========= ConvCaps Layers
        for d in range(1, 2):
            '''自回归模型'''
            self.conv_layers.append(
                SelfRouting2d(num_caps,
                              num_caps,
                              caps_size,
                              caps_size,
                              kernel_size=3,
                              stride=1,
                              padding=1,
                              pose_out=True))
            '''bn'''
            self.norm_layers.append(nn.BatchNorm2d(caps_size * num_caps))
        '''恒等输出'''
        self.conv_a = nn.Conv2d(8 * planes,
                                num_caps,
                                kernel_size=3,
                                stride=1,
                                padding=1,
                                bias=False)
        '''姿态变量'''
        self.conv_pose = nn.Conv2d(8 * planes,
                                   num_caps * caps_size,
                                   kernel_size=3,
                                   stride=1,
                                   padding=1,
                                   bias=False)
        '''两个bn'''
        self.bn_a = nn.BatchNorm2d(num_caps)
        self.bn_pose = nn.BatchNorm2d(num_caps * caps_size)
示例#11
0
 def predict(self, texts, embedding_alphabet: AlphabetEmbeddings,
             label_alphabet: Alphabet, batch_size):
     lens = len(texts)
     batch_num = (lens + batch_size - 1) // batch_size
     ans = []
     for i in range(batch_num):
         start = i * batch_size
         end = min(start + batch_size, lens)
         part = texts[start:end]
         part, lengths, mask = embedding_alphabet.add_padding_tensor(
             part, gpu=self.gpu)
         pred = self.forward(part, lengths, mask)
         pred = torch.argmax(pred, dim=-1, keepdim=False)
         pred = pred.tolist()
         pred = label_alphabet.get_instance(pred)
         ans.extend(pred)
     return ans
示例#12
0
文件: models.py 项目: happog/FudanOCR
    def __init__(self, opt):
        nn.Module.__init__(self)
        self.opt = opt

        from alphabet.alphabet import Alphabet
        self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET))

        self.fe = Feature_Extractor(strides=[(1, 1), (2, 2), (1, 1), (2, 2),
                                             (1, 1), (1, 1)],
                                    compress_layer=False,
                                    input_shape=[1, 32, 128])

        scales = self.fe.Iwantshapes()

        self.cam = CAM(scales=scales, maxT=25, depth=8, num_channels=64)

        self.dtd = DTD(
            nclass=self.n_class,
            nchannel=512,
            dropout=0.3,
        )
示例#13
0
文件: crann.py 项目: dun933/FudanOCR
    def __init__(self, opt):
        # self.alphabet = Alphabet(self.opt.ADDRESS.RECOGNITION.ALPHABET)
        from alphabet.alphabet import Alphabet

        # self.nclass = len(alphabet)
        self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET)) + 1
        # self.n_class = len(alphabet)
        self.crann_config = json.loads(json.dumps(opt))

        n_class = self.n_class
        crann_config = self.crann_config

        print('crann_config"s value is  ', crann_config)
        print(type(crann_config))

        super(newCRANN, self).__init__()
        self.ngpu = crann_config['BASE']['NUM_GPUS']
        cnn_conf = crann_config['CNN']
        print('Constructing {}'.format(cnn_conf['MODEL']))
        self.cnn = ConvNets.__dict__[cnn_conf['MODEL']]()

        rnn_conf = crann_config['RNN']
        print('Constructing {}'.format(rnn_conf['MODEL']))
        self.rnn = SeqNets.__dict__[rnn_conf['MODEL']](rnn_conf, n_class)
示例#14
0
    def __init__(self, opt):

        from alphabet.alphabet import Alphabet

        # self.nclass = len(alphabet)
        self.nclass = len(Alphabet(opt.ADDRESS.ALPHABET))
        self.nh = opt.nh
        self.targetH = opt.targetH
        self.targetW = opt.targetW
        self.BidirDecoder = opt.BidirDecoder
        self.inputDataType = opt.inputDataType
        self.maxBatch = opt.maxBatch
        self.CUDA = opt.CUDA
        self.nc = opt.IMAGE.IMG_CHANNEL

        # def __init__(self, nc, nclass, nh, targetH, targetW, BidirDecoder=False,
        # 	inputDataType='torch.cuda.FloatTensor', maxBatch=256, CUDA=True):
        '''
        初始化MORAN模型,由MORN和ASRN两部分构成

        :param int nc 图片通道数
        :param int nclass 字符表中的字符数量
        :param int nh 图片的高
        :param int targetH 经过MORN调整后图片的目标高度
        :param int targetW 经过MORN调整后图片的目标宽度
        :param bool bidirDeccoder 是否使用双向LSTM
        :param str inputDataType 数据类型
        :param int maxBatch Batch的最大数量
        :param bool CUDA 是否使用CUDA
        '''

        super(newMORAN, self).__init__()
        self.MORN = MORN(self.nc, self.targetH, self.targetW,
                         self.inputDataType, self.maxBatch, self.CUDA)
        self.ASRN = ASRN(self.targetH, self.nc, self.nclass, self.nh,
                         self.BidirDecoder, self.CUDA)
示例#15
0
    # Fix the random seed of Pytorch when using GPU.
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.random_state)
        torch.cuda.manual_seed(args.random_state)

    # Fix the random seed of Pytorch when using CPU.
    torch.manual_seed(args.random_state)
    torch.random.manual_seed(args.random_state)

    # get dataset and alphabets
    dataset = DataIOSST2(config['data'])
    if config['use_pre_embedding']:
        seq_alphabet = AlphabetEmbeddings(**config['embedding'])
        seq_alphabet.load_embeddings_from_file()
    else:
        seq_alphabet = AlphabetEmbeddings(**config['embedding'])
        seq_alphabet.add_instance(dataset.train_word)
    label_alphabet = Alphabet('label', False, False)
    label_alphabet.add_instance(dataset.train_label)

    # get model
    if args.load is not None:
        model = torch.load(args.load)
    else:
        model = ModelFactory.get_model(config, args, seq_alphabet,
                                       label_alphabet)

    process = Process(config, args, dataset, model, seq_alphabet,
                      label_alphabet)
    process.train()