Exemplo n.º 1
0
    def __init__(self,
                 casia_root_dir='/vol/corpora/document-image-analysis/casia',
                 train_split=[1],
                 test_split=[2],
                 local_mode=True):

        if local_mode:
            casia_root_dir = '/Users/mellome1992/Documents/LocalRepository/phocnet/src/gnt_utils/dataset'

        # load the dataset
        character_list, code_map = DatasetLoader.load_casia(casia_root_dir)
        self.code_map = code_map

        # Gnt dataset(s) with split_id loads as train list
        self.train_list = [
            character for character in character_list
            if int(character.get_folder_id()) in train_split
        ]
        self.test_list = [
            character for character in character_list
            if int(character.get_folder_id()) in test_split
        ]

        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([
            character.get_label()
            for character in self.train_list + self.test_list
        ])
Exemplo n.º 2
0
    def __init__(self,
                 rimes_root_dir,
                 embedding_config={
                     'unigrams': 'all',
                     'type': 'phoc',
                     'levels': (1, 2, 4, 8)
                 },
                 ignore_diacrits=False,
                 fixed_image_size=None,
                 min_image_width_height=30):
        '''
        Constructor

        @param rimes_root_dir: full path to the Rimes root dir
        @param embedding_config: configuration of embedding (only phoc available)
                                 -unigrams: alnum36 or all
                                 -type: phoc
                                 -levels: embedding levels 
        @param ignore_diacrits: diacrits are mapped to ascii chars
        @param fixed_image_size: resize images to a fixed size
        @param min_image_width_height: the minimum height or width a word image
                                       has to have
        '''

        # sanity checks
        if embedding_config['unigrams'] not in ['alnum36', 'all']:
            raise ValueError('Unknown unigram definition')
        if embedding_config['type'] not in ['phoc']:
            raise ValueError('embedding must be phoc')

        # class members
        self.embedding_config = embedding_config
        self.fixed_image_size = fixed_image_size
        self.min_image_width_height = min_image_width_height

        # load the dataset
        self.train_list, self.test_list = DatasetLoader.load_rimes(
            path=rimes_root_dir, ignore_diacrits=ignore_diacrits)

        # compute string embeddings

        # extract unigrams from train split
        if embedding_config['unigrams'] == 'alnum36':
            self.unigrams = [
                chr(i) for i in np.hstack([
                    np.arange(ord('a'),
                              ord('z') + 1),
                    np.arange(ord('0'),
                              ord('9') + 1)
                ])
            ]
        elif embedding_config['unigrams'] == 'all':
            self.unigrams = get_unigrams_from_strings([
                word.get_transcription()
                for word in self.train_list + self.test_list
            ])
        else:
            raise ValueError('Unknown unigram type')

        if embedding_config['type'] == 'phoc':
            self.train_embeddings = build_phoc_descriptor(
                words=[word.get_transcription() for word in self.train_list],
                phoc_unigrams=self.unigrams,
                unigram_levels=embedding_config['levels'])
            self.test_embeddings = build_phoc_descriptor(
                words=[word.get_transcription() for word in self.test_list],
                phoc_unigrams=self.unigrams,
                unigram_levels=embedding_config['levels'])
        else:
            raise ValueError('Unknown embedding type')

        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([
            word.get_transcription()
            for word in self.train_list + self.test_list
        ])
Exemplo n.º 3
0
    def __init__(self,
                 gw_root_dir,
                 split_idx=1,
                 embedding_config={
                     'unigrams': 'alnum36',
                     'type': 'phoc',
                     'levels': (1, 2, 4, 8)
                 },
                 fixed_image_size=None,
                 min_image_width_height=30):
        '''
        Constructor

        @param gw_root_dir: full path to the GW root dir
        @param split_idx: the index of the CV split to be used
        @param embedding_config: configuration of embedding (only phoc available)
                                 -unigrams: alnum36 or all
                                 -type: phoc
                                 -levels: embedding levels
        @param fixed_image_size: resize images to a fixed size
        @param min_image_width_height: the minimum height or width a word image
                                       has to have
        '''

        # sanity checks
        if embedding_config['unigrams'] not in ['alnum36', 'all']:
            raise ValueError('Unknown unigram definition')
        if embedding_config['type'] not in ['phoc']:
            raise ValueError('embedding must be phoc')
        if split_idx is None:
            raise ValueError('You need to choose a cv split')

        # class members
        self.embedding_config = embedding_config
        self.fixed_image_size = fixed_image_size
        self.min_image_width_height = min_image_width_height

        # load split ids for cross validation
        split_ids = np.load(os.path.join(gw_root_dir,
                                         'almazan_cv_indices.npy'))

        # load the dataset
        word_list = DatasetLoader.load_gw(path=gw_root_dir)

        self.train_list = [
            word for word, split_id in zip(word_list, split_ids)
            if split_id != split_idx
        ]

        self.test_list = [
            word for word, split_id in zip(word_list, split_ids)
            if split_id == split_idx
        ]

        # compute string embeddings

        # extract unigrams from train split
        if embedding_config['unigrams'] == 'alnum36':
            self.unigrams = [
                chr(i) for i in np.hstack([
                    np.arange(ord('a'),
                              ord('z') + 1),
                    np.arange(ord('0'),
                              ord('9') + 1)
                ])
            ]
        elif embedding_config['unigrams'] == 'all':
            self.unigrams = get_unigrams_from_strings([
                word.get_transcription()
                for word in self.train_list + self.test_list
            ])
        else:
            raise ValueError('Unknown unigram type')

        if embedding_config['type'] == 'phoc':
            self.train_embeddings = build_phoc_descriptor(
                words=[word.get_transcription() for word in self.train_list],
                phoc_unigrams=self.unigrams,
                unigram_levels=embedding_config['levels'])
            self.test_embeddings = build_phoc_descriptor(
                words=[word.get_transcription() for word in self.test_list],
                phoc_unigrams=self.unigrams,
                unigram_levels=embedding_config['levels'])
        else:
            raise ValueError('Unknown embedding type')

        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([
            word.get_transcription()
            for word in self.train_list + self.test_list
        ])
    def __init__(self,
                 iam_root_dir,
                 embedding_config = {'unigrams': 'alnum36', 'type': 'phoc', 'levels': (1, 2, 4, 8)},
                 fixed_image_size=None,
                 min_image_width_height=30,
                 remove_punctuation=True):
        '''
        Constructor

        @param iam_root_dir: full path to the GW root dir
        @param embedding_config: configuration of embedding (only phoc available)
                                 -unigrams: alnum36 or all
                                 -type: phoc
                                 -levels: embedding levels
        @param fixed_image_size: resize images to a fixed size
        @param min_image_width_height: the minimum height or width a word image
                                       has to have
        @param remove_punctuation: remove punctuation elements in train and test list
        '''

        # sanity checks
        if embedding_config['unigrams'] not in ['alnum36', 'all']:
            raise ValueError('Unknown unigram definition')
        if embedding_config['type'] not in ['phoc']:
            raise ValueError('embedding must be phoc')

        # class members
        self.embedding_config = embedding_config
        self.fixed_image_size = fixed_image_size
        self.min_image_width_height = min_image_width_height
        self.iam_root_dir = iam_root_dir

        # load the dataset
        self.train_list, self.test_list = DatasetLoader.load_iam(path = iam_root_dir)

        if remove_punctuation:
            punctuation = string.punctuation
            self.train_list = WordList([elem for elem in self.train_list if elem.get_transcription() not in punctuation])
            self.test_list = WordList([elem for elem in self.test_list if elem.get_transcription() not in punctuation])


        # compute string embeddings

        # extract unigrams from train split
        if embedding_config['unigrams'] == 'alnum36':
            self.unigrams = [chr(i) for i in np.hstack([np.arange(ord('a'), ord('z') + 1),
                                                        np.arange(ord('0'), ord('9') + 1)])]
        elif embedding_config['unigrams'] == 'all':
            self.unigrams = get_unigrams_from_strings([word.get_transcription()
                                                       for word in self.train_list + self.test_list])
        else:
            raise ValueError('Unknown unigram type')


        if embedding_config['type'] == 'phoc':
            self.train_embeddings = build_phoc_descriptor(words = [word.get_transcription() for word in self.train_list],
                                                          phoc_unigrams=self.unigrams,
                                                          unigram_levels=embedding_config['levels'])
            self.test_embeddings = build_phoc_descriptor(words = [word.get_transcription() for word in self.test_list],
                                                          phoc_unigrams=self.unigrams,
                                                          unigram_levels=embedding_config['levels'])
        else:
            raise ValueError('Unknown embedding type')



        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([word.get_transcription() for word in self.train_list + self.test_list])