def lexicon(self):
        '''
        returns the closed lexicon (train + test)
        '''

        unique_word_strings = np.unique(self.label_encoder.classes_)

        unique_word_embeddings = build_phoc_descriptor(words=unique_word_strings,
                                                       phoc_unigrams=self.unigrams,
                                                       unigram_levels=self.embedding_config['levels'])

        class_ids = self.label_encoder.transform(unique_word_strings)

        return unique_word_strings, unique_word_embeddings, class_ids
예제 #2
0
    def get_qry_strings(self):
        '''
        returns query_strings, embeddings, ids for qbs word spotting
        '''

        # load predefined queries
        qry_string_path = self.bot_root_dir + 'Botany_Test_QryStrings.lst'
        qry_string_lines = LineListIO.read_list(qry_string_path)
        qry_strings = [q.lower() for q in qry_string_lines]

        qry_string_ids = self.label_encoder.transform(qry_strings)

        qry_string_embeddings = build_phoc_descriptor(
            words=qry_strings,
            phoc_unigrams=self.unigrams,
            unigram_levels=self.embedding_config['levels'])

        return qry_strings, qry_string_embeddings, qry_string_ids
    def get_qry_strings(self):
        '''
        returns query_strings, embeddings, ids for qbs word spotting
        '''
        qry_strings = np.unique([word.get_transcription() for word in self.test_list])

        qry_string_embeddings = build_phoc_descriptor(words=qry_strings,
                                                      phoc_unigrams=self.unigrams,
                                                      unigram_levels=self.embedding_config['levels'])

        # make sure all strings result in a valid phoc encoding (no zero strings)
        non_zero_embeddings = np.where((np.sum(qry_string_embeddings, axis=1) > 0))
        qry_strings = qry_strings[non_zero_embeddings]
        qry_string_embeddings = qry_string_embeddings[non_zero_embeddings]


        qry_string_ids = self.label_encoder.transform(qry_strings)

        return qry_strings, qry_string_embeddings, qry_string_ids
    def lexicon(self):
        '''
        returns the closed lexicon (train + test)
        '''

        unique_word_strings = np.unique(self.label_encoder.classes_)

        unique_word_embeddings = build_phoc_descriptor(words=unique_word_strings,
                                                       phoc_unigrams=self.unigrams,
                                                       unigram_levels=self.embedding_config['levels'])


        # make sure all strings result in a valid phoc encoding (no zero strings)
        non_zero_embeddings = np.where((np.sum(unique_word_embeddings, axis=1) > 0))
        unique_word_strings = unique_word_strings[non_zero_embeddings]
        unique_word_embeddings = unique_word_embeddings[non_zero_embeddings]


        class_ids = self.label_encoder.transform(unique_word_strings)

        return unique_word_strings, unique_word_embeddings, class_ids
    def get_qry_strings(self):
        '''
        returns query_strings, embeddings, ids for qbs word spotting
        '''
        qry_strings = np.unique([word.get_transcription() for word in self.test_list])

        with open(os.path.join(self.iam_root_dir, 'swIAM.txt')) as sw_file:
            stop_words = sw_file.read().split(',')

        qry_strings = np.array([word for word in qry_strings if word not in stop_words])

        qry_string_embeddings = build_phoc_descriptor(words=qry_strings,
                                                      phoc_unigrams=self.unigrams,
                                                      unigram_levels=self.embedding_config['levels'])


        # make sure all strings result in a valid phoc encoding (no zero strings)
        non_zero_embeddings = np.where((np.sum(qry_string_embeddings, axis=1) > 0))
        qry_strings = qry_strings[non_zero_embeddings]
        qry_string_embeddings = qry_string_embeddings[non_zero_embeddings]

        qry_string_ids = self.label_encoder.transform(qry_strings)
        return qry_strings, qry_string_embeddings, qry_string_ids
예제 #6
0
    def __init__(self,
                 rimes_root_dir,
                 embedding_config={
                     'unigrams': 'all',
                     'type': 'phoc',
                     'levels': (1, 2, 4, 8)
                 },
                 ignore_diacrits=False,
                 fixed_image_size=None,
                 min_image_width_height=30):
        '''
        Constructor

        @param rimes_root_dir: full path to the Rimes root dir
        @param embedding_config: configuration of embedding (only phoc available)
                                 -unigrams: alnum36 or all
                                 -type: phoc
                                 -levels: embedding levels 
        @param ignore_diacrits: diacrits are mapped to ascii chars
        @param fixed_image_size: resize images to a fixed size
        @param min_image_width_height: the minimum height or width a word image
                                       has to have
        '''

        # sanity checks
        if embedding_config['unigrams'] not in ['alnum36', 'all']:
            raise ValueError('Unknown unigram definition')
        if embedding_config['type'] not in ['phoc']:
            raise ValueError('embedding must be phoc')

        # class members
        self.embedding_config = embedding_config
        self.fixed_image_size = fixed_image_size
        self.min_image_width_height = min_image_width_height

        # load the dataset
        self.train_list, self.test_list = DatasetLoader.load_rimes(
            path=rimes_root_dir, ignore_diacrits=ignore_diacrits)

        # compute string embeddings

        # extract unigrams from train split
        if embedding_config['unigrams'] == 'alnum36':
            self.unigrams = [
                chr(i) for i in np.hstack([
                    np.arange(ord('a'),
                              ord('z') + 1),
                    np.arange(ord('0'),
                              ord('9') + 1)
                ])
            ]
        elif embedding_config['unigrams'] == 'all':
            self.unigrams = get_unigrams_from_strings([
                word.get_transcription()
                for word in self.train_list + self.test_list
            ])
        else:
            raise ValueError('Unknown unigram type')

        if embedding_config['type'] == 'phoc':
            self.train_embeddings = build_phoc_descriptor(
                words=[word.get_transcription() for word in self.train_list],
                phoc_unigrams=self.unigrams,
                unigram_levels=embedding_config['levels'])
            self.test_embeddings = build_phoc_descriptor(
                words=[word.get_transcription() for word in self.test_list],
                phoc_unigrams=self.unigrams,
                unigram_levels=embedding_config['levels'])
        else:
            raise ValueError('Unknown embedding type')

        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([
            word.get_transcription()
            for word in self.train_list + self.test_list
        ])
예제 #7
0
    def __init__(self,
                 gw_root_dir,
                 split_idx=1,
                 embedding_config={
                     'unigrams': 'alnum36',
                     'type': 'phoc',
                     'levels': (1, 2, 4, 8)
                 },
                 fixed_image_size=None,
                 min_image_width_height=30):
        '''
        Constructor

        @param gw_root_dir: full path to the GW root dir
        @param split_idx: the index of the CV split to be used
        @param embedding_config: configuration of embedding (only phoc available)
                                 -unigrams: alnum36 or all
                                 -type: phoc
                                 -levels: embedding levels
        @param fixed_image_size: resize images to a fixed size
        @param min_image_width_height: the minimum height or width a word image
                                       has to have
        '''

        # sanity checks
        if embedding_config['unigrams'] not in ['alnum36', 'all']:
            raise ValueError('Unknown unigram definition')
        if embedding_config['type'] not in ['phoc']:
            raise ValueError('embedding must be phoc')
        if split_idx is None:
            raise ValueError('You need to choose a cv split')

        # class members
        self.embedding_config = embedding_config
        self.fixed_image_size = fixed_image_size
        self.min_image_width_height = min_image_width_height

        # load split ids for cross validation
        split_ids = np.load(os.path.join(gw_root_dir,
                                         'almazan_cv_indices.npy'))

        # load the dataset
        word_list = DatasetLoader.load_gw(path=gw_root_dir)

        self.train_list = [
            word for word, split_id in zip(word_list, split_ids)
            if split_id != split_idx
        ]

        self.test_list = [
            word for word, split_id in zip(word_list, split_ids)
            if split_id == split_idx
        ]

        # compute string embeddings

        # extract unigrams from train split
        if embedding_config['unigrams'] == 'alnum36':
            self.unigrams = [
                chr(i) for i in np.hstack([
                    np.arange(ord('a'),
                              ord('z') + 1),
                    np.arange(ord('0'),
                              ord('9') + 1)
                ])
            ]
        elif embedding_config['unigrams'] == 'all':
            self.unigrams = get_unigrams_from_strings([
                word.get_transcription()
                for word in self.train_list + self.test_list
            ])
        else:
            raise ValueError('Unknown unigram type')

        if embedding_config['type'] == 'phoc':
            self.train_embeddings = build_phoc_descriptor(
                words=[word.get_transcription() for word in self.train_list],
                phoc_unigrams=self.unigrams,
                unigram_levels=embedding_config['levels'])
            self.test_embeddings = build_phoc_descriptor(
                words=[word.get_transcription() for word in self.test_list],
                phoc_unigrams=self.unigrams,
                unigram_levels=embedding_config['levels'])
        else:
            raise ValueError('Unknown embedding type')

        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([
            word.get_transcription()
            for word in self.train_list + self.test_list
        ])
    def __init__(self,
                 iam_root_dir,
                 embedding_config = {'unigrams': 'alnum36', 'type': 'phoc', 'levels': (1, 2, 4, 8)},
                 fixed_image_size=None,
                 min_image_width_height=30,
                 remove_punctuation=True):
        '''
        Constructor

        @param iam_root_dir: full path to the GW root dir
        @param embedding_config: configuration of embedding (only phoc available)
                                 -unigrams: alnum36 or all
                                 -type: phoc
                                 -levels: embedding levels
        @param fixed_image_size: resize images to a fixed size
        @param min_image_width_height: the minimum height or width a word image
                                       has to have
        @param remove_punctuation: remove punctuation elements in train and test list
        '''

        # sanity checks
        if embedding_config['unigrams'] not in ['alnum36', 'all']:
            raise ValueError('Unknown unigram definition')
        if embedding_config['type'] not in ['phoc']:
            raise ValueError('embedding must be phoc')

        # class members
        self.embedding_config = embedding_config
        self.fixed_image_size = fixed_image_size
        self.min_image_width_height = min_image_width_height
        self.iam_root_dir = iam_root_dir

        # load the dataset
        self.train_list, self.test_list = DatasetLoader.load_iam(path = iam_root_dir)

        if remove_punctuation:
            punctuation = string.punctuation
            self.train_list = WordList([elem for elem in self.train_list if elem.get_transcription() not in punctuation])
            self.test_list = WordList([elem for elem in self.test_list if elem.get_transcription() not in punctuation])


        # compute string embeddings

        # extract unigrams from train split
        if embedding_config['unigrams'] == 'alnum36':
            self.unigrams = [chr(i) for i in np.hstack([np.arange(ord('a'), ord('z') + 1),
                                                        np.arange(ord('0'), ord('9') + 1)])]
        elif embedding_config['unigrams'] == 'all':
            self.unigrams = get_unigrams_from_strings([word.get_transcription()
                                                       for word in self.train_list + self.test_list])
        else:
            raise ValueError('Unknown unigram type')


        if embedding_config['type'] == 'phoc':
            self.train_embeddings = build_phoc_descriptor(words = [word.get_transcription() for word in self.train_list],
                                                          phoc_unigrams=self.unigrams,
                                                          unigram_levels=embedding_config['levels'])
            self.test_embeddings = build_phoc_descriptor(words = [word.get_transcription() for word in self.test_list],
                                                          phoc_unigrams=self.unigrams,
                                                          unigram_levels=embedding_config['levels'])
        else:
            raise ValueError('Unknown embedding type')



        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([word.get_transcription() for word in self.train_list + self.test_list])