def lexicon(self): ''' returns the closed lexicon (train + test) ''' unique_word_strings = np.unique(self.label_encoder.classes_) unique_word_embeddings = build_phoc_descriptor(words=unique_word_strings, phoc_unigrams=self.unigrams, unigram_levels=self.embedding_config['levels']) class_ids = self.label_encoder.transform(unique_word_strings) return unique_word_strings, unique_word_embeddings, class_ids
def get_qry_strings(self): ''' returns query_strings, embeddings, ids for qbs word spotting ''' # load predefined queries qry_string_path = self.bot_root_dir + 'Botany_Test_QryStrings.lst' qry_string_lines = LineListIO.read_list(qry_string_path) qry_strings = [q.lower() for q in qry_string_lines] qry_string_ids = self.label_encoder.transform(qry_strings) qry_string_embeddings = build_phoc_descriptor( words=qry_strings, phoc_unigrams=self.unigrams, unigram_levels=self.embedding_config['levels']) return qry_strings, qry_string_embeddings, qry_string_ids
def get_qry_strings(self): ''' returns query_strings, embeddings, ids for qbs word spotting ''' qry_strings = np.unique([word.get_transcription() for word in self.test_list]) qry_string_embeddings = build_phoc_descriptor(words=qry_strings, phoc_unigrams=self.unigrams, unigram_levels=self.embedding_config['levels']) # make sure all strings result in a valid phoc encoding (no zero strings) non_zero_embeddings = np.where((np.sum(qry_string_embeddings, axis=1) > 0)) qry_strings = qry_strings[non_zero_embeddings] qry_string_embeddings = qry_string_embeddings[non_zero_embeddings] qry_string_ids = self.label_encoder.transform(qry_strings) return qry_strings, qry_string_embeddings, qry_string_ids
def lexicon(self): ''' returns the closed lexicon (train + test) ''' unique_word_strings = np.unique(self.label_encoder.classes_) unique_word_embeddings = build_phoc_descriptor(words=unique_word_strings, phoc_unigrams=self.unigrams, unigram_levels=self.embedding_config['levels']) # make sure all strings result in a valid phoc encoding (no zero strings) non_zero_embeddings = np.where((np.sum(unique_word_embeddings, axis=1) > 0)) unique_word_strings = unique_word_strings[non_zero_embeddings] unique_word_embeddings = unique_word_embeddings[non_zero_embeddings] class_ids = self.label_encoder.transform(unique_word_strings) return unique_word_strings, unique_word_embeddings, class_ids
def get_qry_strings(self): ''' returns query_strings, embeddings, ids for qbs word spotting ''' qry_strings = np.unique([word.get_transcription() for word in self.test_list]) with open(os.path.join(self.iam_root_dir, 'swIAM.txt')) as sw_file: stop_words = sw_file.read().split(',') qry_strings = np.array([word for word in qry_strings if word not in stop_words]) qry_string_embeddings = build_phoc_descriptor(words=qry_strings, phoc_unigrams=self.unigrams, unigram_levels=self.embedding_config['levels']) # make sure all strings result in a valid phoc encoding (no zero strings) non_zero_embeddings = np.where((np.sum(qry_string_embeddings, axis=1) > 0)) qry_strings = qry_strings[non_zero_embeddings] qry_string_embeddings = qry_string_embeddings[non_zero_embeddings] qry_string_ids = self.label_encoder.transform(qry_strings) return qry_strings, qry_string_embeddings, qry_string_ids
def __init__(self, rimes_root_dir, embedding_config={ 'unigrams': 'all', 'type': 'phoc', 'levels': (1, 2, 4, 8) }, ignore_diacrits=False, fixed_image_size=None, min_image_width_height=30): ''' Constructor @param rimes_root_dir: full path to the Rimes root dir @param embedding_config: configuration of embedding (only phoc available) -unigrams: alnum36 or all -type: phoc -levels: embedding levels @param ignore_diacrits: diacrits are mapped to ascii chars @param fixed_image_size: resize images to a fixed size @param min_image_width_height: the minimum height or width a word image has to have ''' # sanity checks if embedding_config['unigrams'] not in ['alnum36', 'all']: raise ValueError('Unknown unigram definition') if embedding_config['type'] not in ['phoc']: raise ValueError('embedding must be phoc') # class members self.embedding_config = embedding_config self.fixed_image_size = fixed_image_size self.min_image_width_height = min_image_width_height # load the dataset self.train_list, self.test_list = DatasetLoader.load_rimes( path=rimes_root_dir, ignore_diacrits=ignore_diacrits) # compute string embeddings # extract unigrams from train split if embedding_config['unigrams'] == 'alnum36': self.unigrams = [ chr(i) for i in np.hstack([ np.arange(ord('a'), ord('z') + 1), np.arange(ord('0'), ord('9') + 1) ]) ] elif embedding_config['unigrams'] == 'all': self.unigrams = get_unigrams_from_strings([ word.get_transcription() for word in self.train_list + self.test_list ]) else: raise ValueError('Unknown unigram type') if embedding_config['type'] == 'phoc': self.train_embeddings = build_phoc_descriptor( words=[word.get_transcription() for word in self.train_list], phoc_unigrams=self.unigrams, unigram_levels=embedding_config['levels']) self.test_embeddings = build_phoc_descriptor( words=[word.get_transcription() for word in self.test_list], phoc_unigrams=self.unigrams, unigram_levels=embedding_config['levels']) else: raise ValueError('Unknown embedding type') # compute a mapping from class string to class id self.label_encoder = LabelEncoder() self.label_encoder.fit([ word.get_transcription() for word in self.train_list + self.test_list ])
def __init__(self, gw_root_dir, split_idx=1, embedding_config={ 'unigrams': 'alnum36', 'type': 'phoc', 'levels': (1, 2, 4, 8) }, fixed_image_size=None, min_image_width_height=30): ''' Constructor @param gw_root_dir: full path to the GW root dir @param split_idx: the index of the CV split to be used @param embedding_config: configuration of embedding (only phoc available) -unigrams: alnum36 or all -type: phoc -levels: embedding levels @param fixed_image_size: resize images to a fixed size @param min_image_width_height: the minimum height or width a word image has to have ''' # sanity checks if embedding_config['unigrams'] not in ['alnum36', 'all']: raise ValueError('Unknown unigram definition') if embedding_config['type'] not in ['phoc']: raise ValueError('embedding must be phoc') if split_idx is None: raise ValueError('You need to choose a cv split') # class members self.embedding_config = embedding_config self.fixed_image_size = fixed_image_size self.min_image_width_height = min_image_width_height # load split ids for cross validation split_ids = np.load(os.path.join(gw_root_dir, 'almazan_cv_indices.npy')) # load the dataset word_list = DatasetLoader.load_gw(path=gw_root_dir) self.train_list = [ word for word, split_id in zip(word_list, split_ids) if split_id != split_idx ] self.test_list = [ word for word, split_id in zip(word_list, split_ids) if split_id == split_idx ] # compute string embeddings # extract unigrams from train split if embedding_config['unigrams'] == 'alnum36': self.unigrams = [ chr(i) for i in np.hstack([ np.arange(ord('a'), ord('z') + 1), np.arange(ord('0'), ord('9') + 1) ]) ] elif embedding_config['unigrams'] == 'all': self.unigrams = get_unigrams_from_strings([ word.get_transcription() for word in self.train_list + self.test_list ]) else: raise ValueError('Unknown unigram type') if embedding_config['type'] == 'phoc': self.train_embeddings = build_phoc_descriptor( words=[word.get_transcription() for word in self.train_list], phoc_unigrams=self.unigrams, unigram_levels=embedding_config['levels']) self.test_embeddings = build_phoc_descriptor( words=[word.get_transcription() for word in self.test_list], phoc_unigrams=self.unigrams, unigram_levels=embedding_config['levels']) else: raise ValueError('Unknown embedding type') # compute a mapping from class string to class id self.label_encoder = LabelEncoder() self.label_encoder.fit([ word.get_transcription() for word in self.train_list + self.test_list ])
def __init__(self, iam_root_dir, embedding_config = {'unigrams': 'alnum36', 'type': 'phoc', 'levels': (1, 2, 4, 8)}, fixed_image_size=None, min_image_width_height=30, remove_punctuation=True): ''' Constructor @param iam_root_dir: full path to the GW root dir @param embedding_config: configuration of embedding (only phoc available) -unigrams: alnum36 or all -type: phoc -levels: embedding levels @param fixed_image_size: resize images to a fixed size @param min_image_width_height: the minimum height or width a word image has to have @param remove_punctuation: remove punctuation elements in train and test list ''' # sanity checks if embedding_config['unigrams'] not in ['alnum36', 'all']: raise ValueError('Unknown unigram definition') if embedding_config['type'] not in ['phoc']: raise ValueError('embedding must be phoc') # class members self.embedding_config = embedding_config self.fixed_image_size = fixed_image_size self.min_image_width_height = min_image_width_height self.iam_root_dir = iam_root_dir # load the dataset self.train_list, self.test_list = DatasetLoader.load_iam(path = iam_root_dir) if remove_punctuation: punctuation = string.punctuation self.train_list = WordList([elem for elem in self.train_list if elem.get_transcription() not in punctuation]) self.test_list = WordList([elem for elem in self.test_list if elem.get_transcription() not in punctuation]) # compute string embeddings # extract unigrams from train split if embedding_config['unigrams'] == 'alnum36': self.unigrams = [chr(i) for i in np.hstack([np.arange(ord('a'), ord('z') + 1), np.arange(ord('0'), ord('9') + 1)])] elif embedding_config['unigrams'] == 'all': self.unigrams = get_unigrams_from_strings([word.get_transcription() for word in self.train_list + self.test_list]) else: raise ValueError('Unknown unigram type') if embedding_config['type'] == 'phoc': self.train_embeddings = build_phoc_descriptor(words = [word.get_transcription() for word in self.train_list], phoc_unigrams=self.unigrams, unigram_levels=embedding_config['levels']) self.test_embeddings = build_phoc_descriptor(words = [word.get_transcription() for word in self.test_list], phoc_unigrams=self.unigrams, unigram_levels=embedding_config['levels']) else: raise ValueError('Unknown embedding type') # compute a mapping from class string to class id self.label_encoder = LabelEncoder() self.label_encoder.fit([word.get_transcription() for word in self.train_list + self.test_list])