def remove_invalid_files(apps, schema_editor): books = DatabaseBook.list_available() for book in tqdm(books, "Removing old files"): for page in book.pages(): obsolete_files = [ 'annotation.json', 'binary_cropped.png', 'binary_cropped_preview.jpg', 'binary_deskewed.png', 'binary_deskewed_preview.jpg', 'binary_original.png', 'binary_original_preview.jpg', 'color_cropped.jpg', 'color_cropped_preview.jpg', 'color_deskewed.jpg', 'color_deskewed_preview.jpg', 'gray_cropped.jpg', 'gray_cropped_preview.jpg', 'gray_deskewed.jpg', 'gray_deskewed_preview.jpg', 'gray_original.jpg', 'gray_original_preview.jpg', 'connected_components_deskewed.pkl', ] for f in obsolete_files: f = page.local_file_path(f) if os.path.exists(f): os.remove(f)
def dataset_by_locked_pages( n_train, locks: List[LockState], shuffle: bool = True, datasets: List[DatabaseBook] = None ) -> Tuple[List[PcGts], List[PcGts]]: logger.info("Finding PcGts files with valid ground truth") pcgts = [] for dataset in (datasets if datasets else DatabaseBook.list_available()): logger.debug("Listing files of dataset '{}'".format(dataset.book)) if not dataset.exists(): raise ValueError("Dataset '{}' does not exist at '{}'".format( dataset.book, dataset.local_path())) for page in dataset.pages_with_lock(locks): pcgts.append(PcGts.from_file(page.file('pcgts'))) if len(pcgts) == 0: raise EmptyDataSetException() if shuffle: random.shuffle(pcgts) train_pcgts = pcgts[:int(len(pcgts) * n_train)] val_pcgts = pcgts[len(train_pcgts):] if 0 < n_train < 1 and (len(train_pcgts) == 0 or len(val_pcgts) == 0): raise EmptyDataSetException() return train_pcgts, val_pcgts
def remove_word_and_neume_connector_layer(apps, schema_editor): books = DatabaseBook.list_available() for book in books: for page in book.pages(): pcgts_file = page.file('pcgts') try: if not pcgts_file.exists(): continue with open(pcgts_file.local_path(), 'r') as f: pcgts = json.load(f) page = pcgts['page'] if not page: continue text_regions = page.get('textRegions', []) for text_region in text_regions: text_lines = text_region.get('textLines', []) for text_line in text_lines: words = text_line.get('words', []) text_line['syllables'] = text_line.get('syllables', []) if not words: continue for word in words: text_line['syllables'] += word.get('syllables', []) annotations = page.get('annotations', {}) for connection in annotations.get('connections', []): for syllable_connector in connection.get( 'syllableConnectors', []): if 'refID' in syllable_connector: syllable_connector[ 'syllableID'] = syllable_connector['refID'] neume_connectors = syllable_connector.get( 'neumeConnectors', []) if len(neume_connectors) == 0: continue elif len(neume_connectors) == 1: syllable_connector['neumeID'] = neume_connectors[ 0]['refID'] else: raise ValueError( "Cannot convert {}. Neume connector has {} neume connectors. " "You need to manually convert this file. " "".format(pcgts_file.local_path(), len(neume_connectors))) with open(pcgts_file.local_path(), 'w') as f: json.dump(pcgts, f) except Exception as e: logger.error( "Exception occurred during processing of page {}".format( pcgts_file.local_path())) raise e
def to_train_val( self, locks: List[LockState], shuffle: bool = True, books: List[DatabaseBook] = None ) -> Tuple[List[PcGts], List[PcGts]]: if self.includeAllTrainingData: books = DatabaseBook.list_available() return dataset_by_locked_pages(self.nTrain, locks, shuffle, books)
def pcgts_to_relative_coords(apps, schema_editor): books = DatabaseBook.list_available() for book in tqdm(books, "Converting to relative coords"): for page in book.pages(): pcgts_file = page.file('pcgts') size = Image.open(page.file('color_original').local_path()).size if not pcgts_file.exists(): continue with open(pcgts_file.local_path()) as f: j = json.load(f) was_local = to_relative_coords(j, size) if not was_local: with open(pcgts_file.local_path(), 'w') as f: json.dump(j, f)
def pcgts_update_version(apps, schema_editor): books = DatabaseBook.list_available() version = 1 for book in tqdm(books, "Converting to pcgts version {}".format(version)): for page in book.pages(): pcgts_file = page.file('pcgts') if not pcgts_file.exists(): continue with open(pcgts_file.local_path()) as f: j = json.load(f) upgraded = update_pcgts(j, target_version=version) if upgraded: with open(pcgts_file.local_path(), 'w') as f: json.dump(j, f, indent=2)
def fix_dataset_params(apps, schema_editor): # book models for book in DatabaseBook.list_available(): if not os.path.exists(book.local_models_path()): continue for alg in os.listdir(book.local_models_path()): alg_dir = os.path.join(book.local_models_path(alg)) for model in os.listdir(alg_dir): path = os.path.join(alg_dir, model, 'dataset_params.json') fix_file(path) # default models default_models = os.path.join(BASE_DIR, 'internal_storage', 'default_models') if os.path.exists(default_models): for t in os.listdir(default_models): t_dir = os.path.join(default_models, t) for alg in os.listdir(t_dir): path = os.path.join(t_dir, alg, 'dataset_params.json') fix_file(path)
def extract_from_pcgts(pcgts: PcGts): for l in pcgts.page.all_text_lines(): hyphenated = l.sentence.text() for word in hyphenated.split(): word = normalize(word) dictionary[word.replace("-", "")] = word def extract_from_book(book: DatabaseBook): for page in tqdm(book.pages(), desc="Processing {}".format(book.book)): extract_from_pcgts(page.pcgts()) if args.books is None: books = DatabaseBook.list_available() else: books = [DatabaseBook(b) for b in args.books] print("Processing {} books".format(len(books))) for book in books: print("Processing book {}".format(book.book)) extract_from_book(book) print("Extracted {} words".format(len(dictionary))) with open(args.output, 'w') as f: for word, hyphen in dictionary.items(): f.write("{:20s} {}\n".format(word, hyphen))