def test_gen_queryimages(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col) all_note_ids = ocr.col.db.list("select * from notes") q_images = NotesQuery(col=test_col, note_ids=all_note_ids) print(q_images)
def test_query_noteids(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col) note_ids = [1601851621708, 1601851571572] q_images = NotesQuery(col=test_col, note_ids=note_ids) assert len(q_images.notes) == 2 for note in q_images.notes: assert note.note_id in note_ids
def test_clean_ocr_text(self): input_str = "this is some text: with a result\n\n\nThis is some double colon :: with result" \ "\n\nwithout spaces::new word\none space:: new word\n\n\n\none space before ::new word\n" \ "triple ::: new word\n\n\n\n\nquadruple ::::newword""" expected_output = "this is some text: with a result\nThis is some double colon : with result\n" \ "without spaces:new word\none space: new word\none space before :new word\n" \ "triple : new word\nquadruple :newword" output = OCR.clean_ocr_text(input_str) assert output == expected_output
def test_add_ocr_field_then_remove_text_new_field(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col, text_output_location="new_field") note_ids = [1601851571572, 1601851621708] ocr.run_ocr_on_notes(note_ids=note_ids) ocr.remove_ocr_on_notes(note_ids=note_ids)
def test_unbatched_single_threaded(self): console.print("Starting un-batched single threaded") ocr = OCR(col=None, progress=None, languages=["eng"], num_threads=1, use_batching=False) _, time_taken = timeit(ocr._ocr_unbatched_process, self.IMG_PTHS) try: console.print( f"OMP_THREAD_LIMIT = {os.environ['OMP_THREAD_LIMIT']}") except KeyError: console.print("No thread limit found.") return time_taken
def test_batched_multi_threaded(self): console.print("Starting batched multi threaded") ocr = OCR(col=None, progress=None, languages=["eng"], num_threads=4, use_batching=True) _, time_taken = timeit(ocr._ocr_batch_process, self.batched_txts) try: console.print( f"OMP_THREAD_LIMIT = {os.environ['OMP_THREAD_LIMIT']}") except KeyError: console.print("No thread limit found.") return time_taken
def test_run_ocr_on_notes_unbatched_multithreaded(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col, use_batching=False, num_threads=4) ocr.run_ocr_on_notes(note_ids=[1601851571572, 1601851621708])
def test_run_ocr_on_collection(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col) all_note_ids = ocr.col.db.list("select * from notes") ocr.run_ocr_on_query(note_ids=all_note_ids)
def test_ocr_img_without_lang(self, img_pth, expected): img = str(img_pth.absolute()) ocr_result = OCR._ocr_img(img, num_threads=1).strip() cleaned_result = OCR.clean_ocr_text(ocr_result).strip() expected = expected.strip() assert cleaned_result == expected
class TestOCR: all_img_files = list(Path(TESTDATA_DIR, "annotated_imgs").glob("*")) img_pths = sorted([f for f in all_img_files if f.suffix in [".png", ".jpg", ".tiff", ".tif", ".jpeg"]]) annot_pths = sorted([f for f in all_img_files if f.suffix == ".txt"]) annot_txts = [f.read_text(encoding="utf-8") for f in annot_pths] assert len(img_pths) == len(annot_pths) tesseract_cmd = OCR.path_to_tesseract() pytesseract.pytesseract.tesseract_cmd = tesseract_cmd def test_collection_ok(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) assert test_col.basicCheck() @pytest.mark.parametrize(["img_pth", "expected"], [(i, a) for i, a in zip(img_pths, annot_txts)]) def test_ocr_img_with_lang(self, img_pth, expected): img = str(img_pth.absolute()) ocr_result = OCR._ocr_img(img, num_threads=1, languages=["eng"]) cleaned_result = OCR.clean_ocr_text(ocr_result).strip() expected = expected.strip() assert cleaned_result == expected @pytest.mark.parametrize(["img_pth", "expected"], [(i, a) for i, a in zip(img_pths, annot_txts)]) def test_ocr_img_without_lang(self, img_pth, expected): img = str(img_pth.absolute()) ocr_result = OCR._ocr_img(img, num_threads=1).strip() cleaned_result = OCR.clean_ocr_text(ocr_result).strip() expected = expected.strip() assert cleaned_result == expected def test_gen_queryimages(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col) all_note_ids = ocr.col.db.list("select * from notes") q_images = NotesQuery(col=test_col, note_ids=all_note_ids) print(q_images) def test_query_noteids(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col) note_ids = [1601851621708, 1601851571572] q_images = NotesQuery(col=test_col, note_ids=note_ids) assert len(q_images.notes) == 2 for note in q_images.notes: assert note.note_id in note_ids def test_run_ocr_on_collection(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col) all_note_ids = ocr.col.db.list("select * from notes") ocr.run_ocr_on_query(note_ids=all_note_ids) def test_run_ocr_on_notes_batched_multithreaded(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col, use_batching=True, num_threads=4) ocr.run_ocr_on_notes(note_ids=[1601851571572, 1601851621708]) def test_run_ocr_on_notes_batched_single_threaded(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col, use_batching=True, num_threads=1) ocr.run_ocr_on_notes(note_ids=[1601851571572, 1601851621708]) def test_run_ocr_on_notes_unbatched_multithreaded(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col, use_batching=False, num_threads=4) ocr.run_ocr_on_notes(note_ids=[1601851571572, 1601851621708]) def test_run_ocr_on_notes_unbatched_singlethreaded(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col, use_batching=False, num_threads=1) ocr.run_ocr_on_notes(note_ids=[1601851571572, 1601851621708]) def test_add_ocr_field_then_remove_text_tooltip(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col, text_output_location="tooltip") note_ids = [1601851571572, 1601851621708] ocr.run_ocr_on_notes(note_ids=note_ids) ocr.remove_ocr_on_notes(note_ids=note_ids) def test_add_ocr_field_then_remove_text_new_field(self, tmpdir): col_dir = tmpdir.mkdir("collection") test_col = gen_test_collection(col_dir) ocr = OCR(col=test_col, text_output_location="new_field") note_ids = [1601851571572, 1601851621708] ocr.run_ocr_on_notes(note_ids=note_ids) ocr.remove_ocr_on_notes(note_ids=note_ids) def test_clean_ocr_text(self): input_str = "this is some text: with a result\n\n\nThis is some double colon :: with result" \ "\n\nwithout spaces::new word\none space:: new word\n\n\n\none space before ::new word\n" \ "triple ::: new word\n\n\n\n\nquadruple ::::newword""" expected_output = "this is some text: with a result\nThis is some double colon : with result\n" \ "without spaces:new word\none space: new word\none space before :new word\n" \ "triple : new word\nquadruple :newword" output = OCR.clean_ocr_text(input_str) assert output == expected_output
import logging from pathlib import Path from anki import Collection from anki_ocr.ocr import SCRIPT_DIR, OCR if __name__ == '__main__': logging_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" logging.basicConfig(format=logging_format, level=logging.INFO) # Not to be run inside Anki PROFILE_HOME = Path(SCRIPT_DIR.parent, "tests/User 1") cpath = PROFILE_HOME / "collection.anki2" collection = Collection(str(cpath), log=True) # Collection is locked from here on ocr = OCR(col=collection, text_output_location="new_field") all_note_ids = ocr.col.db.list("select * from notes") ocr.run_ocr_on_query(note_ids=all_note_ids) # collection.close(save=True) # ocr.remove_ocr_on_notes(note_ids_c)
class TestPerformance: test_img_pths = list(Path(TESTDATA_DIR, "annotated_imgs").glob("*")) tesseract_cmd = OCR.path_to_tesseract() pytesseract.pytesseract.tesseract_cmd = tesseract_cmd IMG_PTHS = [img_pth.absolute() for img_pth in IMGS_DIR.glob("*.png")] NUM_IMGS = len(IMG_PTHS) TXT_PATH = Path(IMGS_DIR, "imgs.txt") TXT_PATH.write_text("\n".join([str(i) for i in IMG_PTHS])) BATCH_SIZE = 10 console.log(f"BATCH_SIZE : {BATCH_SIZE}") console.log(f"Number of images = {len(IMG_PTHS)}") batched_txts, batched_txts_dir = gen_batched_txts(img_pths=IMG_PTHS, batch_size=BATCH_SIZE) console.log( f"Generated {len(batched_txts)} batches of max {BATCH_SIZE} images") def test_batched_single_threaded(self): console.print("Starting batched single threaded") ocr = OCR(col=None, progress=None, languages=["eng"], num_threads=1, use_batching=True) _, time_taken = timeit(ocr._ocr_batch_process, self.batched_txts) try: console.print( f"OMP_THREAD_LIMIT = {os.environ['OMP_THREAD_LIMIT']}") except KeyError: console.print("No thread limit found.") return time_taken def test_batched_multi_threaded(self): console.print("Starting batched multi threaded") ocr = OCR(col=None, progress=None, languages=["eng"], num_threads=4, use_batching=True) _, time_taken = timeit(ocr._ocr_batch_process, self.batched_txts) try: console.print( f"OMP_THREAD_LIMIT = {os.environ['OMP_THREAD_LIMIT']}") except KeyError: console.print("No thread limit found.") return time_taken def test_unbatched_single_threaded(self): console.print("Starting un-batched single threaded") ocr = OCR(col=None, progress=None, languages=["eng"], num_threads=1, use_batching=False) _, time_taken = timeit(ocr._ocr_unbatched_process, self.IMG_PTHS) try: console.print( f"OMP_THREAD_LIMIT = {os.environ['OMP_THREAD_LIMIT']}") except KeyError: console.print("No thread limit found.") return time_taken def test_unbatched_multi_threaded(self): console.print("Starting un-batched multi threaded") ocr = OCR(col=None, progress=None, languages=["eng"], num_threads=4, use_batching=False) _, time_taken = timeit(ocr._ocr_unbatched_process, self.IMG_PTHS) try: console.print( f"OMP_THREAD_LIMIT = {os.environ['OMP_THREAD_LIMIT']}") except KeyError: console.print("No thread limit found.") return time_taken