def extract_json_for_tkbs_from_toc_file( toc_folder_path="resources_for_tests\\1914-11-06", images_and_xmls_folder_path="resources_for_tests\\output\\1914-11-06", author="test_user", description="pipeline"): p = Document() p.load_legacy_data(os.path.join(toc_folder_path)) page_images, page_xmls = p.img_names_by_pgnum(), p.pxml_names_by_pgnum() title = extract_title_from_TOC_xml(os.path.join(toc_folder_path, "TOC.xml")) img_objects = {} for key, value in page_images.items(): with open(os.path.join(images_and_xmls_folder_path, value), 'rb') as file: img_objects[key] = file.read() xml_objects = {} for key, value in page_xmls.items(): with open(os.path.join(images_and_xmls_folder_path, value), 'rb') as file: xml_objects[key] = file.read() d = { "md": { "title": title, "author": author, "description": description }, "pageList": { "pages": [{ "fileName": value, "pageXmlName": page_xmls[key], "pageNr": int(key) } for key, value in page_images.items()] } } json_as_str = json.dumps(d) img_and_xml_list = [{ 'img': (value, img_objects[key], 'application/octet-stream'), 'xml': (page_xmls[key], xml_objects[key], 'application/octet-stream') } for key, value in page_images.items()] return json_as_str, img_and_xml_list
def upload_a_folder(sfolder): user = config.username outfolder = os.path.join(config.src_path, tkbs_subfolder) prep_dir(outfolder) legacy_output = os.path.join(config.src_path, "legacy_output") collec = config.collection_id HTRmodelid = config.htr_model_id infolder = sfolder OkayMessage = "Done OKAY " + infolder ErrorMessage = "Done with ERRORs " + infolder try: if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')): return (ErrorMessage) start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M")) print(start + " - " + infolder) v and print("--- CREATING DATA to upload ---") p = Document() p.load_legacy_data(infolder) uniquename = p.doc_title + "_" + start firstexportdir = sfolder.replace(config.src_path, legacy_output) if not os.path.isdir(firstexportdir): print( p.doc_title + " Skipping... TKBS output missing under " + firstexportdir + "\nRun stage-1 script first, to convert legacy to transkribus format." ) return (OkayMessage) v and print(p.doc_title + "--- UPLOADING data to server --- from " + firstexportdir) docid = upload(collec, firstexportdir, p.img_names_by_pgnum(), p.pxml_names_by_pgnum(), p.title, user, "pipeline test", tkbs) if docid <= 0: print(p.doc_title + "ERROR - document failed to upload " + p.title) return (ErrorMessage) v and print(p.doc_title + "--- GETTING page ids ---") docjson = get_doc(collec, docid, tkbs) pageids = p.load_tkbs_page_ids(docjson) if config.line_detection != None and config.line_detection.upper( ) == "SKIP": v and print(p.doc_title + "Skipping from Line Detection and on...") return (OkayMessage) v and print(p.doc_title + "--- LINE DETECTION ---") detection_status = line_detect(collec, docid, pageids, tkbs) if not detection_status: print(p.doc_title + "ERROR - document failed line detection " + p.title) return (ErrorMessage) if len(HTRmodelid) < 2: v and print(p.doc_title + "Skipping from Htr and on...") return (OkayMessage) v and print(p.doc_title + "--- RUNNING OCR ---") # ocr_status = run_ocr_with_options(collec, HTRmodelid, "", str(446788), {}, tkbs) dictionary = "" if config.htr_lang_model != None and config.htr_lang_model: dictionary = "trainDataLanguageModel" v and print(p.doc_title + "Using trainDataLanguageModel") ocr_status = run_ocr(collec, HTRmodelid, dictionary, str(docid), pageids, tkbs) if not ocr_status: print(p.doc_title + "ERROR - document failed ocr " + p.title + " with status " + str(ocr_status)) return (ErrorMessage) v and print(p.doc_title + "--- FINAL DOWNLOAD after OCR for TEI export ---") otarget_dir = os.path.join( outfolder, uniquename + "_" + str(collec) + "_" + str(docid)) ocrdocjson = download(collec, str(docid), otarget_dir, tkbs, p.tkbs_meta_filename) pageids = p.load_tkbs_page_ids(ocrdocjson) width = config.default_garbage_line_width try: width = int(config.user_garbage_line_width) except: width = config.default_garbage_line_width if width > 0: v and print(p.doc_title + "--- DELETING GARBAGE TEXT ---") for num, fname in p.pxml_names_by_pgnum().items(): fullname = os.path.join(otarget_dir, fname) delete_garbage_text(fullname, width) return (OkayMessage) except Exception as e: print(p.doc_title + "ERROR in upload_a_folder ") print(e) print(p.doc_title + "END ERROR \n\n") return (ErrorMessage)
def upload_pipeline(config): folders_to_be_uploaded = find_sub_folders_with_toc_file(config.src_path) outfolder = os.path.join(config.src_path, "transkribus_output") prep_dir(outfolder) legacy_output = os.path.join(config.src_path, "legacy_output") collec = config.collection_id user = config.username key = config.password HTRmodelid = config.htr_model_id disable_warnings(InsecureRequestWarning) tkbs = TranskribusClient(sServerUrl="https://transkribus.eu/TrpServer") tkbs.auth_login(user, key, True) for sfolder in folders_to_be_uploaded: try: if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')): continue infolder = sfolder start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M")) print(start + " - " + infolder) v and print("--- CREATING DATA to upload ---") p = Document() p.load_legacy_data(infolder) uniquename = p.doc_title + "_" + start firstexportdir = sfolder.replace(config.src_path, legacy_output) if not os.path.isdir(firstexportdir): print( "Skipping... TKBS output missing under " + firstexportdir + "\nRun stage-1 script first, to convert legacy to transkribus format." ) continue v and print("--- UPLOADING data to server ---") v and print("from " + firstexportdir) docid = upload(collec, firstexportdir, p.img_names_by_pgnum(), p.pxml_names_by_pgnum(), p.title, user, "pipeline test", tkbs) if docid <= 0: print("ERROR - document failed to upload " + p.title) continue v and print("--- GETTING page ids ---") docjson = get_doc(collec, docid, tkbs) pageids = p.load_tkbs_page_ids(docjson) if config.line_detection != None and config.line_detection.upper( ) == "SKIP": v and print("Skipping from Line Detection and on...") continue v and print("--- LINE DETECTION ---") detection_status = line_detect(collec, docid, pageids, tkbs) if not detection_status: print("ERROR - document failed line detection " + p.title) continue if len(HTRmodelid) < 2: v and print("Skipping from Htr and on...") continue v and print("--- RUNNING OCR ---") # ocr_status = run_ocr_with_options(collec, HTRmodelid, "", str(446788), {}, tkbs) dictionary = "" if config.htr_lang_model != None and config.htr_lang_model: dictionary = "trainDataLanguageModel" v and print("Using trainDataLanguageModel") ocr_status = run_ocr(collec, HTRmodelid, dictionary, str(docid), pageids, tkbs) if not ocr_status: print("ERROR - document failed ocr " + p.title + " with status " + str(ocr_status)) continue v and print( "--- FINAL DOWNLOAD after OCR for TEI export ---") otarget_dir = os.path.join( outfolder, uniquename + "_" + str(collec) + "_" + str(docid)) ocrdocjson = download(collec, str(docid), otarget_dir, tkbs, p.tkbs_meta_filename) pageids = p.load_tkbs_page_ids(ocrdocjson) width = config.default_garbage_line_width try: width = int(config.user_garbage_line_width) except: width = config.default_garbage_line_width if width > 0: v and print("--- DELETING GARBAGE TEXT ---") for num, fname in p.pxml_names_by_pgnum().items(): fullname = os.path.join(otarget_dir, fname) delete_garbage_text(fullname, width) except Exception as e: print("ERROR in upload_pipeline main loop ") print(e) print("END ERROR \n\n") pass print("DONE. Output is under " + outfolder) tkbs.auth_logout()
prep_dir(exportdir) p.export_tkbs_format(exportdir) v and print("--- CONNECTING to server ---") user = "******" #CHANGE THIS key = "<password>" #CHANGE THIS collec = "17989" #CHANGE THIS tkbs = TranskribusClient(sServerUrl="https://transkribus.eu/TrpServer") tkbs.auth_login(user, key, True) #HTRmodelname = 'Test' HTRmodelid = "10168" #CHANGE THIS #dictName = "Hebrew_Test.dict" #CHANGE THIS #print("session id: " + tkbs.getSessionId() + "\n=================") v and print("--- UPLOADING data to server ---") docid = upload(collec, exportdir, p.img_names_by_pgnum(), p.pxml_names_by_pgnum(), p.title, user, "pipeline test", tkbs) if docid <= 0: print("ERROR - document failed to upload " + p.title) sys.exit(1) v and print("--- DOWNLOADING-1 doc for page ids ---") tempdowndir = os.path.join(outfolder, "tempdowndir") prep_dir(tempdowndir) target_dir = os.path.join(tempdowndir, p.title + "_" + str(collec) + "_" + str(docid)) docjson = download(collec, str(docid), target_dir, tkbs, p.tkbs_meta_filename) pageids = p.load_tkbs_page_ids(docjson) v and print("--- LINE DETECTION ---") detection_status = line_detect(collec, docid, pageids, tkbs)