示例#1
0
def make_pxml(res=None, f1=None, f2=None):
    log(0,"pxml")
    p = Document()
    for f in factors:
        p.set_factors(f[0], f[1], f[2])
    if res is not None:
        p.set_factors(res,f1,f2)
    # directory containing TOC.xml
    p.load_legacy_data(paper)
    p.export_tkbs_format(os.path.join(paper, config['pxml_dir']))
    log(1,"pxml")
示例#2
0
def get_page_ids_from_document_id(collection_id, document_id, tkbs_client):
    # This function is slow because it requires downloading the file from Transkribus.
    # I couldn't find a way to extract the page ids without downloading the file.
    # If there is such a way - it will surely improve the running speed of the code.
    now = datetime.now()
    current_time = now.strftime("%H-%M-%S")
    temp_folder_name = "temp_folder_for_page_id_" + current_time
    download(collection_id, document_id, temp_folder_name, tkbs_client)
    trp_json_path = os.path.join(temp_folder_name, "trp.json")
    data = read_tkbs_json_file(trp_json_path)
    p = Document()
    page_ids = p.load_tkbs_page_ids(data)
    delete_directory(temp_folder_name)
    return page_ids
示例#3
0
def convert_legacy_folder_to_tkbs_format(src_path, dst_path):
    try:
        p = Document()
        p.load_legacy_data(src_path)
        p.export_tkbs_format(dst_path)
    except Exception as e:
        print("ERROR in convert_legacy_folder_to_tkbs_format with src_path " +
              src_path)
        print(e)
示例#4
0
def upload_pipeline(config):
    p = Document()
    folders_to_be_uploaded = find_sub_folders_with_toc_file(config.src_path)
    #print(folders_to_be_uploaded)
    for folder in folders_to_be_uploaded:
        tkbs_client = connect_to_tkbs(config)
        # output_folder is the folder that legacy_to_tkbs_converter save the output of this folder
        converter_output_folder = os.path.join(
            config.src_path, "output",
            os.path.basename(os.path.normpath(folder)))
        print(converter_output_folder)
        json_as_str, img_and_xml_list = extract_json_for_tkbs_from_toc_file(
            toc_folder_path=folder,
            images_and_xmls_folder_path=converter_output_folder,
            author=config.username,
            description="pipeline")
        document_id = upload(config.collection_id, tkbs_client, json_as_str,
                             img_and_xml_list, config)

        print("** Document uploaded **")

        if True:  # TODO: add condition for check if line detection needed
            detection_status = line_detection(config.collection_id,
                                              document_id, tkbs_client, config)
            if not detection_status:
                print("ERROR - document failed line detection " + str(p.title))
                continue

        print("Line detection done...")

        if config.htr_model_id != "":
            run_ocr(config.collection_id, config.htr_model_id, "", document_id,
                    tkbs_client, config)

        print("OCR done...")

        if config.dst_path != "":
            dest_folder = os.path.join(
                config.dst_path, "output",
                os.path.basename(os.path.normpath(folder)))
            print(dest_folder)
            download(config.collection_id, document_id, dest_folder,
                     tkbs_client, config)

        print("** Document downloaded **")
        time.sleep(40)
示例#5
0
def extract_json_for_tkbs_from_toc_file(
        toc_folder_path="resources_for_tests\\1914-11-06",
        images_and_xmls_folder_path="resources_for_tests\\output\\1914-11-06",
        author="test_user",
        description="pipeline"):
    p = Document()
    p.load_legacy_data(os.path.join(toc_folder_path))
    page_images, page_xmls = p.img_names_by_pgnum(), p.pxml_names_by_pgnum()
    title = extract_title_from_TOC_xml(os.path.join(toc_folder_path,
                                                    "TOC.xml"))
    img_objects = {}

    for key, value in page_images.items():
        with open(os.path.join(images_and_xmls_folder_path, value),
                  'rb') as file:
            img_objects[key] = file.read()

    xml_objects = {}
    for key, value in page_xmls.items():
        with open(os.path.join(images_and_xmls_folder_path, value),
                  'rb') as file:
            xml_objects[key] = file.read()

    d = {
        "md": {
            "title": title,
            "author": author,
            "description": description
        },
        "pageList": {
            "pages": [{
                "fileName": value,
                "pageXmlName": page_xmls[key],
                "pageNr": int(key)
            } for key, value in page_images.items()]
        }
    }

    json_as_str = json.dumps(d)

    img_and_xml_list = [{
        'img': (value, img_objects[key], 'application/octet-stream'),
        'xml': (page_xmls[key], xml_objects[key], 'application/octet-stream')
    } for key, value in page_images.items()]
    return json_as_str, img_and_xml_list
示例#6
0
def upload_a_folder(sfolder):
    user = config.username
    outfolder = os.path.join(config.src_path, tkbs_subfolder)
    prep_dir(outfolder)
    legacy_output = os.path.join(config.src_path, "legacy_output")
    collec = config.collection_id
    HTRmodelid = config.htr_model_id
    infolder = sfolder
    OkayMessage = "Done OKAY " + infolder
    ErrorMessage = "Done with ERRORs " + infolder

    try:
        if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')):
            return (ErrorMessage)

        start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M"))
        print(start + " - " + infolder)
        v and print("---   CREATING DATA to upload  ---")
        p = Document()
        p.load_legacy_data(infolder)
        uniquename = p.doc_title + "_" + start
        firstexportdir = sfolder.replace(config.src_path, legacy_output)
        if not os.path.isdir(firstexportdir):
            print(
                p.doc_title + " Skipping... TKBS output missing under " +
                firstexportdir +
                "\nRun stage-1 script  first, to convert legacy to transkribus format."
            )
            return (OkayMessage)
        v and print(p.doc_title +
                    "---   UPLOADING data to server       --- from " +
                    firstexportdir)
        docid = upload(collec, firstexportdir, p.img_names_by_pgnum(),
                       p.pxml_names_by_pgnum(), p.title, user, "pipeline test",
                       tkbs)
        if docid <= 0:
            print(p.doc_title + "ERROR - document failed to upload " + p.title)
            return (ErrorMessage)

        v and print(p.doc_title + "---   GETTING page ids       ---")
        docjson = get_doc(collec, docid, tkbs)
        pageids = p.load_tkbs_page_ids(docjson)

        if config.line_detection != None and config.line_detection.upper(
        ) == "SKIP":
            v and print(p.doc_title + "Skipping from Line Detection and on...")
            return (OkayMessage)

        v and print(p.doc_title + "---   LINE DETECTION          ---")
        detection_status = line_detect(collec, docid, pageids, tkbs)
        if not detection_status:
            print(p.doc_title + "ERROR - document failed line detection " +
                  p.title)
            return (ErrorMessage)

        if len(HTRmodelid) < 2:
            v and print(p.doc_title + "Skipping from Htr and on...")
            return (OkayMessage)

        v and print(p.doc_title + "---   RUNNING OCR          ---")
        #            ocr_status = run_ocr_with_options(collec, HTRmodelid, "", str(446788), {}, tkbs)
        dictionary = ""
        if config.htr_lang_model != None and config.htr_lang_model:
            dictionary = "trainDataLanguageModel"
            v and print(p.doc_title + "Using trainDataLanguageModel")
        ocr_status = run_ocr(collec, HTRmodelid, dictionary, str(docid),
                             pageids, tkbs)
        if not ocr_status:
            print(p.doc_title + "ERROR - document failed ocr " + p.title +
                  " with status " + str(ocr_status))
            return (ErrorMessage)

        v and print(p.doc_title +
                    "---   FINAL DOWNLOAD after OCR for TEI export        ---")
        otarget_dir = os.path.join(
            outfolder, uniquename + "_" + str(collec) + "_" + str(docid))
        ocrdocjson = download(collec, str(docid), otarget_dir, tkbs,
                              p.tkbs_meta_filename)
        pageids = p.load_tkbs_page_ids(ocrdocjson)

        width = config.default_garbage_line_width
        try:
            width = int(config.user_garbage_line_width)
        except:
            width = config.default_garbage_line_width
        if width > 0:
            v and print(p.doc_title +
                        "---   DELETING GARBAGE TEXT         ---")
            for num, fname in p.pxml_names_by_pgnum().items():
                fullname = os.path.join(otarget_dir, fname)
                delete_garbage_text(fullname, width)

        return (OkayMessage)
    except Exception as e:
        print(p.doc_title + "ERROR in upload_a_folder ")
        print(e)
        print(p.doc_title + "END ERROR \n\n")
        return (ErrorMessage)
示例#7
0
def upload_pipeline(config):
    folders_to_be_uploaded = find_sub_folders_with_toc_file(config.src_path)
    outfolder = os.path.join(config.src_path, "transkribus_output")
    prep_dir(outfolder)
    legacy_output = os.path.join(config.src_path, "legacy_output")
    collec = config.collection_id
    user = config.username
    key = config.password
    HTRmodelid = config.htr_model_id
    disable_warnings(InsecureRequestWarning)
    tkbs = TranskribusClient(sServerUrl="https://transkribus.eu/TrpServer")
    tkbs.auth_login(user, key, True)

    for sfolder in folders_to_be_uploaded:
        try:
            if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')):
                continue
            infolder = sfolder

            start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M"))
            print(start + " - " + infolder)
            v and print("---   CREATING DATA to upload  ---")
            p = Document()
            p.load_legacy_data(infolder)
            uniquename = p.doc_title + "_" + start
            firstexportdir = sfolder.replace(config.src_path, legacy_output)
            if not os.path.isdir(firstexportdir):
                print(
                    "Skipping... TKBS output missing under " + firstexportdir +
                    "\nRun stage-1 script  first, to convert legacy to transkribus format."
                )
                continue
            v and print("---   UPLOADING data to server       ---")
            v and print("from " + firstexportdir)
            docid = upload(collec, firstexportdir, p.img_names_by_pgnum(),
                           p.pxml_names_by_pgnum(), p.title, user,
                           "pipeline test", tkbs)
            if docid <= 0:
                print("ERROR - document failed to upload " + p.title)
                continue

            v and print("---   GETTING page ids       ---")
            docjson = get_doc(collec, docid, tkbs)
            pageids = p.load_tkbs_page_ids(docjson)

            if config.line_detection != None and config.line_detection.upper(
            ) == "SKIP":
                v and print("Skipping from Line Detection and on...")
                continue

            v and print("---   LINE DETECTION          ---")
            detection_status = line_detect(collec, docid, pageids, tkbs)
            if not detection_status:
                print("ERROR - document failed line detection " + p.title)
                continue

            if len(HTRmodelid) < 2:
                v and print("Skipping from Htr and on...")
                continue

            v and print("---   RUNNING OCR          ---")
            #            ocr_status = run_ocr_with_options(collec, HTRmodelid, "", str(446788), {}, tkbs)
            dictionary = ""
            if config.htr_lang_model != None and config.htr_lang_model:
                dictionary = "trainDataLanguageModel"
                v and print("Using trainDataLanguageModel")
            ocr_status = run_ocr(collec, HTRmodelid, dictionary, str(docid),
                                 pageids, tkbs)
            if not ocr_status:
                print("ERROR - document failed ocr " + p.title +
                      " with status " + str(ocr_status))
                continue

            v and print(
                "---   FINAL DOWNLOAD after OCR for TEI export        ---")
            otarget_dir = os.path.join(
                outfolder, uniquename + "_" + str(collec) + "_" + str(docid))
            ocrdocjson = download(collec, str(docid), otarget_dir, tkbs,
                                  p.tkbs_meta_filename)
            pageids = p.load_tkbs_page_ids(ocrdocjson)

            width = config.default_garbage_line_width
            try:
                width = int(config.user_garbage_line_width)
            except:
                width = config.default_garbage_line_width
            if width > 0:
                v and print("---   DELETING GARBAGE TEXT         ---")
                for num, fname in p.pxml_names_by_pgnum().items():
                    fullname = os.path.join(otarget_dir, fname)
                    delete_garbage_text(fullname, width)

        except Exception as e:
            print("ERROR in upload_pipeline main loop ")
            print(e)
            print("END ERROR \n\n")
            pass

    print("DONE. Output is under " + outfolder)
    tkbs.auth_logout()
示例#8
0
        seconds = 80 * len(pids)
        return wait_for_jobstatus(jobid, seconds, mytkbs)
    except Exception as e:
        print("ERROR in run_ocr for docid " + str(mydocid))
        print(e)
        print("END ERROR \n\n")
        pass


v = True

infolder = r'C:\_test_\in_0105'  #CHANGE THIS
outfolder = r'C:\_test_\out'  #CHANGE THIS

v and print("---   CREATING DATA to upload  ---")
p = Document()
#p.set_factors(150, 1.7238, 0.67)
p.load_legacy_data(infolder)

exportdir = os.path.join(outfolder, "pagexml_for_upload")
prep_dir(exportdir)
p.export_tkbs_format(exportdir)

v and print("---   CONNECTING to server    ---")
user = "******"  #CHANGE THIS
key = "<password>"  #CHANGE THIS
collec = "17989"  #CHANGE THIS
tkbs = TranskribusClient(sServerUrl="https://transkribus.eu/TrpServer")
tkbs.auth_login(user, key, True)
#HTRmodelname = 'Test'
HTRmodelid = "10168"  #CHANGE THIS
示例#9
0
def export_pipeline(config):
    folders_to_be_exported = find_sub_folders_with_toc_file(config.src_path)
    tkbs_topfolder = os.path.join(config.src_path, "transkribus_output")
    exportfolder = prep_dir(os.path.join(config.src_path,
                                         "transkribus_export"))
    if config.export_csv:
        csvfolder_byregion = prep_dir(
            os.path.join(exportfolder, 'csv_by_region'))
        csvfolder_byarticle = prep_dir(
            os.path.join(exportfolder, 'csv_by_article'))
    if config.export_plaintext:
        plaintextfolder = prep_dir(os.path.join(exportfolder, 'plaintext'))
        plaintextfolder_byarticle = prep_dir(
            os.path.join(exportfolder, 'plaintext_by_article'))
    if config.export_tei:
        teifolder = prep_dir(os.path.join(exportfolder, 'tei'))
    for sfolder in folders_to_be_exported:
        try:
            if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')):
                continue
            infolder = sfolder

            start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M"))
            print(start + " - " + infolder)  # + "\n==============")
            v and print("---   LOADING Legacy data ---")
            p = Document()
            p.load_legacy_data(infolder)
            tkbsfolder = find_latest_folder(tkbs_topfolder, p.doc_title)
            p.load_tkbs_data(tkbsfolder)  #FIX
            p.load_legacy_articles(p.legacy_metafile)
            p.match_legacy_articles()

            if config.export_tei:
                v and print("---   TEI export           ---")
                p.export_tei(teifolder)

            if config.export_plaintext:
                v and print("---   PLAINTEXT export     ---")
                p.export_plaintext(plaintextfolder)
                p.export_plaintext_articles(plaintextfolder_byarticle)

            if config.export_csv:
                v and print("---   CSV export           ---")
                p.export_csv_articles(csvfolder_byarticle)
                p.export_csv_regions(csvfolder_byregion)

        except Exception as e:
            print("ERROR in export_pipeline main loop ")
            print(e)
            print("END ERROR \n\n")
            pass

    print("DONE. Output is under " + exportfolder)
def convert_legacy_folder_to_tkbs_format(src_path, dst_path):
    p = Document()
    p.load_legacy_data(src_path)
    p.export_tkbs_format(dst_path)