Python AbbyyOnlineSdk.AbbyyOnlineSdk示例，AbbyyOnlineSdk.AbbyyOnlineSdk, ocrsdk.com Python示例

示例#1

0

显示文件

def main(image_file, output_file):
    global processor
    processor = AbbyyOnlineSdk()
    setup_processor()

    if os.path.isfile(image_file):
        recognize_file(image_file, output_file, "", "txt")
    else:
        print("No such file: {}".format(image_file))

示例#2

0

显示文件

 def __init__(self, indir, outdir, pages, language):
     self.processor = AbbyyOnlineSdk()
     self.processor.ApplicationId = ""
     self.processor.Password = ""
     self.outputFormat = 'txt'
     self.language = language
     self.indir = indir
     self.pages = pages
     self.outdir = outdir
     if not os.path.exists(self.outdir):
         os.makedirs(self.outdir)

示例#3

0

显示文件

文件： watcher_cocr.py 项目： divyanshofficials/hocr_watcher

def main_process(src, lang='English', output_format='txt'):
    global processor
    processor = AbbyyOnlineSdk.AbbyyOnlineSdk()
    setup_processor()
    filename = src.replace('.jpg', '.txt')
    filename = filename.replace('images', 'dst')
    dst = filename
    if os.path.isfile(src):
        recognize_file(src, dst, lang, output_format)
        return True
    else:
        print("No such file: {}".format(src))
        return False

示例#4

0

显示文件

文件： process_xml.py 项目： vinhnguyent090/ERPNext-OCR

def recognizeFile(filePath, resultFilePath, language, outputFormat):
    processor = AbbyyOnlineSdk()

    if "ABBYY_APPID" in os.environ:
        processor.ApplicationId = os.environ["ABBYY_APPID"]

    if "ABBYY_PWD" in os.environ:
        processor.Password = os.environ["ABBYY_PWD"]

    # Proxy settings
    if "http_proxy" in os.environ:
        proxyString = os.environ["http_proxy"]
        print "Using proxy at %s" % proxyString
        processor.Proxy = urllib2.ProxyHandler({"http": proxyString})

    print "Uploading.."
    settings = ProcessingSettings()
    settings.Language = language
    settings.OutputFormat = outputFormat
    task = processor.ProcessImage(filePath, settings)
    if task == None:
        print "Error"
        return
    print "Id = %s" % task.Id
    print "Status = %s" % task.Status

    # Wait for the task to be completed
    sys.stdout.write("Waiting..")
    # Note: it's recommended that your application waits at least 2 seconds
    # before making the first getTaskStatus request and also between such requests
    # for the same task. Making requests more often will not improve your
    # application performance.
    # Note: if your application queues several files and waits for them
    # it's recommended that you use listFinishedTasks instead (which is described
    # at http://ocrsdk.com/documentation/apireference/listFinishedTasks/).

    while task.IsActive() == True:
        time.sleep(5)
        sys.stdout.write(".")
        task = processor.GetTaskStatus(task)

    print "Status = %s" % task.Status

    if task.Status == "Completed":
        if task.DownloadUrl != None:
            processor.DownloadResult(task, resultFilePath)
            print "Result was written to %s" % resultFilePath
    else:
        print "Error processing task"

示例#5

0

显示文件

文件： process.py 项目： bobby-m-h/ocrsdk.com

def main():
    global processor
    processor = AbbyyOnlineSdk()

    setup_processor()

    args = create_parser().parse_args()

    source_file = args.source_file
    target_file = args.target_file
    language = args.language
    output_format = args.format

    if os.path.isfile(source_file):
        recognize_file(source_file, target_file, language, output_format)
    else:
        print("No such file: {}".format(source_file))

示例#6

0

显示文件

文件： process.py 项目： ricardosouzamorais/abbyy-ocr-process

def main():
    global processor
    processor = AbbyyOnlineSdk()

    setup_processor()

    args = create_parser().parse_args()

    options = {
        "language": args.language,
        "operation": args.operation,
        "outputFormat": args.format,
        "textType": args.textType
    }

    input_folder = os.path.join(os.getcwd(), 'input')
    output_folder = os.path.join(os.getcwd(), 'output')
    extension = format_extension[options['outputFormat']]
    if options['operation'] == 'processTextField':
        extension = format_extension['xml']

    for root, dirs, files in os.walk(input_folder):
        for file in files:
            nome_arquivo_saida = os.path.splitext(file)[0]
            if nome_arquivo_saida.startswith('.'):
                continue

            caminho_arquivo_entrada = os.path.join(input_folder, file)
            caminho_arquivo_saida   = os.path.join(output_folder, \
                   '.'.join((nome_arquivo_saida, extension)))

            if os.path.isfile(caminho_arquivo_entrada):
                process_file(options, caminho_arquivo_entrada,
                             caminho_arquivo_saida)
            else:
                print("No such file: {}".format(caminho_arquivo_entrada))

示例#7

0

显示文件

# Usage: process.py <input file> <output file> [-language <Language>] [-pdf|-txt|-rtf|-docx|-xml]

import argparse
import base64
import getopt
import MultipartPostHandler
import os
import re
import sys
import time
import urllib2
import urllib

from AbbyyOnlineSdk import *

processor = AbbyyOnlineSdk()

#if "ABBYY_APPID" in os.environ:
processor.ApplicationId = 'Email Signature Finder and Parser'  #os.environ["ABBYY_APPID"]

#if "ABBYY_PWD" in os.environ:
processor.Password = '******'  #os.environ["ABBYY_PWD"]

# Proxy settings
if "http_proxy" in os.environ:
    proxyString = os.environ["http_proxy"]
    print "Using proxy at %s" % proxyString
    processor.Proxy = urllib2.ProxyHandler({"http": proxyString})


# Recognize a file at filePath and save result to resultFilePath

示例#8

0

显示文件

文件： Pdf2Excel.py 项目： EigenLaw/Pdf2Excel

class Pdf2Excel(object):
    """
    build ABBYY OCR into a Pdf2Excel apckage,
    Environment Demands: argparse, AbbyyOnlineSdk
    """
    
    processor = AbbyyOnlineSdk()

    def setup_processor(self):
    	if "ABBYY_APPID" in os.environ:
    		processor.ApplicationId = os.environ["ABBYY_APPID"]

    	if "ABBYY_PWD" in os.environ:
    		processor.Password = os.environ["ABBYY_PWD"]

    	# Proxy settings
    	if "http_proxy" in os.environ:
    		proxy_string = os.environ["http_proxy"]
    		print("Using http proxy at {}".format(proxy_string))
    		processor.Proxies["http"] = proxy_string

    	if "https_proxy" in os.environ:
    		proxy_string = os.environ["https_proxy"]
    		print("Using https proxy at {}".format(proxy_string))
    		processor.Proxies["https"] = proxy_string


    # Recognize a file at filePath and save result to resultFilePath
    def recognize_file(self,file_path, result_file_path, language, output_format):
        print("Uploading..")
        settings = ProcessingSettings()
        settings.Language = language
        settings.OutputFormat = output_format
        task = processor.process_image(file_path, settings)
        if task is None:
            print("Error")
            return
        if task.Status == "NotEnoughCredits":
            print("Not enough credits to process the document. Please add more pages to your application's account.")
            return

        print("Id = {}".format(task.Id))
        print("Status = {}".format(task.Status))

        # Wait for the task to be completed
        print("Waiting..")
        # Note: it's recommended that your application waits at least 2 seconds
        # before making the first getTaskStatus request and also between such requests
        # for the same task. Making requests more often will not improve your
        # application performance.
        # Note: if your application queues several files and waits for them
        # it's recommended that you use listFinishedTasks instead (which is described
        # at http://ocrsdk.com/documentation/apireference/listFinishedTasks/).

        while task.is_active():
            time.sleep(5)
            print(".")
            task = processor.get_task_status(task)

        print("Status = {}".format(task.Status))

        if task.Status == "Completed":
            if task.DownloadUrl is not None:
                processor.download_result(task, result_file_path)
                print("Result was written to {}".format(result_file_path))
        else:
            print("Error processing task")

    def create_parser():
        parser = argparse.ArgumentParser(description="Recognize a file via web service")
        parser.add_argument('source_file')
        parser.add_argument('target_file')

        parser.add_argument('-l', '--language', default='English', help='Recognition language (default: %(default)s)')
        group = parser.add_mutually_exclusive_group()
        group.add_argument('-txt', action='store_const', const='txt', dest='format', default='txt')
        group.add_argument('-pdf', action='store_const', const='pdfSearchable', dest='format')
        group.add_argument('-rtf', action='store_const', const='rtf', dest='format')
        group.add_argument('-docx', action='store_const', const='docx', dest='format')
        group.add_argument('-xml', action='store_const', const='xml', dest='format')

        return parser

    #from process import *
    #这一步即可以，相当于加载上述函数，上面函数本来就是从process.py摘抄出来一步步run的
    def __init__(self,result="myworkbook.xlsx",Language="English",typefile="xlsx"):
        """
        Language input can be found here:https://ocrsdk.com/documentation/specifications/recognition-languages/ 
        typefile input can be found here:https://ocrsdk.com/documentation/specifications/export-formats/ 
        """
        self.setup_processor()
        
        path=input("Your Pdf file path:  like'C:/data/1.pdf'")
        start=time.time()
        self.recognize_file(path, result, Language, typefile)#能识别中文简体ChinesePRC繁体ChineseTaiwan；中英文混合的怎么设置？？？？
        #https://ocrsdk.com/documentation/specifications/recognition-languages/      语言列表
        #https://ocrsdk.com/documentation/specifications/export-formats/         输出格式列表
        T1=time.time()-start
        print('Total time spend %d seconds for 22 page pdf'%T1)

示例#9

0

显示文件

文件： abbyy_recognition.py 项目： tboenig/financial_disclosure_scraping

import AbbyyOnlineSdk as abbyy
import local_settings as ls
reload(ls)

# <codecell>

repo_dir = ls.repo_dir
doc_dir = os.path.join(repo_dir, 'download_PFDs')

# <codecell>

doc_files = os.listdir(doc_dir)

# <codecell>

processor = abbyy.AbbyyOnlineSdk()

# <codecell>

processor.ApplicationId = ls.app_id
processor.Password = ls.app_password

# <codecell>

settings = abbyy.ProcessingSettings()

# <codecell>

settings.Language

# <codecell>

示例#10

0

显示文件

def main():
    global processor

    # ------------------------------------ GLOBAL -----------------------------------------------------#
    browser = Browser('chrome')
    url = "https://goo.gl/gTWejF"
    browser.visit(url)

    # ------------------------------------------------------- LOGIN -----------------------------------------#

    browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Uname").fill(
        "C31052018427")  # Enter the Users name
    browser.find_by_id("ctl00_ContentPlaceHolder1_txt_pass").fill(
        "5SSGS")  # entering the password
    browser.find_by_id(
        "ctl00_ContentPlaceHolder1_btnsubmit").click()  # submit to login

    # ------------------------------------------------ LOGED IN ----------------------------------------------#

    # ------------------------------------------------- FORM FILLING ------------------------------------------#
    no_box = browser.find_by_xpath(
        '//*[@id="ctl00_ContentPlaceHolder1_drp_pagejump"]/option')
    print "#----------------------START HERE -----------------------------------#"

    tic = time.clock()
    print "#------------------ INFORMATION ON THE PAGE ARE : -----------------#"
    print "THE LENGTH OF THE TOTAL PAGES ARE: "
    print len(no_box)
    id = 0

    while 0 <= id <= 149:

        no_box = browser.find_by_xpath(
            '//*[@id="ctl00_ContentPlaceHolder1_drp_pagejump"]/option'
        )  # getting option out using xpath

        no_box[id].click()
        time.sleep(1)

        # ------------------------------------------------ PRINTING THE IMAGE URL FOR DOWNLOAD --#
        image_url = browser.find_by_id(
            'ctl00_ContentPlaceHolder1_MainImg')['src']

        # ----------------------------------------- SAVING IMAGE VIA URLLIB --------------------------------#
        urllib.urlretrieve(image_url, "locol1.png")
        time.sleep(0.5)
        image = Image.open('locol1.png')
        inverted_image = PIL.ImageOps.invert(image)
        time.sleep(0.5)
        inverted_image.save('final.png')
        time.sleep(0.5)
        # -------------------------------------------- IMAGE SAVED AS (loco.png) -----------------------------#

        processor = AbbyyOnlineSdk()

        setup_processor()

        source_file = 'final.png'
        target_file = 'result.txt'
        language = 'English'
        output_format = 'txt'

        if os.path.isfile(source_file):
            recognize_file(source_file, target_file, language, output_format)
        else:
            print("No such file: {}".format(source_file))

        with open('result.txt', 'r') as res:
            text = res.readlines()
        # print text
        lin_cnt = text.count("\n")
        print lin_cnt
        a = ''
        num_lines = sum(1 for line in open('result.txt'))
        print num_lines
        if num_lines == 3:
            for k in range(0, num_lines):
                a = a + text[k]
            b = a.decode('unicode_escape').encode('ascii', 'ignore')
            c = str(b).split("-")
            final = []
            for items in c:
                final.append(re.sub('       ', '', items))
            time.sleep(1)
            print len(final)
            final1 = []
            for items in final:
                final1.append(re.sub('\n', '', items))
            flist1 = []
            for i in range(0, len(final1)):
                if i == 14:
                    flist1.extend(final1[i].split(" "))
                else:
                    flist1.append(final1[i])
            last_arr = []
            for i in flist1:
                if i != '':
                    last_arr.append(i)
            print last_arr
            print len(last_arr)
            print("The page that is filling now is  :  " + str(id + 1))
            print "----------Filling starts from here-------------------"
            try:
                time.sleep(1)
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_tbc").fill(
                    last_arr[0].strip())
                time.sleep(1)  # first name
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_name").fill(
                    last_arr[1].strip())
                time.sleep(1)  # last name
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_email").fill(
                    last_arr[2].strip())
                time.sleep(1)  # email
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_mobno").fill(
                    last_arr[3].strip())
                time.sleep(1)  # mobile number
                browser.find_by_id(
                    "ctl00_ContentPlaceHolder1_txt_gender").fill(
                        last_arr[4].strip())
                time.sleep(1)  # gender
                browser.find_by_id(
                    "ctl00_ContentPlaceHolder1_txt_licenceno").fill(
                        last_arr[5].strip())
                time.sleep(1)  # licence number
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_girno").fill(
                    last_arr[6].strip())  # grid number
                time.sleep(1)
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_panno").fill(
                    last_arr[7].strip())
                time.sleep(1)  # pan number
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Hadd").fill(
                    last_arr[8].strip())
                time.sleep(1)  # state
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Hcity").fill(
                    last_arr[9].strip())
                time.sleep(1)  # city
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Hpin").fill(
                    last_arr[10].strip())
                time.sleep(1)  # pin
                browser.find_by_id(
                    "ctl00_ContentPlaceHolder1_txt_HState").fill(
                        last_arr[11].strip())
                time.sleep(1)  # address
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Oadd").fill(
                    last_arr[12].strip())
                time.sleep(1)  # address
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Ocity").fill(
                    last_arr[13].strip())  # city
                time.sleep(1)
                browser.find_by_id(
                    "ctl00_ContentPlaceHolder1_txt_Opincode").fill(
                        last_arr[14].strip())
                time.sleep(1)  # pincode
                browser.find_by_id(
                    "ctl00_ContentPlaceHolder1_txt_loanapproval").fill(
                        last_arr[15].strip())
                time.sleep(1)  # loan approval
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_menno").fill(
                    last_arr[16].strip())
                time.sleep(1)  # men number
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_af").fill(
                    last_arr[17].strip())
                time.sleep(1)  # af
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_nri").fill(
                    last_arr[18].strip())
                time.sleep(1)  # nri
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_cp").fill(
                    last_arr[19].strip())
                time.sleep(1)  # cpi

                # ------------------------------------------------ SUBMISSION ON END #

                browser.find_by_id(
                    "ctl00_ContentPlaceHolder1_btnsubmit").click()  # submit
                toc = time.clock()
                a = tic - toc
                print
                print("THE TIME TAKEN TO COMPLETE THE FORM IS : ")
                print(a)
                print "#------------------------ PAGE COMPLETED SUCCESSFULLY--------------------------#"
                time.sleep(1)
                id = id + 1
            except:
                print(
                    "------------THERe WAS AN ERROR SO PLEASE CHECK THE PAGE FOR THE ERROR IN THE ERROR LOG---------"
                )
                f = open("error_seethatosee.txt", "a")
                f.write(
                    "--------------LESS ELEMENT ERROR (CASE 1)-------------------- \n "
                )
                f.write("----->  " + "       " + str(id + 1) + "     " +
                        "<---THis is the page that is not filled")
                f.write("\n")
                f.write(str(len(last_arr)) + "\n")
                f.write("\n")
                f.write(str(last_arr) + "\n")
                f.write(
                    "Please Check the above page because it was not filled \n "
                )
                f.close()
                id = id + 1
        else:
            print "--------------------THERE IS AN ERROR CHECK FOR THE ERROR---------------"
            f = open("error_seethatosee.txt", "a")
            f.write(
                "-----------------------LESS COMPONENT ERROR (CASE 2)-------------------- \n "
            )
            f.write(str(id + 1))
            f.write("\n")
            f.write(str(len(last_arr)) + "\n")
            f.write("\n")
            f.write(str(last_arr) + "\n")
            f.write(
                "Please Check the above page because it was not filled \n ")
            f.close()
            id = id + 1
            continue