示例#1
0
 def __init__(self, indir, outdir, pages, language):
     self.processor = AbbyyOnlineSdk()
     self.processor.ApplicationId = ""
     self.processor.Password = ""
     self.outputFormat = 'txt'
     self.language = language
     self.indir = indir
     self.pages = pages
     self.outdir = outdir
     if not os.path.exists(self.outdir):
         os.makedirs(self.outdir)
示例#2
0
def main(image_file, output_file):
    global processor
    processor = AbbyyOnlineSdk()
    setup_processor()

    if os.path.isfile(image_file):
        recognize_file(image_file, output_file, "", "txt")
    else:
        print("No such file: {}".format(image_file))
示例#3
0
def recognizeFile(filePath, resultFilePath, language, outputFormat):
    processor = AbbyyOnlineSdk()

    if "ABBYY_APPID" in os.environ:
        processor.ApplicationId = os.environ["ABBYY_APPID"]

    if "ABBYY_PWD" in os.environ:
        processor.Password = os.environ["ABBYY_PWD"]

    # Proxy settings
    if "http_proxy" in os.environ:
        proxyString = os.environ["http_proxy"]
        print "Using proxy at %s" % proxyString
        processor.Proxy = urllib2.ProxyHandler({"http": proxyString})

    print "Uploading.."
    settings = ProcessingSettings()
    settings.Language = language
    settings.OutputFormat = outputFormat
    task = processor.ProcessImage(filePath, settings)
    if task == None:
        print "Error"
        return
    print "Id = %s" % task.Id
    print "Status = %s" % task.Status

    # Wait for the task to be completed
    sys.stdout.write("Waiting..")
    # Note: it's recommended that your application waits at least 2 seconds
    # before making the first getTaskStatus request and also between such requests
    # for the same task. Making requests more often will not improve your
    # application performance.
    # Note: if your application queues several files and waits for them
    # it's recommended that you use listFinishedTasks instead (which is described
    # at http://ocrsdk.com/documentation/apireference/listFinishedTasks/).

    while task.IsActive() == True:
        time.sleep(5)
        sys.stdout.write(".")
        task = processor.GetTaskStatus(task)

    print "Status = %s" % task.Status

    if task.Status == "Completed":
        if task.DownloadUrl != None:
            processor.DownloadResult(task, resultFilePath)
            print "Result was written to %s" % resultFilePath
    else:
        print "Error processing task"
def main_process(src, lang='English', output_format='txt'):
    global processor
    processor = AbbyyOnlineSdk.AbbyyOnlineSdk()
    setup_processor()
    filename = src.replace('.jpg', '.txt')
    filename = filename.replace('images', 'dst')
    dst = filename
    if os.path.isfile(src):
        recognize_file(src, dst, lang, output_format)
        return True
    else:
        print("No such file: {}".format(src))
        return False
示例#5
0
def main():
    global processor
    processor = AbbyyOnlineSdk()

    setup_processor()

    args = create_parser().parse_args()

    source_file = args.source_file
    target_file = args.target_file
    language = args.language
    output_format = args.format

    if os.path.isfile(source_file):
        recognize_file(source_file, target_file, language, output_format)
    else:
        print("No such file: {}".format(source_file))
def recognize_file(file_path, result_file_path, language, output_format):
    print("Uploading..")
    settings = AbbyyOnlineSdk.ProcessingSettings()
    settings.Language = language
    settings.OutputFormat = output_format
    task = processor.process_image(file_path, settings)
    if task is None:
        print("Error")
        return
    if task.Status == "NotEnoughCredits":
        print(
            "Not enough credits to process the document. Please add more pages to your application's account."
        )
        return

    print("Id = {}".format(task.Id))
    print("Status = {}".format(task.Status))

    # Wait for the task to be completed
    print("Waiting..")
    # Note: it's recommended that your application waits at least 2 seconds
    # before making the first getTaskStatus request and also between such requests
    # for the same task. Making requests more often will not improve your
    # application performance.
    # Note: if your application queues several files and waits for them
    # it's recommended that you use listFinishedTasks instead (which is described
    # at https://ocrsdk.com/documentation/apireference/listFinishedTasks/).

    while task.is_active():
        time.sleep(5)
        print(".")
        task = processor.get_task_status(task)

    print("Status = {}".format(task.Status))

    if task.Status == "Completed":
        if task.DownloadUrl is not None:
            processor.download_result(task, result_file_path)
            print("Result was written to {}".format(result_file_path))
    else:
        print("Error processing task")
def main():
    global processor
    processor = AbbyyOnlineSdk()

    setup_processor()

    args = create_parser().parse_args()

    options = {
        "language": args.language,
        "operation": args.operation,
        "outputFormat": args.format,
        "textType": args.textType
    }

    input_folder = os.path.join(os.getcwd(), 'input')
    output_folder = os.path.join(os.getcwd(), 'output')
    extension = format_extension[options['outputFormat']]
    if options['operation'] == 'processTextField':
        extension = format_extension['xml']

    for root, dirs, files in os.walk(input_folder):
        for file in files:
            nome_arquivo_saida = os.path.splitext(file)[0]
            if nome_arquivo_saida.startswith('.'):
                continue

            caminho_arquivo_entrada = os.path.join(input_folder, file)
            caminho_arquivo_saida   = os.path.join(output_folder, \
                   '.'.join((nome_arquivo_saida, extension)))

            if os.path.isfile(caminho_arquivo_entrada):
                process_file(options, caminho_arquivo_entrada,
                             caminho_arquivo_saida)
            else:
                print("No such file: {}".format(caminho_arquivo_entrada))
示例#8
0
# Usage: process.py <input file> <output file> [-language <Language>] [-pdf|-txt|-rtf|-docx|-xml]

import argparse
import base64
import getopt
import MultipartPostHandler
import os
import re
import sys
import time
import urllib2
import urllib

from AbbyyOnlineSdk import *

processor = AbbyyOnlineSdk()

# if "ABBYY_APPID" in os.environ:
# 	processor.ApplicationId = os.environ["ABBYY_APPID"]

processor.ApplicationId = "frameConverter"

# if "ABBYY_PWD" in os.environ:
# 	processor.Password = os.environ["ABBYY_PWD"]

processor.Password = "******"

# Proxy settings
if "http_proxy" in os.environ:
    proxyString = os.environ["http_proxy"]
    print "Using proxy at %s" % proxyString
示例#9
0
# Usage: process.py <input file> <output file> [-language <Language>] [-pdf|-txt|-rtf|-docx|-xml]

import argparse
import base64
import getopt
import MultipartPostHandler
import os
import re
import sys
import time
import urllib2
import urllib

from AbbyyOnlineSdk import *

processor = AbbyyOnlineSdk()

if "ABBYY_APPID" in os.environ:
    processor.ApplicationId = os.environ["ABBYY_APPID"]

if "ABBYY_PWD" in os.environ:
    processor.Password = os.environ["ABBYY_PWD"]

# Proxy settings
if "http_proxy" in os.environ:
    proxyString = os.environ["http_proxy"]
    print "Using proxy at %s" % proxyString
    processor.Proxy = urllib2.ProxyHandler({"http": proxyString})


# Recognize a file at filePath and save result to resultFilePath
示例#10
0
import argparse
import base64
import getopt
import MultipartPostHandler
import os
import re
import sys
import time
import urllib2
import urllib

from AbbyyOnlineSdk import *


processor = AbbyyOnlineSdk()

#if "ABBYY_APPID" in os.environ:
processor.ApplicationId = 'Email Signature Finder and Parser'#os.environ["ABBYY_APPID"]

#if "ABBYY_PWD" in os.environ:
processor.Password = '******'#os.environ["ABBYY_PWD"]

# Proxy settings
if "http_proxy" in os.environ:
	proxyString = os.environ["http_proxy"]
	print "Using proxy at %s" % proxyString
	processor.Proxy = urllib2.ProxyHandler( { "http" : proxyString })


# Recognize a file at filePath and save result to resultFilePath
示例#11
0
class AbbyyPdfTextExtractor:
    logger = ProcessLogger.getLogger('Abbyy')

    def __init__(self, indir, outdir, pages, language):
        self.processor = AbbyyOnlineSdk()
        self.processor.ApplicationId = ""
        self.processor.Password = ""
        self.outputFormat = 'txt'
        self.language = language
        self.indir = indir
        self.pages = pages
        self.outdir = outdir
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir)

    def setApplicationCredentials(self, appid, password):
        self.processor.ApplicationId = appid
        self.processor.Password = password

    def processPdfPage(self, page):
        """
        all the pdf in outdir are named as 1.pdf, 2.pdf based on the page numbers
        """
        infile = os.path.join(self.indir, "%d.pdf" % page)
        outfile = os.path.join(self.outdir, "%d.txt" % page)
        settings = ProcessingSettings()
        settings.Language = self.language
        settings.OutputFormat = self.outputFormat
        self.logger.info('Processing %s', infile)
        task = self.processor.ProcessImage(infile, settings)
        if task == None:
            self.logger.error('Error in getting task')
            return
        self.logger.info('Task Id: %s, status %s', task.Id, task.Status)

        # Wait for the task to be completed
        # sys.stdout.write( "Waiting.." )
        # Note: it's recommended that your application waits at least 2 seconds
        # before making the first getTaskStatus request and also between such requests
        # for the same task. Making requests more often will not improve your
        # application performance.
        # Note: if your application queues several files and waits for them
        # it's recommended that you use listFinishedTasks instead (which is described
        # at http://ocrsdk.com/documentation/apireference/listFinishedTasks/).

        while task.IsActive() == True:
            time.sleep(5)
            sys.stdout.write(".")
            task = self.processor.GetTaskStatus(task)

        self.logger.info('Task Status: %s', task.Status)

        if task.Status == "Completed":
            if task.DownloadUrl != None:
                self.processor.DownloadResult(task, outfile)
                self.logger.info('Result written to %s', outfile)
        else:
            with open(outfile, 'w') as op:
                op.write("Error in processing: %s" % task.Status)
            self.logger.error('Error processing task')

    def extractPages(self):
        for page in range(1, self.pages + 1):
            self.processPdfPage(page)
            outputFileName = os.path.join(self.outdir, str(page) + ".txt")
            with open(outputFileName, 'r') as infile:
                content = infile.read()
            with open(outputFileName, 'w') as outfile:
                outfile.write(self.nl2br(content))

    def nl2br(self, s):
        return '<br />\n'.join(s.split('\n'))
示例#12
0
import argparse
import base64
import getopt
import MultipartPostHandler
import os
import re
import sys
import time
import urllib2
import urllib

from AbbyyOnlineSdk import *


processor = AbbyyOnlineSdk()

# if "ABBYY_APPID" in os.environ:
# 	processor.ApplicationId = os.environ["ABBYY_APPID"]
	
processor.ApplicationId = "frameConverter"

# if "ABBYY_PWD" in os.environ:
# 	processor.Password = os.environ["ABBYY_PWD"]
	
processor.Password = "******"

# Proxy settings
if "http_proxy" in os.environ:
	proxyString = os.environ["http_proxy"]
	print "Using proxy at %s" % proxyString
示例#13
0
文件: OCR.py 项目: armant/babeml
import argparse
import base64
import getopt
import MultipartPostHandler
import os
import re
import sys
import time
import urllib2
import urllib

from AbbyyOnlineSdk import *


processor = AbbyyOnlineSdk()
processor.ApplicationId = 'babe(m)l'
processor.Password = '******'

#if "ABBYY_APPID" in os.environ:
#	processor.ApplicationId = os.environ["ABBYY_APPID"]

#if "ABBYY_PWD" in os.environ:
#	processor.Password = os.environ["ABBYY_PWD"]

# Proxy settings
if "http_proxy" in os.environ:
	proxyString = os.environ["http_proxy"]
	print "Using proxy at %s" % proxyString
	processor.Proxy = urllib2.ProxyHandler( { "http" : proxyString })
import AbbyyOnlineSdk as abbyy
import local_settings as ls
reload(ls)

# <codecell>

repo_dir = ls.repo_dir
doc_dir = os.path.join(repo_dir, 'download_PFDs')

# <codecell>

doc_files = os.listdir(doc_dir)

# <codecell>

processor = abbyy.AbbyyOnlineSdk()

# <codecell>

processor.ApplicationId = ls.app_id
processor.Password = ls.app_password

# <codecell>

settings = abbyy.ProcessingSettings()

# <codecell>

settings.Language

# <codecell>
示例#15
0
def main():
    global processor

    # ------------------------------------ GLOBAL -----------------------------------------------------#
    browser = Browser('chrome')
    url = "https://goo.gl/gTWejF"
    browser.visit(url)

    # ------------------------------------------------------- LOGIN -----------------------------------------#

    browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Uname").fill(
        "C31052018427")  # Enter the Users name
    browser.find_by_id("ctl00_ContentPlaceHolder1_txt_pass").fill(
        "5SSGS")  # entering the password
    browser.find_by_id(
        "ctl00_ContentPlaceHolder1_btnsubmit").click()  # submit to login

    # ------------------------------------------------ LOGED IN ----------------------------------------------#

    # ------------------------------------------------- FORM FILLING ------------------------------------------#
    no_box = browser.find_by_xpath(
        '//*[@id="ctl00_ContentPlaceHolder1_drp_pagejump"]/option')
    print "#----------------------START HERE -----------------------------------#"

    tic = time.clock()
    print "#------------------ INFORMATION ON THE PAGE ARE : -----------------#"
    print "THE LENGTH OF THE TOTAL PAGES ARE: "
    print len(no_box)
    id = 0

    while 0 <= id <= 149:

        no_box = browser.find_by_xpath(
            '//*[@id="ctl00_ContentPlaceHolder1_drp_pagejump"]/option'
        )  # getting option out using xpath

        no_box[id].click()
        time.sleep(1)

        # ------------------------------------------------ PRINTING THE IMAGE URL FOR DOWNLOAD --#
        image_url = browser.find_by_id(
            'ctl00_ContentPlaceHolder1_MainImg')['src']

        # ----------------------------------------- SAVING IMAGE VIA URLLIB --------------------------------#
        urllib.urlretrieve(image_url, "locol1.png")
        time.sleep(0.5)
        image = Image.open('locol1.png')
        inverted_image = PIL.ImageOps.invert(image)
        time.sleep(0.5)
        inverted_image.save('final.png')
        time.sleep(0.5)
        # -------------------------------------------- IMAGE SAVED AS (loco.png) -----------------------------#

        processor = AbbyyOnlineSdk()

        setup_processor()

        source_file = 'final.png'
        target_file = 'result.txt'
        language = 'English'
        output_format = 'txt'

        if os.path.isfile(source_file):
            recognize_file(source_file, target_file, language, output_format)
        else:
            print("No such file: {}".format(source_file))

        with open('result.txt', 'r') as res:
            text = res.readlines()
        # print text
        lin_cnt = text.count("\n")
        print lin_cnt
        a = ''
        num_lines = sum(1 for line in open('result.txt'))
        print num_lines
        if num_lines == 3:
            for k in range(0, num_lines):
                a = a + text[k]
            b = a.decode('unicode_escape').encode('ascii', 'ignore')
            c = str(b).split("-")
            final = []
            for items in c:
                final.append(re.sub('       ', '', items))
            time.sleep(1)
            print len(final)
            final1 = []
            for items in final:
                final1.append(re.sub('\n', '', items))
            flist1 = []
            for i in range(0, len(final1)):
                if i == 14:
                    flist1.extend(final1[i].split(" "))
                else:
                    flist1.append(final1[i])
            last_arr = []
            for i in flist1:
                if i != '':
                    last_arr.append(i)
            print last_arr
            print len(last_arr)
            print("The page that is filling now is  :  " + str(id + 1))
            print "----------Filling starts from here-------------------"
            try:
                time.sleep(1)
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_tbc").fill(
                    last_arr[0].strip())
                time.sleep(1)  # first name
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_name").fill(
                    last_arr[1].strip())
                time.sleep(1)  # last name
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_email").fill(
                    last_arr[2].strip())
                time.sleep(1)  # email
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_mobno").fill(
                    last_arr[3].strip())
                time.sleep(1)  # mobile number
                browser.find_by_id(
                    "ctl00_ContentPlaceHolder1_txt_gender").fill(
                        last_arr[4].strip())
                time.sleep(1)  # gender
                browser.find_by_id(
                    "ctl00_ContentPlaceHolder1_txt_licenceno").fill(
                        last_arr[5].strip())
                time.sleep(1)  # licence number
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_girno").fill(
                    last_arr[6].strip())  # grid number
                time.sleep(1)
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_panno").fill(
                    last_arr[7].strip())
                time.sleep(1)  # pan number
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Hadd").fill(
                    last_arr[8].strip())
                time.sleep(1)  # state
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Hcity").fill(
                    last_arr[9].strip())
                time.sleep(1)  # city
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Hpin").fill(
                    last_arr[10].strip())
                time.sleep(1)  # pin
                browser.find_by_id(
                    "ctl00_ContentPlaceHolder1_txt_HState").fill(
                        last_arr[11].strip())
                time.sleep(1)  # address
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Oadd").fill(
                    last_arr[12].strip())
                time.sleep(1)  # address
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Ocity").fill(
                    last_arr[13].strip())  # city
                time.sleep(1)
                browser.find_by_id(
                    "ctl00_ContentPlaceHolder1_txt_Opincode").fill(
                        last_arr[14].strip())
                time.sleep(1)  # pincode
                browser.find_by_id(
                    "ctl00_ContentPlaceHolder1_txt_loanapproval").fill(
                        last_arr[15].strip())
                time.sleep(1)  # loan approval
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_menno").fill(
                    last_arr[16].strip())
                time.sleep(1)  # men number
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_af").fill(
                    last_arr[17].strip())
                time.sleep(1)  # af
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_nri").fill(
                    last_arr[18].strip())
                time.sleep(1)  # nri
                browser.find_by_id("ctl00_ContentPlaceHolder1_txt_cp").fill(
                    last_arr[19].strip())
                time.sleep(1)  # cpi

                # ------------------------------------------------ SUBMISSION ON END #

                browser.find_by_id(
                    "ctl00_ContentPlaceHolder1_btnsubmit").click()  # submit
                toc = time.clock()
                a = tic - toc
                print
                print("THE TIME TAKEN TO COMPLETE THE FORM IS : ")
                print(a)
                print "#------------------------ PAGE COMPLETED SUCCESSFULLY--------------------------#"
                time.sleep(1)
                id = id + 1
            except:
                print(
                    "------------THERe WAS AN ERROR SO PLEASE CHECK THE PAGE FOR THE ERROR IN THE ERROR LOG---------"
                )
                f = open("error_seethatosee.txt", "a")
                f.write(
                    "--------------LESS ELEMENT ERROR (CASE 1)-------------------- \n "
                )
                f.write("----->  " + "       " + str(id + 1) + "     " +
                        "<---THis is the page that is not filled")
                f.write("\n")
                f.write(str(len(last_arr)) + "\n")
                f.write("\n")
                f.write(str(last_arr) + "\n")
                f.write(
                    "Please Check the above page because it was not filled \n "
                )
                f.close()
                id = id + 1
        else:
            print "--------------------THERE IS AN ERROR CHECK FOR THE ERROR---------------"
            f = open("error_seethatosee.txt", "a")
            f.write(
                "-----------------------LESS COMPONENT ERROR (CASE 2)-------------------- \n "
            )
            f.write(str(id + 1))
            f.write("\n")
            f.write(str(len(last_arr)) + "\n")
            f.write("\n")
            f.write(str(last_arr) + "\n")
            f.write(
                "Please Check the above page because it was not filled \n ")
            f.close()
            id = id + 1
            continue
示例#16
0
import argparse
import base64
import getopt
import MultipartPostHandler
import os
import re
import sys
import time
import urllib2
import urllib

from AbbyyOnlineSdk import *


processor = AbbyyOnlineSdk()

if "ABBYY_APPID" in os.environ:
	processor.ApplicationId = os.environ["ABBYY_APPID"]

if "ABBYY_PWD" in os.environ:
	processor.Password = os.environ["ABBYY_PWD"]

# Proxy settings
if "http_proxy" in os.environ:
	proxyString = os.environ["http_proxy"]
	print "Using proxy at %s" % proxyString
	processor.Proxy = urllib2.ProxyHandler( { "http" : proxyString })


# Recognize a file at filePath and save result to resultFilePath
示例#17
0
# -*- coding: utf-8 -*-
import argparse
import os
import time

from AbbyyOnlineSdk import *

processor = AbbyyOnlineSdk()

class Pdf2Excel(object):
    """
    build ABBYY OCR into a Pdf2Excel apckage,
    Environment Demands: argparse, AbbyyOnlineSdk
    """
    
    processor = AbbyyOnlineSdk()

    def setup_processor(self):
    	if "ABBYY_APPID" in os.environ:
    		processor.ApplicationId = os.environ["ABBYY_APPID"]

    	if "ABBYY_PWD" in os.environ:
    		processor.Password = os.environ["ABBYY_PWD"]

    	# Proxy settings
    	if "http_proxy" in os.environ:
    		proxy_string = os.environ["http_proxy"]
    		print("Using http proxy at {}".format(proxy_string))
    		processor.Proxies["http"] = proxy_string

    	if "https_proxy" in os.environ:
示例#18
0
# Usage: process.py <input file> <output file> [-language <Language>] [-pdf|-txt|-rtf|-docx|-xml]

import argparse
import base64
import getopt
import MultipartPostHandler
import os
import re
import sys
import time
import urllib2
import urllib

from AbbyyOnlineSdk import *

processor = AbbyyOnlineSdk()

#if "ABBYY_APPID" in os.environ:
processor.ApplicationId = 'Email Signature Finder and Parser'  #os.environ["ABBYY_APPID"]

#if "ABBYY_PWD" in os.environ:
processor.Password = '******'  #os.environ["ABBYY_PWD"]

# Proxy settings
if "http_proxy" in os.environ:
    proxyString = os.environ["http_proxy"]
    print "Using proxy at %s" % proxyString
    processor.Proxy = urllib2.ProxyHandler({"http": proxyString})


# Recognize a file at filePath and save result to resultFilePath
示例#19
0
class Pdf2Excel(object):
    """
    build ABBYY OCR into a Pdf2Excel apckage,
    Environment Demands: argparse, AbbyyOnlineSdk
    """
    
    processor = AbbyyOnlineSdk()

    def setup_processor(self):
    	if "ABBYY_APPID" in os.environ:
    		processor.ApplicationId = os.environ["ABBYY_APPID"]

    	if "ABBYY_PWD" in os.environ:
    		processor.Password = os.environ["ABBYY_PWD"]

    	# Proxy settings
    	if "http_proxy" in os.environ:
    		proxy_string = os.environ["http_proxy"]
    		print("Using http proxy at {}".format(proxy_string))
    		processor.Proxies["http"] = proxy_string

    	if "https_proxy" in os.environ:
    		proxy_string = os.environ["https_proxy"]
    		print("Using https proxy at {}".format(proxy_string))
    		processor.Proxies["https"] = proxy_string


    # Recognize a file at filePath and save result to resultFilePath
    def recognize_file(self,file_path, result_file_path, language, output_format):
        print("Uploading..")
        settings = ProcessingSettings()
        settings.Language = language
        settings.OutputFormat = output_format
        task = processor.process_image(file_path, settings)
        if task is None:
            print("Error")
            return
        if task.Status == "NotEnoughCredits":
            print("Not enough credits to process the document. Please add more pages to your application's account.")
            return

        print("Id = {}".format(task.Id))
        print("Status = {}".format(task.Status))

        # Wait for the task to be completed
        print("Waiting..")
        # Note: it's recommended that your application waits at least 2 seconds
        # before making the first getTaskStatus request and also between such requests
        # for the same task. Making requests more often will not improve your
        # application performance.
        # Note: if your application queues several files and waits for them
        # it's recommended that you use listFinishedTasks instead (which is described
        # at http://ocrsdk.com/documentation/apireference/listFinishedTasks/).

        while task.is_active():
            time.sleep(5)
            print(".")
            task = processor.get_task_status(task)

        print("Status = {}".format(task.Status))

        if task.Status == "Completed":
            if task.DownloadUrl is not None:
                processor.download_result(task, result_file_path)
                print("Result was written to {}".format(result_file_path))
        else:
            print("Error processing task")

    def create_parser():
        parser = argparse.ArgumentParser(description="Recognize a file via web service")
        parser.add_argument('source_file')
        parser.add_argument('target_file')

        parser.add_argument('-l', '--language', default='English', help='Recognition language (default: %(default)s)')
        group = parser.add_mutually_exclusive_group()
        group.add_argument('-txt', action='store_const', const='txt', dest='format', default='txt')
        group.add_argument('-pdf', action='store_const', const='pdfSearchable', dest='format')
        group.add_argument('-rtf', action='store_const', const='rtf', dest='format')
        group.add_argument('-docx', action='store_const', const='docx', dest='format')
        group.add_argument('-xml', action='store_const', const='xml', dest='format')

        return parser

    #from process import *
    #这一步即可以,相当于加载上述函数,上面函数本来就是从process.py摘抄出来一步步run的
    def __init__(self,result="myworkbook.xlsx",Language="English",typefile="xlsx"):
        """
        Language input can be found here:https://ocrsdk.com/documentation/specifications/recognition-languages/ 
        typefile input can be found here:https://ocrsdk.com/documentation/specifications/export-formats/ 
        """
        self.setup_processor()
        
        path=input("Your Pdf file path:  like'C:/data/1.pdf'")
        start=time.time()
        self.recognize_file(path, result, Language, typefile)#能识别中文简体ChinesePRC繁体ChineseTaiwan;中英文混合的怎么设置????
        #https://ocrsdk.com/documentation/specifications/recognition-languages/      语言列表
        #https://ocrsdk.com/documentation/specifications/export-formats/         输出格式列表
        T1=time.time()-start
        print('Total time spend %d seconds for 22 page pdf'%T1)
示例#20
0
#!/usr/bin/python
import os
import sys
import json
from lxml import etree
import dateutil.parser
from AbbyyOnlineSdk import *  

def getconfig(cfg):
    return [x.strip() for x in open((os.path.dirname(__file__) or ".")+"/config.txt") if x.startswith(cfg)][0].split("=")[1]

processor = AbbyyOnlineSdk()
processor.ApplicationId = os.getenv("ABBYY_APPID") or getconfig("ABBYY_APPID")
processor.Password = os.getenv("ABBYY_PWD") or getconfig("ABBYY_PWD")

class OCR:
    def __init__(self, imageid):
        self.runocr(imageid)
        self.lines = self.process(etree.parse(open(imageid + ".xml")).getroot())

    def runocr(self, imageid):
        if os.path.exists(imageid + ".xml"):
            return
	settings = ProcessingSettings()
	settings.Language = "English"
	settings.OutputFormat = "xml"
	settings.ImageSource = "photo"
	settings.profile = "textExtraction"
	task = processor.ProcessImage(imageid+".jpg", settings)
	if task == None:
		return
示例#21
0
# Usage: process.py <input file> <output file> [-language <Language>] [-pdf|-txt|-rtf|-docx|-xml]

import argparse
import base64
import getopt
import MultipartPostHandler
import os
import re
import sys
import time
import urllib2
import urllib

from AbbyyOnlineSdk import *

processor = AbbyyOnlineSdk()

if "ReasegurosOCR" in os.environ:
    processor.ApplicationId = os.environ["ReasegurosOCR"]

if "Lzkn0dpJqrLHTUV7besocZ0z" in os.environ:
    processor.Password = os.environ["Lzkn0dpJqrLHTUV7besocZ0z"]

# Proxy settings
if "http_proxy" in os.environ:
    proxyString = os.environ["http_proxy"]
    print "Using proxy at %s" % proxyString
    processor.Proxy = urllib2.ProxyHandler({"http": proxyString})


# Recognize a file at filePath and save result to resultFilePath