def main(image_file, output_file): global processor processor = AbbyyOnlineSdk() setup_processor() if os.path.isfile(image_file): recognize_file(image_file, output_file, "", "txt") else: print("No such file: {}".format(image_file))
def __init__(self, indir, outdir, pages, language): self.processor = AbbyyOnlineSdk() self.processor.ApplicationId = "" self.processor.Password = "" self.outputFormat = 'txt' self.language = language self.indir = indir self.pages = pages self.outdir = outdir if not os.path.exists(self.outdir): os.makedirs(self.outdir)
def main_process(src, lang='English', output_format='txt'): global processor processor = AbbyyOnlineSdk.AbbyyOnlineSdk() setup_processor() filename = src.replace('.jpg', '.txt') filename = filename.replace('images', 'dst') dst = filename if os.path.isfile(src): recognize_file(src, dst, lang, output_format) return True else: print("No such file: {}".format(src)) return False
def recognizeFile(filePath, resultFilePath, language, outputFormat): processor = AbbyyOnlineSdk() if "ABBYY_APPID" in os.environ: processor.ApplicationId = os.environ["ABBYY_APPID"] if "ABBYY_PWD" in os.environ: processor.Password = os.environ["ABBYY_PWD"] # Proxy settings if "http_proxy" in os.environ: proxyString = os.environ["http_proxy"] print "Using proxy at %s" % proxyString processor.Proxy = urllib2.ProxyHandler({"http": proxyString}) print "Uploading.." settings = ProcessingSettings() settings.Language = language settings.OutputFormat = outputFormat task = processor.ProcessImage(filePath, settings) if task == None: print "Error" return print "Id = %s" % task.Id print "Status = %s" % task.Status # Wait for the task to be completed sys.stdout.write("Waiting..") # Note: it's recommended that your application waits at least 2 seconds # before making the first getTaskStatus request and also between such requests # for the same task. Making requests more often will not improve your # application performance. # Note: if your application queues several files and waits for them # it's recommended that you use listFinishedTasks instead (which is described # at http://ocrsdk.com/documentation/apireference/listFinishedTasks/). while task.IsActive() == True: time.sleep(5) sys.stdout.write(".") task = processor.GetTaskStatus(task) print "Status = %s" % task.Status if task.Status == "Completed": if task.DownloadUrl != None: processor.DownloadResult(task, resultFilePath) print "Result was written to %s" % resultFilePath else: print "Error processing task"
def main(): global processor processor = AbbyyOnlineSdk() setup_processor() args = create_parser().parse_args() source_file = args.source_file target_file = args.target_file language = args.language output_format = args.format if os.path.isfile(source_file): recognize_file(source_file, target_file, language, output_format) else: print("No such file: {}".format(source_file))
def main(): global processor processor = AbbyyOnlineSdk() setup_processor() args = create_parser().parse_args() options = { "language": args.language, "operation": args.operation, "outputFormat": args.format, "textType": args.textType } input_folder = os.path.join(os.getcwd(), 'input') output_folder = os.path.join(os.getcwd(), 'output') extension = format_extension[options['outputFormat']] if options['operation'] == 'processTextField': extension = format_extension['xml'] for root, dirs, files in os.walk(input_folder): for file in files: nome_arquivo_saida = os.path.splitext(file)[0] if nome_arquivo_saida.startswith('.'): continue caminho_arquivo_entrada = os.path.join(input_folder, file) caminho_arquivo_saida = os.path.join(output_folder, \ '.'.join((nome_arquivo_saida, extension))) if os.path.isfile(caminho_arquivo_entrada): process_file(options, caminho_arquivo_entrada, caminho_arquivo_saida) else: print("No such file: {}".format(caminho_arquivo_entrada))
# Usage: process.py <input file> <output file> [-language <Language>] [-pdf|-txt|-rtf|-docx|-xml] import argparse import base64 import getopt import MultipartPostHandler import os import re import sys import time import urllib2 import urllib from AbbyyOnlineSdk import * processor = AbbyyOnlineSdk() #if "ABBYY_APPID" in os.environ: processor.ApplicationId = 'Email Signature Finder and Parser' #os.environ["ABBYY_APPID"] #if "ABBYY_PWD" in os.environ: processor.Password = '******' #os.environ["ABBYY_PWD"] # Proxy settings if "http_proxy" in os.environ: proxyString = os.environ["http_proxy"] print "Using proxy at %s" % proxyString processor.Proxy = urllib2.ProxyHandler({"http": proxyString}) # Recognize a file at filePath and save result to resultFilePath
class Pdf2Excel(object): """ build ABBYY OCR into a Pdf2Excel apckage, Environment Demands: argparse, AbbyyOnlineSdk """ processor = AbbyyOnlineSdk() def setup_processor(self): if "ABBYY_APPID" in os.environ: processor.ApplicationId = os.environ["ABBYY_APPID"] if "ABBYY_PWD" in os.environ: processor.Password = os.environ["ABBYY_PWD"] # Proxy settings if "http_proxy" in os.environ: proxy_string = os.environ["http_proxy"] print("Using http proxy at {}".format(proxy_string)) processor.Proxies["http"] = proxy_string if "https_proxy" in os.environ: proxy_string = os.environ["https_proxy"] print("Using https proxy at {}".format(proxy_string)) processor.Proxies["https"] = proxy_string # Recognize a file at filePath and save result to resultFilePath def recognize_file(self,file_path, result_file_path, language, output_format): print("Uploading..") settings = ProcessingSettings() settings.Language = language settings.OutputFormat = output_format task = processor.process_image(file_path, settings) if task is None: print("Error") return if task.Status == "NotEnoughCredits": print("Not enough credits to process the document. Please add more pages to your application's account.") return print("Id = {}".format(task.Id)) print("Status = {}".format(task.Status)) # Wait for the task to be completed print("Waiting..") # Note: it's recommended that your application waits at least 2 seconds # before making the first getTaskStatus request and also between such requests # for the same task. Making requests more often will not improve your # application performance. # Note: if your application queues several files and waits for them # it's recommended that you use listFinishedTasks instead (which is described # at http://ocrsdk.com/documentation/apireference/listFinishedTasks/). while task.is_active(): time.sleep(5) print(".") task = processor.get_task_status(task) print("Status = {}".format(task.Status)) if task.Status == "Completed": if task.DownloadUrl is not None: processor.download_result(task, result_file_path) print("Result was written to {}".format(result_file_path)) else: print("Error processing task") def create_parser(): parser = argparse.ArgumentParser(description="Recognize a file via web service") parser.add_argument('source_file') parser.add_argument('target_file') parser.add_argument('-l', '--language', default='English', help='Recognition language (default: %(default)s)') group = parser.add_mutually_exclusive_group() group.add_argument('-txt', action='store_const', const='txt', dest='format', default='txt') group.add_argument('-pdf', action='store_const', const='pdfSearchable', dest='format') group.add_argument('-rtf', action='store_const', const='rtf', dest='format') group.add_argument('-docx', action='store_const', const='docx', dest='format') group.add_argument('-xml', action='store_const', const='xml', dest='format') return parser #from process import * #这一步即可以,相当于加载上述函数,上面函数本来就是从process.py摘抄出来一步步run的 def __init__(self,result="myworkbook.xlsx",Language="English",typefile="xlsx"): """ Language input can be found here:https://ocrsdk.com/documentation/specifications/recognition-languages/ typefile input can be found here:https://ocrsdk.com/documentation/specifications/export-formats/ """ self.setup_processor() path=input("Your Pdf file path: like'C:/data/1.pdf'") start=time.time() self.recognize_file(path, result, Language, typefile)#能识别中文简体ChinesePRC繁体ChineseTaiwan;中英文混合的怎么设置???? #https://ocrsdk.com/documentation/specifications/recognition-languages/ 语言列表 #https://ocrsdk.com/documentation/specifications/export-formats/ 输出格式列表 T1=time.time()-start print('Total time spend %d seconds for 22 page pdf'%T1)
import AbbyyOnlineSdk as abbyy import local_settings as ls reload(ls) # <codecell> repo_dir = ls.repo_dir doc_dir = os.path.join(repo_dir, 'download_PFDs') # <codecell> doc_files = os.listdir(doc_dir) # <codecell> processor = abbyy.AbbyyOnlineSdk() # <codecell> processor.ApplicationId = ls.app_id processor.Password = ls.app_password # <codecell> settings = abbyy.ProcessingSettings() # <codecell> settings.Language # <codecell>
def main(): global processor # ------------------------------------ GLOBAL -----------------------------------------------------# browser = Browser('chrome') url = "https://goo.gl/gTWejF" browser.visit(url) # ------------------------------------------------------- LOGIN -----------------------------------------# browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Uname").fill( "C31052018427") # Enter the Users name browser.find_by_id("ctl00_ContentPlaceHolder1_txt_pass").fill( "5SSGS") # entering the password browser.find_by_id( "ctl00_ContentPlaceHolder1_btnsubmit").click() # submit to login # ------------------------------------------------ LOGED IN ----------------------------------------------# # ------------------------------------------------- FORM FILLING ------------------------------------------# no_box = browser.find_by_xpath( '//*[@id="ctl00_ContentPlaceHolder1_drp_pagejump"]/option') print "#----------------------START HERE -----------------------------------#" tic = time.clock() print "#------------------ INFORMATION ON THE PAGE ARE : -----------------#" print "THE LENGTH OF THE TOTAL PAGES ARE: " print len(no_box) id = 0 while 0 <= id <= 149: no_box = browser.find_by_xpath( '//*[@id="ctl00_ContentPlaceHolder1_drp_pagejump"]/option' ) # getting option out using xpath no_box[id].click() time.sleep(1) # ------------------------------------------------ PRINTING THE IMAGE URL FOR DOWNLOAD --# image_url = browser.find_by_id( 'ctl00_ContentPlaceHolder1_MainImg')['src'] # ----------------------------------------- SAVING IMAGE VIA URLLIB --------------------------------# urllib.urlretrieve(image_url, "locol1.png") time.sleep(0.5) image = Image.open('locol1.png') inverted_image = PIL.ImageOps.invert(image) time.sleep(0.5) inverted_image.save('final.png') time.sleep(0.5) # -------------------------------------------- IMAGE SAVED AS (loco.png) -----------------------------# processor = AbbyyOnlineSdk() setup_processor() source_file = 'final.png' target_file = 'result.txt' language = 'English' output_format = 'txt' if os.path.isfile(source_file): recognize_file(source_file, target_file, language, output_format) else: print("No such file: {}".format(source_file)) with open('result.txt', 'r') as res: text = res.readlines() # print text lin_cnt = text.count("\n") print lin_cnt a = '' num_lines = sum(1 for line in open('result.txt')) print num_lines if num_lines == 3: for k in range(0, num_lines): a = a + text[k] b = a.decode('unicode_escape').encode('ascii', 'ignore') c = str(b).split("-") final = [] for items in c: final.append(re.sub(' ', '', items)) time.sleep(1) print len(final) final1 = [] for items in final: final1.append(re.sub('\n', '', items)) flist1 = [] for i in range(0, len(final1)): if i == 14: flist1.extend(final1[i].split(" ")) else: flist1.append(final1[i]) last_arr = [] for i in flist1: if i != '': last_arr.append(i) print last_arr print len(last_arr) print("The page that is filling now is : " + str(id + 1)) print "----------Filling starts from here-------------------" try: time.sleep(1) browser.find_by_id("ctl00_ContentPlaceHolder1_txt_tbc").fill( last_arr[0].strip()) time.sleep(1) # first name browser.find_by_id("ctl00_ContentPlaceHolder1_txt_name").fill( last_arr[1].strip()) time.sleep(1) # last name browser.find_by_id("ctl00_ContentPlaceHolder1_txt_email").fill( last_arr[2].strip()) time.sleep(1) # email browser.find_by_id("ctl00_ContentPlaceHolder1_txt_mobno").fill( last_arr[3].strip()) time.sleep(1) # mobile number browser.find_by_id( "ctl00_ContentPlaceHolder1_txt_gender").fill( last_arr[4].strip()) time.sleep(1) # gender browser.find_by_id( "ctl00_ContentPlaceHolder1_txt_licenceno").fill( last_arr[5].strip()) time.sleep(1) # licence number browser.find_by_id("ctl00_ContentPlaceHolder1_txt_girno").fill( last_arr[6].strip()) # grid number time.sleep(1) browser.find_by_id("ctl00_ContentPlaceHolder1_txt_panno").fill( last_arr[7].strip()) time.sleep(1) # pan number browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Hadd").fill( last_arr[8].strip()) time.sleep(1) # state browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Hcity").fill( last_arr[9].strip()) time.sleep(1) # city browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Hpin").fill( last_arr[10].strip()) time.sleep(1) # pin browser.find_by_id( "ctl00_ContentPlaceHolder1_txt_HState").fill( last_arr[11].strip()) time.sleep(1) # address browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Oadd").fill( last_arr[12].strip()) time.sleep(1) # address browser.find_by_id("ctl00_ContentPlaceHolder1_txt_Ocity").fill( last_arr[13].strip()) # city time.sleep(1) browser.find_by_id( "ctl00_ContentPlaceHolder1_txt_Opincode").fill( last_arr[14].strip()) time.sleep(1) # pincode browser.find_by_id( "ctl00_ContentPlaceHolder1_txt_loanapproval").fill( last_arr[15].strip()) time.sleep(1) # loan approval browser.find_by_id("ctl00_ContentPlaceHolder1_txt_menno").fill( last_arr[16].strip()) time.sleep(1) # men number browser.find_by_id("ctl00_ContentPlaceHolder1_txt_af").fill( last_arr[17].strip()) time.sleep(1) # af browser.find_by_id("ctl00_ContentPlaceHolder1_txt_nri").fill( last_arr[18].strip()) time.sleep(1) # nri browser.find_by_id("ctl00_ContentPlaceHolder1_txt_cp").fill( last_arr[19].strip()) time.sleep(1) # cpi # ------------------------------------------------ SUBMISSION ON END # browser.find_by_id( "ctl00_ContentPlaceHolder1_btnsubmit").click() # submit toc = time.clock() a = tic - toc print print("THE TIME TAKEN TO COMPLETE THE FORM IS : ") print(a) print "#------------------------ PAGE COMPLETED SUCCESSFULLY--------------------------#" time.sleep(1) id = id + 1 except: print( "------------THERe WAS AN ERROR SO PLEASE CHECK THE PAGE FOR THE ERROR IN THE ERROR LOG---------" ) f = open("error_seethatosee.txt", "a") f.write( "--------------LESS ELEMENT ERROR (CASE 1)-------------------- \n " ) f.write("-----> " + " " + str(id + 1) + " " + "<---THis is the page that is not filled") f.write("\n") f.write(str(len(last_arr)) + "\n") f.write("\n") f.write(str(last_arr) + "\n") f.write( "Please Check the above page because it was not filled \n " ) f.close() id = id + 1 else: print "--------------------THERE IS AN ERROR CHECK FOR THE ERROR---------------" f = open("error_seethatosee.txt", "a") f.write( "-----------------------LESS COMPONENT ERROR (CASE 2)-------------------- \n " ) f.write(str(id + 1)) f.write("\n") f.write(str(len(last_arr)) + "\n") f.write("\n") f.write(str(last_arr) + "\n") f.write( "Please Check the above page because it was not filled \n ") f.close() id = id + 1 continue