def My_OCR(url): SUBSCRIPTION_KEY = os.getenv('OCR_SUBSCRIPTION_KEY') ENDPOINT = os.getenv('OCR_ENDPOINT') CV_CLIENT = ComputerVisionClient( ENDPOINT, CognitiveServicesCredentials(SUBSCRIPTION_KEY)) ocr_results = CV_CLIENT.read(url, raw=True) operation_location_remote = ocr_results.headers["Operation-Location"] operation_id = operation_location_remote.split("/")[-1] status = ["notStarted", "running"] while True: get_handw_text_results = CV_CLIENT.get_read_result(operation_id) if get_handw_text_results.status not in status: break time.sleep(1) succeeded = OperationStatusCodes.succeeded text = [] if get_handw_text_results.status == succeeded: res = get_handw_text_results.analyze_result.read_results for text_result in res: for line in text_result.lines: if len(line.text) <= 8: text.append(line.text) r = re.compile("[0-9A-Z]{2,4}[.-]{1}[0-9A-Z]{2,4}") text = list(filter(r.match, text)) return text[0].replace('.', '-') if len(text) > 0 else ""
class cos5year_vision: computervision_client = None ##以下クラス内呼び出し専用 def __enter__(self): self.computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key)) return self def __exit__(self, exc_type, exc_value, traceback): self.computervision_client = None def __image_read(self,image_url): recognize_results = self.computervision_client.read(image_url, raw=True) operation_location_remote = recognize_results.headers["Operation-Location"] operation_id = operation_location_remote.split("/")[-1] return operation_id def __get_read_result(self,operation_id): ##ここの無限ループどうにかする while True: get_text_results = computervision_client.get_read_result(operation_id) if get_text_results.status not in ['notStarted', 'running']: break time.sleep(1) if get_text_results.status == OperationStatusCodes.succeeded: return get_text_results.analyze_result.read_results ##以下外部呼出し関数 def DetectTexts(self,image_url): id = self.__image_read(image_url) res = self.__get_read_result(id) return [[{'text':line.text,'box':line.bounding_box} for line in text_result.lines] for text_result in res]
def read_image_text(self, client: ComputerVisionClient, remote_url: str): """ read text from an image using the Azure OCR Read API """ print(f"Calling read API on {remote_url}") read_result = client.read(remote_url, raw=True) operation_location_remote = read_result.headers['Operation-Location'] operation_id = operation_location_remote.split('/')[-1] # GET method for read results while True: read_operation_result = client.get_read_result(operation_id) if read_operation_result.status not in [ OperationStatusCodes.not_started, OperationStatusCodes.running]: break time.sleep(1) # Re-check every second return read_operation_result if read_operation_result.status ==\ OperationStatusCodes.succeeded else None
def ocr1(img): #Cognitive Services endpoint and key cog_key = '<Your Primary Key here>' #Paste your primary key here cog_endpoint = '<Endpoint url here>' #Paste your endpoint here # Get a client for the computer vision service computervision_client = ComputerVisionClient( cog_endpoint, CognitiveServicesCredentials(cog_key)) # Submit a request to read printed text in the image and get the operation ID try: recognize_handw_results = computervision_client.read(img, raw=True) except Exception as e: return str(e) operation_location_remote = recognize_handw_results.headers[ "Operation-Location"] operation_id = operation_location_remote.split("/")[-1] while True: get_handw_text_results = computervision_client.get_read_result( operation_id) if get_handw_text_results.status not in ['notStarted', 'running']: break time.sleep(1) if get_handw_text_results.status == OperationStatusCodes.succeeded: res = [] for text_result in get_handw_text_results.analyze_result.read_results: for line in text_result.lines: res.append(str(line.text)) if res != []: res = str(" ".join(res)) #Result of OCR else: res = None return res
def InvAns(strUrl): if 'COMPUTER_VISION_SUBSCRIPTION_KEY' in os.environ: subscription_key = os.environ['COMPUTER_VISION_SUBSCRIPTION_KEY'] else: print("\nSet the COMPUTER_VISION_SUBSCRIPTION_KEY environment variable.\n**Restart your shell or IDE for changes to take effect.**") sys.exit() if 'COMPUTER_VISION_ENDPOINT' in os.environ: endpoint = os.environ['COMPUTER_VISION_ENDPOINT'] else: print("\nSet the COMPUTER_VISION_ENDPOINT environment variable.\n**Restart your shell or IDE for changes to take effect.**") sys.exit() computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key)) # print("===== Batch Read File - remote =====") remote_image_handw_text_url = strUrl recognize_handw_results = computervision_client.read(remote_image_handw_text_url, raw=True) operation_location_remote = recognize_handw_results.headers["Operation-Location"] operation_id = operation_location_remote.split("/")[-1] while True: get_handw_text_results = computervision_client.get_read_result(operation_id) if get_handw_text_results.status not in ['notStarted', 'running']: break time.sleep(1) d = {} abcA = [] defgA = [] hijA = [] klmA = [] nopA= [] qrsA = [] uvwA = [] flat_list = [] if get_handw_text_results.status == OperationStatusCodes.succeeded: for text_result in get_handw_text_results.analyze_result.read_results: for sourceTxt in text_result.lines: new_d = {} new_d[sourceTxt.text] = sourceTxt.bounding_box flat_list.append(sourceTxt.text) def Get_shipping_in_pdf(sourceTxt): # print(cf) shp = r"Shipping.*\d.*$" shp1 = r"Shipping" if (re.findall(shp, sourceTxt)): ans = re.findall(shp, sourceTxt)[0] f_ans_split = ans.split() for i in f_ans_split: if i.isnumeric(): print(i) f_ans = i else: f_ans = '' elif (re.findall(shp1, sourceTxt)): f_ans = re.findall(shp1, sourceTxt)[0] else: f_ans = "0" return f_ans def Get_company_in_pdf(sourceTxt): frm = r"From:" frm1 = r"From" frm2= r"Invoice" if(re.findall(frm, sourceTxt)): f_ans = re.findall(frm, sourceTxt)[0] elif(re.findall(frm1, sourceTxt)): f_ans = re.findall(frm1, sourceTxt)[0] elif(re.findall(frm2, sourceTxt)): f_ans = re.findall(frm2, sourceTxt)[0] else: f_ans = 0 # print(f_ans) return f_ans def Get_order_in_pdf(sourceTxt): prcnt = r"Purchase.*\d.*$" prcnt1 = r"Ord.*\d.*$" prcnt2 = r"Ord.*\D" if (re.findall(prcnt, sourceTxt)): ans = re.findall(prcnt, sourceTxt)[0] f_ans_split = ans.split() for i in f_ans_split: if i.isnumeric(): f_ans = i else: f_ans = '' elif(re.findall(prcnt1, sourceTxt)): ans = re.findall(prcnt1, sourceTxt)[0] f_ans_split = ans.split() for i in f_ans_split: if i.isnumeric(): f_ans = i else: f_ans = '' elif (re.findall(prcnt2, sourceTxt)): f_ans = re.findall(prcnt2, sourceTxt)[0] else: f_ans = '' return f_ans def Get_invoice_in_pdf(sourceTxt): prcnt = r"Invoice.*\d.*$" prcnt1 = r"INV.*\-\d.*$" f_ans = '' if (re.findall(prcnt1, sourceTxt)): f_ans = re.findall(prcnt1, sourceTxt)[0] elif(re.findall(prcnt, sourceTxt)): ans = re.findall(prcnt, sourceTxt)[0] f_ans_split = ans.split() for i in f_ans_split: if i.isnumeric(): f_ans = i else: f_ans = '' else: f_ans = '' return f_ans def Get_vat_in_document(sourceTxt): prcnt = r"([0-9][0-9]%$|[0-9]%$)" if(re.findall(prcnt, sourceTxt)): f_ans = re.findall(prcnt, sourceTxt)[0] else: f_ans = 0 return f_ans def Get_total_amount_in_pdf(sourceTxt): lst = r"\d.*,\d{3}\.?\d*$" dcm = r"\d.*\..?\d*$" spc = r"\d.*\s\d{3}\.\d*$" ans=[] ttl = [] f_ans = [] if(re.findall(lst, sourceTxt)): ans.extend(re.findall(lst, sourceTxt)) elif re.findall(dcm, sourceTxt): ans.extend(re.findall(dcm, sourceTxt)) elif re.findall(spc, sourceTxt): ans.extend(re.findall(spc, sourceTxt)) if ans: for i in ans: rem_c = i.split(',') try: ttl.append(float("".join(rem_c))) except ValueError: pass if ttl == []: pass else: f_ans.extend(ttl) return f_ans def Get_date_in_pdf(sourceTxt): date_format = [ r"\d{2} (?:%s) \d{4}" % '|'.join(calendar.month_abbr[1:]), r"\d{1} (?:%s) \d{4}" % '|'.join(calendar.month_abbr[1:]), r"\d{2} (?:%s) \d{4}" % '|'.join(calendar.month_name), r"\d{1} (?:%s) \d{4}" % '|'.join(calendar.month_name[1:]), r"(?:%s) \d{2} \d{4}" % '|'.join(calendar.month_abbr[1:]), r"(?:%s) \d{1} \d{4}" % '|'.join(calendar.month_abbr[1:]), r"(?:%s) \d{2} \d{4}" % '|'.join(calendar.month_name), r"(?:%s) \d{1} \d{4}" % '|'.join(calendar.month_name[1:]),r"\d{2} (?:%s), \d{4}" % '|'.join(calendar.month_abbr[1:]), r"\d{1} (?:%s), \d{4}" % '|'.join(calendar.month_abbr[1:]), r"\d{2} (?:%s), \d{4}" % '|'.join(calendar.month_name), r"\d{1} (?:%s), \d{4}" % '|'.join(calendar.month_name[1:]), r"(?:%s) \d{2}, \d{4}" % '|'.join(calendar.month_abbr[1:]), r"(?:%s) \d{1}, \d{4}" % '|'.join(calendar.month_abbr[1:]), r"(?:%s) \d{2}, \d{4}" % '|'.join(calendar.month_name), r"(?:%s) \d{1}, \d{4}" % '|'.join(calendar.month_name[1:])] dats = [] f_dats = '' for i in date_format: if(re.findall(i, sourceTxt)): dats.extend(re.findall(i, sourceTxt)) else: ans = re.findall(r'([1-9]|1[0-9]|2[0-9]|3[0-1]|0[0-9])(.|-|\/)([1-9]|1[0-2]|2[0-9]|3[0-9])(.|-|\/)(20[0-9][0-9])',sourceTxt) ans_f = [''.join(ans[i]) for i in range(len(ans))] dats.extend(ans_f) for sublist in dats: if any(sublist): f_dats = sublist break return f_dats abc = Get_date_in_pdf(sourceTxt.text) defg = Get_total_amount_in_pdf(sourceTxt.text) hij = Get_vat_in_document(sourceTxt.text) klm = Get_invoice_in_pdf(sourceTxt.text) nop = Get_order_in_pdf(sourceTxt.text) qrs = Get_company_in_pdf(sourceTxt.text) uvw = Get_shipping_in_pdf(sourceTxt.text) if abc=="" and defg=="" and hij: continue elif abc: abcA.append(abc) continue elif defg: defgA.append(defg) continue elif hij: hijA.append(hij) elif klm: klmA.append(klm) elif nop: nopA.append(nop) elif qrs: print(qrs) qrsA.append(qrs) elif uvw: uvwA.append(uvw) def get_index_ord(inp, lst): indx = lst.index(inp) f_ans = lst[indx + 1] return f_ans def get_indx_purchase(inp, lst): indx = int(lst.index(inp)) f_ans = '' for i in lst[indx:indx+4]: if i.isnumeric(): f_ans = i break else: f_ans ='' return f_ans def get_company_name(inp,lst): idx = int(lst.index(inp)) f_ans = '' f_indx = int(idx + 1) for i in lst[f_indx]: if i.isnumeric(): f_ans = lst[f_indx + 1] else: f_ans = lst[f_indx] return f_ans def get_company_name_2(sublist, lst): f_ans = '' print(sublist) if "From:" in sublist: f_ans = get_company_name("From:", lst) print(f_ans) elif "From" in sublist: f_ans = get_company_name("From", lst) elif "Invoice" in sublist: f_ans = get_company_name("Invoice", lst) else: f_ans = lst[0] return f_ans def Get_shipping_cost(inp, lst): f_ans = '' inpt = '' if inp: inpt = inp[0] else: inpt = "0" if inpt.isnumeric(): f_ans = inpt else: indx = lst.index(inpt) f_indx = int(indx + 1) f_ans = lst[f_indx] if f_ans.isnumeric(): f_ans = lst[f_indx] else: f_ans = 0 return f_ans order_2 = get_index_ord(nopA[0], flat_list) company_name = get_company_name_2(qrsA, flat_list) purchase_2 = get_indx_purchase("Invoice", flat_list) def vat_2(inp): ans_vat = '' new_set = list(set([i[0] for i in inp])) new_set.sort() if new_set[-1] - new_set[-2] in new_set: ans_vat = new_set[-1] - new_set[-2] else: ans_vat = 0 return ans_vat vat_2ans = vat_2(defgA) shipping = int(Get_shipping_cost(uvwA, flat_list)) # print(shipping) d["date"] = min(abcA) d["total_amount"] = max(defgA)[0] d["vat"] = vat_2ans - shipping if int(hijA[0][0:-1]) /100 * d["total_amount"] == 0 else int(hijA[0][0:-1]) /100 * d["total_amount"] - shipping d["invoice_number"] = klmA[0] if klmA[0] else purchase_2 d["purchase_order_number"] = nopA[0] if nopA[0].isnumeric() else order_2 d["supplier_name"] = company_name # print("Apiyo", d) return d
from azure.cognitiveservices.vision.computervision import ComputerVisionClient from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes from msrest.authentication import CognitiveServicesCredentials import time subscription_key = "0f5bb668eab84237bd5742644f3cae02" endpoint = "https://ocr-test-for-python.cognitiveservices.azure.com/" computervision_client = ComputerVisionClient( endpoint, CognitiveServicesCredentials(subscription_key)) remote_image_handw_text_url = "0.jpg" # Call API with URL and raw response (allows you to get the operation location) recognize_handw_results = computervision_client.read( remote_image_handw_text_url, raw=True) # Get the operation location (URL with an ID at the end) from the response operation_location_remote = recognize_handw_results.headers[ "Operation-Location"] # Grab the ID from the URL operation_id = operation_location_remote.split("/")[-1] # Call the "GET" API and wait for it to retrieve the results while True: get_handw_text_results = computervision_client.get_read_result( operation_id) if get_handw_text_results.status not in ['notStarted', 'running']: break time.sleep(1)
mlpreview(url) # This requires two calls using batch_read_file() and # get_read_operation_result(). The call to batch_read_file() is # asynchronous. In the results of the call to # get_read_operation_result(), we need to check if the first call # completed with OperationStatusCodes before extracting the text # data. The results include the text as well as the bounding box # coordinates for the text. raw = True numberOfCharsInOperationId = 36 # Asynchronous call. try: rawHttpResponse = client.read(url, raw=raw) except Exception as e: catch_exception(e, url) # Get ID from returned headers. operationLocation = rawHttpResponse.headers["Operation-Location"] idLocation = len(operationLocation) - numberOfCharsInOperationId operationId = operationLocation[idLocation:] # Wait for the result. while True: result = client.get_read_result(operationId) if result.status not in [ OperationStatusCodes.not_started, OperationStatusCodes.running
''' END - Authenticate ''' # <snippet_read_call> ''' OCR: Read File using the Read API, extract text - remote This example will extract text in an image, then print results, line by line. This API call can also extract handwriting style text (not shown). ''' print("===== Read File - remote =====") # Get an image with text read_image_url = "https://raw.githubusercontent.com/MicrosoftDocs/azure-docs/master/articles/cognitive-services/Computer-vision/Images/readsample.jpg" # Call API with URL and raw response (allows you to get the operation location) read_response = computervision_client.read(read_image_url, raw=True) # </snippet_read_call> # <snippet_read_response> # Get the operation location (URL with an ID at the end) from the response read_operation_location = read_response.headers["Operation-Location"] # Grab the ID from the URL operation_id = read_operation_location.split("/")[-1] # Call the "GET" API and wait for it to retrieve the results while True: read_result = computervision_client.get_read_result(operation_id) if read_result.status not in ['notStarted', 'running']: break time.sleep(1)
''' END - Batch Read File - local ''' # <snippet_read_call> ''' Batch Read File, recognize printed text - remote This example will extract printed text in an image, then print results, line by line. This API call can also recognize handwriting (not shown). ''' print("===== Batch Read File - remote =====") # Get an image with printed text remote_image_printed_text_url = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/printed_text.jpg" # Call API with URL and raw response (allows you to get the operation location) recognize_printed_results = computervision_client.read( remote_image_printed_text_url, language='en', raw=True) # </snippet_read_call> # <snippet_read_response> # Get the operation location (URL with an ID at the end) from the response operation_location_remote = recognize_printed_results.headers[ "Operation-Location"] # Grab the ID from the URL operation_id = operation_location_remote.split("/")[-1] # Call the "GET" API and wait for it to retrieve the results while True: get_printed_text_results = computervision_client.get_read_result( operation_id) if get_printed_text_results.status not in ['NotStarted', 'Running']: break
# Set credentials credentials = CognitiveServicesCredentials(key) # Create client client = ComputerVisionClient(endpoint, credentials) # change this URL to reflect the image that you would like to test. url = "https://azurecomcdn.azureedge.net/cvt-181c82bceabc9fab9ec6f3dca486738800e04b45a0b3c1268609c94f4d67173a/images/shared/cognitive-services-demos/analyze-image/analyze-6-thumbnail.jpg" # image_path = "images/computer_vision_ocr.png" lang = 'en' raw = True custom_headers = None # Read an image from a url rawHttpResponse = client.read(url, language=lang, custom_headers=custom_headers, raw=raw) # Uncomment the following code and comment out line 37 to read from image stream # with open(image_path, "rb") as image_stream: # rawHttpResponse = client.read_in_stream( # image=image_stream, language=lang, # # Raw will return the raw response which can be used to find the operation_id # raw=True # ) # Get ID from returned headers operationLocation = rawHttpResponse.headers["Operation-Location"] operationId = operationLocation.split('/')[-1] # SDK call while True: