Exemplo n.º 1
0
def My_OCR(url):

    SUBSCRIPTION_KEY = os.getenv('OCR_SUBSCRIPTION_KEY')
    ENDPOINT = os.getenv('OCR_ENDPOINT')
    CV_CLIENT = ComputerVisionClient(
        ENDPOINT, CognitiveServicesCredentials(SUBSCRIPTION_KEY))

    ocr_results = CV_CLIENT.read(url, raw=True)
    operation_location_remote = ocr_results.headers["Operation-Location"]
    operation_id = operation_location_remote.split("/")[-1]

    status = ["notStarted", "running"]
    while True:
        get_handw_text_results = CV_CLIENT.get_read_result(operation_id)
        if get_handw_text_results.status not in status:
            break
        time.sleep(1)

    succeeded = OperationStatusCodes.succeeded

    text = []
    if get_handw_text_results.status == succeeded:
        res = get_handw_text_results.analyze_result.read_results
        for text_result in res:
            for line in text_result.lines:
                if len(line.text) <= 8:
                    text.append(line.text)

    r = re.compile("[0-9A-Z]{2,4}[.-]{1}[0-9A-Z]{2,4}")
    text = list(filter(r.match, text))

    return text[0].replace('.', '-') if len(text) > 0 else ""
Exemplo n.º 2
0
class cos5year_vision:
    computervision_client = None

    ##以下クラス内呼び出し専用
    def __enter__(self):
        self.computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.computervision_client = None
    
    def __image_read(self,image_url):
        recognize_results = self.computervision_client.read(image_url,  raw=True)
        operation_location_remote = recognize_results.headers["Operation-Location"]
        operation_id = operation_location_remote.split("/")[-1]
        return operation_id
        
    def __get_read_result(self,operation_id):
        ##ここの無限ループどうにかする
        while True:
            get_text_results = computervision_client.get_read_result(operation_id)
            if get_text_results.status not in ['notStarted', 'running']:
                break
            time.sleep(1)
        if get_text_results.status == OperationStatusCodes.succeeded:
            return get_text_results.analyze_result.read_results

    ##以下外部呼出し関数

    def DetectTexts(self,image_url):
        id = self.__image_read(image_url)
        res = self.__get_read_result(id)
        return [[{'text':line.text,'box':line.bounding_box} for line in text_result.lines] for text_result in res]
Exemplo n.º 3
0
    def read_image_text(self, client: ComputerVisionClient, remote_url: str):
        """
        read text from an image using the Azure OCR Read API
        """
        print(f"Calling read API on {remote_url}")

        read_result = client.read(remote_url, raw=True)

        operation_location_remote = read_result.headers['Operation-Location']
        operation_id = operation_location_remote.split('/')[-1]

        # GET method for read results
        while True:
            read_operation_result = client.get_read_result(operation_id)
            if read_operation_result.status not in [
                OperationStatusCodes.not_started,
                    OperationStatusCodes.running]:
                break
            time.sleep(1)  # Re-check every second

        return read_operation_result if read_operation_result.status ==\
            OperationStatusCodes.succeeded else None
Exemplo n.º 4
0
def ocr1(img):

    #Cognitive Services endpoint and key
    cog_key = '<Your Primary Key here>'  #Paste your primary key here
    cog_endpoint = '<Endpoint url here>'  #Paste your endpoint here

    # Get a client for the computer vision service
    computervision_client = ComputerVisionClient(
        cog_endpoint, CognitiveServicesCredentials(cog_key))

    # Submit a request to read printed text in the image and get the operation ID
    try:
        recognize_handw_results = computervision_client.read(img, raw=True)
    except Exception as e:
        return str(e)

    operation_location_remote = recognize_handw_results.headers[
        "Operation-Location"]
    operation_id = operation_location_remote.split("/")[-1]

    while True:
        get_handw_text_results = computervision_client.get_read_result(
            operation_id)
        if get_handw_text_results.status not in ['notStarted', 'running']:
            break
        time.sleep(1)

    if get_handw_text_results.status == OperationStatusCodes.succeeded:
        res = []
        for text_result in get_handw_text_results.analyze_result.read_results:
            for line in text_result.lines:
                res.append(str(line.text))

    if res != []:
        res = str(" ".join(res))  #Result of OCR
    else:
        res = None

    return res
Exemplo n.º 5
0
def InvAns(strUrl):
    
    if 'COMPUTER_VISION_SUBSCRIPTION_KEY' in os.environ:
        subscription_key = os.environ['COMPUTER_VISION_SUBSCRIPTION_KEY']
    else:
        print("\nSet the COMPUTER_VISION_SUBSCRIPTION_KEY environment variable.\n**Restart your shell or IDE for changes to take effect.**")
        sys.exit()

    if 'COMPUTER_VISION_ENDPOINT' in os.environ:
        endpoint = os.environ['COMPUTER_VISION_ENDPOINT']
    else:
        print("\nSet the COMPUTER_VISION_ENDPOINT environment variable.\n**Restart your shell or IDE for changes to take effect.**")
        sys.exit()

    computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))


    # print("===== Batch Read File - remote =====")

    remote_image_handw_text_url = strUrl

    recognize_handw_results = computervision_client.read(remote_image_handw_text_url,  raw=True)

    operation_location_remote = recognize_handw_results.headers["Operation-Location"]

    operation_id = operation_location_remote.split("/")[-1]

    while True:
        get_handw_text_results = computervision_client.get_read_result(operation_id)
        if get_handw_text_results.status not in ['notStarted', 'running']:
            break
        time.sleep(1)
    d = {}
    abcA = []
    defgA = []
    hijA = []
    klmA = []
    nopA= []
    qrsA = []
    uvwA = []
    flat_list = []
    if get_handw_text_results.status == OperationStatusCodes.succeeded:
        for text_result in get_handw_text_results.analyze_result.read_results:
            
            for sourceTxt in text_result.lines:
                new_d = {}
                new_d[sourceTxt.text] = sourceTxt.bounding_box
                flat_list.append(sourceTxt.text)


                def Get_shipping_in_pdf(sourceTxt):
                    # print(cf)
                    shp = r"Shipping.*\d.*$"
                    shp1 = r"Shipping"
                    if (re.findall(shp, sourceTxt)):
                        ans = re.findall(shp, sourceTxt)[0]
                        f_ans_split = ans.split()
                        for i in f_ans_split:
                            if i.isnumeric():
                                print(i)
                                f_ans = i
                            else:
                                f_ans = ''
                    elif (re.findall(shp1, sourceTxt)):
                        f_ans = re.findall(shp1, sourceTxt)[0]   
                    else:
                        f_ans = "0"
                    return f_ans
                
                def Get_company_in_pdf(sourceTxt):
                    frm = r"From:"
                    frm1 = r"From"
                    frm2= r"Invoice"
                    if(re.findall(frm, sourceTxt)):
                        f_ans = re.findall(frm, sourceTxt)[0]
                    elif(re.findall(frm1, sourceTxt)):
                        f_ans = re.findall(frm1, sourceTxt)[0]
                    elif(re.findall(frm2, sourceTxt)):
                        f_ans = re.findall(frm2, sourceTxt)[0]
                    else:
                        f_ans = 0
                    # print(f_ans)    
                    return f_ans
                
                def Get_order_in_pdf(sourceTxt):
                    prcnt = r"Purchase.*\d.*$"
                    prcnt1 = r"Ord.*\d.*$"
                    prcnt2 = r"Ord.*\D"
                    if (re.findall(prcnt, sourceTxt)):
                        ans = re.findall(prcnt, sourceTxt)[0]
                        f_ans_split = ans.split()
                        for i in f_ans_split:
                            if i.isnumeric():
                                f_ans = i
                            else:
                                f_ans = ''
                    elif(re.findall(prcnt1, sourceTxt)):
                        ans = re.findall(prcnt1, sourceTxt)[0]
                        f_ans_split = ans.split()
                        for i in f_ans_split:
                            if i.isnumeric():
                                f_ans = i
                            else:
                                f_ans = ''
                    elif (re.findall(prcnt2, sourceTxt)):
                        f_ans = re.findall(prcnt2, sourceTxt)[0]   
                    else:
                        f_ans = ''
                    return f_ans

                def Get_invoice_in_pdf(sourceTxt):
                    
                    prcnt = r"Invoice.*\d.*$"
                    prcnt1 =  r"INV.*\-\d.*$"
                    f_ans = ''
                    if (re.findall(prcnt1, sourceTxt)):
                        f_ans = re.findall(prcnt1, sourceTxt)[0]
                    elif(re.findall(prcnt, sourceTxt)):
                        ans = re.findall(prcnt, sourceTxt)[0]
                        f_ans_split = ans.split()
                        for i in f_ans_split:
                            if i.isnumeric():
                                f_ans = i 
                            else:
                                f_ans = '' 
                    else:
                        f_ans = '' 
                    return f_ans

                def Get_vat_in_document(sourceTxt):
                    
                    prcnt = r"([0-9][0-9]%$|[0-9]%$)"

                    if(re.findall(prcnt, sourceTxt)):
                        f_ans = re.findall(prcnt, sourceTxt)[0]
                    else:
                        f_ans = 0
        
                    return f_ans
                
                def Get_total_amount_in_pdf(sourceTxt):
                    lst = r"\d.*,\d{3}\.?\d*$"
                    dcm = r"\d.*\..?\d*$"
                    spc = r"\d.*\s\d{3}\.\d*$"
                    ans=[]
                    ttl = []
                    f_ans = []
                    if(re.findall(lst, sourceTxt)):
                        ans.extend(re.findall(lst, sourceTxt))
                    elif re.findall(dcm, sourceTxt):
                        ans.extend(re.findall(dcm, sourceTxt))
                    elif re.findall(spc, sourceTxt):
                        ans.extend(re.findall(spc, sourceTxt))    
                    if ans:
                        for i in ans:
                            rem_c = i.split(',')
                            try:
                                ttl.append(float("".join(rem_c)))
                            except ValueError:
                                pass    
                
                    if ttl == []:
                        pass
                    else:
                        f_ans.extend(ttl)   
                        
                    return f_ans
                    
                def Get_date_in_pdf(sourceTxt):
                    date_format = [ r"\d{2} (?:%s) \d{4}" % '|'.join(calendar.month_abbr[1:]),  r"\d{1} (?:%s) \d{4}" % '|'.join(calendar.month_abbr[1:]), r"\d{2} (?:%s) \d{4}" % '|'.join(calendar.month_name),  r"\d{1} (?:%s) \d{4}" % '|'.join(calendar.month_name[1:]), r"(?:%s) \d{2} \d{4}" % '|'.join(calendar.month_abbr[1:]),  r"(?:%s) \d{1} \d{4}" % '|'.join(calendar.month_abbr[1:]), r"(?:%s) \d{2} \d{4}" % '|'.join(calendar.month_name),  r"(?:%s) \d{1} \d{4}" % '|'.join(calendar.month_name[1:]),r"\d{2} (?:%s), \d{4}" % '|'.join(calendar.month_abbr[1:]),  r"\d{1} (?:%s), \d{4}" % '|'.join(calendar.month_abbr[1:]), r"\d{2} (?:%s), \d{4}" % '|'.join(calendar.month_name),  r"\d{1} (?:%s), \d{4}" % '|'.join(calendar.month_name[1:]), r"(?:%s) \d{2}, \d{4}" % '|'.join(calendar.month_abbr[1:]),  r"(?:%s) \d{1}, \d{4}" % '|'.join(calendar.month_abbr[1:]), r"(?:%s) \d{2}, \d{4}" % '|'.join(calendar.month_name),  r"(?:%s) \d{1}, \d{4}" % '|'.join(calendar.month_name[1:])]

                    dats = []
                    f_dats = ''
                    


                    for i in date_format:
                        if(re.findall(i, sourceTxt)):
                            dats.extend(re.findall(i, sourceTxt))
                        else:
                            ans = re.findall(r'([1-9]|1[0-9]|2[0-9]|3[0-1]|0[0-9])(.|-|\/)([1-9]|1[0-2]|2[0-9]|3[0-9])(.|-|\/)(20[0-9][0-9])',sourceTxt)
                            ans_f = [''.join(ans[i]) for i in range(len(ans))]
                            dats.extend(ans_f)
                    for sublist in dats:

                        if any(sublist): 
                            f_dats = sublist
                            break   
                    return f_dats

                
                abc = Get_date_in_pdf(sourceTxt.text)
                defg = Get_total_amount_in_pdf(sourceTxt.text)
                hij = Get_vat_in_document(sourceTxt.text)
                klm = Get_invoice_in_pdf(sourceTxt.text)
                nop = Get_order_in_pdf(sourceTxt.text)
                qrs = Get_company_in_pdf(sourceTxt.text)
                uvw = Get_shipping_in_pdf(sourceTxt.text)
                
                
                if abc=="" and defg=="" and hij:
                    continue
                elif abc:
                    abcA.append(abc)
                    continue
                elif  defg:
                    defgA.append(defg)
                    continue
                elif hij:
                    hijA.append(hij)
                elif klm:
                    klmA.append(klm)
                elif nop:
                    nopA.append(nop) 
                elif qrs:
                    print(qrs)
                    qrsA.append(qrs)
                elif uvw:
                    uvwA.append(uvw) 


            def get_index_ord(inp, lst): 
            
                indx = lst.index(inp)
                f_ans = lst[indx + 1]
                return f_ans
            def get_indx_purchase(inp, lst):
                indx = int(lst.index(inp))
                f_ans = ''
                for i in lst[indx:indx+4]:
                    if i.isnumeric():
                        f_ans = i
                        break
                    else:
                        f_ans =''

                return f_ans
            def get_company_name(inp,lst):

                idx = int(lst.index(inp))
                f_ans = ''
                f_indx = int(idx + 1)
                for i in lst[f_indx]:
                    if i.isnumeric():
                        f_ans = lst[f_indx + 1]
                    else:
                        f_ans = lst[f_indx]
                return f_ans
            def get_company_name_2(sublist, lst):
                f_ans = ''
                print(sublist)
                if "From:" in sublist:
                    f_ans = get_company_name("From:", lst)
                    print(f_ans)
                elif "From" in sublist:
                    f_ans = get_company_name("From", lst)
                elif "Invoice" in sublist:
                    f_ans = get_company_name("Invoice", lst)
                else:
                    f_ans =  lst[0]
                return f_ans  
            def Get_shipping_cost(inp, lst):
                f_ans = ''
                inpt = ''
                if inp:
                    inpt = inp[0]
                else:
                    inpt = "0"
            
                if inpt.isnumeric():
                    f_ans = inpt
                else:
                    indx = lst.index(inpt)
                    f_indx = int(indx + 1)
                    f_ans = lst[f_indx]
                    if f_ans.isnumeric():
                        f_ans = lst[f_indx]
                    else:
                        f_ans = 0
                return f_ans
            
            order_2 = get_index_ord(nopA[0], flat_list) 
            company_name = get_company_name_2(qrsA, flat_list)
            purchase_2 = get_indx_purchase("Invoice", flat_list)
                
        def vat_2(inp):
            ans_vat = ''
            new_set = list(set([i[0] for i in inp]))
            new_set.sort()
            if  new_set[-1] - new_set[-2] in new_set:
                ans_vat = new_set[-1] - new_set[-2] 
            else:
                ans_vat = 0
            return ans_vat
        vat_2ans = vat_2(defgA)
        shipping = int(Get_shipping_cost(uvwA, flat_list))
                
        
        # print(shipping)
        
        d["date"] = min(abcA)
        d["total_amount"] = max(defgA)[0]
        d["vat"] = vat_2ans - shipping if int(hijA[0][0:-1]) /100 * d["total_amount"] == 0 else int(hijA[0][0:-1]) /100 * d["total_amount"] - shipping
        d["invoice_number"] = klmA[0] if klmA[0] else purchase_2
        d["purchase_order_number"] = nopA[0] if nopA[0].isnumeric() else order_2
        d["supplier_name"] = company_name
    # print("Apiyo", d)
    return d
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
import time

subscription_key = "0f5bb668eab84237bd5742644f3cae02"
endpoint = "https://ocr-test-for-python.cognitiveservices.azure.com/"

computervision_client = ComputerVisionClient(
    endpoint, CognitiveServicesCredentials(subscription_key))
remote_image_handw_text_url = "0.jpg"

# Call API with URL and raw response (allows you to get the operation location)
recognize_handw_results = computervision_client.read(
    remote_image_handw_text_url, raw=True)

# Get the operation location (URL with an ID at the end) from the response
operation_location_remote = recognize_handw_results.headers[
    "Operation-Location"]
# Grab the ID from the URL
operation_id = operation_location_remote.split("/")[-1]

# Call the "GET" API and wait for it to retrieve the results
while True:
    get_handw_text_results = computervision_client.get_read_result(
        operation_id)
    if get_handw_text_results.status not in ['notStarted', 'running']:
        break
    time.sleep(1)
Exemplo n.º 7
0
mlpreview(url)

# This requires two calls using batch_read_file() and
# get_read_operation_result(). The call to batch_read_file() is
# asynchronous. In the results of the call to
# get_read_operation_result(), we need to check if the first call
# completed with OperationStatusCodes before extracting the text
# data. The results include the text as well as the bounding box
# coordinates for the text.

raw = True
numberOfCharsInOperationId = 36

# Asynchronous call.
try:
    rawHttpResponse = client.read(url, raw=raw)
except Exception as e:
    catch_exception(e, url)

# Get ID from returned headers.

operationLocation = rawHttpResponse.headers["Operation-Location"]
idLocation = len(operationLocation) - numberOfCharsInOperationId
operationId = operationLocation[idLocation:]

# Wait for the result.

while True:
    result = client.get_read_result(operationId)
    if result.status not in [
            OperationStatusCodes.not_started, OperationStatusCodes.running
Exemplo n.º 8
0
'''
END - Authenticate
'''

# <snippet_read_call>
'''
OCR: Read File using the Read API, extract text - remote
This example will extract text in an image, then print results, line by line.
This API call can also extract handwriting style text (not shown).
'''
print("===== Read File - remote =====")
# Get an image with text
read_image_url = "https://raw.githubusercontent.com/MicrosoftDocs/azure-docs/master/articles/cognitive-services/Computer-vision/Images/readsample.jpg"

# Call API with URL and raw response (allows you to get the operation location)
read_response = computervision_client.read(read_image_url,  raw=True)
# </snippet_read_call>

# <snippet_read_response>
# Get the operation location (URL with an ID at the end) from the response
read_operation_location = read_response.headers["Operation-Location"]
# Grab the ID from the URL
operation_id = read_operation_location.split("/")[-1]

# Call the "GET" API and wait for it to retrieve the results 
while True:
    read_result = computervision_client.get_read_result(operation_id)
    if read_result.status not in ['notStarted', 'running']:
        break
    time.sleep(1)
Exemplo n.º 9
0
'''
END - Batch Read File - local
'''

# <snippet_read_call>
'''
Batch Read File, recognize printed text - remote
This example will extract printed text in an image, then print results, line by line.
This API call can also recognize handwriting (not shown).
'''
print("===== Batch Read File - remote =====")
# Get an image with printed text
remote_image_printed_text_url = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/printed_text.jpg"

# Call API with URL and raw response (allows you to get the operation location)
recognize_printed_results = computervision_client.read(
    remote_image_printed_text_url, language='en', raw=True)
# </snippet_read_call>

# <snippet_read_response>
# Get the operation location (URL with an ID at the end) from the response
operation_location_remote = recognize_printed_results.headers[
    "Operation-Location"]
# Grab the ID from the URL
operation_id = operation_location_remote.split("/")[-1]

# Call the "GET" API and wait for it to retrieve the results
while True:
    get_printed_text_results = computervision_client.get_read_result(
        operation_id)
    if get_printed_text_results.status not in ['NotStarted', 'Running']:
        break
Exemplo n.º 10
0
# Set credentials
credentials = CognitiveServicesCredentials(key)

# Create client
client = ComputerVisionClient(endpoint, credentials)

# change this URL to reflect the image that you would like to test.
url = "https://azurecomcdn.azureedge.net/cvt-181c82bceabc9fab9ec6f3dca486738800e04b45a0b3c1268609c94f4d67173a/images/shared/cognitive-services-demos/analyze-image/analyze-6-thumbnail.jpg"
# image_path = "images/computer_vision_ocr.png"
lang = 'en'
raw = True
custom_headers = None

# Read an image from a url
rawHttpResponse = client.read(url, language=lang, custom_headers=custom_headers, raw=raw)

# Uncomment the following code and comment out line 37 to read from image stream
# with open(image_path, "rb") as image_stream:
#     rawHttpResponse = client.read_in_stream(
#         image=image_stream, language=lang,
#         # Raw will return the raw response which can be used to find the operation_id
#         raw=True
#     )

# Get ID from returned headers
operationLocation = rawHttpResponse.headers["Operation-Location"]
operationId = operationLocation.split('/')[-1]

# SDK call
while True: