Exemplo n.º 1
0
def pdf_ocr(bucket_name, blob_names, data_dir="raw_ocr/"):
    mime_type = 'application/pdf'
    # How many pages should be grouped into each json output file.
    batch_size = 2
    client = vision.ImageAnnotatorClient()
    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    async_requests = []

    print("Reading file(s): ")
    for blob_name in blob_names:
        source_uri = "gs://" + bucket_name + '/' + blob_name
        print("Reading {}...".format(source_uri))
        gcs_source = vision.GcsSource(uri=source_uri)
        input_config = vision.InputConfig(gcs_source=gcs_source,
                                          mime_type=mime_type)

        destination_uri = "gs://" + bucket_name + '/' + data_dir + blob_name
        print("Saving raw ocr data to {}...".format(destination_uri))
        gcs_destination = vision.GcsDestination(uri=destination_uri)
        output_config = vision.OutputConfig(gcs_destination=gcs_destination,
                                            batch_size=batch_size)

        async_request = vision.AsyncAnnotateFileRequest(
            features=[feature],
            input_config=input_config,
            output_config=output_config)

        async_requests.append(async_request)

    operation = client.async_batch_annotate_files(requests=async_requests)

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)
    print('Done')
Exemplo n.º 2
0
def async_detect_document(gcs_source_uri, gcs_destination_uri,number_of_pages):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    import os
    from google.cloud import vision
    from google.cloud import storage
    from google.protobuf.json_format import MessageToJson

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = number_of_pages

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)
    
    
    outputFileName='Json'+os.path.basename(gcs_source_uri)+'.json'
    #DOWNLOADING THE THE OBJECT THAT IS CONVERTED
    storage_client = storage.Client()
    
    #LOCATION IN THE CLOUD BUCKET
    bucket_name="mypdf_1"
    bucket = storage_client.bucket(bucket_name)    
    
    blobs = [(blob, blob.updated) for blob in storage_client.list_blobs(
    bucket_name,
    )]
    # sort and grab the latest value, based on the updated key
    latest = sorted(blobs, key=lambda tup: tup[1])[-1][0]
    string_data = latest.download_as_string()
    json_data=json.loads(string_data)
    # print(string_data)
    with open(outputFileName, 'w') as outfile:
     json.dump(json_data, outfile)
    
    print("Sucessfully Created the json file")
    return json_data
Exemplo n.º 3
0
def async_detect_document(gcs_source_uri, gcs_destination_uri, file_type):
    """OCR with PDF/TIFF as source files on GCS"""

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = file_type

    # How many pages should be grouped into each json output file.
    batch_size = 1

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source,
                                      mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(gcs_destination=gcs_destination,
                                        batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature],
        input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_string()
    response = json.loads(json_string)

    # # The actual response for the first page of the input file.
    first_page_response = response['responses'][0]
    annotation = first_page_response['fullTextAnnotation']
    return annotation
Exemplo n.º 4
0
def detect_text_from_pdf(gcs_source_uri, gcs_destination_uri):

    mime_type = 'application/pdf'
    batch_size = 50

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=10000)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)
    blob_list = list(bucket.list_blobs(prefix=prefix))

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_string()
    response = json.loads(json_string)
    
    full_text = ''
    for res in response['responses']:
        full_text = full_text + res['fullTextAnnotation']['text']

    return parse_text(full_text)
Exemplo n.º 5
0
def detect_pdf_text(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 50

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source,
                                      mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(gcs_destination=gcs_destination,
                                        batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature],
        input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    return blob_list
Exemplo n.º 6
0
def from_document(source_uri: str, destination_uri: str):
    mime_type = 'application/pdf'  # Supported mime_types are: 'application/pdf' and 'image/tiff'
    batch_size = 2  # How many pages should be grouped into each json output file.

    client = vision.ImageAnnotatorClient()
    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION  # The feature we are going to use
    )
    gcs_source = vision.GcsSource(uri=source_uri)  # The source of the files
    input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)  # Configuring the operation
    gcs_destination = vision.GcsDestination(uri=destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)
    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)
    operation = client.async_batch_annotate_files(
        requests=[async_request])
    print('Waiting for the operation to finish.')
    operation.result(timeout=360)
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    blob_names = [blob.name for blob in blob_list]

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    text_response = []
    for output in blob_list:
        json_string = output.download_as_string()
        response = json.loads(json_string)

        # The actual response for the first page of the input file.
        first_page_response = response['responses'][0]
        annotation = first_page_response['fullTextAnnotation']

        text_response.append(annotation['text'])
    return text_response, blob_names
Exemplo n.º 7
0
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    mime_type = 'application/pdf'
    batch_size = 100
    client = vision.ImageAnnotatorClient()
    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source,
                                      mime_type=mime_type)
    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(gcs_destination=gcs_destination,
                                        batch_size=batch_size)
    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature],
        input_config=input_config,
        output_config=output_config)
    operation = client.async_batch_annotate_files(requests=[async_request])
    print('Waiting for operation to finish')
    operation.result(timeout=400)
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    from google.cloud import vision
    from google.cloud import storage

    if False:

        # Supported mime_types are: 'application/pdf' and 'image/tiff'
        mime_type = 'application/pdf'

        client = vision.ImageAnnotatorClient()

        feature = vision.Feature(
            type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

        gcs_source = vision.GcsSource(uri=gcs_source_uri)
        input_config = vision.InputConfig(gcs_source=gcs_source,
                                          mime_type=mime_type)

        gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
        output_config = vision.OutputConfig(gcs_destination=gcs_destination)

        async_request = vision.AsyncAnnotateFileRequest(
            features=[feature],
            input_config=input_config,
            output_config=output_config)

        operation = client.async_batch_annotate_files(requests=[async_request])

        print('Waiting for the operation to finish.')
        operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    print('Full text:\n')

    for output in blob_list:

        json_string = output.download_as_string()
        response = json.loads(json_string)

        for page in response['responses']:
            if 'fullTextAnnotation' in page.keys():
                print(page['fullTextAnnotation']['text'])
Exemplo n.º 9
0
def async_detect_document(gcs_source_uri, gcs_destination_uri):

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(batch_size=batch_size, gcs_destination=gcs_destination)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_string()
    json_string.decode('iso8859-1')
    response = json.loads(json_string)

    # The actual response for the first page of the input file.
    first_page_response = response['responses'][0]
    annotation = first_page_response['fullTextAnnotation']

    #NAMEHINDI
    text = {}
    text['raw'] = annotation['text']
    namehindi = None
    try:
        newlist1 = []
        for xx in annotation['text'].split('\n'):
            newlist1.append(xx)
            newlist1 = list(filter(lambda x: len(x) > 1, newlist1))
        a = 0
        str2 = "To"

        for no in newlist1:
            if str2 in no:
                b = a
            a = a + 1
        namehindi = newlist1[b + 1]
        text['namehindi'] = namehindi

    # #NAMEENGLISH
    #     translator = Translator()
    #     print(namehindi)
    #     c = translator.translate(namehindi,dest='en')
    #     wordlist = text['raw'].split("\n")
    #
    #     name = get_close_matches(c.text, wordlist)
    #     text['NameEnglish'] = name[0]


    #GENDER
        gender = []
        female_str = {"Female", "महिला", "FEMALE", "స్త్రీ"}
        male_str = {"Male", "పురుషుడు", "MALE", "ਮਰਦ", "पुरुष", "male"}
        for wordlist in text['raw'].split('\n'):
            for g in female_str:
                if re.search(g, wordlist):

                    if g not in gender:
                        print(g)
                        gender.append(g)
        if (len(gender) == 0):

            for wordlist in text['raw'].split('\n'):
                for g in male_str:
                    if re.search(g, wordlist):
                        if g not in gender:
                            print("MALE" + g)
                            gender.append(g)

        if gender[0] == "MALE" or gender[0] == "FEMALE" or gender[0]=="Male":
            gender_string = gender[1] + "/" + gender[0]
        else:
            gender_string = gender[0] + "/" + gender[1]

        text["gender string"] = gender_string

    #Download date
        match = re.search(r'Dow\w+ Date[ :]*\d+[ -/]\d+[ -/]\d+', text['raw'])
        if (match != None):
            text["Downloaddate"] = match.group()
        else:
            pass

    #Issue date
        m = re.search(r'Iss\w+ Date[ :]*\d+[ -/]\d+[ -/]\d+', text['raw'])
        if (m != None):
            text["Issuedate"] = m.group()
        else:
            pass

    #ENG ADDRESS
        addres_hin = None
        try:
            newlist = []
            for xx in text['raw'].split('\n'):
                newlist.append(xx)
                newlist = list(filter(lambda x: len(x) > 0, newlist))
                a = 0
                str = "Address:"

                for no in newlist:
                    a = a + 1
                    c = re.search(r"(?<!\d)\d{6}(?!\d)", no)
                    # r"\(\d[- \d()]\d", line)[0]

                    if c:
                        d = a
                    if str in no:
                        b = a

            addres_hin = newlist[b]
            while (b < d - 1):
                addres_hin = addres_hin + "\n" + newlist[b + 1]
                b = b + 1

        except Exception:
            pass
        text['engAddress'] = addres_hin

    #VID
        g = None
        try:
            newlist = []
            for xx in text['raw'].split('\n'):
                newlist.append(xx)
            newlist = list(filter(lambda x: len(x) > 12, newlist))
            for no in newlist:
                if re.match("^[VID : 0-9]+$", no):
                    g = no
                    g = g.replace("VID:", "")
                    g = g.replace(" ", "")

                    g = ' '.join(re.findall(r'.{1,4}', g))


        except Exception:
            pass
        text['VID'] = g
    #ADHAAR NO
        aadharno = None
        try:
            newlist = []
            str = "XXXX"
            for xx in text['raw'].split('\n'):
                newlist.append(xx)
                newlist = list(filter(lambda x: len(x) > 11, newlist))
                for word in newlist:
                    if re.match("^[0-9 ]+$", word) or str in word and len(word) == 12:
                        aadharno = word
                        aadharno = aadharno.replace(" ", "")
                        aadharno = ' '.join(re.findall(r'.{1,4}', aadharno))
        except Exception:
            pass
        text['Adhaar no'] = aadharno

    #DOB:
        wordlist=None
        birth_str = {"जन्म तिथि", "DOB", "ਜਨਮ ਮਿਤੀ", "పుట్టిన తేదీ", "DOB:", "పుట్టిన తిథి:", "Date of Birth"}
        for i in birth_str:
            for wordlist in text['raw'].split('\n'):
                if re.search(i, wordlist):
                    text["DOB"] = wordlist
                    pass

        # address hindi
        address = None
        try:
            newlist = []
            for xx in text['raw'].split('\n'):
                newlist.append(xx)
                newlist = list(filter(lambda x: len(x) > 0, newlist))
                a = 0
                str_a=""
                str = ["पता:", "ਪਤਾ:","पत्ता","पत्ता:","చిరునామా:"]
                b = 0
                d = 0
                for no in newlist:
                    a = a + 1
                    c = re.search(r"(?<!\d)\d{6}(?!\d)", no)

                    if c:
                        d = a
                    for i in str:
                        if i in no:
                            str_a = i
                            b = a
                            print(b)
                    if d > b & b != 0:
                        break
                if d > b & b != 0:
                    break

            address = newlist[b]
            while (b < d - 1):
                address = address + "\n" + newlist[b + 1]
                b = b + 1
            text['hindiAddress'] = str_a + "\n" + address
        except Exception as e:
            print(traceback.print_exc())
            pass

        phone = None
        try:

            newlist = []

            for xx in text['raw'].split('\n'):
                newlist.append(xx)
                newlist = list(filter(lambda x: len(x) > 5, newlist))
                for word in newlist:
                    if re.match("^[0-9 ]+$", word) and len(word) == 10:
                        phone = word

            text['mobile no'] = phone
        except Exception:
            pass

    except Exception as e:
        print(traceback.print_exc())
        pass

    return text
Exemplo n.º 10
0
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    from google.cloud import vision
    from google.cloud import storage

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source,
                                      mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(gcs_destination=gcs_destination,
                                        batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature],
        input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix, filtering out folders.
    blob_list = [
        blob for blob in list(bucket.list_blobs(prefix=prefix))
        if not blob.name.endswith('/')
    ]
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_string()
    response = json.loads(json_string)

    # The actual response for the first page of the input file.
    first_page_response = response['responses'][0]
    annotation = first_page_response['fullTextAnnotation']

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    print('Full text:\n')
    print(annotation['text'])
Exemplo n.º 11
0
def async_detect_document(userName, fileName):

    from google.cloud import vision
    from google.cloud import storage
    import json
    # from google.protobuf import json_format

    # 결과 테스트
    import sys
    # sys.stdout = open('ocr_pdf_result.txt','w')
    
    bucketName = "graduation_bucket"

    client = vision.ImageAnnotatorClient()

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 20

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source_uri = f"gs://{bucketName}/{fileName}"

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    # 폴더 알아서 생성되는지 확인
    gcs_destination_uri = f"gs://{bucketName}/results/{userName}/{fileName}/"
    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) #배치 추가 가능. default = 20

    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    # print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # print("done")

    #==============================================================
    # fullText 추출하기
    storage_client = storage.Client()

    prefix = f"results/{userName}/{fileName}/"
    bucket = storage_client.get_bucket(bucketName)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    # print('Output files:')
    for blob in blob_list:
        # print(blob.name)
        json_string = blob.download_as_string()
        response = json.loads(json_string)

        pageLen = len(response['responses'])

        for i in range (0, pageLen):
            page_response = response['responses'][i]
            globalVariable.fullText += page_response['fullTextAnnotation']['text']

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.

    
    # print('Full text:\n')
    # print(globalVariable.fullText)

    return
Exemplo n.º 12
0
# Upload all the PDFs to a GCP bucket, create blob_list.txt to get the GCP location on each line and save to expected json response
# On Windows: 'gsutil ls gs://$BUCKET_NAME/*.pdf > blob_list.txt' to save file list
file1 = open('blob_list.txt', 'r')
for line in file1:
    temp_obj = line.strip()
    pdf_list.append(temp_obj)
    json_list.append(temp_obj[0:-4] + '-')
file1.close()

# Supported mime_types are: 'application/pdf' and 'image/tiff'
mime_type = 'application/pdf'

# Number of pages grouped into each json output file - Make this greater than the max document length so it fits in 1 json response.
batch_size = 100
client = vision.ImageAnnotatorClient()
feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

# Loops over all the items in the pdf_list
for i in range(0, len(pdf_list)):
    # Example gcs_source_uri = 'gs://BUCKET_NAME/225423441-Roberson-Joseph-A078-360-606-BIA-Nov-18-2013.pdf'
    gcs_source_uri = pdf_list[i]
    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source,
                                      mime_type=mime_type)

    # Example gcs_destination_uri = 'gs://BUCKET_NAME/225423441-Roberson-Joseph-A078-360-606-BIA-Nov-18-2013-output-X-to-Y.json'
    gcs_destination_uri = json_list[i]
    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(gcs_destination=gcs_destination,
                                        batch_size=batch_size)
    async_request = vision.AsyncAnnotateFileRequest(
Exemplo n.º 13
0
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """
    Annotates PDF document with text detection and saves .txt file to local folder

    Parameters
    ----------
    gcs_source_uri : gcs path to PDF image
    
    gcs_destination_uri : gcs json file which will be written to .txt file

    Returns
    -------
    None.

    """
    client = vision.ImageAnnotatorClient()

    batch_size = 10
    mime_type = 'application/pdf'
    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source,
                                      mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(gcs_destination=gcs_destination,
                                        batch_size=batch_size)
    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature],
        input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(requests=[async_request])
    operation.result(timeout=180)

    storage_client = storage.Client()
    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)
    bucket = storage_client.get_bucket(bucket_name)

    blob_list = list(bucket.list_blobs(prefix=prefix))

    for n in range(1, len(blob_list)):
        output = blob_list[n]
        json_string = output.download_as_string()
        try:
            response = json.loads(json_string)

            first_page_response = response['responses'][0]
            annotation = first_page_response['fullTextAnnotation']

            #print('Full text:\n')
            file = open('chart{}.txt'.format(n), 'w+')
            #print(annotation['text'])
            file.write(annotation['text'])
        except JSONDecodeError:
            print('jsondecode')
            pass
    return None
Exemplo n.º 14
0
def p2a_ocr_pdf(bucket, pdf_blob):
    """
    https://cloud.google.com/vision/docs/pdf
    """

    # define the input config
    gcs_source_uri = "gs://{}/{}".format(bucket.name, pdf_blob.name)
    gcs_source = vision.GcsSource(uri=gcs_source_uri)

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source,
                                      mime_type=mime_type)

    # define output config
    pdf_id = pdf_blob.name.replace(".pdf",
                                   "")[:4]  # use the first 4 chars as pdf_id
    gcs_destination_uri = "gs://{}/{}".format(bucket.name, pdf_id + "_")

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(gcs_destination=gcs_destination,
                                        batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature],
        input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    # storage_client = storage.Client()

    # match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    # bucket_name = match.group(1)
    # prefix = match.group(2)

    # bucket = storage_client.get_bucket(bucket_name)

    # # List objects with the given prefix.
    # blob_list = list(bucket.list_blobs(prefix=prefix))
    # print('Output files:')
    # for blob in blob_list:
    #     print(blob.name)

    # # Process the first output file from GCS.
    # # Since we specified batch_size=2, the first response contains
    # # the first two pages of the input file.
    # output = blob_list[0]

    # json_string = output.download_as_string()
    # response = json.loads(json_string)

    # # The actual response for the first page of the input file.
    # first_page_response = response['responses'][0]
    # annotation = first_page_response['fullTextAnnotation']

    # # Here we print the full text from the first page.
    # # The response contains more information:
    # # annotation/pages/blocks/paragraphs/words/symbols
    # # including confidence scores and bounding boxes
    # print('Full text:\n')
    # print(annotation['text'])

    # convert PDF to PNG files for annotation
    if ANNOTATION_MODE:
        convert_pdf2png(bucket, pdf_blob)
Exemplo n.º 15
0
def loadPdfText(fPath):
    gcs_source_uri = "gs://revaise.appspot.com/" + fPath
    print(gcs_source_uri)
    #gcs_source_uri = "gs://revaise.appspot.com/images/picture-ScienceVideo Game Deep RL0.8575049874802985"
    gcs_destination_uri = "gs://revaise.appspot.com/TextOutput/"
    gcs_destination = vision.GcsSource(uri=gcs_destination_uri)
    mime_type = "application/pdf"
    # mime_type = "image/png"
    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
    print(gcs_source_uri)
    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source,
                                      mime_type=mime_type)
    print(input_config)
    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(gcs_destination=gcs_destination,
                                        batch_size=batch_size)
    print(output_config)
    import time
    time.sleep(1)
    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature],
        input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    storage_client = storage.Client()
    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)
    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    txt = ""
    for output in blob_list:

        json_string = output.download_as_string()
        if json_string != b'':
            response = json.loads(json_string)

            # The actual response for the first page of the input file.
            for j in response['responses']:
                txt += j['fullTextAnnotation']['text'] + " "
    with open("park.txt", "w+") as f:
        f.write(txt)
    for blob in blob_list:
        blob.delete()
    print(txt)
    return txt