Пример #1
0
 def test_damaged_file_bytes_io_fails_autodetect(self, resource_group, location, form_recognizer_account, form_recognizer_account_key):
     client = FormRecognizerClient(form_recognizer_account, AzureKeyCredential(form_recognizer_account_key))
     damaged_pdf = BytesIO(b"\x50\x44\x46\x55\x55\x55")  # doesn't match any magic file numbers
     with self.assertRaises(ValueError):
         poller = client.begin_recognize_content(
             damaged_pdf,
         )
Пример #2
0
    def test_content_multipage_table_span_pdf(self, resource_group, location,
                                              form_recognizer_account,
                                              form_recognizer_account_key):
        client = FormRecognizerClient(
            form_recognizer_account,
            AzureKeyCredential(form_recognizer_account_key))
        with open(self.multipage_table_pdf, "rb") as stream:
            poller = client.begin_recognize_content(stream)

        result = poller.result()
        self.assertEqual(len(result), 2)
        layout = result[0]
        self.assertEqual(layout.page_number, 1)
        self.assertEqual(len(layout.tables), 2)
        self.assertEqual(layout.tables[0].row_count, 30)
        self.assertEqual(layout.tables[0].column_count, 5)
        self.assertEqual(layout.tables[0].page_number, 1)
        self.assertEqual(layout.tables[1].row_count, 6)
        self.assertEqual(layout.tables[1].column_count, 5)
        self.assertEqual(layout.tables[1].page_number, 1)
        layout = result[1]
        self.assertEqual(len(layout.tables), 1)
        self.assertEqual(layout.page_number, 2)
        self.assertEqual(layout.tables[0].row_count, 24)
        self.assertEqual(layout.tables[0].column_count, 5)
        self.assertEqual(layout.tables[0].page_number, 2)
        self.assertFormPagesHasValues(result)
Пример #3
0
    def test_content_stream_transform_jpg(self, resource_group, location,
                                          form_recognizer_account,
                                          form_recognizer_account_key):
        client = FormRecognizerClient(
            form_recognizer_account,
            AzureKeyCredential(form_recognizer_account_key))
        with open(self.form_jpg, "rb") as fd:
            myform = fd.read()

        responses = []

        def callback(raw_response, _, headers):
            analyze_result = client._client._deserialize(
                AnalyzeOperationResult, raw_response)
            extracted_layout = prepare_content_result(analyze_result)
            responses.append(analyze_result)
            responses.append(extracted_layout)

        poller = client.begin_recognize_content(myform, cls=callback)
        result = poller.result()
        raw_response = responses[0]
        layout = responses[1]
        page_results = raw_response.analyze_result.page_results
        read_results = raw_response.analyze_result.read_results

        # Check form pages
        self.assertFormPagesTransformCorrect(layout, read_results,
                                             page_results)
Пример #4
0
 def test_damaged_file_passed_as_bytes_io(self, resource_group, location, form_recognizer_account, form_recognizer_account_key):
     client = FormRecognizerClient(form_recognizer_account, AzureKeyCredential(form_recognizer_account_key))
     damaged_pdf = BytesIO(b"\x25\x50\x44\x46\x55\x55\x55")  # still has correct bytes to be recognized as PDF
     with self.assertRaises(HttpResponseError):
         poller = client.begin_recognize_content(
             damaged_pdf,
         )
 def test_content_authentication_bad_key(self, formrecognizer_test_endpoint,
                                         formrecognizer_test_api_key):
     client = FormRecognizerClient(formrecognizer_test_endpoint,
                                   AzureKeyCredential("xxxx"))
     with self.assertRaises(ClientAuthenticationError):
         poller = client.begin_recognize_content(
             b"xx", content_type="application/pdf")
Пример #6
0
    def recognize_content(self):
        from azure.ai.formrecognizer import FormWord, FormLine
        # [START recognize_content]
        from azure.core.credentials import AzureKeyCredential
        from azure.ai.formrecognizer import FormRecognizerClient
        form_recognizer_client = FormRecognizerClient(endpoint=self.endpoint, credential=AzureKeyCredential(self.key))
        with open("sample_forms/forms/Invoice_1.pdf", "rb") as f:
            poller = form_recognizer_client.begin_recognize_content(stream=f.read())
        contents = poller.result()

        for idx, content in enumerate(contents):
            print("----Recognizing content from page #{}----".format(idx))
            print("Has width: {} and height: {}, measured with unit: {}".format(
                content.width,
                content.height,
                content.unit
            ))
            for table_idx, table in enumerate(content.tables):
                print("Table # {} has {} rows and {} columns".format(table_idx, table.row_count, table.column_count))
                for cell in table.cells:
                    print("...Cell[{}][{}] has text '{}' within bounding box '{}'".format(
                        cell.row_index,
                        cell.column_index,
                        cell.text,
                        format_bounding_box(cell.bounding_box)
                    ))
                    # [END recognize_content]
            for line_idx, line in enumerate(content.lines):
                print("Line # {} has word count '{}' and text '{}' within bounding box '{}'".format(
                    line_idx,
                    len(line.words),
                    line.text,
                    format_bounding_box(line.bounding_box)
                ))
            print("----------------------------------------")
Пример #7
0
    def test_content_continuation_token(self, resource_group, location,
                                        form_recognizer_account,
                                        form_recognizer_account_key):
        client = FormRecognizerClient(
            form_recognizer_account,
            AzureKeyCredential(form_recognizer_account_key))
        with open(self.form_jpg, "rb") as fd:
            myfile = fd.read()
        initial_poller = client.begin_recognize_content(myfile)
        cont_token = initial_poller.continuation_token()

        poller = client.begin_recognize_content(myfile,
                                                continuation_token=cont_token)
        result = poller.result()
        self.assertIsNotNone(result)
        initial_poller.wait(
        )  # necessary so azure-devtools doesn't throw assertion error
Пример #8
0
 def test_content_authentication_bad_key(self, resource_group, location,
                                         form_recognizer_account,
                                         form_recognizer_account_key):
     client = FormRecognizerClient(form_recognizer_account,
                                   AzureKeyCredential("xxxx"))
     with self.assertRaises(ClientAuthenticationError):
         poller = client.begin_recognize_content(
             b"xx", content_type="application/pdf")
Пример #9
0
    def test_content_multipage(self, resource_group, location, form_recognizer_account, form_recognizer_account_key):
        client = FormRecognizerClient(form_recognizer_account, AzureKeyCredential(form_recognizer_account_key))
        with open(self.multipage_invoice_pdf, "rb") as fd:
            invoice = fd.read()
        poller = client.begin_recognize_content(invoice)
        result = poller.result()

        self.assertEqual(len(result), 3)
        self.assertFormPagesHasValues(result)
 def test_content_bad_endpoint(self, formrecognizer_test_endpoint,
                               formrecognizer_test_api_key):
     with open(self.invoice_pdf, "rb") as fd:
         myfile = fd.read()
     with self.assertRaises(ServiceRequestError):
         client = FormRecognizerClient(
             "http://notreal.azure.com",
             AzureKeyCredential(formrecognizer_test_api_key))
         poller = client.begin_recognize_content(myfile)
Пример #11
0
 def test_passing_bad_content_type_param_passed(self, resource_group, location, form_recognizer_account, form_recognizer_account_key):
     client = FormRecognizerClient(form_recognizer_account, AzureKeyCredential(form_recognizer_account_key))
     with open(self.invoice_pdf, "rb") as fd:
         myfile = fd.read()
     with self.assertRaises(ValueError):
         poller = client.begin_recognize_content(
             myfile,
             content_type="application/jpeg"
         )
Пример #12
0
    def test_blank_page(self, resource_group, location, form_recognizer_account, form_recognizer_account_key):
        client = FormRecognizerClient(form_recognizer_account, AzureKeyCredential(form_recognizer_account_key))

        with open(self.blank_pdf, "rb") as stream:
            poller = client.begin_recognize_content(
                stream,
            )
        result = poller.result()
        self.assertIsNotNone(result)
Пример #13
0
    def test_auto_detect_unsupported_stream_content(self, resource_group, location, form_recognizer_account, form_recognizer_account_key):
        client = FormRecognizerClient(form_recognizer_account, AzureKeyCredential(form_recognizer_account_key))

        with open(self.unsupported_content_py, "rb") as fd:
            myfile = fd.read()

        with self.assertRaises(ValueError):
            poller = client.begin_recognize_content(
                myfile
            )
Пример #14
0
    def test_content_stream_passing_url(self, resource_group, location,
                                        form_recognizer_account,
                                        form_recognizer_account_key):
        client = FormRecognizerClient(
            form_recognizer_account,
            AzureKeyCredential(form_recognizer_account_key))

        with self.assertRaises(TypeError):
            poller = client.begin_recognize_content(
                "https://badurl.jpg", content_type="application/json")
Пример #15
0
 def test_content_authentication_successful_key(
         self, resource_group, location, form_recognizer_account,
         form_recognizer_account_key):
     client = FormRecognizerClient(
         form_recognizer_account,
         AzureKeyCredential(form_recognizer_account_key))
     with open(self.invoice_pdf, "rb") as fd:
         myfile = fd.read()
     poller = client.begin_recognize_content(myfile)
     result = poller.result()
Пример #16
0
 def test_passing_enum_content_type(self, resource_group, location, form_recognizer_account, form_recognizer_account_key):
     client = FormRecognizerClient(form_recognizer_account, AzureKeyCredential(form_recognizer_account_key))
     with open(self.invoice_pdf, "rb") as fd:
         myfile = fd.read()
     poller = client.begin_recognize_content(
         myfile,
         content_type=FormContentType.application_pdf
     )
     result = poller.result()
     self.assertIsNotNone(result)
    def recognize_content(self):
        path_to_sample_forms = os.path.abspath(
            os.path.join(os.path.abspath(__file__), "..",
                         "./sample_forms/forms/form_selection_mark.png"))
        # [START recognize_content]
        from azure.core.credentials import AzureKeyCredential
        from azure.ai.formrecognizer import FormRecognizerClient

        endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
        key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

        form_recognizer_client = FormRecognizerClient(
            endpoint=endpoint, credential=AzureKeyCredential(key))
        with open(path_to_sample_forms, "rb") as f:
            poller = form_recognizer_client.begin_recognize_content(form=f)
        form_pages = poller.result()

        for idx, content in enumerate(form_pages):
            print("----Recognizing content from page #{}----".format(idx + 1))
            print("Page has width: {} and height: {}, measured with unit: {}".
                  format(content.width, content.height, content.unit))
            for table_idx, table in enumerate(content.tables):
                print("Table # {} has {} rows and {} columns".format(
                    table_idx, table.row_count, table.column_count))
                print("Table # {} location on page: {}".format(
                    table_idx, format_bounding_box(table.bounding_box)))
                for cell in table.cells:
                    print(
                        "...Cell[{}][{}] has text '{}' within bounding box '{}'"
                        .format(cell.row_index, cell.column_index, cell.text,
                                format_bounding_box(cell.bounding_box)))

            for line_idx, line in enumerate(content.lines):
                print(
                    "Line # {} has word count '{}' and text '{}' within bounding box '{}'"
                    .format(line_idx, len(line.words), line.text,
                            format_bounding_box(line.bounding_box)))
                if line.appearance:
                    if line.appearance.style_name == "handwriting" and line.appearance.style_confidence > 0.8:
                        print(
                            "Text line '{}' is handwritten and might be a signature."
                            .format(line.text))
                for word in line.words:
                    print("...Word '{}' has a confidence of {}".format(
                        word.text, word.confidence))

            for selection_mark in content.selection_marks:
                print(
                    "Selection mark is '{}' within bounding box '{}' and has a confidence of {}"
                    .format(selection_mark.state,
                            format_bounding_box(selection_mark.bounding_box),
                            selection_mark.confidence))
            print("----------------------------------------")
Пример #18
0
    def test_content_stream_pdf(self, resource_group, location, form_recognizer_account, form_recognizer_account_key):
        client = FormRecognizerClient(form_recognizer_account,
                                      AzureKeyCredential(form_recognizer_account_key))
        with open(self.invoice_pdf, "rb") as fd:
            myform = fd.read()

        poller = client.begin_recognize_content(myform)
        result = poller.result()
        self.assertEqual(len(result), 1)
        layout = result[0]
        self.assertEqual(layout.page_number, 1)
        self.assertFormPagesHasValues(result)
        self.assertEqual(layout.tables[0].row_count, 2)
        self.assertEqual(layout.tables[0].column_count, 6)
        self.assertEqual(layout.tables[0].page_number, 1)
Пример #19
0
def main():
    load_dotenv("../../.env")
    credential = AzureKeyCredential(os.environ.get("COGNITIVE_SERVICE_KEY"))

    form_recognizer_client = FormRecognizerClient(
        endpoint="https://ocrdemo1.cognitiveservices.azure.com/",
        credential=credential
    )
    with open("../../data/invoice2.png", "rb") as f:
        invoice = f.read()
    poller = form_recognizer_client.begin_recognize_content(invoice)
    page = poller.result()

    img = imread("../../data/invoice2.png")
    img = draw_blocks(img, page)
    cv2.imshow("img", img)
    cv2.waitKey(0)
Пример #20
0
    def recognize_content(self):
        path_to_sample_forms = os.path.abspath(os.path.join(os.path.abspath(__file__),
                                                            "..", "./sample_forms/forms/Invoice_1.pdf"))
        # [START recognize_content]
        from azure.core.credentials import AzureKeyCredential
        from azure.ai.formrecognizer import FormRecognizerClient

        endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
        key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

        form_recognizer_client = FormRecognizerClient(endpoint=endpoint, credential=AzureKeyCredential(key))
        with open(path_to_sample_forms, "rb") as f:
            poller = form_recognizer_client.begin_recognize_content(form=f)
        form_pages = poller.result()

        for idx, content in enumerate(form_pages):
            print("----Recognizing content from page #{}----".format(idx+1))
            print("Page has width: {} and height: {}, measured with unit: {}".format(
                content.width,
                content.height,
                content.unit
            ))
            for table_idx, table in enumerate(content.tables):
                print("Table # {} has {} rows and {} columns".format(table_idx, table.row_count, table.column_count))
                for cell in table.cells:
                    print("...Cell[{}][{}] has text '{}' within bounding box '{}'".format(
                        cell.row_index,
                        cell.column_index,
                        cell.text,
                        format_bounding_box(cell.bounding_box)
                    ))
                    # [END recognize_content]
            for line_idx, line in enumerate(content.lines):
                print("Line # {} has word count '{}' and text '{}' within bounding box '{}'".format(
                    line_idx,
                    len(line.words),
                    line.text,
                    format_bounding_box(line.bounding_box)
                ))
                for word in line.words:
                    print("...Word '{}' has a confidence of {}".format(word.text, word.confidence))
            print("----------------------------------------")