Python analyze_labels示例，shared_code.autolabeling.analyze_labels Python示例

示例#1

0

显示文件

文件： test_autolabeling.py 项目： xibelly/knowledge-extraction-recipes-forms

    def test_analyze_labels_when_lookup_path_invalid(self):

        # Expecting failure when lookup path is invalid
        result, _ = autolabeling.analyze_labels(self.gt_path, self.file_path,
                                                self.analyze_result,
                                                self.key_field_names, "test")

        assert len(result['labels']) == 0

示例#2

0

显示文件

文件： test_autolabeling.py 项目： xibelly/knowledge-extraction-recipes-forms

    def test_analyze_labels_when_key_field_names_invalid(self):

        # Expecting failure when key field names is invalid
        result, _ = autolabeling.analyze_labels(self.gt_path, self.file_path,
                                                self.analyze_result, "",
                                                self.lookup_path)

        assert len(result['labels']) == 0

示例#3

0

显示文件

文件： test_autolabeling.py 项目： xibelly/knowledge-extraction-recipes-forms

    def test_analyze_labels_when_file_name_invalid(self):

        # Expecting failure when file name is invalid
        result, _ = autolabeling.analyze_labels(self.gt_path, [],
                                                self.analyze_result,
                                                self.key_field_names,
                                                self.lookup_path)

        assert result == None

示例#4

0

显示文件

文件： test_autolabeling.py 项目： xibelly/knowledge-extraction-recipes-forms

    def test_analyze_labels_when_gt_path_invalid(self):

        # Expecting failure when gt path is invalid
        result, _ = autolabeling.analyze_labels("test", self.file_path,
                                                self.analyze_result,
                                                self.key_field_names,
                                                self.lookup_path)

        assert result == None

示例#5

0

显示文件

文件： test_autolabeling.py 项目： xibelly/knowledge-extraction-recipes-forms

    def test_analyze_labels_when_valid(self):

        # Expecting success when all parameters are valid
        result, _ = autolabeling.analyze_labels(self.gt_path, self.file_path,
                                                self.analyze_result,
                                                self.key_field_names,
                                                self.lookup_path)

        assert result != None

示例#6

0

显示文件

文件： process_doc.py 项目： xibelly/knowledge-extraction-recipes-forms

    def run(self, blob_name, skip_status_table = False, gt_df = None):
        if self.container_client != None and self.table_service != None:
            file_content = storage_helpers.download_blob(self.container_client, blob_name)

            # Check document status to see if it was already processed
            doctype = blob_name.split('/')[0]
            file_name = blob_name.split('/')[-1]
            status = "new"

            if(skip_status_table == False):
                status = storage_helpers.query_entity_status(self.table_service, self.app_settings.status_table, doctype, file_name)
            # If status = "done", we do nothing, if status = "ocr_done", we only find labels
            if status != 'done':

                ocr_output_path = blob_name + '.ocr.json'
                if status != 'ocr-done':
                    # Creating OCR file for document
                    logging.info(f"Creating OCR file for document {blob_name}...")
                    analyze_result = fr_helpers.analyze_layout(self.app_settings.fr_region, self.app_settings.fr_key, file_content, blob_name)
                    analyze_result_string = json.dumps(analyze_result)
                    storage_helpers.upload_blob(self.container_client, ocr_output_path, analyze_result_string)
                    # Updating status
                    if(skip_status_table == False):
                        entity = {'PartitionKey': doctype, 'RowKey': file_name, 'status': 'ocr-done'}
                        if storage_helpers.insert_or_replace_entity(self.table_service, self.app_settings.status_table, entity):
                            logging.info(f"Updated {blob_name} status in status table.")
                        else:
                            logging.error(f"Could not update {blob_name} status in status table.")
                else:
                    logging.info(f"OCR file for document {blob_name} already created, getting it from storage.")
                    ocr_file = storage_helpers.download_blob(self.container_client, ocr_output_path, 'text')
                    if(ocr_file != None):
                        analyze_result = json.loads(ocr_file)
                
                # Creating labels file for document
                if analyze_result != None:
                    key_field_names = self.fields
                    labels_result, keys = autolabeling.analyze_labels(gt_df if gt_df is not None else self.app_settings.gt_path, blob_name, analyze_result, key_field_names, self.app_settings.lookup_path)
                    logging.info(keys)
                    if  labels_result != None and len(keys) > 1:
                        labels_output_path = blob_name + '.labels.json'
                        labels_result_string = json.dumps(labels_result)
                        storage_helpers.upload_blob(self.container_client, labels_output_path, labels_result_string)
                        # Updating status
                        if(skip_status_table == False):
                            entity = {'PartitionKey': doctype, 'RowKey': file_name, 'status': 'done'}
                            if storage_helpers.insert_or_replace_entity(self.table_service, self.app_settings.status_table, entity):
                                logging.info(f"Updated {blob_name} status in status table.")
                            else:
                                logging.error(f"Could not update {blob_name} status in status table.")

                else:
                    logging.error(f"Could not continue processing for blob {blob_name} as analyze result is missing.")