def run(self, blob_name, skip_status_table = False, gt_df = None):
        if self.container_client != None and self.table_service != None:
            file_content = storage_helpers.download_blob(self.container_client, blob_name)

            # Check document status to see if it was already processed
            doctype = blob_name.split('/')[0]
            file_name = blob_name.split('/')[-1]
            status = "new"

            if(skip_status_table == False):
                status = storage_helpers.query_entity_status(self.table_service, self.app_settings.status_table, doctype, file_name)
            # If status = "done", we do nothing, if status = "ocr_done", we only find labels
            if status != 'done':

                ocr_output_path = blob_name + '.ocr.json'
                if status != 'ocr-done':
                    # Creating OCR file for document
                    logging.info(f"Creating OCR file for document {blob_name}...")
                    analyze_result = fr_helpers.analyze_layout(self.app_settings.fr_region, self.app_settings.fr_key, file_content, blob_name)
                    analyze_result_string = json.dumps(analyze_result)
                    storage_helpers.upload_blob(self.container_client, ocr_output_path, analyze_result_string)
                    # Updating status
                    if(skip_status_table == False):
                        entity = {'PartitionKey': doctype, 'RowKey': file_name, 'status': 'ocr-done'}
                        if storage_helpers.insert_or_replace_entity(self.table_service, self.app_settings.status_table, entity):
                            logging.info(f"Updated {blob_name} status in status table.")
                        else:
                            logging.error(f"Could not update {blob_name} status in status table.")
                else:
                    logging.info(f"OCR file for document {blob_name} already created, getting it from storage.")
                    ocr_file = storage_helpers.download_blob(self.container_client, ocr_output_path, 'text')
                    if(ocr_file != None):
                        analyze_result = json.loads(ocr_file)
                
                # Creating labels file for document
                if analyze_result != None:
                    key_field_names = self.fields
                    labels_result, keys = autolabeling.analyze_labels(gt_df if gt_df is not None else self.app_settings.gt_path, blob_name, analyze_result, key_field_names, self.app_settings.lookup_path)
                    logging.info(keys)
                    if  labels_result != None and len(keys) > 1:
                        labels_output_path = blob_name + '.labels.json'
                        labels_result_string = json.dumps(labels_result)
                        storage_helpers.upload_blob(self.container_client, labels_output_path, labels_result_string)
                        # Updating status
                        if(skip_status_table == False):
                            entity = {'PartitionKey': doctype, 'RowKey': file_name, 'status': 'done'}
                            if storage_helpers.insert_or_replace_entity(self.table_service, self.app_settings.status_table, entity):
                                logging.info(f"Updated {blob_name} status in status table.")
                            else:
                                logging.error(f"Could not update {blob_name} status in status table.")

                else:
                    logging.error(f"Could not continue processing for blob {blob_name} as analyze result is missing.")
예제 #2
0
    def train_and_save(self, doctype, training_data_path, use_label_file):

        if use_label_file == True:
            training_type = "supervised"
        else:
            training_type = "unsupervised"

        logging.info(f"{training_type} training started.")

        train_response = fr_helpers.train_model(self.app_settings.fr_region,
                                                self.app_settings.fr_key,
                                                training_data_path, doctype,
                                                use_label_file)

        if train_response != None:

            logging.info(
                f"{training_type} training done. Creating results file...")
            model_details = utils.get_model_details(train_response,
                                                    training_type)

            if model_details != None:
                autolabel_results, csv_output = utils.create_results_files(
                    model_details, doctype)
                autolabel_results_path = doctype + f'/autolabel_results_{training_type}.txt'
                storage_helpers.upload_blob(self.container_client,
                                            autolabel_results_path,
                                            autolabel_results)
                csv_output_path = doctype + f'/autolabel_{training_type}.csv'
                storage_helpers.upload_blob(self.container_client,
                                            csv_output_path, csv_output)
                logging.info("Done.")
                logging.info("Saving model details in models table...")
                entity = {
                    "PartitionKey":
                    self.app_settings.environment + '_' + training_type,
                    "RowKey": doctype,
                    "modelId": model_details['model_id'],
                    "status": model_details['status'],
                    "avgAccuracy": model_details['accuracy'],
                    "date": model_details['date'],
                    "fieldsAccuracy": str(model_details['fields_accuracy'])
                }
                storage_helpers.insert_or_replace_entity(
                    self.table_service, self.app_settings.models_table, entity)
                return model_details

        return None
예제 #3
0
    def test_insert_or_replace_entity_when_table_invalid(self):
        
        # Expecting failure when table is invalid
        result = storage_helpers.insert_or_replace_entity(
                self.table_service, 
                "abcd",
                self.entity)

        assert result == False
예제 #4
0
    def test_insert_or_replace_entity_when_valid(self):
        
        # Expecting success when all parameters are valid
        result = storage_helpers.insert_or_replace_entity(
                self.table_service, 
                self.table_status,
                self.entity)

        assert result == True
예제 #5
0
    def test_insert_or_replace_entity_when_entity_invalid(self):
        
        entity_invalid = {'Name': self.blob_name, 'status': 'new'}
        # Expecting failure when entity is invalid
        result = storage_helpers.insert_or_replace_entity(
                self.table_service, 
                self.table_status,
                entity_invalid)

        assert result == False
예제 #6
0
 def process_blobs(self, blobs, status, table_service, queue):
     logging.info(f"Adding files to processing queue...")
     messages = []
     for blob in blobs:
         # Add message to queue
         messages.append(blob)
         # Add file status in the status table
         doctype = blob.split('/')[0]
         file_name = blob.split('/')[-1]
         # If the status value is "keep", we keep the current status
         if (status == 'keep'):
             file_status = storage_helpers.query_entity_status(
                 table_service, self.app_settings.status_table, doctype,
                 file_name)
             if file_status == None:
                 file_status = 'new'
         else:
             file_status = status
         entity = {
             'PartitionKey': doctype,
             'RowKey': file_name,
             'status': file_status
         }
         if storage_helpers.insert_or_replace_entity(
                 table_service, self.app_settings.status_table, entity):
             logging.info(f"Updated {blob} status in status table.")
         else:
             logging.error(
                 f"Could not update {blob} status in status table.")
     if queue:
         try:
             queue.set(messages)
             logging.info(
                 f"Put {str(len(messages))} messages in processing queue.")
         except Exception as e:
             logging.error(
                 f"Error putting messages in processing queue: {e}")