def run(self, blob_name, skip_status_table = False, gt_df = None): if self.container_client != None and self.table_service != None: file_content = storage_helpers.download_blob(self.container_client, blob_name) # Check document status to see if it was already processed doctype = blob_name.split('/')[0] file_name = blob_name.split('/')[-1] status = "new" if(skip_status_table == False): status = storage_helpers.query_entity_status(self.table_service, self.app_settings.status_table, doctype, file_name) # If status = "done", we do nothing, if status = "ocr_done", we only find labels if status != 'done': ocr_output_path = blob_name + '.ocr.json' if status != 'ocr-done': # Creating OCR file for document logging.info(f"Creating OCR file for document {blob_name}...") analyze_result = fr_helpers.analyze_layout(self.app_settings.fr_region, self.app_settings.fr_key, file_content, blob_name) analyze_result_string = json.dumps(analyze_result) storage_helpers.upload_blob(self.container_client, ocr_output_path, analyze_result_string) # Updating status if(skip_status_table == False): entity = {'PartitionKey': doctype, 'RowKey': file_name, 'status': 'ocr-done'} if storage_helpers.insert_or_replace_entity(self.table_service, self.app_settings.status_table, entity): logging.info(f"Updated {blob_name} status in status table.") else: logging.error(f"Could not update {blob_name} status in status table.") else: logging.info(f"OCR file for document {blob_name} already created, getting it from storage.") ocr_file = storage_helpers.download_blob(self.container_client, ocr_output_path, 'text') if(ocr_file != None): analyze_result = json.loads(ocr_file) # Creating labels file for document if analyze_result != None: key_field_names = self.fields labels_result, keys = autolabeling.analyze_labels(gt_df if gt_df is not None else self.app_settings.gt_path, blob_name, analyze_result, key_field_names, self.app_settings.lookup_path) logging.info(keys) if labels_result != None and len(keys) > 1: labels_output_path = blob_name + '.labels.json' labels_result_string = json.dumps(labels_result) storage_helpers.upload_blob(self.container_client, labels_output_path, labels_result_string) # Updating status if(skip_status_table == False): entity = {'PartitionKey': doctype, 'RowKey': file_name, 'status': 'done'} if storage_helpers.insert_or_replace_entity(self.table_service, self.app_settings.status_table, entity): logging.info(f"Updated {blob_name} status in status table.") else: logging.error(f"Could not update {blob_name} status in status table.") else: logging.error(f"Could not continue processing for blob {blob_name} as analyze result is missing.")
def train_and_save(self, doctype, training_data_path, use_label_file): if use_label_file == True: training_type = "supervised" else: training_type = "unsupervised" logging.info(f"{training_type} training started.") train_response = fr_helpers.train_model(self.app_settings.fr_region, self.app_settings.fr_key, training_data_path, doctype, use_label_file) if train_response != None: logging.info( f"{training_type} training done. Creating results file...") model_details = utils.get_model_details(train_response, training_type) if model_details != None: autolabel_results, csv_output = utils.create_results_files( model_details, doctype) autolabel_results_path = doctype + f'/autolabel_results_{training_type}.txt' storage_helpers.upload_blob(self.container_client, autolabel_results_path, autolabel_results) csv_output_path = doctype + f'/autolabel_{training_type}.csv' storage_helpers.upload_blob(self.container_client, csv_output_path, csv_output) logging.info("Done.") logging.info("Saving model details in models table...") entity = { "PartitionKey": self.app_settings.environment + '_' + training_type, "RowKey": doctype, "modelId": model_details['model_id'], "status": model_details['status'], "avgAccuracy": model_details['accuracy'], "date": model_details['date'], "fieldsAccuracy": str(model_details['fields_accuracy']) } storage_helpers.insert_or_replace_entity( self.table_service, self.app_settings.models_table, entity) return model_details return None
def test_insert_or_replace_entity_when_table_invalid(self): # Expecting failure when table is invalid result = storage_helpers.insert_or_replace_entity( self.table_service, "abcd", self.entity) assert result == False
def test_insert_or_replace_entity_when_valid(self): # Expecting success when all parameters are valid result = storage_helpers.insert_or_replace_entity( self.table_service, self.table_status, self.entity) assert result == True
def test_insert_or_replace_entity_when_entity_invalid(self): entity_invalid = {'Name': self.blob_name, 'status': 'new'} # Expecting failure when entity is invalid result = storage_helpers.insert_or_replace_entity( self.table_service, self.table_status, entity_invalid) assert result == False
def process_blobs(self, blobs, status, table_service, queue): logging.info(f"Adding files to processing queue...") messages = [] for blob in blobs: # Add message to queue messages.append(blob) # Add file status in the status table doctype = blob.split('/')[0] file_name = blob.split('/')[-1] # If the status value is "keep", we keep the current status if (status == 'keep'): file_status = storage_helpers.query_entity_status( table_service, self.app_settings.status_table, doctype, file_name) if file_status == None: file_status = 'new' else: file_status = status entity = { 'PartitionKey': doctype, 'RowKey': file_name, 'status': file_status } if storage_helpers.insert_or_replace_entity( table_service, self.app_settings.status_table, entity): logging.info(f"Updated {blob} status in status table.") else: logging.error( f"Could not update {blob} status in status table.") if queue: try: queue.set(messages) logging.info( f"Put {str(len(messages))} messages in processing queue.") except Exception as e: logging.error( f"Error putting messages in processing queue: {e}")