def fetch_sections_data(request, doc_id): """ :param request: :param doc_id: for which we need to fetch sections :return: Response in Json format which consists the all sections data for the given doc_id """ context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: if doc_id != "": solution_id = common.get_solution_from_session(request) document_info = document_data(doc_id, solution_id, False) data = document_info["data"]["data"] review_state = document_info["data"]["review_state"] counts = {"extracted": 0, "reviewed": 0} data["elements"] = process_elements(data["elements"], counts) data['need_review_count'] = counts["reviewed"] data['attributes_extracted'] = counts["extracted"] return {"status": "success", "data": data, "volume": MOUNT_PATH, "review_state": review_state, "msg": "successfully returned document sections data"} else: return {"status": "failure", "msg": "Failed to return document sections data"} # TODO raise specific exception except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) return {'status': 'failure', 'msg': 'Internal error while submitting section data', 'error': str(e)} finally: context.end_span()
def find_documents(request, collection, query, solution_id, projection_fields=None): context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: cursor = MongoDbConn.find(collection, query, projection=projection_fields) sort_by, order_by_asc, skip, limit = get_pagination_details(request, sort_by='updated_ts', order_by_asc=-1, skip=0, limit=0) documents_list = cursor.sort(sort_by, order_by_asc).skip(skip).limit(limit) documents = [] for document in documents_list: document.pop("_id", None) document = construct_json(document, DOCUMENT_SUMMARY_FIELDS) doc_type = get_doc_type(document['extn']) if doc_type == "image": document["is_digital"] = False else: document["is_digital"] = True if "confidence_score" not in document: document["confidence_score"] = get_confidence_score(document, solution_id, document["is_digital"]) document["is_failed"] = True if document["doc_state"] == "failed" else False document["review_text"] = get_review_text(document["doc_state"], document) documents.append(document) return documents # TODO raise specific exception except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) finally: context.end_span()
def format_entity_data(entity, elements, review_data, enrich_data, rules_reqd=True): context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: entity_data_json = json.loads(entity) return format_enriched_data(entity_data_json, elements, review_data, enrich_data, rules_reqd) # TODO raise specific exception except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) return {} finally: context.end_span()
def document_data(doc_id, solution_id, entity_reqd=True, rules_reqd=True): context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: filter_query = {"doc_id": doc_id} projection = {"metadata": 1, "doc_id": 1, "confidence_score": 1, "elements": 1, "doc_state": 1, "root_id": 1, "entity": 1, "_id": 0} document = MongoDbConn.find_one(DOCUMENTS_COLLECTION, filter_query, projection=projection) doc_type = get_doc_type(document["metadata"]["properties"]['extension']) overall_doc_score = get_doc_confidence_score(document, doc_type) document['document_confidence_score'] = overall_doc_score template_id = document["metadata"]["template_info"]["id"] template_type = get_template_type(template_id, solution_id) document["doc_type"] = doc_type if doc_type == "email": document["attachments"] = add_email_info(document) # elif doc_type == "excel": # for page in document["pages"]: # page["doc_html"] = get_html_data(page) document["template_type"] = template_type if "metadata" in document: if "searchable_pdf" in document["metadata"].keys(): document["searchable_pdf"] = document["metadata"]["searchable_pdf"] if "entity" in document and entity_reqd: entity_data_orgnl = json.loads(document["entity"]) enrich_data = list(get_enrichments(entity_data_orgnl, "enrichments")) filter_query["is_deleted"] = False elements = get_all_elements(document["elements"], []) review_data = dict(attributes_extracted=0, review_required=0, confidence=0) review_data["entity_feedback"] = document["entity_feedback"] if "entity_feedback" in document else [] document["entity"] = format_entity_data(document["entity"], elements, review_data, enrich_data, rules_reqd) document['document_confidence_score'] = review_data["confidence"] document["attributes_extracted"] = review_data["attributes_extracted"] if rules_reqd: document["review_required"] = 0 else: document["review_required"] = review_data["review_required"] document = remove_items(document, ["entity_feedback"]) document["elements"] = elements else: document.pop("entity", None) review_state = get_review_state(entity_reqd, rules_reqd, doc_type, template_type) data = {"data": document, "volume": MOUNT_PATH, "review_state": review_state} return {"status": "success", "msg": "document data", "data": data} # TODO raise specific exception except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) return {"status": "failure", "msg": str(e), "data": {}} finally: context.end_span()
def update_queue_extracted_feedback(document, doc_id, state): context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: if not document: document = MongoDbConn.find_one(DOCUMENTS_COLLECTION, {"doc_id": doc_id}) if "life_cycle" in document: curr_state = check_current_status(document, state) if curr_state and curr_state != "In Progress": update_queue_status(document, state, "In Progress", update_reqd=True) return {"status": "success", "msg": "Feedback submitted"} except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) return {"status": "failed", "msg": "Error updating queue status", "error": str(e)} finally: context.end_span()
def construct_table_data(headings, columns, domain_mapping): headings_list = [] column_name = "column" for heading in headings: if "final_column" in heading: headings_list.append(heading["final_column"]) column_name = "final_column" if "column" in heading: headings_list.append(heading["column"]) column_list = [] if isinstance(domain_mapping, dict) and "data" in domain_mapping: domain_list = domain_mapping["data"] line_dict = {} for column in columns: col_name = column["name"] if col_name == "subheaders": continue domain_mapped = "" context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: [col_heading_idx] = [idx for idx, value in enumerate(headings) if "".join(value[column_name]) == col_name] if "map_to" in domain_list[col_heading_idx]: domain_mapped = domain_list[col_heading_idx]["map_to"] # TODO raise specific exception except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) context.end_span() for val in column["value"]: line_num = val["line"] value = construct_json(val, ["text", "score", "value_coordinates"]) value["domain_mapping"] = domain_mapped if "score" not in value: value["score"] = 100 row_dict = {col_name: value} if line_num not in line_dict: line_dict[line_num] = row_dict else: line_dict[line_num].update(row_dict) ordered_dict = OrderedDict(sorted(line_dict.items(), key=lambda x: int(x[0]))) for line_num, value in ordered_dict.items(): column_list.append(value) return column_list, headings_list
def download_document_json(request, doc_id): context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: solution_id = common.get_solution_from_session(request) data = document_data(doc_id, solution_id) if data["status"] == "success": download_data = data["data"]["data"] download_data = remove_items(download_data, ["elements", "updated_ts"]) return download_file(download_data, doc_id) else: return data # TODO raise specific exception except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) return {"status": "failure", "msg": "Internal Error occurred", "Error": str(e)} finally: context.end_span()
def process_complete_review(request, doc_id): context = tracer.get_context(request_id=str(uuid4()), log_level="ERROR") context.start_span(component=__name__) try: solution_id = common.get_solution_from_session(request) path = request.get_full_path() if "text/" in path or "entity/" in path: payload = json.loads(request.body.decode()) doc_id = payload["doc_id"] if payload["feedback"]: if "text/" in path: feedback_status = process_text_feedback(request) else: feedback_status = process_entity_feedback(request) if feedback_status["status"] != "success": return {"status": "failure", "msg": "Failed to submit feedback"} query = {"doc_id": doc_id, "solution_id": solution_id} document = MongoDbConn.find_one(DOCUMENTS_COLLECTION, query) data = dict(doc_id=doc_id,pipeline_name="manual_review",root_id=document["root_id"]) if 'completeReview/review/' in path: data.update({"object_type": ["document", "domain", "recommendation"],"complete_review":True}) post_status = post(API_GATEWAY_POST_JOB_URI + PIPELINE["MANUAL_TRIGGER"], {"solution_id": solution_id, "data": data}) if post_status["status"] != "success": return {"status": "failure", "msg": "Error while posting review"} state = "" if "text/" in path: state = "extracted" elif "grouping/" in path: state = "classified" elif "entity/" in path: state = "processed" elif 'review/' in path: state = 'reviewed' update_queue_status(document, state, "Closed", update_reqd=True) # context.end_span() return {"status": "success", "msg": "Review completion Posted successfully"} # TODO raise specific exception except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) return {"status": "failure", "msg": "Internal Error occured while posting review", "error": str(e)} finally: context.end_span()
def get_document_details(request, doc_id, page_no): context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: solution_id = common.get_solution_from_session(request) query = {"doc_id": doc_id, "page_no": int(page_no)} projection = {"solution_id": 0, "updated_ts": 0, "created_ts": 0, "_id": 0, "doc_id": 0} elements = MongoDbConn.find(DOC_ELEMENTS_COLLECTION, query, projection=projection) document = MongoDbConn.find_one(DOCUMENTS_COLLECTION, {"doc_id": doc_id}, projection={"doc_id": 1, "entity": 1}) processed_rules = {} if "entity" in document: processed_rules = get_all_rules_processed(document["entity"]) mapping_data = get_doc_mapping_from_template(doc_id, solution_id) element_list = [] for element in elements: domain_mapping = get_domain_mapping(mapping_data, element_id=element["element_id"], section_id=element["section_id"]) if element["type"] == "table": table = dict() if "headings" and "columns" in element: table["table"], table["headings"] = construct_table_data(element["headings"], element["columns"], domain_mapping) element["tables"] = table element = remove_items(element, ["headings", "columns"]) else: element["domain_mapping"] = "" if domain_mapping and isinstance(domain_mapping, dict) and "domain_mapping" in domain_mapping: element["domain_mapping"] = domain_mapping["domain_mapping"] if processed_rules: element["rules"] = get_rules_info(element["domain_mapping"], processed_rules, solution_id, element["text"]) if "score" not in element: element["score"] = 0 element_list.append(element) data = {"elements": element_list, "entity": {}} return {"status": "success", "data": data} # TODO raise specific exception except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) return {"status": "failure", "msg": "Error occured while processing", "error": str(e)} finally: context.end_span()
def process_entity_feedback(request): context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: solution_id = common.get_solution_from_session(request) request_data = json.loads(request.body.decode()) if request_data["feedback"]: request_data["request_type"] = "extract_entities" feedback_status = post_feedback(request_data, solution_id) if feedback_status['status'] == 'success' and "feedback" in request.get_full_path(): return update_queue_extracted_feedback(None, request_data["doc_id"], "processed") else: return feedback_status else: return {"status": "success", "msg": "No changes to be saved"} except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) return {"status": "failure", "msg": "Internal error occured in processing Feedback", "error": str(e)} finally: context.end_span()
def documents_data(solution_id, filter_obj=None): context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: filter_query = {"solution_id": solution_id, "is_root": True, "$or": [{"is_test": False}, {"is_test": {"$exists": False}}]} projection_fields = dict() for field in DOCUMENT_SUMMARY_FIELDS: projection_fields[field] = 1 apply_filters(filter_obj, filter_query) documents = find_documents(filter_obj, DOCUMENTS_COLLECTION, filter_query, solution_id, projection_fields=projection_fields) documents_total_count = MongoDbConn.count(DOCUMENTS_COLLECTION, filter_query) resp = {'config': summary_config, 'data': documents, 'total_count': documents_total_count} return {"status": "success", "msg": "documents data", "data": resp} # TODO raise specific exception except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) return {"status": "failure", "msg": str(e), "data": {}} finally: context.end_span()
def process_text_feedback(request): context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: solution_id = common.get_solution_from_session(request) payload = json.loads(request.body.decode()) if payload["feedback"]: payload["feedback"] = reprocess_feedback(payload["feedback"]) payload["request_type"] = "extract_elements" feedback_status = post_feedback(payload, solution_id) if feedback_status['status'] == "success" and "feedback" in request.get_full_path(): return update_queue_extracted_feedback(None, payload["doc_id"], "extracted") else: return feedback_status else: return {"status": "success", "msg": "No changes to be saved"} # TODO raise specific exception except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) return {"status": "failure", "msg": "Internal error occurred while submitting feedback"} finally: context.end_span()
def save_threshold_data(solution_id, payload): """ :param solution_id: :param payload: request payload :return: response in json format """ context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: data = payload['data'] post_status = post(API_GATEWAY_POST_JOB_URI + DOCUMENT_ENDPOINT["thresholds_update"], {"solution_id": solution_id, "data": data}) if post_status['status'] == 'success': return {"status": "success", "msg": "Threshold data updated successfully"} else: return {"status": "failure", "msg": "Error while updating threshold data"} # TODO raise specific exception except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) return {"status": "failure", "msg": str(e)} finally: context.end_span()
def page_group_review(request, doc_id): context = tracer.get_context(request_id=str(uuid4()), log_level="INFO") context.start_span(component=__name__) try: solution_id = common.get_solution_from_session(request) if request.method == "GET": query = {"doc_id": doc_id} projection = {"doc_id": 1, "solution_id": 1, "pages": 1, "page_groups": 1, "metadata.properties": 1, "_id": 0} document = MongoDbConn.find_one(DOCUMENTS_COLLECTION, query, projection=projection) if document is not None: document["volume"] = MOUNT_PATH return {"status": "success", "data": document} else: return {"status": "failure", "msg": "Failed to return document data"} elif request.method == "POST": payload = json.loads(request.body.decode()) query = {"doc_id": doc_id, "solution_id": solution_id} document = MongoDbConn.find_one(DOCUMENTS_COLLECTION, query) doc_groups = document["page_groups"] feedback_list = get_groups_feedback(payload, doc_groups) if feedback_list: feedback_status = post_groups_feedback(feedback_list, doc_id, solution_id, document["root_id"]) else: feedback_status = True if feedback_status: return process_complete_review(request, doc_id) else: return {'status': 'failure', 'msg': 'Error posting feedback'} # TODO raise specific exception except Exception as e: context.log(message=str(e), obj={"tb": traceback.format_exc()}) return {'status': 'failure', 'msg': 'Internal error while submitting review', 'error': str(e)} finally: context.end_span()