def file_type(paths, file): pdf = bool(config.get('document_type', 'pdf')) docx = bool(config.get('document_type', 'docx')) txt = bool(config.get('document_type', 'txt')) csv = bool(config.get('document_type', 'csv')) if file.filename.endswith(".docx") and docx: doc_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" paths["filepath"] = paths["inputpath"] + "/form.docx" file.save(paths["filepath"]) elif file.filename.endswith(".pdf") and pdf: doc_type = "application/pdf" paths["filepath"] = paths["inputpath"] + "/form.pdf" file.save(paths["filepath"]) #file.save(paths["/home/ubuntu/temp/form.pdf"]) elif (file.filename.endswith(".txt") or file.filename.endswith(".TXT")) and txt: doc_type = "text/plain" paths["filepath"] = paths["inputpath"] + "/form.txt" file.save(paths["filepath"]) elif (file.filename.endswith(".csv") or file.filename.endswith(".CSV")) and csv: doc_type = "text/csv" paths["filepath"] = paths["inputpath"] + "/form.csv" file.save(paths["filepath"]) else: return {"code": -1, "message": "file type not allowed"} return doc_type
def discover_content_type(text): form_identification_path = config.get('api', 'form_identification_path') form_type = "UNKNOWN" config_file = "/".join( path.dirname(path.abspath(__file__)).split("/") + ["extdata", "config"]) + "/" + form_identification_path configdata = pd.read_csv(config_file, sep="|") leng = len(configdata) for i in range(leng): if text == "" or text is None: break pattern = (configdata.loc[i][1]).split(",") val_list = [False] * len(pattern) for j in range(len(pattern)): pattern[j] = re.sub("\"", "", pattern[j]) pattern[j] = re.sub("\'", "", pattern[j]) pattern[j] = pattern[j].strip() pattern[j] = re.sub("\s+", "\s*", pattern[j]) pattern[j] = "\s*" + pattern[j] + "\s*" pattern[j] = re.compile(pattern[j]) if re.search(pattern[j], text): val_list[j] = True if False not in val_list: form_type = configdata.loc[i][0] module_name = configdata.loc[i][2] break if form_type == "UNKNOWN": module_name = "" return [form_type, module_name]
def run_ml_api(): try: x = {} upload_folder = config.get('api', 'upload_folder') path = upload_folder + str(uuid.uuid4()) if len(request.files) == 0: x["code"] = -1 x["message"] = "no file received" return jsonify(x) print("identified") file = request.files[(list(request.files))[0]] paths = init_dir(path) doc_type_or_x = file_type(paths, file) logger.info(doc_type_or_x) print("file type done") if type(doc_type_or_x) is dict: return jsonify(doc_type_or_x) doc_type = doc_type_or_x print("doc type is identified") file_size_validity = verify_file_size(paths["filepath"]) print("bbefore size") if file_size_validity == False: x["code"] = -1 x["message"] = "File size exceeded" else: params = {"spam_flag": "", "structuredParsing_flag": "", "unstructuredParsing_flag": "", "doc_type": doc_type,"pass_header": ""} req = request.headers params = processing_headers(req, params) x = process_forms(paths, params) logger.info(x) return jsonify(x) except: return jsonify({"code":1, "message":"file not suported"})
def verify_file_size(filepath): max_size = int(config.get('api', 'max_file_size')) * 1024 * 1024 # reading in MB converting to bytes file_size = path.getsize(filepath) logger.info("Size of the file is " + str(file_size)) if file_size > max_size: return False return True
def checkMalfunctions(text, py_path): text = re.sub("\s+", " ", text) MALFUNCTION_KEYWORDS = config.get('unstructured', 'MALFUNCTION_KEYWORDS') config_unstructured = configparser.ConfigParser() config_unstructured.read(py_path + "/config/prediction_config.ini") model_path = py_path + "/models/en_med_astragenica" #+ config_unstructured.get('SectionOne', 'pqc_models') nlp = spacy.load(model_path) doc = nlp(text) labels, mal_values = [], [] for ents in doc.ents: labels.append(ents.label_) mal_values.append(ents.text) if "MALFUNCTIONS" in labels: for i in MALFUNCTION_KEYWORDS: for j in mal_values: if i.lower() in j.lower(): return True return False
def unstructured_form_parsing(paths, text, unstructuredParsing_flag, spam_flag): logger.info("unstructured parsing in progress") x = {"code": None, "message": None, "spam_acc": None} try: if text.strip() == "": pdfObj = pdfplumber.open(paths["filepath"]) pagesObj = pdfObj.pages for page in pagesObj: if page.page_number == 1: fileName = paths["ocrpath"] + "/ocred_text.txt" file = open(fileName, "rb") text = file.read().decode("ASCII") os.remove(fileName) logger.info(text) else: # do ocr and append text logger.info( "unstructured: doing ocr for extended pages, page_num: " + page) imageObj = page.to_image(resolution=300) imageObj.save(paths["imagepath"] + "/form.tiff", format="tiff") fileName = paths["ocrpath"] + "/ocred.txt" file = open(fileName, "w") file.write( str( pytesseract.image_to_string( Image.open(paths["imagepath"] + "/form.tiff")))) file.close() fileName = paths["ocrpath"] + "/ocred_text.txt" file = open(fileName, "rb") text = process_text(paths, text) case_category, case_category_accu, category_flag = check_pqc( paths, text) '''for normal unstructured''' # unstructured.unstructure_pipeline.unstruct_prediction(paths["ocrpath"] + "/test.txt", paths["ocrpath"] + "/unsoutput.json", py_path, category_flag) # for unstructured api cioms_flag = None url_unst1 = config.get('unstructured', 'unstructure_api_url') print("----------------------------------------" + url_unst1) config_path_uns = "/home/ubuntu/pvi-form-engine/structuredForms/py_generic/extdata/config/config.json" config_json_uns = json.load(open(config_path_uns)) url_unst = config_json_uns[ "base-generic-url"] + ":" + "9888/unstruct/live" print("----------------------------------------" + url_unst) #requests.post(url_unst, headers={"input_file": paths["ocrpath"] + "/test.txt", # "output_file": paths["ocrpath"] + "/unsoutput.json", # "PQC_FLAG": str(category_flag), "cioms_flag": str(cioms_flag)}) files = {'file2': open(paths["ocrpath"] + "/test.txt", 'rb')} x = requests.post(url_unst, files=files, headers={ "PQC_FLAG": str(category_flag), "cioms_flag": str(cioms_flag) }) x = x.json() #try: # with open(paths["ocrpath"] + "/unsoutput.json") as data: # x = json.load(data) #except: # with open("/".join(path.dirname(path.abspath(__file__)).split("/")[:-3] + ["temp1.json"])) as data: # x = json.load(data) logger.info("x from JSON") logger.info(x) y = process_json(x) x = spam_identification(x, y, spam_flag, text, paths) if x["message"] == "spam" or x["code"] == 2: return x x["code"] = 6 x["message"] = "Non Form AE Case" if case_category_accu >= 0.98: x["categories"] = case_category return x except Exception: logger.info(("\nstartTrace::::" + traceback.format_exc().strip() + "::::endTrace").replace("\n", "\n$")) x = { "code": 3, "message": "error came parsing unstructured AE or checking for spam" } return x
def process_forms(paths, params): logger.info("list of all paths is ", paths) logger.info("list of all params is ", params) x = {} # setting flag value flags = flag_setter(params["spam_flag"], params["structuredParsing_flag"], params["unstructuredParsing_flag"]) spam_flag = flags[0] structuredParsing_flag = flags[1] unstructuredParsing_flag = flags[2] # checking password logger.info("Pass header is: %s" % params["pass_header"]) passwd = get_passwd(params["pass_header"]) logger.info("decoded pass is: %s" % passwd) if params["doc_type"] == "application/pdf": response = validate_password(paths["filepath"], passwd) logger.info(response) if response['code'] is not None and response['message'] is not None: flush_dir(paths) return response try: pdf_info = pdfinfo_from_path(paths["filepath"], userpw=passwd) decrypt_pdf(paths, pdf_info, passwd) if pdf_info["Pages"] > int(config.get('api', 'max_file_pages')): temp_var = pfw() for i in range(16): temp_var.addPage((pfr(paths["filepath"], 'rb')).getPage(i)) with open((paths["inputpath"] + "/trimmed_form.pdf"), 'wb') as f: temp_var.write(f) os.rename(paths["inputpath"] + "/trimmed_form.pdf", paths["filepath"]) except: pass result = find_content_type(paths, params["doc_type"]) logger.info(result) if type(result) is not list: flush_dir(paths) return result content_type = result[0] module_name = result[1] text = result[2] # parsing if content_type in ["UNKNOWN", "", " "]: response = unstructured_form_parsing(paths, text[0], unstructuredParsing_flag, spam_flag) flush_dir(paths) return response # code for structured from parsing if not structuredParsing_flag: x["model_type"] = content_type x["code"] = 5 x["message"] = "Form is medwatch or CIOMS." flush_dir(paths) return x else: response = structured_form_parsing(paths, content_type, module_name) flush_dir(paths) return response
def run_pvi_api(): url = config.get('api_url', 'url') port = config.get('api_url', 'port') app.run(url, int(port))