Пример #1
0
def file_type(paths, file):
	pdf = bool(config.get('document_type', 'pdf'))
	docx = bool(config.get('document_type', 'docx'))
	txt = bool(config.get('document_type', 'txt'))
	csv = bool(config.get('document_type', 'csv'))
	if file.filename.endswith(".docx") and docx:
		doc_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
		paths["filepath"] = paths["inputpath"] + "/form.docx"
		file.save(paths["filepath"])
	elif file.filename.endswith(".pdf") and pdf:
		doc_type = "application/pdf"
		paths["filepath"] = paths["inputpath"] + "/form.pdf"
		file.save(paths["filepath"])
		#file.save(paths["/home/ubuntu/temp/form.pdf"])
	elif (file.filename.endswith(".txt") or file.filename.endswith(".TXT")) and txt:
		doc_type = "text/plain"
		paths["filepath"] = paths["inputpath"] + "/form.txt"
		file.save(paths["filepath"])
	elif (file.filename.endswith(".csv") or file.filename.endswith(".CSV")) and csv:
		doc_type = "text/csv"
		paths["filepath"] = paths["inputpath"] + "/form.csv"
		file.save(paths["filepath"])
	else:
		return {"code": -1, "message": "file type not allowed"}
	return doc_type
Пример #2
0
def discover_content_type(text):
	form_identification_path = config.get('api', 'form_identification_path')
	form_type = "UNKNOWN"
	config_file = "/".join(
		path.dirname(path.abspath(__file__)).split("/") + ["extdata", "config"]) + "/" + form_identification_path
	configdata = pd.read_csv(config_file, sep="|")
	leng = len(configdata)
	for i in range(leng):
		if text == "" or text is None:
			break
		pattern = (configdata.loc[i][1]).split(",")
		val_list = [False] * len(pattern)
		for j in range(len(pattern)):
			pattern[j] = re.sub("\"", "", pattern[j])
			pattern[j] = re.sub("\'", "", pattern[j])
			pattern[j] = pattern[j].strip()
			pattern[j] = re.sub("\s+", "\s*", pattern[j])
			pattern[j] = "\s*" + pattern[j] + "\s*"
			pattern[j] = re.compile(pattern[j])
			if re.search(pattern[j], text):
				val_list[j] = True
		if False not in val_list:
			form_type = configdata.loc[i][0]
			module_name = configdata.loc[i][2]
			break
	if form_type == "UNKNOWN":
		module_name = ""
	return [form_type, module_name]
Пример #3
0
def run_ml_api():
	try:
		x = {}
		upload_folder = config.get('api', 'upload_folder')
		path = upload_folder + str(uuid.uuid4())

		if len(request.files) == 0:
			x["code"] = -1
			x["message"] = "no file received"
			return jsonify(x)
		print("identified")
		file = request.files[(list(request.files))[0]]
		paths = init_dir(path)
		doc_type_or_x = file_type(paths, file)
		logger.info(doc_type_or_x)
		print("file type done")
		if type(doc_type_or_x) is dict:
			return jsonify(doc_type_or_x)
		doc_type = doc_type_or_x
		print("doc type is identified")
		file_size_validity = verify_file_size(paths["filepath"])
		print("bbefore size")
		if file_size_validity == False:
			x["code"] = -1
			x["message"] = "File size exceeded"
		else:
			params = {"spam_flag": "", "structuredParsing_flag": "", "unstructuredParsing_flag": "", "doc_type": doc_type,"pass_header": ""}
			req = request.headers
			params = processing_headers(req, params)
			x = process_forms(paths, params)
			logger.info(x)

		return jsonify(x)
	except:
		return jsonify({"code":1, "message":"file not suported"})
Пример #4
0
def verify_file_size(filepath):
	max_size = int(config.get('api', 'max_file_size')) * 1024 * 1024  # reading in MB converting to bytes
	file_size = path.getsize(filepath)
	logger.info("Size of the file is " + str(file_size))
	if file_size > max_size:
		return False
	return True
Пример #5
0
def checkMalfunctions(text, py_path):
    text = re.sub("\s+", " ", text)
    MALFUNCTION_KEYWORDS = config.get('unstructured', 'MALFUNCTION_KEYWORDS')
    config_unstructured = configparser.ConfigParser()
    config_unstructured.read(py_path + "/config/prediction_config.ini")
    model_path = py_path + "/models/en_med_astragenica"  #+ config_unstructured.get('SectionOne', 'pqc_models')
    nlp = spacy.load(model_path)
    doc = nlp(text)
    labels, mal_values = [], []
    for ents in doc.ents:
        labels.append(ents.label_)
        mal_values.append(ents.text)

    if "MALFUNCTIONS" in labels:
        for i in MALFUNCTION_KEYWORDS:
            for j in mal_values:
                if i.lower() in j.lower():
                    return True
    return False
Пример #6
0
def unstructured_form_parsing(paths, text, unstructuredParsing_flag,
                              spam_flag):
    logger.info("unstructured parsing in progress")

    x = {"code": None, "message": None, "spam_acc": None}

    try:
        if text.strip() == "":
            pdfObj = pdfplumber.open(paths["filepath"])
            pagesObj = pdfObj.pages
            for page in pagesObj:
                if page.page_number == 1:
                    fileName = paths["ocrpath"] + "/ocred_text.txt"
                    file = open(fileName, "rb")
                    text = file.read().decode("ASCII")
                    os.remove(fileName)
                    logger.info(text)
                else:
                    # do ocr and append text
                    logger.info(
                        "unstructured: doing ocr for extended pages, page_num: "
                        + page)
                    imageObj = page.to_image(resolution=300)
                    imageObj.save(paths["imagepath"] + "/form.tiff",
                                  format="tiff")
                    fileName = paths["ocrpath"] + "/ocred.txt"
                    file = open(fileName, "w")
                    file.write(
                        str(
                            pytesseract.image_to_string(
                                Image.open(paths["imagepath"] +
                                           "/form.tiff"))))
                    file.close()
                    fileName = paths["ocrpath"] + "/ocred_text.txt"
                    file = open(fileName, "rb")

        text = process_text(paths, text)
        case_category, case_category_accu, category_flag = check_pqc(
            paths, text)
        '''for normal unstructured'''
        #		unstructured.unstructure_pipeline.unstruct_prediction(paths["ocrpath"] + "/test.txt", paths["ocrpath"] + "/unsoutput.json", py_path, category_flag)
        #		for unstructured api
        cioms_flag = None
        url_unst1 = config.get('unstructured', 'unstructure_api_url')
        print("----------------------------------------" + url_unst1)
        config_path_uns = "/home/ubuntu/pvi-form-engine/structuredForms/py_generic/extdata/config/config.json"
        config_json_uns = json.load(open(config_path_uns))
        url_unst = config_json_uns[
            "base-generic-url"] + ":" + "9888/unstruct/live"
        print("----------------------------------------" + url_unst)
        #requests.post(url_unst, headers={"input_file": paths["ocrpath"] + "/test.txt",
        #								 "output_file": paths["ocrpath"] + "/unsoutput.json",
        #								 "PQC_FLAG": str(category_flag), "cioms_flag": str(cioms_flag)})
        files = {'file2': open(paths["ocrpath"] + "/test.txt", 'rb')}
        x = requests.post(url_unst,
                          files=files,
                          headers={
                              "PQC_FLAG": str(category_flag),
                              "cioms_flag": str(cioms_flag)
                          })
        x = x.json()
        #try:
        #	with open(paths["ocrpath"] + "/unsoutput.json") as data:
        #		x = json.load(data)
        #except:
        #	with open("/".join(path.dirname(path.abspath(__file__)).split("/")[:-3] + ["temp1.json"])) as data:
        #		x = json.load(data)
        logger.info("x from JSON")
        logger.info(x)

        y = process_json(x)
        x = spam_identification(x, y, spam_flag, text, paths)
        if x["message"] == "spam" or x["code"] == 2:
            return x
        x["code"] = 6
        x["message"] = "Non Form AE Case"
        if case_category_accu >= 0.98:
            x["categories"] = case_category
        return x

    except Exception:
        logger.info(("\nstartTrace::::" + traceback.format_exc().strip() +
                     "::::endTrace").replace("\n", "\n$"))
        x = {
            "code": 3,
            "message":
            "error came parsing unstructured AE or checking for spam"
        }
        return x
Пример #7
0
def process_forms(paths, params):
    logger.info("list of all paths is ", paths)
    logger.info("list of all params is ", params)
    x = {}
    # setting flag value
    flags = flag_setter(params["spam_flag"], params["structuredParsing_flag"],
                        params["unstructuredParsing_flag"])

    spam_flag = flags[0]
    structuredParsing_flag = flags[1]
    unstructuredParsing_flag = flags[2]

    # checking password
    logger.info("Pass header is: %s" % params["pass_header"])
    passwd = get_passwd(params["pass_header"])
    logger.info("decoded pass is: %s" % passwd)

    if params["doc_type"] == "application/pdf":
        response = validate_password(paths["filepath"], passwd)
        logger.info(response)

        if response['code'] is not None and response['message'] is not None:
            flush_dir(paths)
            return response
        try:
            pdf_info = pdfinfo_from_path(paths["filepath"], userpw=passwd)
            decrypt_pdf(paths, pdf_info, passwd)

            if pdf_info["Pages"] > int(config.get('api', 'max_file_pages')):
                temp_var = pfw()
                for i in range(16):
                    temp_var.addPage((pfr(paths["filepath"], 'rb')).getPage(i))
                with open((paths["inputpath"] + "/trimmed_form.pdf"),
                          'wb') as f:
                    temp_var.write(f)
                os.rename(paths["inputpath"] + "/trimmed_form.pdf",
                          paths["filepath"])
        except:
            pass

    result = find_content_type(paths, params["doc_type"])
    logger.info(result)
    if type(result) is not list:
        flush_dir(paths)
        return result

    content_type = result[0]
    module_name = result[1]
    text = result[2]
    # parsing
    if content_type in ["UNKNOWN", "", " "]:
        response = unstructured_form_parsing(paths, text[0],
                                             unstructuredParsing_flag,
                                             spam_flag)
        flush_dir(paths)
        return response

    # code for structured from parsing
    if not structuredParsing_flag:
        x["model_type"] = content_type
        x["code"] = 5
        x["message"] = "Form is medwatch or CIOMS."
        flush_dir(paths)
        return x
    else:
        response = structured_form_parsing(paths, content_type, module_name)
        flush_dir(paths)
        return response
Пример #8
0
def run_pvi_api():
	url = config.get('api_url', 'url')
	port = config.get('api_url', 'port')
	app.run(url, int(port))