def training_dataset(): # Load the document from source path and converted into dataset try: logger.debug("\n\n training_dataset : {}", tokenize_stem) templates = [] final_dataset = {} from os import listdir # read the templates for name in os.listdir(training_location): templates.append(name) # Load the data logger.info('\n Loading the dataset...\n') logger.info('\n templates : ', templates) from sklearn import datasets final_dataset = sklearn.datasets.load_files(training_location, description=None, categories=templates, load_content=True, encoding='ISO-8859-1', shuffle=False, random_state=42) print("\n type -> final_dataset :", type(final_dataset)) except Exception as err: error_updation.exception_log(err, "error in final_dataset", str('')) #print("training_dataset", err) return final_dataset
def get_security_token(): import requests as req import json try: sec_token_url = config['CLASSIFIER']['SEC_TOKEN_URL'] sec_username = config['CLASSIFIER']['SEC_USERNAME'] sec_password = config['CLASSIFIER']['SEC_PASSWORD'] sec_grant_type = config['CLASSIFIER']['SEC_GRANT_TYPE'] payload = {} payload['username'] = sec_username payload['password'] = sec_password payload['grant_type'] = sec_grant_type # print("sec_token_url:",sec_token_url) headers = {'Content-Type': 'application/x-www-form-urlencoded'} sec_reqst = req.request("GET", url=sec_token_url, data=payload, headers=headers) # print("\n\n Connector reqst:", sec_reqst.json()) token = '' if sec_reqst and sec_reqst.status_code is 200: sec_data = sec_reqst.json() token = 'bearer ' + sec_data['access_token'] except Exception as error: error_updation.exception_log("Security Token issue ", " Security Token issue ", 0) return token
def connect_classification(payload): import requests as req import json try: ipaddress_str = find_ipaddress() classifier_api = "http://%s:%s/%s" % ( ipaddress_str, config['CLASSIFIER']['API_PORT_NUMBER'], config['CLASSIFIER']['CLASSIFIER_URL_NAME']) # classifier_api="http://192.168.0.21:3535/classifier" payload = json.dumps(payload) class_headers = { 'Content-Type': "application/json", 'cache-control': "no-cache" } reqst = req.request("POST", url=classifier_api, data=payload, headers=class_headers) print("classifier_api:", classifier_api, payload) logger.debug("\n\n Connector reqst:", reqst) if not reqst and reqst.status_code is not 200: classifier_transactions.update_inbound_status( payload['inbound_id']) error_updation.custom_error_update_log( "Classification service is down", "Classification service is down", str(payload['inbound_id'])) # reqst_result = rest_response(reqst, classifier_api) logger.debug("\n\n\ reqst_result: {}", reqst.status_code) except Exception as error: error_updation.exception_log(" Classification API request issue ", " Classification API request issue ", str(payload['inbound_id'])) return reqst.status_code
def ocr_processing(img_file_name, inbound_id, auth_key): try: if os.path.exists(img_file_name): pytesseract.pytesseract.tesseract_cmd = pytesseract_install_loc logger.info(' \n\n Image processing with openCV & PIL .....') img = cv2.imread(img_file_name) ###reading image kernel = np.ones((1, 1), np.uint8) img = cv2.dilate(img, kernel, iterations=1) img = cv2.erode(img, kernel, iterations=1) os.chdir(image2text_path) # new_image = os.path.normpath( os.path.join( temp_directory, str(inbound_id) + "_" + get_file_name(img_file_name) + ".jpg")) cv2.imwrite(new_image, img) img2txt = pytesseract.image_to_string(new_image) # img2txt = img2txt.replace('\t', ' ') if os.path.exists(new_image): os.remove(new_image) return str(img2txt) else: logger.info(" Error : Image not found , Error :", img_file_name) except Exception as error: error_updation.exception_log(error, "Image to Text conversion failed", str(inbound_id)) classifier_transactions.update_inbound_status(inbound_id, auth_key) logger.debug("Image to Text conversion failed , Error :", error)
def spacy_entity_extraction(content): try: from nltk import word_tokenize import spacy nlp = spacy.load('en_core_web_md') capitalized_text = [] tokenized_words = word_tokenize(content) for text in tokenized_words: capitalize_first_char = text.capitalize() capitalized_text.append(capitalize_first_char) detokenizer = Detok() detokenized_text = detokenizer.detokenize(capitalized_text) #remove_cardinal = re.sub(r'[0-9]+', '', detokenized_text) nlp_document = nlp(detokenized_text) str_replace_dict = {} if len(nlp_document.ents) == 0: str2 = detokenized_text else: for entities in nlp_document.ents: extracted_entities = {entities.label_} if 'CARDINAL' not in extracted_entities: extracted_text = {entities.text} #print(extracted_text) #print(extracted_text) for key in extracted_text: str_replace_dict[ key] = "<span class='imp'>" + key + '</span>' str2 = multiwordReplace(detokenized_text, str_replace_dict) return str2 except Exception as e: error_updation.exception_log(e, "Error in entities_extraction :", str(''))
def add_group(auth_key): add_group = {} add_group['group_no'] = 0 add_group['error_msg'] = '' add_group['error_code'] = 0 cur_group_no = 0 try: get_headers['Authorization'] = auth_key headers['Authorization'] = auth_key cur.execute("SELECT id FROM public.table_dfx_similiaritygroup WHERE id=(select max(id) from public.table_dfx_similiaritygroup);") rows = cur.fetchall() if not rows: cur_group_no = cur_group_no else: for cur_group_no in rows: cur_group_no = cur_group_no[0] group_dict["name"] = "Group_" + str(int(cur_group_no) + 1) #payload = json.dumps(group_dict) query = """INSERT INTO table_dfx_similiaritygroup(name, status, parenttypeid, modeltypeid, createdtimestamp, groupfilelocation, isreverted, modifiedtimestamp) VALUES(%(name)s, %(status)s, %(parenttypeid)s, %(modeltypeid)s, %(createdtimestamp)s, %(groupfilelocation)s, %(isreverted)s, %(modifiedtimestamp)s);""" cur.execute(query, group_dict) cur.execute("SELECT id FROM public.table_dfx_similiaritygroup WHERE id=(select max(id) from public.table_dfx_similiaritygroup);") rows_ = cur.fetchall() for cur_group_no_ in rows_: logger.info(cur_group_no_[0]) add_group['group_no'] = int(cur_group_no_[0]) except Exception as error: add_group['error_code'] = 5 add_group['error_msg'] += "find_group : Issue with Resource to find the unclassified documents group " error_updation.exception_log(error, add_group['error_code']+add_group['error_msg'], str(add_group['error_code'])) logger.debug("\n\n exception: {}", str(error)) return add_group
def config_data(): try: tmp_request_url = urllib.parse.urljoin(system_config_all, sysytem_config_api) resp = req.request(method='GET', url=tmp_request_url) Filename_config = 'config_' + time.strftime("%Y-%m-%d %H%M%S") + '.ini' if os.path.exists('config.ini'): os.rename('config.ini', Filename_config) shutil.move(Filename_config, ini_archive_loc) logger.info("File renamed and moved") else: logger.debug("File not found") with open('config.ini', 'w+') as configuration: config_classifier = ('[CLASSIFIER]' + "\n") configuration.write(config_classifier) if resp and resp.status_code == 200: json_str_in_db = json.loads(resp.text) for json_str_values in json_str_in_db: configuration_name = json_str_values['Name'] configuration_value = json_str_values["Value"] config = (configuration_name + "=" + configuration_value + "\n") configuration.write(config) return json.dumps(dic_class['Config_file_created']) except Exception as e: error_updation.exception_log(e, "Error in config data", str(''))
def convert2str(val, bool): import xlrd try: if type(val) is float and bool == 0: return str(int(val)) elif type(val) is float and bool == 1: xldate = xlrd.xldate.xldate_as_datetime(val, 0) return xldate except Exception as error: error_updation.exception_log( error, "Error occurred in Excel date conversion : convert2str ERROR ", str(val))
def pickle_save(model, path, filename): try : import pickle model_name = os.path.normpath(os.path.join(path ,filename)) with open(model_name, 'wb') as f: pickle.dump(model, f) logger.debug("\n\n Pickled dumped : {}", path) if os.path.exists(model_name) : return model_name else: raise WindowsError(" Model build process is failed") except Exception as error : error_updation.exception_log(error, "Model build process is failed", str(filename))
def extract_value(search_pre_str, search_post_str, raw_text): search_str = '' try: if len( raw_text.strip() ) >= 1 and search_pre_str is not None and search_post_str is not None and raw_text is not None: value = re.search( search_pre_str + '(.*)' + search_post_str + '(.*)', raw_text) search_str = value.group(1) except Exception as e: error_updation.exception_log(e, "extract_value exception:", str(search_pre_str)) #logger.debug(" extract_value exception:", e) logger.debug(" search_str: {}", search_str) return search_str
def processing_template(csvfile, input_txtfile, out_json_file): try: import csv with open(input_txtfile, "r+", errors='ignore', encoding='utf-8') as f: new_data = f.read().replace("\n", ' ') csvfile = open(csvfile, 'r') reader = csv.DictReader(csvfile) reader.fieldnames = ("field", "begin", "end") final_out = {} for row in reader: final_out[row["field"]] = extract_value(row["begin"], row["end"], new_data).strip() csvfile.close() with open(out_json_file, "w+", encoding='utf-8') as jsonfile: jsonfile.write(json.dumps(final_out)) logger.info("processing_template , JSON : {}", jsonfile) except Exception as error: error_updation.exception_log(error, "Error in data extraction", str('input_txtfile')) #logger.debug(" Error in data extraction, Error :", error) return out_json_file
def pdf_to_image(file_path, inbound_id, auth_key, no_of_pages=ocr_all_pages): import os file_content = "" try: if os.path.exists(file_path): import fitz import os doc = fitz.open(file_path) pdf_file_name = get_file_name(file_path) for i in range(no_of_pages): for img in doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(doc, xref) png_file_name = os.path.normpath( os.path.join(image_path, pdf_file_name + "_" + str(xref) + ".png")) logger.info("\nPNG File Name : {}", png_file_name) if pix.n < 5: # this is GRAY or RGB pix.writePNG(png_file_name) else: # CMYK: convert to RGB first pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG(png_file_name) pix1 = None pix = None file_content = file_content + " " + ocr_processing( png_file_name, inbound_id, auth_key) #if os.path.exists(png_file_name): #os.remove(png_file_name) else: logger.info(" \n\n pdf file not found ") except Exception as error: error_updation.exception_log( error, "Error with Image processing pdf_to_image", str(inbound_id)) classifier_transactions.update_inbound_status(inbound_id, auth_key) return file_content
def similarity(input_file, auth_key): error_msg = '' sim_details = {} sim_details['error_msg'] = '' sim_details['error_code'] = 0 sim_details['new_group'] = 0 sim_details['group_no'] = 0 group_no = 0 try: sim_details['error_code'] = 0 if os.path.exists(input_file): from datetime import datetime as dt sysdate = dt.now() training_dataset = [] train_data = '' doc_labels = [ os.path.join(path, name) for path, subdirs, files in os.walk(sim_template) for name in files if name.endswith('.txt') ] train_txt_file = "sim_training_data.txt" logger.debug(" \n\n\n doc_labels : {}", doc_labels) remove_digits = str.maketrans('', '', digits) for file in doc_labels: with open(file, 'r', errors='ignore', encoding='utf-8') as f: training_dataset.append(( (os.path.basename(os.path.dirname(file)) + '~~__~~' + (f.read()).translate(remove_digits) ).replace("\n", ' ')) + "\n") logger.debug(" \n\n training_dataset: {}", training_dataset) #print(" \n\n SIM Input file", input_file) with open(input_file, "r", errors='ignore', encoding='utf-8') as f: new_data = f.read().replace("\n", ' ').translate(remove_digits) print("-----new_data", new_data) logger.debug("Number of train docs : {}".format( len(training_dataset))) if (len(training_dataset) > 0): train_docs = [[w.lower() for w in word_tokenize(line)] for line in training_dataset] train_dictionary = gensim.corpora.Dictionary(train_docs) train_corpus = [ train_dictionary.doc2bow(gen_doc) for gen_doc in train_docs ] logger.info("Creating TF-IDF model") tf_idf = gensim.models.TfidfModel(train_corpus) logger.info(tf_idf) sim_model_loc = config['CLASSIFIER']['SIM_MODEL_LOC'] sims = gensim.similarities.Similarity(sim_model_loc, tf_idf[train_corpus], num_features=999999999) logger.info("sims: {}", sims) result = new_data_query(test_data=new_data, dictionary=train_dictionary, tf_idf_model=tf_idf, sims=sims) logger.debug("\n\n\n result: {}", result) indices = np.asarray(result).argsort()[-5:][::-1] #indices = np.asarray(result).argsort()[-len(training_dataset):][::-1] # logger.debug(indices, [result[_] for _ in indices]) res = [result[_] for _ in indices] logger.debug("\n\n res : {}", res[0]) sim_details['accuracy_rate'] = res[0] # op.write((training_dataset[indices[0]].split(group_identifier))[0] + "\n") group_list = training_dataset[indices[0]].split( group_identifier) #print("$$$$$ Group List",group_list) logger.debug("\n\n\n Group Number : {}", group_no) #print("#####",res[0]) if len(group_list) > 0 and float( res[0]) >= float(sim_accuracy): group_no = group_list[0] sim_details['group_no'] = group_no logger.debug("\n\n\n Existing Group Number : {}", group_no) else: group_details = classifier_transactions.add_group(auth_key) group_no = int(group_details['group_no']) logger.debug("\n\n\n New Group Number : {}", group_no) sim_details['group_no'] = group_no if group_no is not None and group_no is not '' and group_no > 0: sim_details['new_group'] = 1 else: sim_details['error_msg'] = group_details['error_msg'] sim_details['error_code'] = group_details['error_code'] logger.debug("\n\n\n New Group Number : {}", group_no) else: sim_details[ 'error_msg'] = " similarity : Error in similarity() : Resource is not available in mentioned location" + input_file sim_details['error_code'] = 4 sim_details['group_no'] = group_no except Exception as err: sim_details['error_code'] = 4 print(" Exception occured in similarity :" + str(err)) sim_details[ 'error_msg'] = "Exception occurred in similarity : Please check the input resource :" + input_file + " ," + str( err) error_updation.exception_log(err, sim_details, str(input_file)) #logger.debug("\n\n\n sim_details: {}", sim_details) return sim_details
def call_classiciation(): import requests as req import json import sys prediction_json = {} try: payload = {} payload['file_name'] = None payload['inbound_id'] = 0 payload['IsTrainingSource'] = 0 payload['is_unclassified'] = 1 payload['Authorization'] = sec_token payload['FileLength'] = 0 logger.log('sec_token -- >',sec_token) get_headers['Authorization'] = sec_token ipaddress_str = find_ipaddress() port = config['CLASSIFIER']['API_PORT_NUMBER'] if is_port_open(ipaddress_str,port) : logger.log("\n unclassifier_api", unclassifier_api, ' get_headers',get_headers) unclass_resp = req.request(method='GET', url=unclassifier_api , headers=get_headers) unclass_data = unclass_resp.json() logger.log("\n\n unclass_data :", unclass_data) logger.debug("\n\n unclass_data: {}", unclass_data) if unclass_resp and unclass_resp.status_code == 200 and unclass_data is not None and len(unclass_data) > 0: payload['unclass_id'] = unclass_data['Id'] payload['file_name'] = unclass_data['FileLocation'] payload['inbound_id'] = unclass_data['DiscoveryInBoundId'] payload['IsTrainingSource'] = 0 print(" \n\n Inclassification : payload", payload) connect_classification(payload) else: logger.log("\n classifier_api", get_file_info_request, 'get_headers',get_headers) resp = req.request(method='GET', url=get_file_info_request, headers=get_headers) discovery_file_details = resp.json() logger.debug(" \n\n json_data: {} ", discovery_file_details) if resp and resp.status_code == 200 and discovery_file_details is not None and len(discovery_file_details) > 0: logger.debug(" \n\n template_details : {}", discovery_file_details) reqst_result = [] for document in discovery_file_details: try: running_status = 1 payload = {} payload['file_name'] = None payload['unclass_id'] = 0 payload['inbound_id'] = 0 payload['IsTrainingSource'] = 0 payload['is_unclassified'] = 0 payload['Authorization'] = sec_token logger.debug("\n\n document: {}", document) json_str = json.loads(document['Value']) crm_file_name = '' discover_source_id = 0 dirpath = json_str.get("FileDirectoryName") payload['inbound_id'] = document['Id'] payload['FileLength'] = json_str.get("FileLength") # payload['IsTrainingSource'] = document['IsTrainingSource'] payload['IsTrainingSource'] = 0 if document['SourceTag'] == 'CRM' and json_str.get("FileFullPath").strip() != '' and json_str.get("Annotationid").strip() != '' : crm_document_reqst = req.request(method='GET', url=json_str.get("FileFullPath")) crm_document_data = crm_document_reqst.json() logger.log("crm_document_data[0]", crm_document_data[0]) if crm_document_reqst and crm_document_reqst.status_code == 200 and crm_document_data is not None and os.path.exists( crm_document_data[0]): payload['file_name'] = crm_document_data[0] else: error_updation.exception_log(" CRM Request : Unable to locate document ", " CRM Request : Unable to locate document ", str(document['Id'])) else: payload['file_name'] = json_str.get("FileFullPath") logger.debug(discover_source_id) logger.debug("\n\n payload: {}", payload) connect_classification(payload) except Exception as error: error_updation.exception_log(" Classification API request issue ", " Classification API service is down ", str(payload['inbound_id'])) else: time.sleep(10)
else: error_updation.exception_log(" CRM Request : Unable to locate document ", " CRM Request : Unable to locate document ", str(document['Id'])) else: payload['file_name'] = json_str.get("FileFullPath") logger.debug(discover_source_id) logger.debug("\n\n payload: {}", payload) connect_classification(payload) except Exception as error: error_updation.exception_log(" Classification API request issue ", " Classification API service is down ", str(payload['inbound_id'])) else: time.sleep(10) else: error_updation.exception_log(" Classification API service is down ", " ", str(0)) time.sleep(10) except ConnectionError: pass @app.route('/pause') def pause_job(): print("Pause" ) scheduler.pause() print(" Pause ->", scheduler.state) return jsonify("pause") @app.route('/dfx_stop') def stop_job():
def find_group(new_doc_name, auth_key): try: group_details['error_code'] = 0 if os.path.exists(new_doc_name): group_no = 0 group_details['error_msg'] = '' group_details['group_no'] = -1 group_details['unclassified_file_name'] = '' group_details['new_group'] = 0 text_file = '' #logger.debug("\n\n\n Find group : Input file {}, file size :{} ",new_doc_name , os.path.getsize(new_doc_name) ) get_group_details = document_similarity.similarity( new_doc_name, auth_key) logger.debug("\n\n\n Find group: get_group_details {}", get_group_details) group_details['accuracy_rate'] = get_group_details['accuracy_rate'] group_details['group_no'] = get_group_details['group_no'] group_no = get_group_details['group_no'] group_details['new_group'] = get_group_details['new_group'] logger.debug("\n\n\n Find group: get_group_details {}", group_no) if group_no is not None and int( group_no ) > 0 and new_doc_name is not None and new_doc_name is not '': #group_details['error_msg'] = get_group_details['error_msg'] logger.debug("\n\n find_group :Group No from similarity : {}", group_no) logger.info("\n\n Group No : {}", group_no) template_path = os.path.normpath( os.path.join(sim_template, str(group_no))) group_file_path = os.path.normpath( os.path.join(sim_group_loc, str(group_no))) group_details['group_file_path'] = group_file_path group_details['doc_name'] = str( group_no) + group_identifier + get_file_name( new_doc_name) + get_file_ext(new_doc_name) access_rights = 0o755 try: import shutil template_text_file = os.path.normpath( os.path.join( template_path, str(group_no) + group_identifier + get_file_name(new_doc_name) + get_file_ext(new_doc_name))) logger.info(template_text_file) dim_doc_location = os.path.normpath( os.path.join( group_file_path, str(group_no) + group_identifier + get_file_name(new_doc_name) + get_file_ext(new_doc_name))) if not os.path.exists(group_file_path): os.makedirs(group_file_path, access_rights) if not os.path.exists(template_path): os.makedirs(template_path, access_rights) if get_group_details['new_group'] == 1: os.rename(new_doc_name, template_text_file) shutil.copy(template_text_file, group_file_path) else: file_rename(new_doc_name, dim_doc_location) #text_file = file_rename(new_temp_file, group_no) except OSError: group_details['error_code'] = 3 group_details[ 'error_msg'] += "find_group : Failed to create the directory , Dir : " + template_path + " , " + group_file_path + ", " + str( OSError) group_details['group_no'] = group_no group_details['file_name'] = dim_doc_location logger.debug( "\n\n\n ##### Unclassified Grouping : Find Group : {}, \n\n Group File temp loc:{}", group_details, dim_doc_location) else: group_details['error_msg'] += get_group_details['error_msg'] group_details['error_code'] += get_group_details['error_code'] else: group_details[ 'error_msg'] += " Resource or Directory is not available " group_details['error_code'] = 3 except Exception as error: group_details['error_code'] = 3 #print("EXCEPTION in find_group : ",error) error_updation.exception_log( error, "EXCEPTION in find_group" + str(group_details['error_code']), str(new_doc_name)) group_details[ 'error_msg'] += "find_group : Issue with Resource to find the unclassified documents group " return group_details