def read_docx_text(filepath, supplierID): imageInfo = image_detect(filepath) flagList = [] flagDetails = {} data_text = docx2txt.process(filepath) rm_newline_char = data_text.rstrip('\n').split(',') rp_newline_empty = [i.replace('\n', ' ') for i in rm_newline_char] rp_newline_empty = [i.replace('\t', ' ') for i in rp_newline_empty] text_oneline = str(rp_newline_empty) email_url_phone = utility.email_phone_url(text_oneline) email_url_phoneFlags = email_url_phone[0] flagDetails = email_url_phone[1] supplierInfo = detect_supplier_info(data_text, supplierID) supplierBoolean = supplierInfo[0] imageBoolean = imageInfo[0] if imageBoolean[0]: flagList.append(imageBoolean) imageValue = imageInfo[1] flagDetails['image'] = imageValue else: flagList.append(imageBoolean) flagList.append(email_url_phoneFlags) suppliers = [] if supplierBoolean: supplierVals = supplierInfo[1] flagDetails['suppliers'] = supplierVals suppliers.append(1) flagList.append(suppliers) else: suppliers.append(0) flagList.append(suppliers) return flagList, flagDetails
def read_rtf_text_catdoc(filepath): image = image_detect(filepath) data_text = '' flagList = [] fileopen = os.popen('catdoc -w "%s"' % filepath) data_text = fileopen.read() rm_dump_spaces = data_text.rstrip('\n').split(',') data = [i.replace('\n', ' ') for i in rm_dump_spaces] text_oneline = str(data) email_url_phone = utility.email_phone_url(text_oneline) supplier = detect_supplier_info(data_text) flagList.append(image) flagList.append(email_url_phone) flagList.append(supplier) return flagList
def read_pdf_text(filepath, supplierID): imageInfo = image_detect(filepath) data_text = '' flagList = [] flagDetails = {} # pdf_file_object = open(filepath, 'rb') # pdf_file = PdfFileReader(pdf_file_object) # for page in pdf_file.pages: # data_text += page.extractText() # make the text in oneline remove new line charecter parsed = parser.from_file(filepath) data_text += parsed["content"] rm_newline_char = data_text.rstrip('\n').split(',') rp_newline_empty = [i.replace('\n', ' ') for i in rm_newline_char] rp_newline_empty = [i.replace('\t', ' ') for i in rp_newline_empty] text_oneline = str(rp_newline_empty) email_url_phone = utility.email_phone_url(text_oneline) email_url_phoneFlags = email_url_phone[0] flagDetails = email_url_phone[1] supplierInfo = detect_supplier_info(data_text, supplierID) supplierBoolean = supplierInfo[0] imageBoolean = imageInfo[0] if imageBoolean[0]: flagList.append(imageBoolean) imageValue = imageInfo[1] flagDetails['image'] = imageValue else: flagList.append(imageBoolean) flagList.append(email_url_phoneFlags) suppliers = [] if supplierBoolean: supplierVals = supplierInfo[1] flagDetails['suppliers'] = supplierVals suppliers.append(1) flagList.append(suppliers) else: suppliers.append(0) flagList.append(suppliers) return flagList, flagDetails
def read_odt_text(filepath, supplierID): imageInfo = image_detect(filepath) flagList = [] flagDetails = {} data_text = '' popen_param = ['odt2txt', filepath] popen_output = Popen(popen_param, stdout=PIPE) stdout, stderr = popen_output.communicate() data_text += stdout.decode('ascii', 'ignore') rm_newline_char = data_text.rstrip('\n').split(',') rp_newline_empty = [i.replace('\n', ' ') for i in rm_newline_char] rp_newline_empty = [i.replace('\t', ' ') for i in rp_newline_empty] text_oneline = str(rp_newline_empty) email_url_phone = utility.email_phone_url(text_oneline) email_url_phoneFlags = email_url_phone[0] flagDetails = email_url_phone[1] supplierInfo = detect_supplier_info(data_text, supplierID) supplierBoolean = supplierInfo[0] imageBoolean = imageInfo[0] if imageBoolean[0]: flagList.append(imageBoolean) imageValue = imageInfo[1] flagDetails['image'] = imageValue else: flagList.append(imageBoolean) flagList.append(email_url_phoneFlags) suppliers = [] if supplierBoolean: supplierVals = supplierInfo[1] flagDetails['suppliers'] = supplierVals suppliers.append(1) flagList.append(suppliers) else: suppliers.append(0) flagList.append(suppliers) return flagList, flagDetails