def add_invoice(request): result = None form = None if request.method == 'POST': form = browse(request.POST, request.FILES) if form.is_valid(): handle_uploaded_file( request.FILES['file']) # store file in upload folder path = "pdfextractor/static/upload/" + str( request.FILES['file']) #path of selected file result1 = extract_data(path) # extract data from invoice pdf file listl = ['issuer', 'invoice_number', 'date', 'amount', 'currency'] new_in_dict = {} c = {} for key in result1: if key in listl: new_in_dict[key] = result1[key] else: c[key] = result1[key] new_in_dict['other'] = c result = new_in_dict # this is final dictionary else: form = browse() context = {"form": form, "result": result} return render(request, 'add_invoice.html', context)
def read_file(filename, debug): # If debug is active, get PDF as string for debugging/template creation if debug is True: with open('INPUT/' + filename, 'rb') as f: pdf = pdftotext.PDF(f) print('\n\n'.join(pdf)) templates = read_templates('TEMPLATES/') result = extract_data('INPUT/' + filename, templates, pdftotextdef) # if pdf read successful write JSON file if result != False: to_json.write_to_file( result, 'OUTPUT/' + os.path.splitext(filename)[0] + '.json', '%Y-%m-%d') # checks if due_date present in JSON and if not sets due date 1 month after invoice date with open('OUTPUT/' + os.path.splitext(filename)[0] + '.json', 'r+') as file: data = json.load(file) if "date_due" not in data: date = data["date"] date_obj = datetime.strptime(date, '%Y-%m-%d') json_in = { "date_due": helper_functions.add_month(date_obj).strftime('%Y-%m-%d') } data.update(json_in) file.seek(0) json.dump(data, file, indent=4, sort_keys=True) # else add file name to error list and move on else: helper_functions.append_error(filename)
def flipkartInvoice(path,request): '''Extract flipkartinvoice data''' f_invoice = extract_data(str(settings.MEDIA_ROOT+path),input_module=pdftotext) insert = Userpdfdata(buyer=f_invoice['issuer'],invoice_number=f_invoice['invoice_number'],seller=f_invoice['issuer'],invoice_date=f_invoice['date'],items=f_invoice['desc'],digitalized="digitized") insert.save(force_insert=True) digital_data=list(Userpdfdata.objects.all().values_list()) return digital_data
def extract_invoice(filename): template = invoice2data.extract.loader.read_templates( "env\\Lib\\site-packages\\invoice2data\\extract\\templates\\max\\") result = invoice2data.extract_data(filename, template) if result: return to_list(result) else: exit()
def parse_pdfs(self) -> List[Dict]: result = [] pdf_list = self.fm.get_pdf_list() templates = self.read_templates() for pdf in pdf_list: data = extract_data(pdf, templates=templates) result.append(data) return result
def amazonInvoice(path,request): ''' Extract amazoninvoice data''' a_invoice = extract_data(str(settings.MEDIA_ROOT+path),input_module=pdftotext) desc="" for key in a_invoice['lines']: for val in key: if val=='description': desc+=key[val]+"," insert = Userpdfdata(buyer=a_invoice['issuer'],invoice_number=a_invoice['invoice_number'],seller=a_invoice['partner_name'],invoice_date=a_invoice['date'],items=desc,digitalized="digitized") insert.save(force_insert=True) digital_data=list(Userpdfdata.objects.all().values_list()) return digital_data
def mailExtract(request): result = None form = None if request.method == 'POST': form = browse(request.POST, request.FILES) if form.is_valid(): handle_uploaded_file( request.FILES['file']) # store file in upload folder path = "invoicedata/static/upload/" + str( request.FILES['file']) #path of selected file result = extract_data(path) # extract data from file print(result) fields = ['issuer', 'invoice_number', 'date', 'amount', 'currency'] alist = [] other = [] for i in result: if i in fields: print(i) if i == 'issuer': alist.insert(0, result[i]) elif i == 'invoice_number': alist.insert(1, result[i]) elif i == 'date': alist.insert(2, result[i]) elif i == 'amount': alist.insert(3, result[i]) elif i == 'currency': alist.insert(4, result[i]) else: pass else: temp = str(str(i) + "-" + str(result[i])) other.append(temp) nalist = alist nother = Reverse(other) print(nalist) print(nother) p = Invoicelist(issuer=nalist[0], invoice_number=nalist[1], amount=nalist[2], date=nalist[3], currency=nalist[4], other=nother, author=request.user) p.save() else: form = browse() context = {"form": form, "result": result} return render(request, 'invoicedata/showdownloadedFile1.html', context)
def extract_multi(file_refrence, catagory): pdf_splitter(file_refrence, catagory) global total pages = glob.glob(r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{}".format("*.pdf")) print(pages) result = '' templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates') for page in pages: # path = r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{pdf}".format(pdf=page) result += to_table(extract_data(page, templates=templates), page) remove_file() ret_data = '{result}<h3 style="float:right">Total: {total}</h3>'.format(result=result, total=total) total = 0 return ret_data
def __init__(self, invDirectory, filename): templates = read_templates(cwd + '/tplf') pdfFile = invDirectory + '/' + filename result = extract_data(pdfFile, templates=templates) print(result) print(filename) if(result): for item in result.keys(): self.__setattr__(item, result[item]) else: print() print() print() self.__setattr__("issuer", "null") self.__setattr__("filename", filename)
def index(request): result = None form = None if request.method == 'POST': form = browse(request.POST, request.FILES) if form.is_valid(): handle_uploaded_file( request.FILES['file']) # store file in upload folder path = "pdfextractor/static/upload/" + str( request.FILES['file']) #path of selected file result = extract_data(path) # extract data from file else: form = browse() context = {"form": form, "result": result} return render(request, 'pdfextractor/index.html', context)
def thread_fn(file, dummy): total_missed = 0 total_corrected = 0 total_line_with_issue = 0 cmdlist = ["tesseract", "-c", "tessedit_char_whitelist=/.: abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"] templates = read_templates('./templates') result, missed, corrected, issue_lines, qtyerr, noofitem = extract_data(r+file, input_module=INPUT_MODULE, templates=templates, cmdlist=cmdlist, conv_cmdlist=None, tid = TID) total_missed = total_missed + missed total_corrected = total_corrected + corrected total_line_with_issue = total_line_with_issue + len(issue_lines) #logger.error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") report = f'=============================> missed: {missed} corrected: {corrected} line with issue: {len(issue_lines)} Qty issue: {qtyerr} Total Items: {noofitem}<============================' if missed != 0 or qtyerr != "Match": logger.error(CYELLOW + report + CEND) issue_list.append(file+"\t"+report) else: logger.error(CGREEN + report + CEND)
def extract_invoice_details(filename): if filename != '': filename_splitted = filename.split('.') # 1.Case for images if filename_splitted[-1] != 'pdf': # makingSearchablePDF = MakingSearchablePDFs() filename = MakingSearchablePDFs.convert_image_to_searchable_pdf( filename) # filename = convert_image_to_searchable_pdf(filename) # 2.Case for pdfs elif filename_splitted[-1] == 'pdf': pass # YAML Template System source = 'input/uploads/' + filename templates = read_templates('Templates/') result = extract_data(source, templates=templates) print('\n', type(result)) print('\n', result) # from json import dumps # print(dumps(datetime.now(), default=json_serial)) if result != False: destination = 'output/processed/' + filename json_data = json.dumps(result, indent=4, sort_keys=True, default=str) print(type(json_data), json_data) # shutil.move(source, destination) else: destination = 'output/failed/' + filename print('Failed for Processing of Invoice!!!') # Move processed file to respective actioned folder. # shutil.move(source, destination) # json_data = json.dumps(result) # print('\n', type(json_data)) with open(destination + name_of_image + '_json.json', 'w') as file: file.write(result)
from invoice2data import extract_data from invoice2data.extract.loader import read_templates import sys filename = 'C://Users//Shahrukh//Desktop//document_ai//invoiceX//pdf_input//sample_pg_6.pdf' #filename = sys.argv[1] templates = read_templates( 'C://Users//Shahrukh//Desktop//document_ai//invoiceX//templates') print(templates) result = extract_data(filename, templates=templates) #print("\n") print(result) #print("Working inside the jojo code") # Preprocessing: re-arrange and re-formating the extracted output ''' date = result['date'].strftime('%d, %b %Y') total = result['total'] invoice_number = result['invoice_number'] addr_from = result['From_Address'] addr_to = result['To_Address'] '''
from invoice2data import extract_data from invoice2data.extract.loader import read_templates import os templates = read_templates('./datasets') result = extract_data('datasets/MktPlace-Myntra.pdf', templates=templates)
def parse_pdf(pdf_path): templates = read_templates('peco_assistant/data/templates') results = extract_data(pdf_path, templates=templates) return results
def submit_com(): filename = input_entry.get() status_entry.delete(0, "") status_entry.insert(0, "Cheking for file") path = folder_entry.get() content = extract_data(filename) if str(content) == "False": status_entry.delete(0, "") status_entry.insert(0, "File Error!!!") else: file_type = list_option.get() file_type = file_type.lower() file_name = "output." + file_type if file_type == "xml": content = dicttoxml(content, custom_root='test', attr_type=False) try: out_file_open = open(join(path, file_name), 'w') out_file_open.write(str(content)) out_file_open.close() status_entry.delete(0, "") status_entry.insert(0, "Check Folder") input_entry.delete(0, "") print join(path, file_name) except: out_file_open = open((file_name), 'w') out_file_open.write(str(content)) out_file_open.close() status_entry.delete(0, "") status_entry.insert( 0, ("Successfully written in " + file_name)) input_entry.delete(0, "") if file_type == "csv": try: with open(join(path, file_name), 'wb') as file_opner: # Just use 'w' mode in 3.x file_csv = csv.DictWriter(file_opner, content.keys()) file_csv.writeheader() file_csv.writerow(content) status_entry.delete(0, "") status_entry.insert(0, "Check Folder") input_entry.delete(0, "") except: with open((file_name), 'w') as file_opner: # Just use 'w' mode in 3.x file_csv = csv.DictWriter(file_opner, content.keys()) file_csv.writeheader() file_csv.writerow(content) status_entry.delete(0, "") status_entry.insert( 0, ("Successfully written in " + file_name)) input_entry.delete(0, "") if file_type == "json": try: out_file_open = open(join(path, file_name), 'w') out_file_open.write(str(content)) out_file_open.close() status_entry.delete(0, "") status_entry.insert(0, "Check floder") input_entry.delete(0, "") except: out_file_open = open((file_name), 'w') out_file_open.write(str(content)) out_file_open.close() status_entry.delete(0, "") status_entry.insert( 0, ("Successfully written in " + file_name)) input_entry.delete(0, "")
"foreign_bruto": (worksheet.write_number, to_number), "local_base": (worksheet.write_number, to_number), "local_vat": (worksheet.write_number, to_number), "local_bruto": (worksheet.write_number, to_number), "exchange_rate": (worksheet.write_number, to_number) } #OCR count = 1 total = len([f for f in invoice_folder.iterdir()]) for invoice in invoice_folder.iterdir(): try: logger.info(" {} Processing ... {} / {}".format( invoice.name, count, total)) result = extract_data(str(invoice), templates=templates, input_module=tesseract4) if not result: result = {"document_number": str(invoice.name)} logger.info(" {} - No template".format(invoice.name)) else: result["document_number"] = invoice.name results.append(result) count += 1 except Exception as e: print('Error:', e) logger.exception(" {} - Exception occurred...".format(invoice.name)) continue try:
import os import invoice2data as ntd from invoice2data import extract_data from invoice2data.extract.loader import read_templates file_name = 'NT_01.pdf' temp_name = 'pdf2inv.py' file_path = os.path.join( r'C:\Users\fkhalil\primeStone\docrecog\sampleDocs\EURO DIESEL\B1959500485', file_name) temp_path = os.path.join(r'C:\Users\fkhalil\primeStone\docrecog\templates', temp_name) print(file_path, temp_path) templates = read_templates(temp_path) result = extract_data(file_path, templates=templates)
# Importing all the required libraries from invoice2data import extract_data from invoice2data.extract.loader import read_templates from invoice2data.input import pdftotext import pandas as pd # Importing custom template templates = read_templates('./template/') #print(templates) # Extract data from PDF result = extract_data('./data/pnlsheet.pdf', templates=templates, input_module=pdftotext) # Store the extracted data to a Data-frame df = pd.DataFrame(data=result) # Export Data-frame to a csv file df.to_csv('./data/invoice2data_simple.csv') ''' You can use any desired library to extract data from pdftotext, pdftotext, pdfminer, tesseract. It is optional and by default pdftotext will be used if not specified. The custom template named temp.yml is placed in the templates. You can remove the templates parameter in extract_data(). Default templates will be used '''
#path = "/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files" myclient = pymongo.MongoClient("mongodb://localhost:27017/") mydb = myclient["pdfinvdata"] mycol = mydb["invoicedata"] for filename in os.listdir( '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/' ): if filename.endswith(".pdf"): filenames.append(filename) pathname = '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/' + str( filename) path_filename.append(pathname) result = extract_data( '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/' + filename, read_template) pdfFiles1.append(result) #print(type(result)) #print("----------------------") if (result == False): pdfFiles.append(filename) else: pdfFiles.append(result) #pdfFiles.sort() #print(pdfFiles) #print(filenames) #print(path_filename) i = 0
import json, pandas import csv, os import sys, subprocess HOME = os.environ['HOME'] template_folder = sys.argv[2].strip() invoices_folder = sys.argv[1].strip() invoice_files = [ os.path.join(invoices_folder, f) for f in os.listdir(invoices_folder) if os.path.isfile(os.path.join(invoices_folder, f)) ] print('Template:', template_folder) for each in invoice_files: if not '.pdf' in each: continue result = extract_data(each) if not result: cmd = ['invoice2data', '--template-folder', template_folder, each] session = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) result = session.communicate()[0].decode('UTF-8') print(result) result_json = json.loads(result) print(result_json) #result = extract_data('/Users/hithyshikrishnamurthy/Downloads/Pearson_Converted_3.split/Pearson_Converted_3.1.pdf') csvfile_name = HOME + '/Desktop/invoice_details.csv' first_write = True if os.path.exists(csvfile_name): first_write = False
from invoice2data import extract_data from invoice2data.extract.loader import read_templates import json import datetime import argparse if __name__ == '__main__': # Initialize the arguments parser parser = argparse.ArgumentParser(description="Extract data from invoices") # Add the parameters positional/optional parser.add_argument('-t','--templates_dirpath', help="Templates directory path", type=str) parser.add_argument('-i','--invoice_path', help="Invoice file path", type=str) # Parse the arguments args = parser.parse_args() templates = read_templates(args.templates_dirpath) output_data = extract_data(args.invoice_path, templates=templates) date_time = output_data['date'].strftime('%Y-%m-%d') output_data['date'] = date_time # then print the formatted JSON data output print(json.dumps(output_data, indent=2))
def extract(name): templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates') result = extract_data(name, templates=templates) return result
from invoice2data import extract_data from invoice2data.extract.loader import read_templates templates = read_templates('test.pdf') result = extract_data('test.pdf', templates=templates) # st = "Total: 4.00 4,123.00" # result = st.split() # result.pop(1) # print(" ".join(result))