Exemplos de read_templates em Python, exemplos de invoice2data.extract.loader.read_templates em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: account_invoice_import.py Projeto: CURE-EMR/oca10

 def invoice2data_parse_invoice(self, file_data):
     logger.info('Trying to analyze PDF invoice with invoice2data lib')
     fd, file_name = mkstemp()
     try:
         os.write(fd, file_data)
     finally:
         os.close(fd)
     # Transfer log level of Odoo to invoice2data
     loggeri2data.setLevel(logger.getEffectiveLevel())
     local_templates_dir = tools.config.get('invoice2data_templates_dir',
                                            False)
     logger.debug('invoice2data local_templates_dir=%s',
                  local_templates_dir)
     templates = []
     if local_templates_dir and os.path.isdir(local_templates_dir):
         templates += read_templates(local_templates_dir)
     exclude_built_in_templates = tools.config.get(
         'invoice2data_exclude_built_in_templates', False)
     if not exclude_built_in_templates:
         templates += read_templates()
     logger.debug('Calling invoice2data.extract_data with templates=%s',
                  templates)
     try:
         invoice2data_res = extract_data(file_name, templates=templates)
     except Exception, e:
         raise UserError(
             _("PDF Invoice parsing failed. Error message: %s") % e)

Exemplo n.º 2

0

Exibir arquivo

def main(args=None):
    """Take folder or single file and analyze each."""
    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    if args.cmdlist:
        cmdlist = args.cmdlist.split("+")
    if args.imgcmd:
        imgcmd = args.imgcmd.split("+")
    else:
        imgcmd = None
    templates = []
    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates()
    output = []

    for f in args.input_files:
        res, missed, corrected, issue_lines, qtyerr, noofitem = extract_data(f.name, templates=templates, input_module=args.input_reader, cmdlist=cmdlist, conv_cmdlist=imgcmd, tid=args.tid)
        if res:
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = args.filename.format(
                    #date=res["date"].strftime("%Y-%m-%d"),
                    date=res["date"],
                    invoice_number=res["invoice_number"],
                    desc=res["desc"],
                )
                shutil.copyfile(f.name, join(args.copy, filename))
            if args.move:
                filename = args.filename.format(
                    date=res["date"],
                    invoice_number=res["invoice_number"],
                    desc=res["desc"],
                )
                shutil.move(f.name, join(args.move, filename))
        f.close()

    generate_output(output, output_name=args.output_name, output_date_format=args.output_date_format, output_module=args.output_format)

    sys.exit(missed)

Exemplo n.º 3

0

Exibir arquivo

def main(args=None):
    """Take folder or single file and analyze each."""
    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    input_module = input_mapping[args.input_reader]
    output_module = output_mapping[args.output_format]

    templates = []
    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates()
    output = []
    for f in args.input_files:
        res = extract_data(f.name,
                           templates=templates,
                           input_module=input_module)
        if res:
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = args.filename.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    invoice_number=res['invoice_number'],
                    desc=res['desc'],
                )
                shutil.copyfile(f.name, join(args.copy, filename))
            if args.move:
                filename = args.filename.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    invoice_number=res['invoice_number'],
                    desc=res['desc'],
                )
                shutil.move(f.name, join(args.move, filename))
        f.close()

    if output_module is not None:
        output_module.write_to_file(output, args.output_name,
                                    args.output_date_format)

Exemplo n.º 4

0

Exibir arquivo

def read_file(filename, debug):
    # If debug is active, get PDF as string for debugging/template creation
    if debug is True:
        with open('INPUT/' + filename, 'rb') as f:
            pdf = pdftotext.PDF(f)
        print('\n\n'.join(pdf))

    templates = read_templates('TEMPLATES/')
    result = extract_data('INPUT/' + filename, templates, pdftotextdef)
    # if pdf read successful write JSON file
    if result != False:
        to_json.write_to_file(
            result, 'OUTPUT/' + os.path.splitext(filename)[0] + '.json',
            '%Y-%m-%d')

        # checks if due_date present in JSON and if not sets due date 1 month after invoice date
        with open('OUTPUT/' + os.path.splitext(filename)[0] + '.json',
                  'r+') as file:
            data = json.load(file)
            if "date_due" not in data:
                date = data["date"]
                date_obj = datetime.strptime(date, '%Y-%m-%d')
                json_in = {
                    "date_due":
                    helper_functions.add_month(date_obj).strftime('%Y-%m-%d')
                }
                data.update(json_in)
                file.seek(0)
                json.dump(data, file, indent=4, sort_keys=True)
    # else add file name to error list and move on
    else:
        helper_functions.append_error(filename)

Exemplo n.º 5

0

Exibir arquivo

def extract_data(invoicefile, templates=None, input_module=pdftotext, debug=False):
    if debug:
        logging.basicConfig(level=logging.DEBUG)

    if templates is None:
        templates = read_templates()

    extracted_str = input_module.to_text(invoicefile).decode('utf-8')

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    default_template = next((t for t in templates if 'default' in t['template_name']),None)
    if default_template:
        logger.error("Falling back to default template.")
        return default_template.extract(default_template.prepare_input(extracted_str))

    # Master template
    logger.error('No template for %s', invoicefile)
    return False

Exemplo n.º 6

0

Exibir arquivo

def extract_data(invoicefile, templates=None, input_module=pdftotext):
    """Extracts structured data from PDF/image invoices.

    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    if templates is None:
        templates = read_templates()

    extracted_str = input_module.to_text(invoicefile).decode("utf-8")

    logger.debug("START pdftotext result ===========================")
    logger.debug(extracted_str)
    logger.debug("END pdftotext result =============================")

    logger.debug("Testing {} template files".format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)
        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    logger.error("No template for %s", invoicefile)
    return False

Exemplo n.º 7

0

Exibir arquivo

Arquivo: filedata.py Projeto: shahrukhgellani/DemoFRE

def extract_multi(file_refrence, catagory):
    pdf_splitter(file_refrence, catagory)
    global total
    pages = glob.glob(r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{}".format("*.pdf"))
    print(pages)
    result = ''
    templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates')
    for page in pages:
        # path = r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{pdf}".format(pdf=page)
        result += to_table(extract_data(page, templates=templates), page)
    remove_file()
    ret_data = '{result}<h3 style="float:right">Total: {total}</h3>'.format(result=result, total=total)
    total = 0
    return ret_data

Exemplo n.º 8

0

Exibir arquivo

Arquivo: invoice_manipulation.py Projeto: joco8/Invoice-Handler

 def __init__(self, invDirectory, filename):
     templates = read_templates(cwd + '/tplf')
     pdfFile = invDirectory + '/' + filename
     result = extract_data(pdfFile, templates=templates)
     print(result)
     print(filename)
     if(result): 
         for item in result.keys():
             self.__setattr__(item, result[item]) 
     else:
         print()
         print()
         print()
         self.__setattr__("issuer", "null")
         self.__setattr__("filename", filename)

Exemplo n.º 9

0

Exibir arquivo

def get_parsed_data(templates, extracted_str, partiallyExtracted):
    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    if templates is None:
        templates = read_templates()

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str, partiallyExtracted)

    return False

Exemplo n.º 10

0

Exibir arquivo

Arquivo: main.py Projeto: us241098/invoice2data

def extract_data(invoicefile, templates=None, input_module=pdftotext):
    if templates is None:
        templates = read_templates()

    extracted_str = input_module.to_text(invoicefile).decode('utf-8')

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    logger.error('No template for %s', invoicefile)
    return False

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test.py Projeto: aksonlyaks/invoice2data

def thread_fn(file, dummy):
    total_missed = 0
    total_corrected = 0
    total_line_with_issue = 0

    cmdlist = ["tesseract", "-c", "tessedit_char_whitelist=/.: abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"]
    templates = read_templates('./templates')
    result, missed, corrected, issue_lines, qtyerr, noofitem = extract_data(r+file, input_module=INPUT_MODULE, templates=templates, cmdlist=cmdlist, conv_cmdlist=None, tid = TID)

    total_missed = total_missed + missed
    total_corrected = total_corrected + corrected
    total_line_with_issue = total_line_with_issue + len(issue_lines)

    #logger.error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    report = f'=============================> missed: {missed} corrected: {corrected} line with issue: {len(issue_lines)} Qty issue: {qtyerr} Total Items: {noofitem}<============================'
    if missed != 0 or qtyerr != "Match":
        logger.error(CYELLOW + report + CEND)
        issue_list.append(file+"\t"+report)
    else:
        logger.error(CGREEN + report + CEND)

Exemplo n.º 12

0

Exibir arquivo

def extract_invoice_details(filename):
    if filename != '':
        filename_splitted = filename.split('.')
        # 1.Case for images
        if filename_splitted[-1] != 'pdf':
            # makingSearchablePDF = MakingSearchablePDFs()
            filename = MakingSearchablePDFs.convert_image_to_searchable_pdf(
                filename)
            # filename = convert_image_to_searchable_pdf(filename)
        # 2.Case for pdfs
        elif filename_splitted[-1] == 'pdf':
            pass

        # YAML Template System
        source = 'input/uploads/' + filename
        templates = read_templates('Templates/')
        result = extract_data(source, templates=templates)
        print('\n', type(result))
        print('\n', result)

        # from json import dumps
        # print(dumps(datetime.now(), default=json_serial))
        if result != False:
            destination = 'output/processed/' + filename
            json_data = json.dumps(result,
                                   indent=4,
                                   sort_keys=True,
                                   default=str)
            print(type(json_data), json_data)
            # shutil.move(source, destination)
        else:
            destination = 'output/failed/' + filename
            print('Failed for Processing of Invoice!!!')

        # Move processed file to respective actioned folder.
        # shutil.move(source, destination)
        # json_data = json.dumps(result)
        # print('\n', type(json_data))
        with open(destination + name_of_image + '_json.json', 'w') as file:
            file.write(result)

Exemplo n.º 13

0

Exibir arquivo

import os
from pathlib import Path
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
from invoice2data.input import tesseract4
import xlsxwriter
from my_logger import logger

logger.name = "FX"
work_dir_path = Path(__file__).parent / 'work_dir'
invoice_folder = work_dir_path / "original"
templates = read_templates(work_dir_path / "templates")

# file_name = "invoice-test.pdf"
#file_name = "57001194492019.tiff"
#file_path = work_dir_path / "original" / file_name
#result = extract_data(str(file_path), templates=templates, input_module=tesseract4)
#print(result)


#Excel setting
def no_change(val):
    return val


def to_number(val):
    return float(val.replace(',', ''))


workbook = xlsxwriter.Workbook('invoices.xlsx')
worksheet = workbook.add_worksheet()

Exemplo n.º 14

0

Exibir arquivo

Arquivo: filedata.py Projeto: shahrukhgellani/DemoFRE

def extract(name):
    templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates')
    result = extract_data(name, templates=templates)
    return result

Exemplo n.º 15

0

Exibir arquivo

Arquivo: pdfinvoice2data.py Projeto: senthilsweb/Invoicepdf2Data

from flask import Flask, render_template, request
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import yaml
import glob
import errno
import os
import pymongo
#import PyPDF2

app = Flask(__name__)

read_template = read_templates(folder="invoice_templates")

pdfFiles = []
pdfFiles1 = []
pdfFiles2 = []
filenames = []
path_filename = []

#path = "/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files"
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["pdfinvdata"]
mycol = mydb["invoicedata"]

for filename in os.listdir(
        '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/'
):
    if filename.endswith(".pdf"):
        filenames.append(filename)
        pathname = '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/' + str(

Exemplo n.º 16

0

Exibir arquivo

#!/Library/Frameworks/Python.framework/Versions/3.7/bin/python3

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import json
import sys

file = sys.argv[1]

filename = '/Applications/XAMPP/xamppfiles/htdocs/invoice-app/admin/' + file

templates = read_templates('/Users/krzemson/Desktop/aed/tpl')
result = extract_data(filename, templates=templates)

print(json.dumps(result))

Exemplo n.º 17

0

Exibir arquivo

Arquivo: invoice2data.py Projeto: hilali-msc/financial-documents-ocr-deep-learning

# Importing all the required libraries

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
from invoice2data.input import pdftotext
import pandas as pd

# Importing custom template
templates = read_templates('./template/')

#print(templates)

# Extract data from PDF
result = extract_data('./data/pnlsheet.pdf',
                      templates=templates,
                      input_module=pdftotext)

# Store the extracted data to a Data-frame
df = pd.DataFrame(data=result)

# Export Data-frame to a csv file
df.to_csv('./data/invoice2data_simple.csv')
''' 
You can use any desired library to extract data from pdftotext, pdftotext, pdfminer, tesseract. It is optional
and by default pdftotext will be used if not specified.

The custom template named temp.yml is placed in the templates. You can remove the templates parameter in
extract_data(). Default templates will be used

'''

Exemplo n.º 18

0

Exibir arquivo

Arquivo: test_cli.py Projeto: yucer/invoice2data

 def setUp(self):
     self.templates = read_templates()
     self.parser = create_parser()

Exemplo n.º 19

0

Exibir arquivo

Arquivo: extract_data.py Projeto: audacesk/blue_ml

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import json
import datetime
import argparse

if __name__ == '__main__':
    # Initialize the arguments parser
    parser = argparse.ArgumentParser(description="Extract data from invoices")

    # Add the parameters positional/optional
    parser.add_argument('-t','--templates_dirpath', help="Templates directory path", type=str)
    parser.add_argument('-i','--invoice_path', help="Invoice file path", type=str)

    # Parse the arguments
    args = parser.parse_args()

    templates = read_templates(args.templates_dirpath)

    output_data = extract_data(args.invoice_path, templates=templates)

    date_time = output_data['date'].strftime('%Y-%m-%d')
    output_data['date'] = date_time
    # then print the formatted JSON data output
    print(json.dumps(output_data, indent=2))

Exemplo n.º 20

0

Exibir arquivo

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import os

templates = read_templates('./datasets')
result = extract_data('datasets/MktPlace-Myntra.pdf', templates=templates)

Exemplo n.º 21

0

Exibir arquivo

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import sys

filename = 'C://Users//Shahrukh//Desktop//document_ai//invoiceX//pdf_input//sample_pg_6.pdf'
#filename = sys.argv[1]
templates = read_templates(
    'C://Users//Shahrukh//Desktop//document_ai//invoiceX//templates')
print(templates)

result = extract_data(filename, templates=templates)
#print("\n")
print(result)
#print("Working inside the jojo code")

# Preprocessing: re-arrange and re-formating the extracted output
'''
date = result['date'].strftime('%d, %b %Y')
total = result['total']
invoice_number = result['invoice_number']
addr_from = result['From_Address']
addr_to = result['To_Address']
'''

Exemplo n.º 22

0

Exibir arquivo

def parse_pdf(pdf_path):
    templates = read_templates('peco_assistant/data/templates')
    results = extract_data(pdf_path, templates=templates)
    return results

Exemplo n.º 23

0

Exibir arquivo

Arquivo: test_extraction.py Projeto: Chatbot123/pdftotext

 def setUp(self):
     self.templates = read_templates()

Exemplo n.º 24

0

Exibir arquivo

Arquivo: test_cli.py Projeto: m3nu/invoice2data

 def setUp(self):
     self.templates = read_templates()
     self.parser = create_parser()

Exemplo n.º 25

0

Exibir arquivo

 def read_templates(self):
     return read_templates(settings.TEMPLATE_DIR)

Exemplo n.º 26

0

Exibir arquivo

def extract_data(invoicefile, templates=None, input_module="png", cmdlist=None, conv_cmdlist=None, tid=None):
    """Extracts structured data from PDF/image invoices.
˜
    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    try:
        t = None
        if templates is None:
            templates = read_templates()

        input_module = input_mapping[input_module]
        
        logger.error("Input tid is %s and Input module is %s", tid, input_module)
        for tt in templates:
            if t != None:
                break
            if "tid" in tt.options:
                for tid_option in tt.options["tid"]:
                    if str(tid_option) == tid:
                        t = tt
                        logger.error(f'Template found based on tid {t.options["tid"]} {t["issuer"]}')
                        
                
        if t != None and "psm" in t.options:
            logger.error("PSM is %d", t.options["psm"])
            if str(t.options["psm"]) == "3":
                cmdlist = copy.deepcopy(cmdlist_psm3)
            else:
                cmdlist_psm6[6] = str(t.options["psm"])
                cmdlist = copy.deepcopy(cmdlist_psm6)
        
        if t!=None and "imgcmd" in t.options:
            logger.error("imgcmd is %s", t.options["imgcmd"])
            conv_cmdlist = t.options["imgcmd"]
            
        # print(templates[0])
        extracted_str = input_module.to_text(invoicefile, cmdlist=cmdlist, conv_cmdlist=conv_cmdlist).decode("utf-8")

        logger.debug("START pdftotext result ===========================")
        logger.error(extracted_str)
        logger.debug("END pdftotext result =============================")

        logger.debug("Testing {} template files".format(len(templates)))
        missed = -1
        corrected = -1
        issue_lines = []
        qtyerr = ""
        noofitem = -1
        output = []
        if t == None:
            for t in templates:
                optimized_str = t.prepare_input(extracted_str)

                if t.matches_input(optimized_str):
                    return t.extract(optimized_str)
        else:
            optimized_str = t.prepare_input(extracted_str) 
            output = t.extract(optimized_str)
            if t != None and "decimal" in t.options:
                missed, corrected, issue_lines, qtyerr, noofitem = post_process(output, t.options)
            
            return output, missed, corrected, issue_lines, qtyerr, noofitem

        logger.error("No template for %s", invoicefile)
        return output, missed, corrected, issue_lines, qtyerr, noofitem
    except Exception as ex:
        logger.error("Exception occured in invoice conversion "+ str(ex))

    return False

Exemplo n.º 27

0

Exibir arquivo

import os
import invoice2data as ntd
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
file_name = 'NT_01.pdf'
temp_name = 'pdf2inv.py'
file_path = os.path.join(
    r'C:\Users\fkhalil\primeStone\docrecog\sampleDocs\EURO DIESEL\B1959500485',
    file_name)
temp_path = os.path.join(r'C:\Users\fkhalil\primeStone\docrecog\templates',
                         temp_name)
print(file_path, temp_path)

templates = read_templates(temp_path)
result = extract_data(file_path, templates=templates)

Exemplo n.º 28

0

Exibir arquivo

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates

templates = read_templates('test.pdf')
result = extract_data('test.pdf', templates=templates)

# st = "Total: 4.00 4,123.00"
# result = st.split()
# result.pop(1)
# print(" ".join(result))

Exemplo n.º 29

0

Exibir arquivo

Arquivo: main.py Projeto: jfitoussi/invoice2data

def extract_data(invoicefile, templates=None, input_module=pdftotext, with_extracted_str=False, **kwargs):
    """Extracts structured data from PDF/image invoices.

    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    if templates is None:
        templates = read_templates()

    # print(templates[0])
    extracted_str = input_module.to_text(invoicefile, **kwargs).decode('utf-8')
    try:
        extracted_str = extracted_str.replace(u'\xa0', u' ').replace(u"\xc2", " ")
    except:
        pass

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    data = data_from_template(templates, extracted_str)
    try:
        if data:
            if with_extracted_str:
                return data, extracted_str
            return data
    except ValueError:
        logger.warning("Exception in parsing", exc_info=True)

    logger.warning('No template for %s', invoicefile)
    if with_extracted_str:
        return False, extracted_str
    return False