예제 #1
0
    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
                 imagewriter=None, stripcontrol=False):
        PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
                              laparams=laparams)
        self.imagewriter = imagewriter
        self.stripcontrol = stripcontrol
        self.textboxes = []
        self.page_width = []
        self.page_height = []
        self.classified = []
        self.classified_header = []
        self.classified_paragraph = []
        self.classified_section = []
        self.classified_subsection = []
        self.tree = Tree()
        self.tree.create_node("Documents", 'documents')
        self.num_tabs = 0
        self.write_header()

        self.headerExist = False
        self.in_li = False

        json_file = open('data/model.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        self.model = model_from_json(loaded_model_json)
        self.model.load_weights("data/model.h5")
        
        self.tokenizer = []

        with open('data/tokenizer.pickle', 'rb') as handle:
            self.tokenizer = pickle.load(handle)
        
        return
예제 #2
0
 def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
              showpageno=False, imagewriter=None):
     PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
     self.showpageno = showpageno
     self.imagewriter = imagewriter
     self.current_total_height = 0
     return
예제 #3
0
 def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, showpageno=False, imagewriter=None):
     PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
     self.showpageno       = showpageno
     self.imagewriter      = imagewriter
     self.blotterProcessor = BlotterProcessor(outfp)
     self.coro             = self.blotterProcessor.processDocument()
     return
예제 #4
0
파일: pdfs.py 프로젝트: joshgagnon/nzlawapi
 def __init__(self, rsrcmgr, doc, codec='utf-8', pageno=1,
          laparams=None, imagewriter=None):
     PDFConverter.__init__(self, rsrcmgr, None, codec=codec, pageno=pageno, laparams=laparams)
     self.imagewriter = imagewriter
     self.laparams = laparams
     self.doc = doc
     self.sizes = []
     return
예제 #5
0
 def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
              laparams=None, imagewriter=None, stripcontrol=False, document=None):
     PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
     self.imagewriter  = imagewriter
     self.stripcontrol = stripcontrol
     self.document     = document
     self.rsrcmgr      = rsrcmgr
     self.write_header()
     return
예제 #6
0
 def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
              imagewriter=None, stripcontrol=False):
     PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
                           laparams=laparams)
     self.imagewriter = imagewriter
     self.stripcontrol = stripcontrol
     self.textboxes = []
     self.page_width = []
     self.page_height = []
     self.write_header()
     return
예제 #7
0
 def __init__(self,
              rsrcmgr,
              outfp,
              pageno=1,
              laparams=None,
              showpageno=False):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           outfp,
                           pageno=pageno,
                           laparams=laparams)
     self.showpageno = showpageno
예제 #8
0
 def __init__(self, rsrcmgr, outfp):
     laparams = LAParams()
     laparams.char_margin=0.1
     PDFConverter.__init__(self, rsrcmgr, outfp, codec='utf-8', laparams=laparams)
     self.lines = []
     self.boxes = []
     self.writer = csv.writer(outfp, lineterminator='\n')
     self.writer.writerow(["企業・事業場名称",
                           "所在地",
                           "公表日",
                           "違反法条",
                           "事案概要",
                           "その他参考事項"])
     return
예제 #9
0
 def __init__(self,
              rsrcmgr,
              outfp,
              pageno=1,
              laparams=None,
              showpageno=False,
              imagewriter=None):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           outfp,
                           pageno=pageno,
                           laparams=laparams)
     self.showpageno = showpageno
     self.imagewriter = imagewriter
     self.outtext = ''
예제 #10
0
 def __init__(self,
              rsrcmgr,
              recorder,
              codec='utf-8',
              pageno=1,
              laparams=None,
              imagewriter=None,
              pages='all'):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           outfp=sys.stderr,
                           codec=codec,
                           pageno=pageno,
                           laparams=laparams)
     self.recorder = recorder  #custom class which is definied next
     self.pages = pages
예제 #11
0
    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
        PDFConverter.__init__(self,
                              rsrcmgr,
                              outfp,
                              codec=codec,
                              pageno=pageno,
                              laparams=laparams)

        self.layoutmode = 'normal'
        self._yoffset = 50

        self._font = None
        self._fontstack = []

        self._posstack = []
        self._texts = []
예제 #12
0
 def __init__(self,
              rsrcmgr,
              doc,
              codec='utf-8',
              pageno=1,
              laparams=None,
              imagewriter=None):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           None,
                           codec=codec,
                           pageno=pageno,
                           laparams=laparams)
     self.imagewriter = imagewriter
     self.laparams = laparams
     self.doc = doc
     self.sizes = []
     return
예제 #13
0
 def __init__(self,
              rsrcmgr,
              outfp,
              codec='utf-8',
              pageno=1,
              laparams=None,
              imagewriter=None,
              stripcontrol=False):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           outfp,
                           codec=codec,
                           pageno=pageno,
                           laparams=laparams)
     self.imagewriter = imagewriter
     self.stripcontrol = stripcontrol
     self.root = ContentNode(type="pages")
     return
예제 #14
0
 def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           codecs.getwriter(
                               locale.getpreferredencoding())(outfp),
                           codec=codec,
                           pageno=pageno,
                           laparams=laparams)
     self.document_root = {'tag': 'pages'}
     self.stack = []
     self.taglists = dict([
         (t, []) for t in
         "char pages page textbox textline textbox page rect polygon line figure curve textgrouplrtb textgrouptbrl"
         .split()
     ])
     self.hasrun = False
     self.interesting_attributes = 'tag index id bbox rotate pageid text size sizes orientation fontname fontnames fontstyles fonts bboxes length height width'.split(
         ' ')
     self.open()
예제 #15
0
파일: PDF.py 프로젝트: mcspx/rpaframework
 def __init__(
     self,
     rsrcmgr,
     codec="utf-8",
     pageno=1,
     laparams=None,
     imagewriter=None,
     stripcontrol=False,
 ):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           sys.stdout,
                           codec=codec,
                           pageno=pageno,
                           laparams=laparams)
     self.rpa_pdf_document = RpaPdfDocument()
     self.figure = None
     self.current_page = None
     self.imagewriter = imagewriter
     self.stripcontrol = stripcontrol
     self.write_header()
예제 #16
0
 def __init__(self, rsrcmgr):
     PDFConverter.__init__(self, rsrcmgr, None, codec='utf-8', pageno=1, laparams=None)
     self.pages = {}
     return
예제 #17
0
 def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
     PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
     self.textlines = []