Exemplo n.º 1
0
def get_tables(fh):
    """
    Return a list of 'tables' from the given file handle, where a table is a
    list of rows, and a row is a list of strings.
    """
    result = []
    doc, interpreter, device = initialize_pdf_miner(fh)
    doc_length = len(list(PDFPage.create_pages(doc)))
    for i, pdf_page in enumerate(PDFPage.create_pages(doc)):
        #print("Trying page {}".format(i + 1))
        if not page_contains_tables(pdf_page, interpreter, device):
            #print("Skipping page {}: no tables.".format(i + 1))
            continue

        # receive the LTPage object for the page.
        interpreter.process_page(pdf_page)
        processed_page = device.get_result()

        (table, _) = page_to_tables(
            processed_page,
            extend_y=False,
            hints=[],
            atomise=True)
        crop_table(table)
        result.append(Table(table,i+1,doc_length,1,1))

    return result
Exemplo n.º 2
0
def GetScript(filename):
    global scriptName
    ResetGlobals()
    scriptName = filename
    password = ""
    # Open a PDF file.
    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser, password)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        print "---Not translatable---"
        return
        #raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
    
    # Set parameters for analysis.
    laparams = LAParams()
    laparams.boxes_flow = 2
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for pgnum,page in enumerate(PDFPage.create_pages(document)):
        if pgnum == 0:
            continue
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        text = []
        for page in layout:
            try:
                if page.get_text().strip():
                    text.append(TextBlock(page.x0,page.y1,page.get_text().strip()))
            except:
                temp=5  
            print ".",
        text.sort(key = lambda row:(-row.y))
        # Parse all of the "line" objects in each page
        for line in text:
            ParseLine(line.text, line.x)
Exemplo n.º 3
0
def calculate_locations(filename,keywords):
    locations = []
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)    
    #Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document)
    pagenum = 0
    reader = PdfFileReader(file(filename,"rb"))
    for page in pages:
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()    
        page = reader.getPage(pagenum)
        x = page.trimBox[0].as_numeric()
        y = page.trimBox[1].as_numeric()
        #Handling special case
        if  (x > 0 and y < 0):
                x = 0
#         print "At page = %s  X  = %s , y = %s"%(pagenum,x,y)
        for keyword in keywords:    
            print '********************************'
            co_ordinates = get_location(keyword,layout,x,y)
            print'Keyword %s , location %s'%(keyword,co_ordinates)
            print '********************************'
            if co_ordinates != None :
                for location in co_ordinates:
                    print "PageNum-->%s"%pagenum
                    l = LocationKeeper(keyword,location,pagenum)
                    locations.append(l)
        pagenum+=1
    return locations
Exemplo n.º 4
0
def convert(url, pages=None):
    assert isinstance(url, basestring)
    assert pages == None or isinstance(pages, list)

    rscmng = PDFResourceManager()
    retstr = StringIO()
    device = TextConverter(rscmng, retstr, codec='utf-8', laparams=LAParams())
    web_page = urllib2.urlopen(urllib2.Request(url))
    fp = StringIO(web_page.read())
    interpreter = PDFPageInterpreter(rscmng, device)

    pdf_pages = PDFPage.get_pages(
        fp,
        set(pages if pages != None else []),
        maxpages=0,
        password='',
        caching=True,
        check_extractable=True
    )

    for page in pdf_pages:
        interpreter.process_page(page)

    result = retstr.getvalue()

    fp.close()
    web_page.close()
    device.close()
    retstr.close()

    return result
Exemplo n.º 5
0
def pdf_to_text(page_object):
    parser = PDFParser(page_object)
    # Create a PDF document object that stores the document structure
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.initialize('')
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF page aggregator object
    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = []
    # i = page number #without this it doesn't work
    # page are items in page
    for i, page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for object in layout:
            if isinstance(object, LTTextBox) or isinstance(object, LTTextLine):
                trial = []
                trial.append(object.get_text())
                for word in trial:
                    text_content.append(word)                    
    return text_content
Exemplo n.º 6
0
def get_pdf_text(path):
    """ Reads a pdf file and returns a dict of the text where the
        index represents the page number.
        http://stackoverflow.com/a/20905381
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    # change to to utf-8 if the text comes out garbled
    codec = 'ascii'
    #codec = 'utf-8'
    laparams = LAParams()
    pages = {}
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, showpageno=True, pages=pages)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    retstr.close()
    return pages
Exemplo n.º 7
0
def dumppdf(fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    res = ""
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            res += dumpxml(obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        res += dumpxml( obj, codec=codec)
                else:
                    res += dumpxml(page.attrs)
    #print "before dumpall"
    if dumpall:
        res += dumpallobjs( doc, codec=codec)
        #print "after dumpall"
    if (not objids) and (not pagenos) and (not dumpall):
        res += dumptrailers( doc)
    fp.close()
    if codec not in ('raw','binary'):
        res += '\n'
    #print "end proc"
    return res
Exemplo n.º 8
0
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
                page_num += 1

                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                data = retstr.getvalue()
                retstr.close()

                self.parse_page(fpath, data, page_num)
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
Exemplo n.º 9
0
def read_fields(pdffile):
    outfields = list()
    fp = open(pdffile, 'rb')
    id_to_page = dict()
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    pageno = 1;
    for page in PDFPage.create_pages(doc):
        id_to_page[page.pageid] = pageno
        pageno += 1
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    for i in fields:
        field = resolve1(i)
        name, value, rect, page, field_type = field.get('T'), field.get('V'), field.get('Rect'), field.get('P'), field.get('FT')
        logmessage("name is " + str(name) + " and FT is |" + str(field_type) + "|")
        if page is not None:
            pageno = id_to_page[page.objid]
        else:
            pageno = 1
        if str(field_type) == '/Btn':
            if value == '/Yes':
                default = "Yes"
            else:
                default = "No"
        elif str(field_type) == '/Sig':
            default = '${ user.signature }'
        else:
            if value is not None:
                default = value
            else:
                default = word("something")
        outfields.append((name, default, pageno, rect, field_type))
    return outfields
Exemplo n.º 10
0
def convert_pdf_to_txt(path, output):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()

    f = open(output, 'wb')
    f.write(text)
    f.close()
    return text
Exemplo n.º 11
0
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
Exemplo n.º 12
0
    def convert_pdf_to_txt(self, path):
        """
        A very simple conversion function
        which returns text for parsing from PDF.

        path = The path to the file
        """
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(
                rsrcmgr, retstr, codec=codec, laparams=laparams)
            fp = file(path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
            text = retstr.getvalue()
            fp.close()
            device.close()
            retstr.close()
            return text
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to PDF to text: " + str(e))
Exemplo n.º 13
0
def pdf_to_txt(in_file):
	""" turn a PDF file to a TXT file (roughly processed)
	"""
	# Open a PDF file.
	fp = open(in_file, 'rb')
	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	document = PDFDocument(parser)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
		raise PDFTextExtractionNotAllowed
	# Set parameters for analysis.
	laparams = LAParams()
	# Create a PDF resource manager object that stores shared resources.
	rsrcmgr = PDFResourceManager()
	# Create a PDF page aggregator object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	# Create a PDF interpreter object.
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.create_pages(document):
		interpreter.process_page(page)
		# Receive the LTPage object for the page.
		layout = device.get_result()
		for klass in layout:
			if isinstance(klass, LTTextBoxHorizontal):
				out_file = in_file[:-3] + 'txt'
				with open(out_file, 'a') as dst_file:
					text = klass.get_text().encode('utf-8')
					dst_file.write(text + '\n')
	return None
Exemplo n.º 14
0
def convert_pdf_to_txt(path): 
	## TAKEN FROM STACK OVERFLOW
	## see... http://www.unixuser.org/~euske/python/pdfminer/programming.html for tutorial
	## Also see... https://github.com/dpapathanasiou/pdfminer-layout-scanner/blob/master/layout_scanner.py
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	fp = file(path, 'rb')
	password = ""
	maxpages = 0
	caching = True
	pagenos=set()

	# Read text from pages
	device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)	
	interpreter = PDFPageInterpreter(rsrcmgr, device)	
	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
	str = retstr.getvalue()

	fp.close()
	device.close()
	retstr.close()

	return str
Exemplo n.º 15
0
def get_layout(path):
	'''returns a list of every character in the document as well as its location'''

	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	fp = file(path, 'rb')
	password = ""
	maxpages = 0
	caching = True
	pagenos=set()

	layout = []
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
		layout.append(  device.get_result()  )
	fp.close()
	device.close()
	retstr.close()

	return layout
Exemplo n.º 16
0
def pdf_read(pdf):
    """
    Use PDFMiner to extract text from pdf file.
    <PDFMiner even though more low-level but pretty good tool to read pdfs>

    Args:
        *pdf* (str) -- path to pdf file

    Returns:
        *text* (str) -- a text extracted from pdf

    """
    # initalizing objects
    res_manager = PDFResourceManager()
    strio = StringIO()
    lps = LAParams()
    device = TextConverter(res_manager, strio, codec='utf-8', laparams=lps)
    interpreter = PDFPageInterpreter(res_manager, device)
    # opening a pdf file with 'rb' mode for reading binary files
    pdf_file = file(pdf, 'rb')
    for page in PDFPage.get_pages(pdf_file, maxpages=0, password='',
                                  caching=True, check_extractable=True):
        interpreter.process_page(page)
    # finishing up
    pdf_file.close()
    device.close()
    text = strio.getvalue()
    strio.close()
    return text
Exemplo n.º 17
0
    def __init__(self, file, password='', just_text=1, check_extractable=True, char_margin=1.0, line_margin=0.1, word_margin=0.1):
        self.parser = PDFParser(file)
        self.laparams = LAParams(char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)

        if PYTHON_3:
            self.doc = PDFDocument()
            self.parser.set_document(self.doc)
            self.doc.set_parser(self.parser)
            self.doc.initialize(password)
        else:
            self.doc = PDFDocument(self.parser, password)

        if not check_extractable or self.doc.is_extractable:
            self.resmgr = PDFResourceManager()
            self.device = TextConverter(self.resmgr, outfp=StringIO(), laparams=self.laparams)
            self.interpreter = PDFPageInterpreter(
               self.resmgr, self.device)

            if PYTHON_3:
                page_generator = self.doc.get_pages()
            else:
                page_generator = PDFPage.create_pages(self.doc)

            for page in page_generator:
                self.append(self.interpreter.process_page(page))
            self.metadata = self.doc.info
        if just_text:
            self._cleanup()
Exemplo n.º 18
0
def convert_pdf_to_txt(path):
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
	fp = file(path, 'rb')
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	password = ""
	maxpages = 120
	caching = True
	pagenos=set()
	# print "two"

	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
	# print "one"

	try:
		fp.close()
		device.close()
		str = retstr.getvalue()
		retstr.close()
	except:
		str = retstr.getvalue()

	return str
Exemplo n.º 19
0
def parsePDF(pdf_file):

    pdf_file = open(pdf_file, "r").read()

    # Cast to StringIO object
    from StringIO import StringIO

    memory_file = StringIO(pdf_file)

    # Create a PDF parser object associated with the StringIO object
    parser = PDFParser(memory_file)

    # Create a PDF document object that stores the document structure
    document = PDFDocument(parser)

    # Define parameters to the PDF device objet
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    codec = "utf-8"

    # Create a PDF device object
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    # Create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        data = retstr.getvalue()
        print data
        break
Exemplo n.º 20
0
    def run(path):
        print "Calling parser :%s" % path

        t0 = time.clock()

        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        book = Book()
        i = 0
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                      check_extractable=True):
            page_tmp = Page()
            begin_page = len(retstr.getvalue())
            interpreter.process_page(page)
            page_tmp.text = retstr.getvalue()[begin_page:-1]
            book.pages.append(page_tmp)
        fp.close()
        device.close()
        retstr.close()
        print "Parsing in:", time.clock() - t0
        return book
Exemplo n.º 21
0
def convert_pdf_to_txt(path):
    """
    Converts PDF to text using the pdfminer library
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    file_handle = file(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(
        file_handle, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
    ):
        interpreter.process_page(page)

    text = retstr.getvalue()

    file_handle.close()
    device.close()
    retstr.close()
    return text
Exemplo n.º 22
0
def pdf2xml(infile):
    '''
    Return a string of XML representation for given PDF file handle.
    Uses pdfminer to do the conversion and does some final post-processing.
    '''

    outfile = StringIO()

    # Empirically determined...
    laparams = LAParams()
    laparams.char_margin = 0.4

    # See pdf2txt.py
    rsrcmgr = PDFResourceManager(caching=False)
    device = XMLConverter(rsrcmgr, outfile, codec='utf-8', laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    if page_api:
        for page in PDFPage.get_pages(infile, set()):
            interpreter.process_page(page)
    else:
        process_pdf(rsrcmgr, device, infile, set())

    infile.close()
    return outfile.getvalue().replace("\n", "")
    def Parse(self):
        # 先看是否有 cache,以及日期是否夠新
        if not os.path.exists(parseCacheDir):
            os.makedirs(parseCacheDir)
        cacheFile = os.path.join(parseCacheDir, os.path.basename(self.pdfFileName) + '.cache')
        foundCache = (os.path.isfile(cacheFile) and \
                      os.path.getsize(cacheFile) > 0 and \
                      os.path.getmtime(cacheFile) > os.path.getmtime(self.pdfFileName))
        if (foundCache):
            fp = open(cacheFile, 'rb')
            self.RawData = pickle.load(fp)
            fp.close()
        else:
            fp = open(self.pdfFileName, 'rb')
            for page in PDFPage.get_pages(fp, None, maxpages=1):
                rsrcmgr = PDFResourceManager()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                layout = device.get_result()
                self.__readobj(layout._objs)
                for category in self.RawData.values():
                    self.__reverseYaxis(category, layout.bbox[3])
                cacheFp = open(cacheFile, 'wb')
                pickle.dump(self.RawData, cacheFp)
                cacheFp.close()
            fp.close()

        self.__calculateBoundary()
        self.__assignCharsAndLinesToCell()
        self.__processCells()
        return (self.effectiveFrom, self.__getResult())
Exemplo n.º 24
0
def extract_pdf(path, languages=None):
    """ Extract content from a PDF file. This will attempt to use PyPDF2
    to extract textual content first. If none is found, it'll send the file
    through OCR. """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')
        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                if k != 'pages':
                    result[k] = safe_text(v)

        if not doc.is_extractable:
            log.warning("PDF not extractable: %s", path)
            return result

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            text = _convert_page(layout, languages)
            result['pages'].append(text)
        device.close()
        return result
Exemplo n.º 25
0
    def extract_text(self):
        pdf_data = file(self.local_file, 'rb').read()
        pdf_stream = io.BytesIO(pdf_data)
        laparams = LAParams()
        resource_manager = PDFResourceManager(caching=True)
        output_type = 'text'
        codec = 'utf-8'
        output_stream = io.BytesIO()
        pagenos = set()

        device = TextConverter(
            resource_manager,
            output_stream,
            codec=codec,
            laparams=laparams,
        )

        interpreter = PDFPageInterpreter(
            resource_manager,
            device,
        )

        pages = PDFPage.get_pages(
            pdf_stream,
            pagenos,
            maxpages=0,
            caching=True,
            check_extractable=True,
        )

        for page in pages:
            interpreter.process_page(page)

        self.text = output_stream.getvalue().decode('utf8')
Exemplo n.º 26
0
def convert_pdf_to_txt(path):

    temp = os.path.splitext(path)

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(
        fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
    ):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()

    outputFile = temp[0] + ".txt"
    print outputFile

    ff = open(outputFile, "w")
    ff.write(text)
    ff.close()
Exemplo n.º 27
0
def extract_text_from_pdf(pdf_filename):
    """
    Function to extract the text from pdf documents using pdfminer

    Parameters:
    -----------
    pdf_filename -- string
        File name of the pdf document as string

    Returns:
    --------
    extracted_text -- string
        Text extracted from pdf as string
    """

    resource_manager = PDFResourceManager()
    return_string = StringIO()
    la_params = LAParams()
    device = TextConverter(resource_manager, return_string, codec='utf-8', laparams=la_params)
    fp = file(pdf_filename, 'rb')
    interpreter = PDFPageInterpreter(resource_manager, device)
    page_nos = set()

    for page in PDFPage.get_pages(fp, page_nos):
        interpreter.process_page(page)
    fp.close()

    device.close()
    extracted_text = return_string.getvalue()
    return_string.close()

    return extracted_text
Exemplo n.º 28
0
def pdf_from_url_to_txt(url, maxpages=0):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Open the url provided as an argument to the function and read the content
    f = urllib2.urlopen(urllib2.Request(url)).read()
    # Cast to StringIO object
    fp = StringIO(f)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    string = retstr.getvalue()
    retstr.close()
    return string
Exemplo n.º 29
0
def pdf_to_text(pdfname):
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams

    from cStringIO import StringIO

    # PDFMiner boilerplate
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    # codec = 'utf-8'
    codec = 'ascii'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Extract text
    fp = file(pdfname, 'rb')
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()

    # Get text from StringIO
    text = sio.getvalue()

    # Cleanup
    device.close()
    sio.close()

    return text
Exemplo n.º 30
0
 def fix_text(self, filename):
     # Open a PDF file.
     pdfText = StringIO()
     fp = open(filename, 'rb')
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     # Supply the password for initialization.
     if not self.password:
         document = PDFDocument(parser)
     else:
         document = PDFDocument(parser, self.password)
     # Check if the document allows text extraction. If not, abort.
     if not document.is_extractable:
         raise PDFTextExtractionNotAllowed
     # Create a PDF resource manager object that stores shared resources.
     rsrcmgr = PDFResourceManager()
     # Create a PDF device object.
     device = TextConverter(rsrcmgr, pdfText, codec=self.codec
             , laparams=LAParams(), imagewriter=None
             )
     # Create a PDF interpreter object.
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     # Process each page contained in the document.
     for page in PDFPage.create_pages(document):
         interpreter.process_page(page)
     txt = pdfText.getvalue()
     return txt
Exemplo n.º 31
0
    #laparams = LAParams()
    laparams = None
    #password = ''
    #maxpages = 0
    manager = PDFResourceManager(caching=caching) 
    if case == 'txt' :
        output = io.StringIO()
        converter = TextConverter(manager, output, codec=codec, laparams=laparams)     
    if case == 'HTML' :
        output = io.BytesIO()
        converter = HTMLConverter(manager, output, codec=codec, laparams=laparams)
        
    interpreter = PDFPageInterpreter(manager, converter)   
    infile = open(fname, 'rb')

    for index,page in enumerate(PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True)):
        interpreter.process_page(page)

    convertedPDF = output.getvalue()


    infile.close();
    converter.close();
    output.close()
    
    return convertedPDF

#%%
#file_path = 'Q:/DATA/AI/JelleBarkema/DP Mission Chief Project/Documents'
folder = 'd:/usr-profiles/chuang/Desktop/Dev/textmining/2_imf_docs/1_use_xmls/data/pdfs'
dest_folder = 'data/txt'
Exemplo n.º 32
0
parser = argparse.ArgumentParser(description=Path of a document)
parser.add_argument('--path', dest='pathoffile')
args = parser.parse_args()
path = args.pathoffile
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
    interpreter.process_page(page)
text = retstr.getvalue()
print(text)
fp.close()
device.close()
retstr.close()

'''
    DOWNLOADING STOPWORDS.
'''
nltk.download('stopwords')

'''
	GETTING RID OF UNWANTED PUNCTUATIONS.
'''
Exemplo n.º 33
0
def anotate_pdf(file_path, sht, query_dict):

    # preparing the output file name
    path = pathlib.Path(file_path).parent
    extension = pathlib.Path(file_path).suffix
    name = pathlib.Path(file_path).name[:-len(extension)]
    result_file = str(path) + '\\' + name + '_highlighted' + extension

    #=========================================================

    # create a parser object associated with the file object
    parser = PDFParser(open(file_path, 'rb'))
    # create a PDFDocument object that stores the document structure
    doc = PDFDocument(parser)

    # Layout Analysis
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # create pdf layout - this is list with layout of every page
    layout = []
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout.append(device.get_result())

    # add tooltip info not sure how to use this option in the most usefull way
    m_meta = {"author": "AK", "contents": "HL text1"}

    outputStream = open(result_file, "wb")
    pdfInput = PdfFileReader(open(file_path, 'rb'), strict=True)
    pdfOutput = PdfFileWriter()

    npage = pdfInput.numPages
    for pgn in range(0, npage):
        for query in query_dict:
            all_coor = []
            for page in layout:
                result = get_page_coordinates(page, query)
                all_coor.append(result)

            page_hl = pdfInput.getPage(pgn)

            for item in all_coor[pgn]:
                highlight = create_highlight(item[0],
                                             item[1],
                                             item[2],
                                             item[3],
                                             m_meta,
                                             color=query_dict[query])
                highlight_ref = pdfOutput._addObject(highlight)

                if "/Annots" in page_hl:
                    page_hl[NameObject("/Annots")].append(highlight_ref)
                else:
                    page_hl[NameObject("/Annots")] = ArrayObject(
                        [highlight_ref])

        pdfOutput.addPage(page_hl)

    # save HL to new file
    pdfOutput.write(outputStream)
    outputStream.close()
    sht.range('B2').value = f'File {name+extension} completed'
Exemplo n.º 34
0
    def convert(
        self,
        source_path: str = None,
        trim: bool = True,
        pagenum: Optional[Union[int, str]] = None,
    ):
        """Parse source PDF into entities.

        These entities can be used for text searches or XML dumping for example. The
        conversion will be done automatically when using the dependent keywords
        directly.

        :param source_path: source PDF filepath
        :param trim: trim whitespace from the text is set to True (default)
        :param pagenum: Page number where search is performed on, defaults to `None`. (
            meaning all pages get converted)

        **Examples**

        **Robot Framework**

        .. code-block:: robotframework

            ***Settings***
            Library    RPA.PDF

            ***Tasks***
            Example Keyword
                Convert    /tmp/sample.pdf

        **Python**

        .. code-block:: python

            from RPA.PDF import PDF

            pdf = PDF()

            def example_keyword():
                pdf.convert("/tmp/sample.pdf")
        """
        self.ctx.switch_to_pdf(source_path)
        converted_pages = self.active_pdf_document.has_converted_pages
        if pagenum is not None:
            pagenum = int(pagenum)
            if pagenum in converted_pages:
                return  # specific page already converted
        else:
            pages_count = self.active_pdf_document.reader.getNumPages()
            if len(converted_pages) >= pages_count:
                return  # all pages got converted already

        self.logger.debug(
            "Converting active PDF document page %s on: %s",
            pagenum if pagenum is not None else "<all>",
            self.active_pdf_document.path,
        )
        rsrcmgr = PDFResourceManager()
        if not self.ctx.convert_settings:
            self.set_convert_settings()
        laparams = pdfminer.layout.LAParams(**self.ctx.convert_settings)
        device = Converter(
            self.active_pdf_document,
            rsrcmgr,
            laparams=laparams,
            trim=trim,
            logger=self.logger,
            # Also explicitly set by us when iterating pages for processing.
            pageno=pagenum if pagenum is not None else 1,
        )
        interpreter = pdfminer.pdfinterp.PDFPageInterpreter(rsrcmgr, device)

        # Look at all (nested) objects on each page.
        source_parser = PDFParser(self.active_pdf_document.fileobject)
        source_document = PDFDocument(source_parser)
        source_pages = PDFPage.create_pages(source_document)
        for idx, page in enumerate(source_pages, start=1):
            # Process relevant pages only if instructed like so.
            # (`pagenum` starts from 1 as well)
            if pagenum is None or idx == pagenum:
                if idx not in converted_pages:
                    # Skipping converted pages will leave this counter un-incremented,
                    # therefore we increment it explicitly.
                    device.pageno = idx
                    interpreter.process_page(page)
                    converted_pages.add(idx)

        device.close()
Exemplo n.º 35
0
                        #print(o.get_font())
                        for c in o._objs:
                            if isinstance(c, pdfminer.layout.LTChar):
                                #print(c,"fontname %s"%c.fontname)
                                break
            i += 1
        # if it's a container, recurse
        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs)
        else:
            pass


path = "/home/pavan/Downloads/Test2.pdf"
document = createPDFDoc(path)
device, interpreter = createDeviceInterpreter()
pages = PDFPage.create_pages(document)
interpreter.process_page(pages.next())
#layout = device.get_result()

for page in PDFPage.get_pages(open(path, 'rb'),
                              set(),
                              maxpages=0,
                              password="",
                              caching=True,
                              check_extractable=True):
    interpreter.process_page(page)
    layout = device.get_result()
    parse_obj(layout._objs)
#input('------------------------------------------------------')
Exemplo n.º 36
0
def extractHighlights2(filename, anno, verbose=True):
    '''Extract highlighted texts from a PDF

    Extract texts from PDF using pdftotext
    '''

    hlpages = anno.hlpages
    if len(hlpages) == 0:
        return []

    #--------------Get pdfmine instances--------------
    document, interpreter, device = init(filename)

    #----------------Loop through pages----------------
    hltexts = []

    for ii, page in enumerate(PDFPage.create_pages(document)):

        #------------Get highlights in page------------
        if len(hlpages) > 0 and ii + 1 in hlpages:

            annoii = anno.highlights[ii + 1]
            anno_total = len(annoii)
            anno_found = 0

            #------------Merge annos in single line------------
            annoii = mergeLine(annoii)

            #-----------Sort annotations vertically-----------
            annoii = sortAnnoY(annoii)

            interpreter.process_page(page)
            layout = device.get_result()
            page_height = layout.height

            #--------------Sort boxes diagnoally--------------
            objs = sortDiag(layout)

            #-----------------Refine ordering-----------------
            objs = fineTuneOrder(objs)

            #----------------Loop through boxes----------------
            for jj, objj in enumerate(objs):

                if type(objj)!=LTTextBox and\
                        type(objj)!=LTTextBoxHorizontal:
                    continue
                textjj, numjj = findStrFromBox2(annoii, objj, filename,
                                                page_height)

                if numjj > 0:
                    #--------------Attach text with meta--------------
                    authors = tools.getAuthorList(anno.meta)

                    textjj=Anno(textjj,\
                        ctime=getCtime(annoii),\
                        title=anno.meta['title'],\
                        page=ii+1,
                        citationkey=anno.meta['citationkey'],\
                        tags=anno.meta['tags'],
                        bbox=objj.bbox,
                        author=authors,
                        note_author=anno.meta['user_name'])

                    hltexts.append(textjj)

                #----------------Break if all found----------------
                anno_found += numjj
                if anno_total == anno_found:
                    break

    #----------------Number highlights----------------
    for ii, hlii in enumerate(hltexts):
        hlii.num = ii + 1

    return hltexts
Exemplo n.º 37
0
def main(argv):
    import getopt
    def usage():
        print(f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]'
               ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
               ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
               ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
               ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...')
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-P': password = v.encode('ascii')
        elif k == '-o': outfile = v
        elif k == '-t': outtype = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-c': encoding = v
        elif k == '-s': scale = float(v)
        elif k == '-R': rotation = int(v)
        elif k == '-Y': layoutmode = v
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = open(outfile, 'w', encoding=encoding)
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        with open(fname, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp, pagenos,
                                          maxpages=maxpages, password=password,
                                          caching=caching, check_extractable=True):
                page.rotate = (page.rotate+rotation) % 360
                interpreter.process_page(page)
    device.close()
    outfp.close()
    return
Exemplo n.º 38
0
def extract_text_from_pdf(pdf_path):

    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
        text = fake_file_handle.getvalue()
    converter.close()
    fake_file_handle.close()
    print(text)

    dem = re.search("Demand/fixed charge.{10}", text)
    dem = dem.group()
    dem = dem.replace("Demand/fixed charge", " ")
    dem = dem.split(".")
    dem = dem[0] + "." + dem[1][:2]
    print(dem)

    wheel = re.search("Wheeling Charges.{10}", text)
    wheel = wheel.group()
    wheel = wheel.replace("Wheeling Charges", " ")
    wheel = wheel.split(".")
    wheel = wheel[0] + "." + wheel[1][:2]
    print(wheel)

    ec = re.search("Energy charge.{10}", text)
    ec = ec.group()
    ec = ec.replace("Energy charge", " ")
    ec = ec.split(".")
    ec = ec[0] + "." + ec[1][:2]
    print(ec)

    ge = re.search("Government Electricity Duty.{50}", text)
    ge = ge.group()
    ge = ge.replace("Government Electricity Duty", " ")
    ge = ge.split("%")
    ge = ge[1].split(".")
    ge = ge[0] + "." + ge[1][:2]
    print(ge)

    ma = re.search("Mah.Govt.Tax on sale of electricity.{50}", text)
    ma = ma.group()
    ma = ma.replace("Mah.Govt.Tax on sale of electricity", " ")
    ma = ma.split("unit")
    ma = ma[1].split(".")
    ma = ma[0] + "." + ma[1][:2]
    print(ma)

    amt = re.search("Current month's bill amount.{50}", text)
    amt = amt.group()
    amt = amt.replace("Current month's bill amount(A)", " ")
    amt = amt.split(".")
    amt = amt[0] + "." + amt[1][:2]
    print(amt)

    dp = re.search("Digital Payment Discount.{50}", text)
    dp = dp.group()
    dp = dp.replace("Digital Payment Discount", " ")
    dp = dp.split(".")
    dp = dp[0] + "." + dp[1][:2]
    print(dp)

    pd = re.search("Payment received upto.{30}", text)
    pd = pd.group()
    pd = pd.replace("Payment received upto", " ")
    if "-" in pd:
        pd = pd.split("-")
        pd = pd[0] + "-" + pd[1] + "-" + pd[2][:4]
    else:
        pd = pd.split(".")
        pd = pd[0] + "." + pd[1] + "." + pd[2][:4]

    print(pd)

    pr = re.search("Payment received upto.{30}", text)
    pr = pr.group()
    pr = pr.replace("Payment received upto", " ")
    pr = pr.replace(pd, " ")
    pr = pr.split(".")
    pr = pr[0] + "." + pr[1][:2]
    print(pr)

    md = re.search("Meter reading date.{100}", text)
    md = md.group().split("-")
    md = md[0][-2:] + "-" + md[1] + "-" + md[2][:4]
    print(md)

    pmd = re.search("Meter reading date.{100}", text)
    pmd = pmd.group().split("-")
    pmd = pmd[2][-2:] + "-" + pmd[3] + "-" + pmd[4][:4]
    print(pmd)

    sd = re.search("Your security deposit.{50}", text)
    sd = sd.group()
    sd = sd.replace("Your security deposit (SD) with us", " ")
    sd = sd.split(".")
    sd = sd[0] + "." + sd[1][:2]
    print(sd)

    dpc = re.search("Total bill amount with DPC.{30}", text)
    dpc = dpc.group()
    dpc = dpc.replace("Total bill amount with DPC", " ")
    dpc = dpc.split(".")
    dpc = dpc[0] + "." + dpc[1][:2]
    print(dpc)

    CoD = re.search("Contract Demand.{20}", text)
    CoD = CoD.group()
    CoD = CoD.replace("Contract Demand", " ").split(".")
    CoD = CoD[0] + "." + CoD[1][:5]
    print(CoD)

    PF = re.search("Power Factor.{40}", text)
    PF = PF.group().replace("Power Factor (PF) penalty/incentive",
                            " ").split(".")
    PF = PF[0] + "." + PF[1][:2]
    print(PF)

    number = re.search("Meter No..{7}", text)
    number = number.group().replace("Meter No.", " ")
    print(number)

    mf = re.search("Multiplying Factor.{1}", text)
    mf = mf.group().replace("Multiplying Factor", " ")
    print(mf)

    redpr = re.search("Energy consumptionReadingPresent.{50}", text)
    redprk = redpr.group().replace("Energy consumptionReadingPresent",
                                   " ").split(".")
    z = redprk
    z = z[0] + "." + z[1][:2]
    print(z)

    redprv = re.search("Energy consumptionReadingPresent.{50}", text)
    redprv = redprv.group().replace("Energy consumptionReadingPresent", " ")
    redprv = redprv.replace(z, " ")
    redprv = redprv.replace("Previous", " ").split(".")
    y = redprv
    y = y[0] + "." + y[1][:2]
    print(y)

    total = re.search(" Factor1Energy consumption.{100}", text)
    total = total.group().split("TOD")
    total = total[0].split("(kWh)")
    total = total[1]
    n = len(total)
    m = int(n / 2)
    total = total[-m:]
    #total = "".join(total).split("(kWh)")
    print(total)
Exemplo n.º 39
0
def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
Exemplo n.º 40
0
 def processor_pdfminersix(self, pathfile):
     '''
     '''
     self._tmp_file = open(str(pathfile), 'rb')
     return [x for x in PDFPage.get_pages(self._tmp_file)]
Exemplo n.º 41
0


listadoarchivos = listadoarchivos[::-1]


dfrow_list=[] # guardado temporal de los datos antes de incluirlos en el DataFrame
csv_file = "provincias.csv"

for pdf_file in listadoarchivos:
    fp = open(pdf_file, 'rb')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.get_pages(fp)

    fecha = re.split('_|-',pdf_file)
    if "matutino" in pdf_file:
        fecha[0] = str(int(fecha[0])-1)
        if len(fecha[0]) == 1:
            fecha[0] = "0" + str(fecha[0])
    fecha = fecha[0] + "/" + fecha[1] + "/" + fecha[2] 

    for page in pages:
        interpreter.process_page(page)
        layout = device.get_result()
        for lobj in layout:
            if isinstance(lobj, LTTextBox):
                paragraph = lobj.get_text()
                paragraph = paragraph.split("\n")
Exemplo n.º 42
0
def print_and_write(txt):
    print(txt)
    #outputfile.write(txt)
#    output_txt.write('\n')
    outputfile.close()

with open(sys.argv[1], 'rb') as f:
    # PDFPage.get_pages()にファイルオブジェクトを指定して、PDFPageオブジェクトを順に取得する。
    # 時間がかかるファイルは、キーワード引数pagenosで処理するページ番号(0始まり)のリストを指定するとよい。
    rank = False
    lanes = []
    thisHeat = ""
    number = 1
    semi = False
    seminumber = 1
    for page in PDFPage.get_pages(f):
        # print_and_write('\n====== ページ区切り ======\n')
        interpreter.process_page(page)  # ページを処理する。
        layout = device.get_result()  # LTPageオブジェクトを取得。

        # ページ内のテキストボックスのリストを取得する。
        boxes = find_textboxes_recursively(layout)

        # テキストボックスの左上の座標の順でテキストボックスをソートする。
        # y1(Y座標の値)は上に行くほど大きくなるので、正負を反転させている。
        boxes.sort(key=lambda b: (-b.y1, b.x0))
        
        for box in boxes:
            text = box.get_text()
            if prefix != "":
                if "Heat" in text and "of" in text:
Exemplo n.º 43
0
def get_text_from_pdf(pdfname, caption, skip_header, skip_footer):
    # PDF 読み込み
    fp = open(pdfname, 'rb')
    texts = []

    for page in tqdm(
            PDFPage.get_pages(fp,
                              pagenos=None,
                              maxpages=0,
                              password=None,
                              caching=True,
                              check_extractable=True)):
        rsrcmgr = PDFResourceManager()
        out_fp = StringIO()
        la_params = LAParams()
        la_params.detect_vertical = True
        device = TextConverter(rsrcmgr,
                               out_fp,
                               codec='utf-8',
                               laparams=la_params)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        interpreter.process_page(page)
        texts.append(out_fp.getvalue())
        device.close()
        out_fp.close()
    fp.close()

    output = ""

    # 文章成形
    for text in tqdm(texts):
        lines = text.splitlines()
        replace_strs = [b'\x00']  # 除去するutf8文字
        new_lines = []
        for line in lines:
            line_utf8 = line.encode('utf-8')
            for replace_str in replace_strs:
                line_utf8 = line_utf8.replace(replace_str, b'')
            line = line_utf8.decode()
            line = re.sub("[ ]+", " ", line)  # 連続する空白を一つにする
            line = line.strip()
            if len(line) == 0:
                continue  # 空行は無視
            if is_float(line):
                continue  # 数字だけの行は無視
            new_lines.append(line)

        for index in range(len(new_lines)):
            if index == 0 and skip_header:
                continue
            if index == len(new_lines) - 1 and skip_footer:
                continue
            line = new_lines[index]
            # 見出しで改行
            if is_float(line.split(".")[0]) and len(
                    line.split()) < caption and (not line.endswith(".")):
                output += str(line)
                output += "\r\n"
                continue

            if line.endswith("."):
                output += str(line)
                output += "\r\n"
            elif line.endswith("-"):
                # 前の行からの続きの場合
                output += str(line[:-1])
            elif line.endswith(":"):
                # 数式が続く場合
                output += str(line)
                output += "\r\n"
            else:
                # それ以外の場合は、単語の切れ目として半角空白を入れる
                output += str(line)
                output += " "

    return output
Exemplo n.º 44
0
    def pdf2txt(self):
        '''
        =============================

        return : str, text File path
        '''

        # input
        password = ''
        pagenos = set()
        maxpages = 0

        # output
        imagewriter = None
        rotation = 0
        codec = 'UTF-8'
        pageno = 1
        scale = 1
        caching = True
        showpageno = True
        laparams = LAParams()

        infp = open(self.input_path, "rb")

        if self.output_path == None:
            self.output_path = self.input_path[:-4] + '_trans.txt'
            outfp = open(self.output_path, "w", encoding='UTF8')
        else:
            outfp = open(self.output_path, "w", encoding='UTF8')

        #page total num
        parser = PDFParser(infp)
        document = PDFDocument(parser)
        page_total_num = resolve1(document.catalog['Pages'])['Count']

        #
        rsrcmgr = PDFResourceManager(caching=caching)

        # pdf -> text converter
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)

        # pdf -> text interpreter
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # pdf -> text start
        with tqdm(total=page_total_num) as pbar:
            for page in PDFPage.get_pages(infp,
                                          pagenos,
                                          maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):

                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)

                pbar.update(1)

        print('[INFO] pdf -> text')

        outfp.close()
        infp.close()

        return self.output_path
Exemplo n.º 45
0
# Create a PDF document object that stores the document structure.
document = PDFDocument(parser)


# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

# Create a PDF device object.
device = PDFDevice(rsrcmgr)

# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)

# Process each page contained in the document.
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)


from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator

# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    layout = device.get_result()
Exemplo n.º 46
0
def parse():
    fp = open(path, 'rb')  # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument(praser)
    # 连接分析器 与文档对象
    praser.set_document(doc)
    # 创建PDf 资源管理器 来管理共享资源
    rsrcmgr = PDFResourceManager()
    # 创建一个PDF设备对象
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # 创建一个PDF解释器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # 循环遍历列表,每次处理一个page的内容

    wb = Workbook()  #新建excel
    ws = wb.active

    # 记录page的行数
    text_number = 0

    for page in PDFPage.create_pages(doc):  # doc.get_pages() 获取page列表
        interpreter.process_page(page)
        # 接受该页面的LTPage对象
        layout = device.get_result()
        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
        # 得到box
        page_container = []  #存储所有该page的字符串字典
        page_rows = []  #存储行位置数据
        for text_box in layout:
            if (isinstance(text_box, LTTextBox)):
                # 得到line
                for text_line in text_box:
                    if (isinstance(text_line, LTTextLine)):
                        # 得到每个字符
                        temp = []  # 存储得到的字符
                        temp_loc = []  #存储字符串位置
                        isfirst = True  #判断是否为字符串的第一个字符
                        for text_index in text_line:
                            # 判断是否为字符数据,并不断更新temp temp_loc
                            if (isinstance(text_index, LTChar)):
                                temp.append(text_index.get_text())
                                if isfirst == True:
                                    temp_loc.append(
                                        round(text_index.bbox[0], 3))
                                    temp_loc.append(
                                        round(text_index.bbox[1], 3))
                                    temp_loc.append(
                                        round(text_index.bbox[2], 3))
                                    temp_loc.append(
                                        round(text_index.bbox[3], 3))
                                    isfirst = False
                                temp_loc[2] = round(text_index.bbox[2], 3)
                                temp_loc[3] = round(text_index.bbox[3], 3)
                            # 判断是否为LTText,并将得到的字符串输入page_container的指定位置,最后更新temp 、temp_loc、 isfirst
                            elif (isinstance(text_index, LTText)):
                                # 如果page_rows没有该行的位置数据,则将数据信息插入page_container,page_rows
                                # if temp_loc[1] not in page_rows:
                                if is_not_in(page_rows, temp_loc[1]):
                                    insert_loc = insert_into_page_rows(
                                        page_rows, temp_loc[1])
                                    page_container.insert(
                                        insert_loc, [{
                                            'value': ''.join(temp),
                                            'location': temp_loc
                                        }])
                                    # page_rows.append(temp_loc[1])
                                    # page_container.append([{'value':''.join(temp),'location':temp_loc}])
                                # 如果有该行的信息
                                elif not is_not_in(page_rows, temp_loc[1]):
                                    # loc = page_rows.index(temp_loc[1])
                                    loc = get_page_rows_loc(
                                        page_rows, temp_loc[1])
                                    temp_list = insert_into_page_container(
                                        page_container[loc], {
                                            'value': ''.join(temp),
                                            'location': temp_loc
                                        })
                                    page_container[loc] = temp_list[:]
                                temp = []
                                temp_loc = []
                                isfirst = True
        rows_num = len(page_container)

        # 对最后一行进行重排
        if len(page_container[rows_num - 1]) != len(
                page_container[rows_num - 2]):
            loc_for_no2 = []
            loc_for_no1 = []
            adjust_for_no1 = []
            temp_array = page_container[rows_num - 1][:]
            for i in page_container[rows_num - 2]:
                loc_for_no2.append([i['location'][0], i['location'][2]])
            for i in page_container[rows_num - 1]:
                loc_for_no1.append([i['location'][0], i['location'][2]])
            for i in range(len(loc_for_no1)):
                for j in range(len(loc_for_no2)):
                    if not (loc_for_no1[i][0] > loc_for_no2[j][1]
                            or loc_for_no1[i][1] < loc_for_no2[j][0]):
                        adjust_for_no1.append(j)
                        break

            page_container[rows_num - 1] = []
            for i in range(len(page_container[rows_num - 2])):
                if i in adjust_for_no1:
                    page_container[rows_num - 1].append(
                        temp_array[adjust_for_no1.index(i)])
                else:
                    page_container[rows_num - 1].append(None)

        # 对前五行进行重排
        if len(page_container[0]) != len(page_container[1]) or len(
                page_container[1]) != len(page_container[2]) or len(
                    page_container[2]) != len(page_container[3]) or len(
                        page_container[3]) != len(page_container[4]):
            rows_length = []
            the_max_row = []
            new_max_row = []
            for i in range(6):
                rows_length.append(len(page_container[i]))
            max_length = max(rows_length)
            the_max_row = page_container[rows_length.index(max_length)][:]
            for i in range(len(rows_length)):
                if rows_length[i] < max_length:
                    page_container[i] = align_row(the_max_row,
                                                  page_container[i])
        # 检测表头

        # 输出验证
        for i in range(len(page_container)):
            for j in range(len(page_container[i])):
                print(page_container[i][j])
        # print(page_container)
        # print(page_rows)

        # 得到该页数据以后写入excel
        for i in range(len(page_container)):
            for j in range(len(page_container[i])):
                cell_index = ws.cell(row=i + 1 + text_number, column=j + 1)
                if page_container[i][j] == None:
                    cell_index.value = ' '
                else:
                    cell_index.value = page_container[i][j]['value']

        # 更新text_number,保证page之间的数据连续
        text_number += rows_num

    wb.save(r'C:\Users\15644\Desktop\pdf_file\test_pdf_list\test_1.xlsx')
Exemplo n.º 47
0
def scientific_analysis(password, path, title, topn):
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from io import StringIO

    print('Convering pdf to text ...')
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password_pdf = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password_pdf,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
    text = text.replace('-\n', '').replace('’', "'").replace('infl', 'infl')
    lines = text.split('\n')
    lines_section_ids_dict = {}
    lines_section_ids = []
    for i, line in enumerate(lines[1:-2]):
        if len(lines[i - 1]) == 0 and len(lines[i + 1]) == 0 and len(
                lines[i]) > 3 and not str(lines[i]).isdigit():
            lines_section_ids_dict[i] = lines[i]
            lines_section_ids.append(i)

    data = []
    for id in lines_section_ids_dict:
        data.append((lines_section_ids_dict[id], id))
    data = dict(data)

    final_data = {}
    new_txt = ''
    try:
        ref_id = data['References']
    except KeyError:
        ref_id = len(lines) - 1
    for i, id in enumerate(lines_section_ids):
        if i < len(lines_section_ids) - 1 and id < ref_id:
            start = lines_section_ids[i]
            end = lines_section_ids[i + 1]
            interval_lines = lines[start + 1:end]
            interval_lines_txt = ' '.join(interval_lines)
            if 'Abbreviations' not in lines_section_ids_dict[
                    start] and '18 of 36' not in lines_section_ids_dict[start]:
                new_txt += interval_lines_txt
            if interval_lines and len(interval_lines_txt) > 100:
                final_data[lines_section_ids_dict[start]] = ' '.join(
                    interval_lines)

    final_data['paper_title'] = title
    final_data['full_text'] = new_txt
    final_data['topn'] = topn
    print('Uploading text ...')
    response = requests.post(
        'http://tzagerlib1-env.eba-wjp8tqpj.eu-west-2.elasticbeanstalk.com/scientific_analysis/'
        + password,
        json=json.dumps(final_data))
    if response.status_code == 200:
        data = dict(response.json())
    else:
        data = {'error': response.status_code}
        data = dict(data)
    return data
Exemplo n.º 48
0
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
import io
import os
data_dir = '/home/lsy2018/媛媛/data/人工智能教师/'
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
file_list = os.listdir(data_dir)
for each_file in file_list:
    file_name = os.path.join(data_dir, each_file)
    print(file_name)
    with open(file_name, 'rb') as fh:
        for page in PDFPage.get_pages(fh, check_extractable=True):
            page_interpreter.process_page(page)
        text = fake_file_handle.getvalue()
        print(text)
        exit()
Exemplo n.º 49
0
def parse_obj(lt_objs, texts):

    # loop over the object list
    for obj in lt_objs:

        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            texts.append([obj.bbox[0], obj.bbox[1], obj.get_text()])

        # if it's a container, recurse
        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs, page_state)


i = 0
# loop over all pages in the document
for i, page in enumerate(PDFPage.create_pages(document)):
    p = PageState()

    # read the page into a layout object
    interpreter.process_page(page)
    layout = device.get_result()

    texts = []

    # extract texts from this object
    parse_obj(layout._objs, texts)

    # sort text on first x, then y coordinate.
    texts.sort(key=lambda a: a[0], reverse=False)
    texts.sort(key=lambda a: a[1], reverse=True)
Exemplo n.º 50
0
def pdf_to_csv(filename):
    regexMålestasjon = re.compile(r"(MSA|MSB)")
    regexLinje = re.compile("linje " + r"[0-9]{1}[0-1]?")
    regexOljetype = re.compile(r"[0-9]{2}:[0-9]{2}" + "(14|1)")

    with open(filename, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):

            resource_manager = PDFResourceManager()
            fake_file_handler = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handler)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)

            text = fake_file_handler.getvalue()

            converter.close()
            fake_file_handler.close()
            fh.close()

        Målestasjon = re.search(regexMålestasjon, text)

        if Målestasjon == None:
            return
        else:
            matchMålestasjon = Målestasjon.group(0)

        Linje = re.search(regexLinje, text)

        if Linje == None:
            return

        else:
            matchLinje = Linje.group(0)

        oljetype = re.search(regexOljetype, text)

        if oljetype == None:
            return

        else:
            matchOljetype_mau = oljetype.group(0)

        if matchOljetype_mau.endswith("4"):
            matchOljetype = "14"
        if matchOljetype_mau.endswith("1"):
            matchOljetype = "1"
        if matchOljetype_mau.endswith("0"):
            return

        newfilename = str(matchMålestasjon) + "_" + str(
            matchLinje) + "_" + matchOljetype + ".pdf"
        location = str(matchMålestasjon) + "_" + str(matchOljetype)

        try:

            os.rename(filename, newfilename)
            shutil.move(newfilename, location)

        except IOError:
            print(f"""failed:\nOld filename:
            {filename}\n New Filename: {newfilename}""")
Exemplo n.º 51
0
from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

output_string = StringIO()
with open('simple1.pdf', 'rb') as in_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)

print(output_string.getvalue())
Exemplo n.º 52
0
def extract_text_from_pdf(pdf_path):
    '''
    Helper function to extract the plain text from .pdf files

    :param pdf_path: path to PDF file to be extracted (remote or local)
    :return: iterator of string of extracted text
    '''
    # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
    if not isinstance(pdf_path, io.BytesIO):
        # extract text from local pdf file
        with open(pdf_path, 'rb') as fh:
            try:
                for page in PDFPage.get_pages(
                                fh,
                                caching=True,
                                check_extractable=True
                ):
                    resource_manager = PDFResourceManager()
                    fake_file_handle = io.StringIO()
                    converter = TextConverter(
                        resource_manager,
                        fake_file_handle,
                        codec='utf-8',
                        laparams=LAParams()
                    )
                    page_interpreter = PDFPageInterpreter(
                        resource_manager,
                        converter
                    )
                    page_interpreter.process_page(page)

                    text = fake_file_handle.getvalue()
                    yield text

                    # close open handles
                    converter.close()
                    fake_file_handle.close()
            except PDFSyntaxError:
                return
    else:
        # extract text from remote pdf file
        try:
            for page in PDFPage.get_pages(
                pdf_path,
                caching=True,
                check_extractable=True
            ):
                resource_manager = PDFResourceManager()
                fake_file_handle = io.StringIO()
                converter = TextConverter(
                    resource_manager,
                    fake_file_handle,
                    codec='utf-8',
                    laparams=LAParams()
                )
                page_interpreter = PDFPageInterpreter(
                    resource_manager,
                    converter
                )
                page_interpreter.process_page(page)

                text = fake_file_handle.getvalue()
                yield text

                # close open handles
                converter.close()
                fake_file_handle.close()
        except PDFSyntaxError:
            return
Exemplo n.º 53
0
def parse():
    with open("schedule/{}".format(cfg.get("schedule_file")), "rb") as fp:
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        rsrcmgr = PDFResourceManager()
        device = PDFDevice(rsrcmgr)
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        def parse_obj(lt_objs):
            for obj in lt_objs:
                if isinstance(obj, LTTextBoxHorizontal):
                    coor = getTextCoords(obj.bbox[0:2])
                    text = obj.get_text().replace('\n', ' ')
                    # check if content contains a date
                    match = re.search(r"\d{2}/\d{2}/\d{4}", text)
                    if match:
                        data["dates"].append({
                            "date": match.group(),
                            "coords": coor
                        })
                    match = re.findall(r"\d{1,2}:\d{2}", text)
                    if match:
                        data["hours"].append({
                            "hours":
                            list(map(lambda x: "{0:0>5}".format(x), match)),
                            "coords":
                            coor
                        })
                    data["textboxes"].append([coor, text, ""])

                if isinstance(obj, LTRect):
                    data["rects"].append(getRectCoords(obj.bbox[0:4]))

                if isinstance(obj, LTFigure):
                    parse_obj(obj._objs)

        if LOG_TEXTS:
            with open("outputs/" + cfg.get("folder") + "/pdf_texts.txt",
                      "w",
                      encoding="utf8") as log:
                log.write("")

        with open("outputs/" + cfg.get("folder") + "/pdf_svg.html",
                  "w",
                  encoding="utf8") as svg:
            ''' SVG HEAD '''
            if CREATE_SVG:
                svg.write(
                    "<style type=\"text/css\">svg{stroke:#000;stroke-width:1;fill:none}</style>\n"
                )
            i = 0

            # loop over all pages in the document
            for page in PDFPage.create_pages(document):
                # read the page into a layout object
                interpreter.process_page(page)
                layout = device.get_result()
                ''' CREATE SVG '''
                if CREATE_SVG:
                    svg.write(
                        "<svg id=\"s{}\" width=\"1200\" height=\"600\">\n".
                        format(i))

                data["rects"] = []
                data["textboxes"] = []
                data["dates"] = []
                data["datelines"] = []
                data["hours"] = []

                # extract info from this page
                parse_obj(layout._objs)

                lines = rectsToLines(data["rects"])

                lines = mergeLines(lines)
                lines.sort(key=lambda x: x[1][1])
                lines.sort(key=lambda x: x[0][1])

                grid = createGrid(lines)
                data["textboxes"] = mergeTexts(grid, data["textboxes"])
                data["textboxes"] = splitSimultaneousCourses(data["textboxes"])

                data["hours"].sort(key=lambda x: x["coords"][1])

                if data["hours"]:
                    calcHourBoundaries(grid)
                if data["dates"]:
                    calcDateBoundaries(grid)

                # keyword matching for each textbox
                for t in data["textboxes"]:
                    t[1] = " ".join(t[1].split())
                    res = keywords.match(format_text(t[1]))
                    if len(res["indexes"]) == 1:
                        data["courses"][res["indexes"][0]] = {
                            "coords": t[0],
                            "date": getDate(t[0])
                        }
                        t[2] = " (match: {})".format(res["titles"][0])
                ''' DRAW LINES '''
                if CREATE_SVG:
                    minX, maxX = 1e10, 0
                    for l in lines:
                        svg.write(
                            "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" stroke=\"#{}\"></line>\n"
                            .format(l[0][0], l[0][1], l[1][0], l[1][1],
                                    randomColor()))
                        if l[0][0] < minX:
                            minX = l[0][0]
                        if l[1][0] > maxX:
                            maxX = l[1][0]
                    if SHOW_DATELINES:
                        for h in data["hours"]:
                            svg.write(
                                "<circle cx=\"{}\" cy=\"{}\" r=\"1\" stroke=\"red\"></circle>\n"
                                .format(h["coords"][0], h["coords"][1]))
                        for d in data["dates"]:
                            if d["boundaries"][0] != 0 and d["boundaries"][
                                    1] != 0:
                                svg.write(
                                    "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" stroke=\"#111111\"></line>\n"
                                    .format(minX, d["boundaries"][0], maxX,
                                            d["boundaries"][0]))
                                svg.write(
                                    "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" stroke=\"#111111\"></line>\n"
                                    .format(minX, d["boundaries"][1], maxX,
                                            d["boundaries"][1]))
                    if SHOW_TEXTBOXES:
                        for t in data["textboxes"]:
                            svg.write(
                                "<text x=\"{}\" y=\"{}\" font-size=\"4\" font-weight=\"lighter\">{}</text>\n"
                                .format(t[0][0], t[0][1], t[1][:5]))
                if LOG_TEXTS:
                    with open("outputs/" + cfg.get("folder") +
                              "/pdf_texts.txt",
                              "a",
                              encoding="utf8") as log:
                        for t in data["textboxes"]:
                            log.write("{}, {}, {}{}\n".format(
                                t[0][0], t[0][1], t[1], t[2]))
                ''' CLOSE SVG '''
                if CREATE_SVG:
                    svg.write('</svg>' + "\n")
                i += 1

        coursedates = {}
        for key, c in data["courses"].items():
            coursedates[key] = c["date"]
        write(coursedates)
Exemplo n.º 54
0
def extract_text(my_file):
    """Pulling text boxes out of PDFs. First half of this defn copies off the internet."""
    try:
        #my_file = os.path.join(base_path + "/" + filename)
        #my_file = os.path.join(dayDataPath, frontPages[paper])
        password = ""
        extracted_text = ""
        extracted_text_plus = []
        # Open and read the pdf file in binary mode
        fp = open(my_file, "rb")
        # Create parser object to parse the pdf content
        parser = PDFParser(fp)
        # Store the parsed content in PDFDocument object
        document = PDFDocument(parser, password)
        # Check if document is extractable, if not abort
        #if not document.is_extractable:
        #    raise PDFTextExtractionNotAllowed
        # Create PDFResourceManager object that stores shared resources such as fonts or images
        rsrcmgr = PDFResourceManager()
        # set parameters for analysis
        laparams = LAParams()
        # Create a PDFDevice object which translates interpreted information into desired format
        # Device needs to be connected to resource manager to store shared resources
        # device = PDFDevice(rsrcmgr)
        # Extract the decive to page aggregator to get LT object elements
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create interpreter object to process page content from PDFDocument
        # Interpreter needs to be connected to resource manager for shared resources and device
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Ok now that we have everything to process a pdf document, lets process it page by page
        for page in PDFPage.create_pages(document):
            # As the interpreter processes the page stored in PDFDocument object
            interpreter.process_page(page)
            # The device renders the layout from interpreter
            layout = device.get_result()
            # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
            for lt_obj in layout:
                #print(lt_obj)
                #extracted_text_plus.append(lt_obj)
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extracted_text_plus.append(lt_obj)
            #print(layout)
        #close the pdf file
        fp.close()
        #save the text
        #with open(log_file, "wb") as my_log:
        #    my_log.write(extracted_text.encode("utf-8"))

        ###Finally getting to my contributions.###
        #Headlines are assumed to be large text. By comparing the number of lines of text in a textbox
        #with the height of the textbox, the average size of the text can be found.
        #Text that's larger than average is kept.
        df = pd.DataFrame()
        df['cords'] = 0
        df['num'] = 0
        df['height'] = 0
        df['text'] = ''
        df['TL_X'] = -1
        df['TL_Y'] = -1
        df['width'] = -1
        nums = []
        heights = []
        for n in range(0, len(extracted_text_plus)):
            cords = str(extracted_text_plus[n]).split(' ')[1].split(',')
            vals = [float(elm) for elm in cords]
            a, b, c, d = vals
            text = ' '.join(str(extracted_text_plus[n]).split(' ')[2:])
            h = d - b  #float(cords[3])-float(cords[1])
            w = c - a
            #nums.append(n)
            #heights.append(h)
            #print(cords)
            df.loc[n, 'cords'] = ' '.join(cords)
            df.loc[n, 'num'] = n
            df.loc[n, 'height'] = h
            df.loc[n, 'width'] = w
            df.loc[n, 'TL_X'] = a
            df.loc[n, 'TL_Y'] = b
            df.loc[n, 'text'] = text
        df['newlines'] = 0
        for x in range(0, len(df)):
            df.loc[x, 'newlines'] = df.loc[x, 'text'].count('\\n')
        df['text height'] = df['height'] / df['newlines']
        return df
    except:
        pass
Exemplo n.º 55
0
def cas_pdf_to_text(filename: Union[str, io.IOBase],
                    password) -> PartialCASData:
    """
    Parse CAS pdf and returns line data.

    :param filename: CAS pdf file (CAMS or Kfintech)
    :param password: CAS pdf password
    :return: array of lines from the CAS.
    """
    file_type: Optional[FileType] = None

    if isinstance(filename, str):
        fp = open(filename, "rb")
    elif isinstance(filename, io.IOBase):
        fp = filename
    elif hasattr(filename, "read"):  # compatibility for Django UploadedFile
        fp = filename
    else:
        raise CASParseError(
            "Invalid input. filename should be a string or a file like object")

    with fp:
        pdf_parser = PDFParser(fp)
        try:
            document = PDFDocument(pdf_parser, password=password)
        except PDFPasswordIncorrect:
            raise CASParseError("Incorrect PDF password!")
        except PDFSyntaxError:
            raise CASParseError("Unhandled error while opening file")

        line_margin = {
            FileType.KFINTECH: 0.1,
            FileType.CAMS: 0.2
        }.get(detect_pdf_source(document), 0.2)

        rsrc_mgr = PDFResourceManager()
        laparams = LAParams(line_margin=line_margin, detect_vertical=True)
        device = PDFPageAggregator(rsrc_mgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrc_mgr, device)

        pages: List[Iterator[LTTextBoxHorizontal]] = []

        investor_info = None
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            text_elements = filter(
                lambda x: isinstance(x, LTTextBoxHorizontal), layout)
            if file_type is None:
                for el in filter(lambda x: isinstance(x, LTTextBoxVertical),
                                 layout):
                    if re.search("CAMSCASWS", el.get_text()):
                        file_type = FileType.CAMS
                    if re.search("KFINCASWS", el.get_text()):
                        file_type = FileType.KFINTECH
            if investor_info is None:
                investor_info = parse_investor_info(layout, *page.mediabox[2:])
            pages.append(text_elements)

        lines = group_similar_rows(pages)
        return PartialCASData(file_type=file_type,
                              investor_info=investor_info,
                              lines=lines)
Exemplo n.º 56
0
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice

from pdfminer.layout import LAParams
from pdfminer.converter import PDFResourceManager, PDFPageAggregator
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal

document = open('../sample_surat2.pdf', 'rb')
#Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(document):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    layout = device.get_result()
    for element in layout:
        if instanceof(element, LTTextBoxHorizontal):
            print(element.get_text())
Exemplo n.º 57
0
def pdf_to_csv(filename, separator, threshold):
    #from cStringIO import StringIO
    from pdfminer.converter import LTChar, TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)
            self.separator = separator
            self.threshold = threshold

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda: {})
            for child in self.cur_item._objs:  # <-- changed
                if isinstance(child, LTChar):
                    (_, _, x, y) = child.bbox
                    line = lines[int(-y)]
                    line[x] = child._text.encode(self.codec)  # <-- changed
            for y in sorted(lines.keys()):
                line = lines[y]
                self.line_creator(line)
                self.outfp.write(self.line_creator(line))
                self.outfp.write("\n")

        def line_creator(self, line):
            keys = sorted(line.keys())
            # calculate the average distange between each character on this row
            average_distance = sum(
                [keys[i] - keys[i - 1]
                 for i in range(1, len(keys))]) / len(keys)
            # append the first character to the result
            result = [line[keys[0]]]
            for i in range(1, len(keys)):
                # if the distance between this character and the last character is greater than the average*threshold
                if (keys[i] - keys[i - 1]) > average_distance * self.threshold:
                    # append the separator into that position
                    result.append(self.separator)
                # append the character
                result.append(line[keys[i]])
            printable_line = ''.join(result)
            return printable_line

    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()

    ft = 'txt\\' + filename + '.txt'
    outfp = open(ft, 'w')

    #outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    fp = open(filename, 'rb')

    interpreter = PDFPageInterpreter(rsrc, device)
    for i, page in enumerate(PDFPage.get_pages(fp)):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        else:
            print 'none'
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()
    outfp.close()
    #return outfp.getvalue()
    return 0
Exemplo n.º 58
0
def extract_text(
        files=[],
        outfile='-',
        _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
        no_laparams=False,
        all_texts=None,
        detect_vertical=None,  # LAParams
        word_margin=None,
        char_margin=None,
        line_margin=None,
        boxes_flow=None,  # LAParams
        output_type='text',
        codec='utf-8',
        strip_control=False,
        maxpages=0,
        page_numbers=None,
        password="",
        scale=1.0,
        rotation=0,
        layoutmode='normal',
        output_dir=None,
        debug=False,
        disable_caching=False,
        **other):
    if _py2_no_more_posargs is not None:
        raise ValueError("Too many positional arguments passed.")
    if not files:
        raise ValueError("Must provide files to work upon!")

    # If any LAParams group arguments were passed, create an LAParams object and
    # populate with given args. Otherwise, set it to None.
    if not no_laparams:
        laparams = pdfminer.layout.LAParams()
        for param in ("all_texts", "detect_vertical", "word_margin",
                      "char_margin", "line_margin", "boxes_flow"):
            paramv = locals().get(param, None)
            if paramv is not None:
                setattr(laparams, param, paramv)
    else:
        laparams = None

    imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)

    if output_type == "text" and outfile != "-":
        for override, alttype in ((".htm", "html"), (".html", "html"),
                                  (".xml", "xml"), (".tag", "tag")):
            if outfile.endswith(override):
                output_type = alttype

    if outfile == "-":
        outfp = sys.stdout
        if outfp.encoding is not None:
            codec = 'utf-8'
    else:
        outfp = open(outfile, "wb")

    for fname in files:
        with open(fname, "rb") as infp:
            #           pdfminer.high_level.extract_text_to_fp(fp, **locals())

            rsrcmgr = PDFResourceManager_new(caching=not disable_caching)
            device = TextConverter(rsrcmgr,
                                   outfp,
                                   codec=codec,
                                   laparams=laparams,
                                   imagewriter=imagewriter)
            if outfp == sys.stdout:
                outfp = sys.stdout.buffer

            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(infp,
                                          page_numbers,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=not disable_caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)

            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.get_pages(infp):
                interpreter.process_page(page)
                # receive the LTPage object for the page.
                layout = device.get_result()
                for element in layout:
                    if isinstance(element, LTTextBoxHorizontal):
                        print(element.get_text())

    return outfp
def collect_events(helper, ew):

    if helper.get_arg('endpoint') == 'worldwide':
        graph_url = 'https://graph.microsoft.com/v1.0'
    elif helper.get_arg('endpoint') == 'gcchigh':
        graph_url = 'https://graph.microsoft.us/v1.0'

    access_token = _get_access_token(helper)

    headers = {
        "Authorization": "Bearer " + access_token,
        "User-Agent": "MicrosoftGraphEmail-Splunk/" + _get_app_version(helper)
    }
    #"Prefer": "outlook.body-content-type=text"}

    #defining email account to retrieve messages from
    endpoint = "/users/" + helper.get_arg('audit_email_account')

    #defining inbox id to retrieve messages from
    endpoint += "/mailFolders/inbox/messages/"

    #expanding property id 0x0E08 to gather message size, and then expanding attachments to get fileattachment type contentBytes
    endpoint += "?$expand=SingleValueExtendedProperties($filter=Id eq 'LONG 0x0E08'),attachments"

    #selecting which fields to retrieve from emails
    endpoint += "&$select=receivedDateTime,subject,sender,from,hasAttachments,internetMessageId,toRecipients,ccRecipients,bccRecipients,replyTo,internetMessageHeaders,body,bodyPreview,isReadReceiptRequested,isDeliveryReceiptRequested"

    #defining how many messages to retrieve from each page
    endpoint += "&$top=980"

    #getting the oldest messages first
    endpoint += "&$orderby=receivedDateTime"

    #getting the total count of messages in each round
    endpoint += "&$count=true"

    messages_response = helper.send_http_request(graph_url + endpoint,
                                                 "GET",
                                                 headers=headers,
                                                 parameters=None,
                                                 timeout=(15.0, 15.0)).json()

    helper.log_info("Retrieving " + str(messages_response['@odata.count']) +
                    " messages")

    messages = []

    #Routine that iterates through the messages.  Uses the @odata.nextLink values to find the next endpoint to query.

    messages.append(messages_response['value'])

    #Calculate how many pages of 980 messages we'll attempt based on the interval value.  Helps to keep requests within API limits.

    interval_in_seconds = int(helper.get_arg('interval'))

    url_count_limit = (interval_in_seconds // 60) - 1

    if url_count_limit > 0:

        url_count = 0

        while ("@odata.nextLink" in messages_response) and (is_https(
                messages_response["@odata.nextLink"])):
            if url_count < url_count_limit:
                nextlinkurl = messages_response["@odata.nextLink"]
                messages_response = helper.send_http_request(
                    nextlinkurl,
                    "GET",
                    headers=headers,
                    parameters=None,
                    timeout=(15.0, 15.0)).json()
                messages.append(messages_response['value'])
                url_count += 1
            else:
                helper.log_debug("Protecting API limits, breaking out")
                break

    #Routine to find attachments in messages.  This caters for both standard, as well as inline attachments.  MS Graph doesn't list inline attachments in the "hasAttachments" value, this fixes that.
    message_data = []
    attach_data = []

    for message in messages:

        for item in message:

            message_items = {}

            message_items['_time'] = item['receivedDateTime']
            message_items['to'] = item['toRecipients']
            message_items['from'] = item['from']
            message_items['sender'] = item['sender']
            message_items['subject'] = item['subject']
            message_items['id'] = item['id']
            message_items['internetMessageId'] = item['internetMessageId']
            message_items['ccRecipients'] = item['ccRecipients']
            message_items['bccRecipients'] = item['bccRecipients']
            message_items['replyTo'] = item['replyTo']
            message_items['hasAttachments'] = item['hasAttachments']

            message_body = item['body']['content']
            body_preview = item['bodyPreview']
            attachments = item['attachments']
            single_value_properties = item['singleValueExtendedProperties']

            if 'internetMessageHeaders' in item:
                internet_message_headers = item['internetMessageHeaders']

                if helper.get_arg('get_internet_headers'):
                    message_items[
                        'Internet-Headers'] = internet_message_headers

                #message path calculations
                message_path = []
                path_item = {}

                for item in internet_message_headers:
                    if item['name'] == "Received":
                        path_item = item
                        message_path.append(path_item)

                src_line = str(message_path[-1])
                dest_line = str(message_path[0])

                re_by = re.compile(r'(?<=\bby\s)(\S+)')
                re_from = re.compile(r'(?<=\bfrom\s)(\S+)')

                dest = re_by.search(dest_line)

                if re_from.search(src_line):
                    src = re_from.search(src_line)
                elif re_by.search(src_line):
                    src = re_by.search(src_line)

                message_items['src'] = str(src[0])
                message_items['dest'] = str(dest[0])

                if helper.get_arg('get_message_path'):
                    message_items['message_path'] = message_path

                if helper.get_arg('get_x_headers'):

                    x_headers = []
                    x_header_item = {}

                    for item in internet_message_headers:
                        if "X-" in item['name']:
                            x_header_item = item
                            x_headers.append(x_header_item)

                    message_items['X-Headers'] = x_headers

                if helper.get_arg('get_auth_results'):

                    auth_results = []
                    auth_results_item = {}

                    for item in internet_message_headers:
                        if "Authentication-Results" in item['name']:
                            auth_results_item = item
                            auth_results.append(auth_results_item)

                    message_items['Authentication-Results'] = auth_results

                if helper.get_arg('get_spf_results'):

                    spf_results = []
                    spf_results_item = {}

                    for item in internet_message_headers:
                        if "Received-SPF" in item['name']:
                            spf_results_item = item
                            spf_results.append(spf_results_item)

                    message_items['Received-SPF'] = spf_results

                if helper.get_arg('get_dkim_signature'):

                    dkim_sig = []
                    dkim_sig_item = {}

                    for item in internet_message_headers:
                        if "DKIM-Signature" in item['name']:
                            dkim_sig_item = item
                            dkim_sig.append(dkim_sig_item)

                    message_items['DKIM-Signature'] = dkim_sig

            #tracking pixel detection
            if pixeltrack_re.search(message_body):
                pixel_data = pixeltrack_re.search(message_body)
                message_items['tracking_pixel'] = "true"
                message_items['tracking_pixel_data'] = pixel_data.group(0)
            else:
                message_items['tracking_pixel'] = "false"

            #size mapping
            for item in single_value_properties:
                if item['id'] == "Long 0xe08":
                    message_items['size'] = item['value']

            if helper.get_arg('get_body'):
                message_items['body'] = message_body

            if helper.get_arg('get_body_preview'):
                message_items['bodyPreview'] = body_preview

            if helper.get_arg('get_internet_headers'):
                message_items['Internet-Headers'] = internet_message_headers

            if helper.get_arg('get_attachment_info'):
                message_items['attachments'] = attachments

            if helper.get_arg('get_body'):
                if helper.get_arg('extract_iocs'):

                    iocs = extract_iocs(helper, message_items["body"])

                    email_iocs = []

                    for ioc in iocs:
                        if not ioc in email_iocs:
                            email_iocs.append(ioc)
                    if email_iocs:
                        message_items['iocs'] = email_iocs

            if helper.get_arg('get_attachment_info'):

                if message_items['attachments'] is not None:

                    for attachment in message_items["attachments"]:

                        #Looks for itemAttachment type, which is a contact, event, or message that's attached.
                        if attachment[
                                "@odata.type"] == "#microsoft.graph.itemAttachment":

                            my_added_data = {}

                            my_added_data['name'] = attachment['name']
                            my_added_data['odata_type'] = attachment[
                                '@odata.type']
                            my_added_data['id'] = attachment['id']
                            my_added_data['contentType'] = attachment[
                                'contentType']
                            my_added_data['size'] = attachment['size']

                            attach_data.append(my_added_data)

                        #Looks for referenceAttachment type, which is a link to a file on OneDrive or other supported storage location
                        if attachment[
                                "@odata.type"] == "#microsoft.graph.referenceAttachment":

                            my_added_data = {}

                            my_added_data['name'] = attachment['name']
                            my_added_data['odata_type'] = attachment[
                                '@odata.type']
                            my_added_data['id'] = attachment['id']
                            my_added_data['contentType'] = attachment[
                                'contentType']
                            my_added_data['size'] = attachment['size']

                            attach_data.append(my_added_data)

                        #Looks for fileAttachment type, which is a standard email attachment.
                        if attachment[
                                "@odata.type"] == "#microsoft.graph.fileAttachment":

                            my_added_data = {}

                            attach_b64decode = base64.b64decode(
                                attachment['contentBytes'])

                            #Selects which hashing algorithm (md5, sha1, sha256) to use on the attachment.
                            if helper.get_arg(
                                    'get_attachment_info') and helper.get_arg(
                                        'file_hash_algorithm') == 'md5':
                                hash_object = hashlib.md5(attach_b64decode)
                            if helper.get_arg(
                                    'get_attachment_info') and helper.get_arg(
                                        'file_hash_algorithm') == 'sha1':
                                hash_object = hashlib.sha1(attach_b64decode)
                            if helper.get_arg(
                                    'get_attachment_info') and helper.get_arg(
                                        'file_hash_algorithm') == 'sha256':
                                hash_object = hashlib.sha256(attach_b64decode)

                            att_hash = hash_object.hexdigest()

                            my_added_data['name'] = attachment['name']
                            my_added_data['odata_type'] = attachment[
                                '@odata.type']
                            my_added_data['id'] = attachment['id']
                            my_added_data['contentType'] = attachment[
                                'contentType']
                            my_added_data['size'] = attachment['size']
                            my_added_data['file_hash'] = att_hash

                            #Attempts to open up zip file to list file names and hashes if the option is selected in the input.
                            if helper.get_arg(
                                    'get_attachment_info'
                            ) and helper.get_arg(
                                    'read_zip_files'
                            ) and attachment[
                                    '@odata.mediaContentType'] == 'application/zip':

                                filedata_encoded = attachment[
                                    'contentBytes'].encode()
                                file_bytes = base64.b64decode(filedata_encoded)

                                zipbytes = io.BytesIO(file_bytes)

                                try:
                                    zipfile = ZipFile(zipbytes)

                                    zipmembers = zipfile.namelist()

                                    zip_files = []
                                    zip_hashes = []

                                    for file in zipmembers:

                                        zip_read = zipfile.read(file)

                                        if helper.get_arg('file_hash_algorithm'
                                                          ) == 'md5':
                                            hash_object = hashlib.md5(zip_read)
                                        if helper.get_arg('file_hash_algorithm'
                                                          ) == 'sha1':
                                            hash_object = hashlib.sha1(
                                                zip_read)
                                        if helper.get_arg('file_hash_algorithm'
                                                          ) == 'sha256':
                                            hash_object = hashlib.sha256(
                                                zip_read)

                                        zip_hash = hash_object.hexdigest()

                                        if not file in zip_files:

                                            zip_files.append(file)
                                            zip_hashes.append(zip_hash)

                                        if zip_files:
                                            my_added_data[
                                                'zip_files'] = zip_files
                                            my_added_data[
                                                'zip_hashes'] = zip_hashes

                                except:
                                    my_added_data[
                                        'attention'] = 'could not extract the zip file, may be encrypted'

                            #Routine to gather info on CSV file types.
                            if helper.get_arg(
                                    'get_attachment_info'
                            ) and attachment[
                                    '@odata.mediaContentType'] == 'text/csv':

                                filedata_encoded = attachment[
                                    'contentBytes'].encode()
                                file_bytes = base64.b64decode(filedata_encoded)

                                csvbytes = io.BytesIO(file_bytes)

                                try:
                                    csvstring = csvbytes.read().decode('utf-8')

                                    if helper.get_arg('extract_iocs'):

                                        iocs = extract_iocs(helper, csvstring)

                                        csv_iocs = []

                                        for ioc in iocs:
                                            if not ioc in csv_iocs:
                                                csv_iocs.append(ioc)
                                        if csv_iocs:
                                            my_added_data['iocs'] = csv_iocs

                                    #Will attempt to ingest the actual contents of the CSV file if this option is selected in the input.
                                    if 'csv' in helper.get_arg(
                                            'attachment_data_ingest'):
                                        my_added_data['csv_data'] = csvstring

                                except:
                                    my_added_data[
                                        'attention'] = 'could not parse the csv document, may be encrypted'

                            #Routine to gather info on HTML file types.
                            if helper.get_arg(
                                    'get_attachment_info'
                            ) and attachment[
                                    '@odata.mediaContentType'] == 'text/html':

                                filedata_encoded = attachment[
                                    'contentBytes'].encode()
                                file_bytes = base64.b64decode(filedata_encoded)

                                try:
                                    uncooked_soup = html.unescape(
                                        str(file_bytes))

                                    soup = BeautifulSoup(uncooked_soup)

                                    soup_data = str(soup)

                                    if helper.get_arg('extract_iocs'):

                                        iocs = extract_iocs(helper, soup_data)

                                        html_iocs = []

                                        for ioc in iocs:
                                            if not ioc in html_iocs:
                                                html_iocs.append(ioc)
                                        if html_iocs:
                                            my_added_data['iocs'] = html_iocs

                                    #Will attempt to ingest the actual contents of the HTML file if this option is selected in the input.
                                    if 'html' in helper.get_arg(
                                            'attachment_data_ingest'):
                                        my_added_data['html_data'] = soup_data

                                except:
                                    my_added_data[
                                        'attention'] = 'could not parse the html document, may be encrypted'

                            #Routine to gather info on PDF file types.
                            if helper.get_arg(
                                    'get_attachment_info'
                            ) and attachment[
                                    '@odata.mediaContentType'] == 'application/pdf':

                                filedata_encoded = attachment[
                                    'contentBytes'].encode()

                                file_bytes = base64.b64decode(filedata_encoded)

                                pdf_content = io.BytesIO(file_bytes)

                                output_string = StringIO()

                                try:
                                    parser = PDFParser(pdf_content)

                                    doc = PDFDocument(parser)

                                    rsrcmgr = PDFResourceManager()

                                    device = TextConverter(rsrcmgr,
                                                           output_string,
                                                           laparams=LAParams())

                                    interpreter = PDFPageInterpreter(
                                        rsrcmgr, device)

                                    for page in PDFPage.create_pages(doc):
                                        interpreter.process_page(page)

                                    pdf_text = output_string.getvalue()

                                    if helper.get_arg('extract_iocs'):

                                        iocs = extract_iocs(helper, pdf_text)

                                        pdf_iocs = []

                                        for ioc in iocs:
                                            if not ioc in pdf_iocs:
                                                pdf_iocs.append(ioc)
                                            if pdf_iocs:
                                                my_added_data[
                                                    'iocs'] = pdf_iocs

                                    #Will attempt to ingest the actual contents of the PDF file if this option is selected in the input.
                                    if 'pdf' in helper.get_arg(
                                            'attachment_data_ingest'):
                                        my_added_data['pdf_data'] = pdf_text

                                except:
                                    my_added_data[
                                        'attention'] = 'could not parse the pdf document, may be encrypted'

                            #Routine to gather info on XML file types.
                            if helper.get_arg(
                                    'get_attachment_info'
                            ) and attachment[
                                    '@odata.mediaContentType'] == 'text/xml':

                                filedata_encoded = attachment[
                                    'contentBytes'].encode()

                                file_bytes = base64.b64decode(filedata_encoded)

                                try:
                                    soup = BeautifulSoup(file_bytes, 'lxml')

                                    soup_data = str(soup)

                                    if helper.get_arg('extract_iocs'):

                                        iocs = extract_iocs(helper, soup_data)

                                        xml_iocs = []

                                        for ioc in iocs:
                                            if not ioc in xml_iocs:
                                                xml_iocs.append(ioc)
                                        if xml_iocs:
                                            my_added_data['iocs'] = xml_iocs

                                    #Will attempt to ingest the actual contents of the XML file if this option is selected in the input.
                                    if 'xml' in helper.get_arg(
                                            'attachment_data_ingest'):
                                        my_added_data['xml_data'] = soup_data

                                except:
                                    my_added_data[
                                        'attention'] = 'could not parse the xml document, may be encrypted'

                            #Routine to do macro analysis on files of supported content types listed below if selected in the input setup.  This uses OLEVBA tools to detect macros in the attachment, then analyses the macros.
                            if helper.get_arg(
                                    'get_attachment_info') and helper.get_arg(
                                        'macro_analysis'):

                                filename = attachment['name']

                                #Content types supported by OLEVBA.
                                supported_content = [
                                    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                                    'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
                                    'application/vnd.ms-excel.sheet.macroenabled.12',
                                    'application/vnd.ms-excel.template.macroenabled.12',
                                    'application/vnd.ms-excel.addin.macroenabled.12',
                                    'application/vnd.ms-excel.sheet.binary.macroenabled.12',
                                    'application/vnd.ms-excel',
                                    'application/xml',
                                    'application/vnd.ms-powerpoint',
                                    'application/vnd.openxmlformats-officedocument.presentationml.presentation',
                                    'application/vnd.openxmlformats-officedocument.presentationml.template',
                                    'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
                                    'application/vnd.ms-powerpoint.addin.macroenabled.12',
                                    'application/vnd.ms-powerpoint.presentation.macroenabled.12',
                                    'application/vnd.ms-powerpoint.template.macroenabled.12',
                                    'application/vnd.ms-powerpoint.slideshow.macroenabled.12',
                                    'application/msword',
                                    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                                    'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
                                    'application/vnd.ms-word.document.macroenabled.12',
                                    'application/vnd.ms-word.template.macroenabled.12'
                                ]

                                if attachment[
                                        '@odata.mediaContentType'] in supported_content:

                                    filedata_encoded = attachment[
                                        'contentBytes'].encode()
                                    file_bytes = base64.b64decode(
                                        filedata_encoded)

                                    try:
                                        vbaparser = VBA_Parser(filename,
                                                               data=file_bytes)

                                        if vbaparser.detect_vba_macros():
                                            my_added_data[
                                                'macros_exist'] = "true"

                                            macro_analysis = VBA_Parser.analyze_macros(
                                                vbaparser)
                                            helper.log_debug(
                                                "GET Response: " + json.dumps(
                                                    macro_analysis, indent=4))

                                            if macro_analysis == []:
                                                my_added_data[
                                                    'macro_analysis'] = "Macro doesn't look bad, but I never trust macros."
                                            else:
                                                my_added_data[
                                                    'macros_analysis'] = macro_analysis

                                        else:
                                            my_added_data[
                                                'macros_exist'] = "false"

                                    except:
                                        my_added_data[
                                            'attention'] = 'could not extract the office document, may be encrypted'

                            attach_data.append(my_added_data)

            message_items['attachments'] = attach_data
            message_data.append(message_items)

        _write_events(helper, ew, messages=message_data)
    _purge_messages(helper, messages)
Exemplo n.º 60
0
 def _extract_pages_from_file(self, source_pdf: str):
     self.switch_to_pdf_document(source_pdf)
     pdf_pages = PDFPage.get_pages(self.active_fileobject)
     return PageGenerator(pdf_pages)