示例#1
0
文件: Parse.py 项目: flahemade/TO52
    def run(path):
        print "Calling parser :%s" % path

        t0 = time.clock()

        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        book = Book()
        i = 0
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                      check_extractable=True):
            page_tmp = Page()
            begin_page = len(retstr.getvalue())
            interpreter.process_page(page)
            page_tmp.text = retstr.getvalue()[begin_page:-1]
            book.pages.append(page_tmp)
        fp.close()
        device.close()
        retstr.close()
        print "Parsing in:", time.clock() - t0
        return book
示例#2
0
def pdfconvert(infullpath, file, outfullpath, pages=None):         #Handle PDF
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    pdffile = open(infullpath, 'rb')
    for page in PDFPage.get_pages(pdffile, pagenums):
        interpreter.process_page(page)
    pdffile.close()
    converter.close()
    txtfilename = file

    jpgfile = os.path.splitext(outfullpath)[0] + '.jpg'
    txtfile = os.path.splitext(outfullpath)[0] + '.txt'
    string.replace(txtfile, ' ', '_')
    string.replace(txtfile, '(', '_')
    string.replace(txtfile, ')', '_')
    text = output.getvalue()
    output.close
    temp = open(txtfile, 'w')
    temp.write (text)
    temp.close()

    imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"'
    os.system(imagemagick_string)
示例#3
0
    def parse(self, path):
		out = StringIO.StringIO()
		fp = None
        # Directory
		if os.path.isdir(path):
			raise NotImplementedError()
        # File
	       	else:
			fp = file(path)		
		rsrc = PDFResourceManager()
		codec = 'utf-8'
		laparams = LAParams()
		laparams.char_margin = 2.0
		laparams.line_margin = 2.0
		laparams.word_margin = 0.0
		device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
		doc = PDFDocument()
		parser = PDFParser(fp)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize()
		interpreter = PDFPageInterpreter(rsrc, device)
		for page in doc.get_pages():
			interpreter.process_page(page)
		device.close()
		sample = Sample(path, None, out.getvalue())
		out.close()
		return sample
示例#4
0
def convert_pdf_to_txt(path, output):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()

    f = open(output, 'wb')
    f.write(text)
    f.close()
    return text
示例#5
0
def get_pdf_text(path):
    """ Reads a pdf file and returns a dict of the text where the
        index represents the page number.
        http://stackoverflow.com/a/20905381
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    # change to to utf-8 if the text comes out garbled
    codec = 'ascii'
    #codec = 'utf-8'
    laparams = LAParams()
    pages = {}
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, showpageno=True, pages=pages)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    retstr.close()
    return pages
示例#6
0
def edit_file(fname, pages=None):
	if 'log_in' not in session.keys() :
		return redirect(url_for('index'))
	#f = send_from_directory(app.config['UPLOAD_FOLDER'],fname)
	filename = fname
	fname = os.path.join(app.config['UPLOAD_FOLDER'], fname)
	exten = fname.split('.')[1]
	print exten
	if exten != 'pdf' :
		f = open(fname, 'rb').read()
		text = f
	else :
		if not pages:
		    pagenums = set()
		else:
		    pagenums = set(pages)

		output = StringIO()
		manager = PDFResourceManager()
		converter = TextConverter(manager, output, laparams=LAParams())
		interpreter = PDFPageInterpreter(manager, converter)

		infile = file(fname, 'rb')
		for page in PDFPage.get_pages(infile, pagenums):
		    interpreter.process_page(page)
		infile.close()
		converter.close()
		text = output.getvalue()
		output.close
	print filename
	return '<!doctype html><title>Edit File</title><h1>Upload new File</h1><form action="/save" method=post><p><textarea name="contents" rows=30 cols = 150 autofocus>' + text +'</textarea><br /><input type=hidden name=filename value=' + str(filename) + '> <input type=submit value=Upload></form></html>'
示例#7
0
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        # a = page.contents[0].rawdata
        # print ('u', a)
        # print
        # splitData = a.split('\n')
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    # print ('u', text)
    # print
    # print(text)
    return text
示例#8
0
def get_text(path):
    txt_path = path + '.txt'
    
    if (os.path.isfile(txt_path)):
        return open(txt_path).read()
    
    path = path + '.pdf'
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    
    write_text(txt_path, str)
    
    return str
示例#9
0
def pdfconvert(infullpath, file, infolder, pages=None):         #Handle PDF
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    pdffile = open(infullpath, 'rb')
   # print "pdffile=", pdffile
    for page in PDFPage.get_pages(pdffile, pagenums):
        interpreter.process_page(page)
    pdffile.close()
    converter.close()
    txtfilename = file
    jpgfile = infolder + str(txtfilename) + '.jpg'
    txtfile = corpuspath + corpusfolder + '/' + txtfilename + '.txt'

    text = output.getvalue()
    output.close
    temp = open(txtfile, 'w')
    temp.write (text)
    temp.close()

    imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"'
    os.system(imagemagick_string)

    return jpgfile
示例#10
0
def convert_pdf_to_txt(path): 
	## TAKEN FROM STACK OVERFLOW
	## see... http://www.unixuser.org/~euske/python/pdfminer/programming.html for tutorial
	## Also see... https://github.com/dpapathanasiou/pdfminer-layout-scanner/blob/master/layout_scanner.py
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	fp = file(path, 'rb')
	password = ""
	maxpages = 0
	caching = True
	pagenos=set()

	# Read text from pages
	device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)	
	interpreter = PDFPageInterpreter(rsrcmgr, device)	
	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
	str = retstr.getvalue()

	fp.close()
	device.close()
	retstr.close()

	return str
示例#11
0
    def convert_pdf_to_txt(self, path):
        """
        A very simple conversion function
        which returns text for parsing from PDF.

        path = The path to the file
        """
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(
                rsrcmgr, retstr, codec=codec, laparams=laparams)
            fp = file(path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
            text = retstr.getvalue()
            fp.close()
            device.close()
            retstr.close()
            return text
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to PDF to text: " + str(e))
示例#12
0
def pdf_from_url_to_txt(url, maxpages=0):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Open the url provided as an argument to the function and read the content
    f = urllib2.urlopen(urllib2.Request(url)).read()
    # Cast to StringIO object
    fp = StringIO(f)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    string = retstr.getvalue()
    retstr.close()
    return string
示例#13
0
def pdf_to_text(pdf):
    pagenos = set()
    maxpages = 0
    # output option
    rotation = 0
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()

    rsrcmgr = PDFResourceManager(caching=caching)
    outtype = 'text'
    retstr = BytesIO()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = pdf
    if isinstance(pdf, str):
        fp = open(pdf, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()
    result = retstr.getvalue()
    print(result)
    return result
示例#14
0
def convert_pdf_to_txt(path):

    temp = os.path.splitext(path)

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(
        fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
    ):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()

    outputFile = temp[0] + ".txt"
    print outputFile

    ff = open(outputFile, "w")
    ff.write(text)
    ff.close()
def extract_text_from_pdf(pdf_filename):
    """
    Function to extract the text from pdf documents using pdfminer

    Parameters:
    -----------
    pdf_filename -- string
        File name of the pdf document as string

    Returns:
    --------
    extracted_text -- string
        Text extracted from pdf as string
    """

    resource_manager = PDFResourceManager()
    return_string = StringIO()
    la_params = LAParams()
    device = TextConverter(resource_manager, return_string, codec='utf-8', laparams=la_params)
    fp = file(pdf_filename, 'rb')
    interpreter = PDFPageInterpreter(resource_manager, device)
    page_nos = set()

    for page in PDFPage.get_pages(fp, page_nos):
        interpreter.process_page(page)
    fp.close()

    device.close()
    extracted_text = return_string.getvalue()
    return_string.close()

    return extracted_text
示例#16
0
def convert_pdf_to_txt(path):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    with open(path, 'rb') as fp:
        parser = PDFParser(fp)

        doc = PDFDocument(caching=True)
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.

        for page in doc.get_pages():
            interpreter.process_page(page)
        text = retstr.getvalue()

    device.close()
    retstr.close()

    return text
示例#17
0
    def __convert(self, ifile, ofile=None):
        fp = file(ifile, 'rb')

        if ofile is None:
            outfp = StringIO.StringIO()
        else:
            outfp = file(ofile, 'wb')

        rsrcmgr = PDFResourceManager(caching=self.caching)
        device = TextConverter(rsrcmgr, outfp, codec=self.codec, laparams=self.laparams,
                               imagewriter=self.imagewriter)

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        try:
            for page in PDFPage.get_pages(fp, self.pagenos,
                                          maxpages=self.maxpages, password=self.password,
                                          caching=self.caching, check_extractable=True):
                page.rotate = (page.rotate + self.rotation) % 360
                interpreter.process_page(page)
        except (PDFException, MemoryError) as e:
            print "Could not extract text {0}".format(e)
        fp.close()
        device.close()
        retval = None
        if ofile is None:
            retval = outfp.getvalue()

        outfp.close()
        return retval
示例#18
0
def pdf_read(pdf):
    """
    Use PDFMiner to extract text from pdf file.
    <PDFMiner even though more low-level but pretty good tool to read pdfs>

    Args:
        *pdf* (str) -- path to pdf file

    Returns:
        *text* (str) -- a text extracted from pdf

    """
    # initalizing objects
    res_manager = PDFResourceManager()
    strio = StringIO()
    lps = LAParams()
    device = TextConverter(res_manager, strio, codec='utf-8', laparams=lps)
    interpreter = PDFPageInterpreter(res_manager, device)
    # opening a pdf file with 'rb' mode for reading binary files
    pdf_file = file(pdf, 'rb')
    for page in PDFPage.get_pages(pdf_file, maxpages=0, password='',
                                  caching=True, check_extractable=True):
        interpreter.process_page(page)
    # finishing up
    pdf_file.close()
    device.close()
    text = strio.getvalue()
    strio.close()
    return text
示例#19
0
def pdf_to_text(pdfname):
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams

    from cStringIO import StringIO

    # PDFMiner boilerplate
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    # codec = 'utf-8'
    codec = 'ascii'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Extract text
    fp = file(pdfname, 'rb')
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()

    # Get text from StringIO
    text = sio.getvalue()

    # Cleanup
    device.close()
    sio.close()

    return text
示例#20
0
    def pdf2txt(self, lowerBorder=-1, upperBorder=-1):
        """
        Returns the plain text of the document. If lowerBorder is an int number > -1, only
        page referring to this number will be returned. If lowerBorder and upperBorder are >-1
        and upperBorder > lowerBoder, the pages referring to that range will be returned.  
        """
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(self.filename, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        if (lowerBorder==-1 and upperBorder==-1) or (lowerBorder>-1 and upperBorder=="max"):
            pagenos=set()
        elif lowerBorder > -1 and upperBorder==-1:
            #extract only a single page
            pagenos=set(range(lowerBorder, lowerBorder+1))
        elif lowerBorder==-1 or upperBorder==-1 or lowerBorder > upperBorder:
            raise ValueError("illegal parameter passed")
        else:
            pagenos=set(range(lowerBorder, upperBorder+1))

        for (pageno, page) in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True)):
            if pageno < lowerBorder and upperBorder == "max":
                continue
            interpreter.process_page(page)
        fp.close()
        device.close()
        s = retstr.getvalue()
        retstr.close()
        return s.decode('utf-8')
def pdf2txt(path):
    '''
    Converts a given PDF to plain text in UTF8.
    '''

    try:
        rsrcMgr = PDFResourceManager()
        retStr = StringIO()
        codec = 'utf-8'
        laParams = LAParams()
        device = TextConverter(rsrcMgr, retStr, codec=codec, laparams=laParams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcMgr, device)
        password = ""
        maxPages = 0
        caching = True
        pageNos=set()
        for page in PDFPage.get_pages(fp,pageNos,maxpages=maxPages,password=password,caching=caching,check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        text = retStr.getvalue()
        retStr.close()

        return text
    except:
        return None
示例#22
0
def convert(url, pages=None):
    assert isinstance(url, basestring)
    assert pages == None or isinstance(pages, list)

    rscmng = PDFResourceManager()
    retstr = StringIO()
    device = TextConverter(rscmng, retstr, codec='utf-8', laparams=LAParams())
    web_page = urllib2.urlopen(urllib2.Request(url))
    fp = StringIO(web_page.read())
    interpreter = PDFPageInterpreter(rscmng, device)

    pdf_pages = PDFPage.get_pages(
        fp,
        set(pages if pages != None else []),
        maxpages=0,
        password='',
        caching=True,
        check_extractable=True
    )

    for page in pdf_pages:
        interpreter.process_page(page)

    result = retstr.getvalue()

    fp.close()
    web_page.close()
    device.close()
    retstr.close()

    return result
示例#23
0
def convert_pdf(path='provide path here', format='text', codec='utf-8'):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    laparams = LAParams()
    if format == 'text':
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    else:
        raise ValueError('Please provide the format to extract')
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 500 #mention the maximum pages here (Note: Large number of pages will decrease the performance.)
    caching = True
    page_numbers=set()
    for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    text = retstr.getvalue().decode()
    fp.close()
    device.close()
    retstr.close()
    bulletins_data = re.findall('•([^•]+)*', str(text))
    list_of_bullet_points = []
    json_dict = {}
    for points in bulletins_data:
        list_of_bullet_points.append(points)
    json_dict['bulletins'] = list_of_bullet_points
    json_data= json.dumps(json_dict)
    parsed = json.loads(json_data)
    final_data = json.dumps(parsed, indent=4, sort_keys=True) #creates a pretty json with the data extracted
    document = Document()  # creates a new document
    document.add_heading('Bulletins data in the PDF')
    document.add_paragraph(str(final_data))
    document.save('json_data.docx')  # saves it to the filesystem
    os.startfile("json_data.docx")  # will open the file
    return ''
示例#24
0
def convert_pdf_to_txt(path):
    """
    Converts PDF to text using the pdfminer library
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    file_handle = file(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(
        file_handle, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
    ):
        interpreter.process_page(page)

    text = retstr.getvalue()

    file_handle.close()
    device.close()
    retstr.close()
    return text
示例#25
0
def convert_pdf_to_txt(path):
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
	fp = file(path, 'rb')
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	password = ""
	maxpages = 120
	caching = True
	pagenos=set()
	# print "two"

	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
	# print "one"

	try:
		fp.close()
		device.close()
		str = retstr.getvalue()
		retstr.close()
	except:
		str = retstr.getvalue()

	return str
def convert_pdf_to_txt(path):
    """
    This function converts a .pdf file to text
    @path: file path to .pdf document

    from: http://stackoverflow.com/questions/26494211/
    extracting-text-from-a-pdf-file-using-pdfminer-in-python/26495057#26495057

    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password, caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text
示例#27
0
def pdf_to_txt(fichero_pdf,fichero_txt):    

    # Especificamos la configuracion de nuestro pdf
    password = ''
    pagenos = set()
    maxpages = 0

    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()

    # Estrablecemos el gestor
    rsrcmgr = PDFResourceManager(caching=caching)
       
    # Creamos el fichero de salida y lingamos el dispositivo que lo transforma
    outfp = file(fichero_txt, 'w')
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
    
    # Para cada pagina del fichero pdf vamos interpretandola mediante el dispositivo
    fp = file(fichero_pdf, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
        
    # Cerramos los dispositivos abiertos
    fp.close()
    device.close()
    outfp.close()
    
    return 1
示例#28
0
def pdf_to_txt(path, lowerBorder=-1, upperBorder=-1):
   rsrcmgr = PDFResourceManager()
   retstr = StringIO()
   codec = 'utf-8'
   laparams = LAParams()
   device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
   fp = file(path, 'rb')
   interpreter = PDFPageInterpreter(rsrcmgr, device)
   password = ""
   maxpages = 0
   caching = True
   if lowerBorder==-1 and upperBorder==-1:
      pagenos=set()
   else:
      if lowerBorder==-1 or upperBorder==-1 or lowerBorder > upperBorder:
         raise ValueError("illegal parameter passed")
      else:
         pagenos=set(range(lowerBorder, upperBorder+1))
   for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
      interpreter.process_page(page)
   fp.close()
   device.close()
   s = retstr.getvalue()
   retstr.close()
   return s.decode('utf-8')
示例#29
0
	def getTexts(self):
		try:
			password =''
			pagenos = set()
			maxpages = 0
			codec = 'utf-8'
			caching = True
			laparams = LAParams()
			rsrcmgr = PDFResourceManager(caching=caching)
			outfp = file('temppdf.txt','w')
			device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
			fname= self.fname
			fp = file(fname, 'rb')
			process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)
			fp.close()
			device.close()
			outfp.close()
			infp = file('temppdf.txt','rb')
			test=infp.read()
			infp.close()
			os.remove('temppdf.txt')
			self.text=test
			return "ok"
		except Exception,e:
			return e
示例#30
0
文件: pdf2txt.py 项目: LoicH/aps
def pdf_to_txt(path):
    """converts pdf into a string
    @param path: path to the file
    @type path: string
    
    @return: pdf content
    @rtype: string"""
    
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    s = retstr.getvalue()
    retstr.close()
    return s.replace('\x0c','')
示例#31
0
def scrape(filepath: str) -> Dict:
    sop = SOP()
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec="utf-8", laparams=laparams)
    _, file_extension = os.path.splitext(filepath)
    if file_extension != ".pdf":
        raise FileTypeException
    with open(filepath, "rb") as f:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        page = next(
            PDFPage.get_pages(f,
                              set(),
                              maxpages=1,
                              caching=False,
                              check_extractable=True))
        interpreter.process_page(page)
        text = retstr.getvalue()
        text = text.replace("\n", "")
        match = re.search(r"SOP\s(\d{1}\-\d+)", text)
        try:
            sop.number = match.group(1)
        except AttributeError:
            pass
        match = re.search(r"Effective\:\s(\d{1,2}\/\d{2}\/\d{2})", text)
        try:
            sop.effective_date = match.group(1)
        except AttributeError:
            pass
        match = re.search(r"(Review\sDue|Expires)\:\s(\d{1,2}\/\d{2}\/\d{2})",
                          text)
        try:
            sop.expires_date = match.group(2)
        except AttributeError:
            pass
        match = re.search(r"Replaces\:\s(\d{1,2}\/\d{2}\/\d{2})", text)
        try:
            sop.replaces_date = match.group(1)
        except AttributeError:
            pass
        return sop.__dict__
示例#32
0
def pdf2text(path):
    x = os.path.split(path)
    new_f_name = x[-1]
    if new_f_name.endswith('.pdf'):
        new_f_txt = new_f_name.replace('.pdf', '.txt')
    elif new_f_name.endswith('.PDF'):
        new_f_txt = new_f_name.replace('.PDF', '.txt')
    print(new_f_txt)
    os.chdir(output_path)
    f = open(new_f_txt, "a")
    try:
        rsrcmgr = PDFResourceManager()
        retstr = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(f):
            interpreter.process_page(page)
            path = retstr.getvalue()
        print(path)
    except:
        print('This pdf contains images. Starting OCR....')
        pages = convert_from_path(path, 500)
        image_counter = 1
        for page in pages:
            filename = "page_" + str(image_counter) + ".jpg"
            page.save(filename, 'JPEG')
            image_counter = image_counter + 1
        filelimit = image_counter - 1
        for i in range(1, filelimit + 1):
            filename = "page_" + str(i) + ".jpg"
            text = str(((pytesseract.image_to_string(Image.open(filename)))))
            text = text.replace('-\n', '')
            f.write(text)
        f.close()
        del_files = glob.glob(output_path + '*')
        for i in del_files:
            if i.endswith(".jpg") or i.endswith(".jpeg") or i.endswith(".JPG"):
                os.remove(i)
        print('OCRed file: %s is available' % (new_f_txt))
        print('Deleted all images to the above job')
示例#33
0
def request_pdf(url, case_id, court_name):
    try:
        response = requests.request("GET", url, proxies=proxy_dict)
        if response.status_code == 200:
            res = response.text

            if res is None:
                logging.error("No data for: " + str(case_id))
                return "NULL"

            file_path = module_directory + "/../Data_Files/PDF_Files/" + court_name + "_" + slugify(
                case_id) + ".pdf"
            fw = open(file_path, "wb")
            fw.write(response.content)

            text_data = ""

            pdf_manager = PDFResourceManager()
            string_io = StringIO()
            pdf_to_text = TextConverter(pdf_manager,
                                        string_io,
                                        codec='utf-8',
                                        laparams=LAParams())
            interpreter = PDFPageInterpreter(pdf_manager, pdf_to_text)
            for page in PDFPage.get_pages(open(file_path, 'rb')):
                interpreter.process_page(page)
                text_data = string_io.getvalue()

            file_path = module_directory + "/../Data_Files/Text_Files/" + court_name + "_" + slugify(
                case_id) + ".txt"
            fw = open(file_path, "w")
            fw.write(str(text_data))

            return str(text_data)
        else:
            logging.error("Failed to get text file for: " + str(case_id))
            return "NULL"

    except Exception as e:
        logging.error(
            "Failed to get pdf file for: " + str(case_id) + ". Error: %s", e)
        return "NULL"
示例#34
0
def extract_text(data):
    try:
        try:
            from pdfminer.pdfdocument import PDFDocument
            from pdfminer.pdfpage import PDFPage
            newapi = True
        except ImportError:
            from pdfminer.pdfparser import PDFDocument
            newapi = False
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
        from pdfminer.converter import TextConverter
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    except ImportError:
        raise ImportError('Please install python3-pdfminer to parse PDF')
    else:
        parser = PDFParser(BytesIO(data))
        try:
            if newapi:
                doc = PDFDocument(parser)
            else:
                doc = PDFDocument()
                parser.set_document(doc)
                doc.set_parser(parser)
        except PDFSyntaxError:
            return

        rsrcmgr = PDFResourceManager()
        if sys.version_info.major == 2:
            out = BytesIO()
        else:
            out = StringIO()
        device = TextConverter(rsrcmgr, out)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        if newapi:
            pages = PDFPage.create_pages(doc)
        else:
            doc.initialize()
            pages = doc.get_pages()
        for page in pages:
            interpreter.process_page(page)

        return out.getvalue()
示例#35
0
def uploaded_file():
    if request.method == 'POST':
        # check if the post request has the file part
        if 'file' not in request.files:
            #flash('No file part')
            return redirect(url_for('index'))
        file = request.files['file']
        if file.filename == '':
            flash('No file selected for uploading')
            return redirect(url_for('index'))
        elif file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            text_file_path = filename
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            laparams = LAParams()
            device = TextConverter(rsrcmgr, retstr, laparams=laparams)
            pdf_file = "/tmp/" + filename
            fp = open(pdf_file, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            string = ""
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
                #fp.close()
                #device.close()
                string = retstr.getvalue()
                new_str = re.sub('[^a-zA-Z0-9\n]', ' ', string)
                #retstr.close()
            return new_str
        else:
            #flash('Allowed file types is pdf')
            return redirect(url_for('index'))
示例#36
0
    def extractToText(self,fileName, oFileName=None):
        """
        Extracts all the text from the specified file to the output file using pdfminer.

        Args:
            fileName: Name for the file to be extracted
            oFileName: Name for the output textfile, if none specified is the specified fileName
        """
        # Only allow processing of pdfs here
        if (fileName.find(".pdf") == -1):
            return None

        if (oFileName == None):
            oFileName = fileName.replace(".pdf",".txt")
        
        retStr = StringIO()
        device = TextConverter(self.rsrc_mgr_, retStr, codec=self.codec_, laparams=LAParams(char_margin= 20))

        fp = open(fileName,"rb")

        # Create the interpreter 
        interpreter = PDFPageInterpreter(self.rsrc_mgr_, device)

        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        for page in PDFPage.get_pages(fp,pagenos, maxpages = maxpages , password = password , caching = caching , check_extractable = True):
            interpreter.process_page(page)
        
        text = retStr.getvalue()

        oFile = open(oFileName,"w")

        oFile.writelines(text)

        # Close the file
        oFile.close()

        # Close the input pdf
        fp.close()
示例#37
0
文件: PDFToText.py 项目: mvthanh/BAP
def pdfparser(data):
    fp = open(data, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    pages = PdfFileReader(open(data, "rb"), strict=False).getNumPages()
    i = 0
    for page in PDFPage.get_pages(fp):
        i += 1
        if i > pages - 2:
            break
        interpreter.process_page(page)
        data = retstr.getvalue()
    file = open("textTA.txt", "wb")
    file.write(data.encode())
    file.close()
def first_page_str(og_filename):
    """ takes a pdf-file
        and returns a string with the text on the first page
    """
    output_string = StringIO()

    with open(og_filename, "rb") as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

        text = output_string.getvalue()
        text_pages = text.split("\x0c")
        new_filename = text_pages[0].strip()

    return new_filename
示例#39
0
def pdfparser(in_path,out_path):

    fp = open(in_path, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec ='utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr,  laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data =  retstr.getvalue()
        with open(out_path,'w',encoding=codec) as f:
            f.write(data)
            
            f.close()
    return data
示例#40
0
 def read(filepath):
     """
     Reads PDF saved at filepath and returns list of strings containing text from each page of
     the PDF.
     :param filepath:
     :return:
     """
     pages = []
     with open(filepath, 'rb') as fp:
         resource_manager = PDFResourceManager()
         buffer = io.StringIO()
         device = TextConverter(resource_manager,
                                buffer,
                                laparams=LAParams())
         interpreter = PDFPageInterpreter(resource_manager, device)
         for page in PDFPage.get_pages(fp):
             interpreter.process_page(page)
             data = buffer.getvalue()
             pages += [data]
     return pages
示例#41
0
def pdf_to_text(path):
    with open(path, 'rb') as fp:
        rsrcmgr = PDFResourceManager()
        outfp = io.StringIO()
        laparams = LAParams()
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
        text = outfp.getvalue()
        doc = nlp(text)
        case_list = []
        for entity in doc.ents:
            case = {'Text': entity.text, 'Label':  entity.label_ }
            case_list.append(case)
            print(entity.text,entity.label_)
        result = {"pdf_to_text":text.replace('\n',''),
                  "text_label":case_list
                  }
        return Response(result)
示例#42
0
def pdfparser(data):
    fp = open(data, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    filename = data
    for i, page in enumerate(PDFPage.get_pages(fp)):
        try:
            interpreter.process_page(page)
            data = retstr.getvalue()
        except error as e:
            print(e)
            print(filename, "failed to read %dth file" % i + 1)
            return e
    return data
示例#43
0
def parse(fname):  # pylint: disable=too-many-branches
    fp = open(fname, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data = retstr.getvalue()

    res = defaultdict(set)
    current_stage = 'title'

    for line in StringIO(data):
        print line,
示例#44
0
    def __init__(self, file_object, password=''):
        self.pdf_document = PDFDocument()

        self.parser = PDFParser(file_object)
        self.parser.set_document(self.pdf_document)

        self.pdf_document.set_parser(self.parser)
        self.pdf_document.initialize(password)

        if self.pdf_document.is_extractable:

            self.resource_manager = PDFResourceManager()
            self.text_converter = TextConverter(self.resource_manager,
                                                outfp=StringIO())
            self.interpreter = PDFPageInterpreter(self.resource_manager,
                                                  self.text_converter)
            self.pdf = PDF(metadata=self.pdf_document.info)

            for page in self.pdf_document.get_pages():
                self.pdf.append_page(self.interpreter.process_page(page))
示例#45
0
    def _index_pdf(self, bin_data):
        '''Index PDF documents'''
        if PDFResourceManager is None:
            return
        buf = u""
        if bin_data.startswith(b'%PDF-'):
            f = io.BytesIO(bin_data)
            try:
                resource_manager = PDFResourceManager()
                with io.StringIO() as content, TextConverter(resource_manager, content) as device:
                    logging.getLogger("pdfminer").setLevel(logging.CRITICAL)
                    interpreter = PDFPageInterpreter(resource_manager, device)

                    for page in PDFPage.get_pages(f):
                        interpreter.process_page(page)

                    buf = content.getvalue()
            except Exception:
                pass
        return buf
def pdf_parser(data):
    """

    :param data: The file stream
    :return: The converted text
    """
    fp = open(data, 'rb')
    rsrc_mgr = PDFResourceManager()
    ret_str = io.StringIO()
    la_params = LAParams()
    device = TextConverter(rsrc_mgr, ret_str, laparams=la_params)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrc_mgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data = ret_str.getvalue()

    return data
示例#47
0
def pdfparser(data):

    fp = open(data, 'rb')
    rsrcmgr = PDFResourceManager(
    )  #to store shared resources such as fonts or images
    retstr = io.StringIO()  # Cast to StringIO object
    codec = 'utf-8'
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF device object
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    with open('{}'.format(data.replace('pdf', 'txt')), 'w') as f:
        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
            data = retstr.getvalue()
        f.write(data.encode('cp850', 'replace').decode('cp850'))
    print("done")
示例#48
0
def parse(file):
    """
    Args:
        file:
    """
    output_string = StringIO()
    with open(file, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr,
                               output_string,
                               laparams=LAParams(detect_vertical=True))
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    pages = str(resolve1(doc.catalog["Pages"])["Count"])
    content = output_string.getvalue()
    return f"{pages} {content}"
示例#49
0
def pdf_to_text_pdfminer(path):
    # Open PDF File
    pdf_file = open(path, 'rb')

    # Initialze / Settings
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)

    # Create PDF Interpreter Object
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process Each PDF Page
    for page in PDFPage.get_pages(pdf_file):
        interpreter.process_page(page)
        pdf_txt = retstr.getvalue()

    return (pdf_txt)
示例#50
0
def pdfparser(s):
    from StringIO import StringIO
    m_file = StringIO(s)

    parser = PDFParser(m_file)

    document = PDFDocument(parser)
    rsmgr = PDFResourceManager()
    rstr = StringIO()
    lpm = LAParams()
    cdc = 'utf-8'

    device = TextConverter(rsmgs, rstr, codec=cdc, laparams=lpm)

    interpreter = PDFInterpreter(rsmgr, device)

    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        data = rstr.getvalue()
        print data
示例#51
0
def read_pdf_text(path, retured_value):
    output_string = StringIO()
    if path:
        with open(path, 'rb') as file:
            parser = PDFParser(file)
            fileDoc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            fileLen = resolve1(fileDoc.catalog['Pages'])['Count']
            counter = 0
            for page in PDFPage.create_pages(fileDoc):

                precent = int(round(((counter + 1) / fileLen) * 100))
                print(f"reading at {precent}%")
                interpreter.process_page(page)
                counter += 1
    retured_value = output_string

    return retured_value
示例#52
0
def convertWithPdfMiner(fname):
        pages_text = []
        rsrcmgr = PDFResourceManager()
        sio = StringIO()
        codec = 'utf-8'  # ISO-8859-1 is good for foreign languages
        laparams = LAParams()
        device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pdf = open(fname, "rb")
        count = 0

        for page in PDFPage.get_pages(pdf):
            # Get (and store) the "cursor" position of stream before reading from PDF
            # On the first page, this will be zero
            read_position = sio.tell()
            interpreter.process_page(page)
            sio.seek(read_position, 0)
            page_text = sio.read()
            pages_text.append(page_text)
        return pages_text
def pdfparser(input_path, fname):
    filename = os.path.join(input_path, fname)
    fp = open(filename, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        txt_string = retstr.getvalue()

    ori_df = DataFrame(re.split("\n", txt_string))
    ori_df.columns = ["value"]

    return ori_df
示例#54
0
def get_text_lines(location):
    """
    Return a list of unicode text lines extracted from a pdf file at
    `location`. May raise exceptions.
    """
    extracted_text = BytesIO()
    lines = []
    laparams = LAParams()
    with open(location, 'rb') as pdf_file:
        with contextlib.closing(PDFParser(pdf_file)) as parser:
            document = PDFDocument(parser)
            manager = PDFResourceManager()
            with contextlib.closing(TextConverter(manager, extracted_text,
                                                  laparams=laparams)) as extractor:
                interpreter = PDFPageInterpreter(manager, extractor)
                pages = PDFPage.create_pages(document)
                for page in pages:
                    interpreter.process_page(page)
                lines = extracted_text.getvalue().splitlines(True)
    return lines
示例#55
0
    def skip_test_pdfminer(self):
        from io import StringIO

        from pdfminer.converter import TextConverter
        from pdfminer.layout import LAParams
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
        from pdfminer.pdfpage import PDFPage
        from pdfminer.pdfparser import PDFParser

        output_string = StringIO()
        with open(TestHierarchy.straight_forward_doc, 'rb') as in_file:
            parser = PDFParser(in_file)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)

        print(output_string.getvalue())
示例#56
0
def read_pdf(list_file):
    list_due_date = []
    for f in list_file:
        with open(f, 'rb') as fp:
            rsrcmgr = PDFResourceManager()
            retstr = io.StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(rsrcmgr,
                                   retstr,
                                   codec=codec,
                                   laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            pages = PDFPage.get_pages(fp)
            for page in pages:
                interpreter.process_page(page)
                data = retstr.getvalue()
                if 'VENCIMENTO' in data or 'Fatura de Pagamento' in data or 'NOTA FISCAL DE SERVIÇOS' in data:
                    list_due_date.append(f)

    return list_due_date
示例#57
0
def pdf2string(path):
    """
    From a given pdf path, it creates a string of the pdf.
    :param path: Path to the pdf file
    :return: string of the pdf file
    """

    file_in = open(path, 'rb')
    # Create a PDF interpreter object. (pdfminer)
    retstr = io.StringIO()
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    for page in PDFPage.get_pages(file_in):
        interpreter.process_page(page)

    data = retstr.getvalue()

    return data
示例#58
0
    def textPDF_to_text(self):

        for i in range(self.number_of_pages):
            fp = open('{}//{}_{}.pdf'.format(self.temp_folder_name, self.file_name, i), 'rb')

            rsrcmgr = PDFResourceManager()
            retstr = io.StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
            # Create a PDF interpreter object.
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            # Process each page contained in the document.

            for page in PDFPage.get_pages(fp):
                interpreter.process_page(page)
                text = retstr.getvalue()

            textfile = open('{}//{}_{}.txt'.format(self.temp_folder_name, self.file_name, i), 'w')
            textfile.write(text)
            textfile.close()
def convertPDFToText(path):
    """
    This function converts pdf to text

    path: path of the file to be converted
    """
    fp = open(path, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data = retstr.getvalue()

    return data
示例#60
0
    def extract_additional_content(
            cls,
            content_arg: str) -> t.List[t.Tuple[t.Type[ContentType], str]]:
        path = Path(content_arg)
        if not path.is_file():
            raise Exception(f"Not a file: {content_arg}")
        if path.suffix != ".pdf":
            raise Exception(f"Not a .pdf: {content_arg}")

        text = StringIO()
        with path.open("rb") as in_file:
            parser = PDFParser(in_file)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            device = TextConverter(rsrcmgr, text, laparams=LAParams())
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
        return [
            (TextContent, text.getvalue()),
        ]