示例#1
0
文件: main.py 项目: pvalente/pdfyql
  def get(self):
    
    url = self.request.get('url')
    if not url:
        url = ""
    
    
    page = self.request.get('pg')
    if not page:
        page = 'all'
    else:
        page = int(page)-1
    
    u = urllib2.urlopen(urllib2.unquote(url))
    
    output = StringIO.StringIO()
    output.write(u.read())
    
    p = PdfFileReader(output)

    pages = p.getNumPages()
    title = p.getDocumentInfo().title
    if page == "all":
        content=""
        for i in range(0, pages):
            # Extract text from page and add to content
            content += "<page number='%d'><![CDATA[%s]]></page>\n" % (i+1,  p.getPage(i).extractText())
    else:
        content = "<page number='%d'><![CDATA[%s]]></page>\n" % (page+1,  p.getPage(page).extractText())
    output.close()
    
    result = "<?xml version='1.0' encoding='UTF-8'?>\n<document url='%s' title='%s'>%s</document>" % (url, title, content)
    
    self.response.headers['Content-type'] = 'application/xml'
    self.response.out.write(result)
示例#2
0
	def parse(self, file_full, statdata):
		pdf = PdfFileReader(file(file_full, 'rb'))

		pages = pdf.getNumPages()
		text = ''

		self._extra['pages'] = pages

		for pagenr in range(pages):
			page = pdf.getPage(pagenr-1)
			text += ' ' + page.extractText()

		return text
示例#3
0
    def mergePdf(self):
    #        self.threadPdfWritingStatus.emit(
    #            '<font size=4><b>Method "%s": </b></font><b>Setting Title for</b> %s<b>. Please Wait...</b><br />' % (
    #                self.groupType, self.url))
        self.threadPdfWritingStatus.emit(
            '<b>Setting Title for</b> %s<b>. Please Wait...</b><br />' % self.url)

        packet = StringIO()
        # create a new PDF with Reportlab
        pdfCanvas = canvas.Canvas(packet, pagesize=A4)
        pdfCanvas.setFont('Helvetica', 8)
        if len(self.title) is 0:
            self.title = str(self.url).split('/')[-1]
            self.title = self.regex.getSearchedData('(?i)([a-zA-Z0-9-_ ]*?)\.[a-zA-Z0-9_]*$', self.title)
            self.title = self.regex.replaceData('(?i)_', ' ', self.title)
        title = unicode(self.title[:57] + '...') if  (len(self.title) > 60) else unicode(self.title)
        url = self.url[:57] + '...' if (len(self.title) > 60) else self.url
        pdfCanvas.drawString(5, 830, title + '                      ' + str(url).lower())
        d = datetime.datetime.now()
        strDate = str(d.strftime("%Y-%m-%d %H-%M-%S %p"))
        pdfCanvas.drawString(420, 5, 'Created Date Time: ' + strDate)
        pdfCanvas.save()
        packet.seek(0)
        newPdf = PdfFileReader(packet)

        if not os.path.exists(self.tempPdfFile):
            return self.printWebHtmlToPdf(self.url, self.filePath, self.fileName)

        writer = PdfFileWriter()
        tmpPdfFile = file(self.tempPdfFile, 'rb')
        reader = PdfFileReader(tmpPdfFile)
        for i in range(0, (reader.getNumPages())):
            page = reader.getPage(i)
            page.mergePage(newPdf.getPage(0))
            #            page = newPdf.getPage(0)
            #            page.mergePage(reader.getPage(i))
            writer.addPage(page)
        print 'Filename: ' + self.fileName
        outputStream = file(self.filePath + self.fileName, "wb")
        writer.write(outputStream)
        outputStream.close()
        tmpPdfFile.close()
        os.remove(str(self.tempPdfFile))
示例#4
0
def extractTextFromPdfStream(stream):
    reader = PdfFileReader(stream)
    return '\n'.join(
        reader.getPage(i).extractText() for i in range(reader.getNumPages()))
示例#5
0
def extract_text_from_pdf_stream(stream):
    reader = PdfFileReader(stream)
    return '\n'.join(
        reader.getPage(i).extractText()
        for i in range(reader.getNumPages())
    )