Python PDF示例，slate.PDF Python示例

示例#1

0

显示文件

def analisa_siope_pdf(ano, uf, cod_uf, municipio, cod_municipio):
    arquivo = os.path.join('arquivos', ano, cod_uf, cod_municipio + '.pdf')

    if not os.path.exists(os.path.dirname(arquivo)):
        os.makedirs(os.path.dirname(arquivo))

    url = pdf_url + '_' + cod_municipio + '_' + periodo + '_' + ano + '.pdf'

    #Baixa o arquivo pdf correspondente e o armazena localmente
    try:
        with request.urlopen(url) as resposta, open(arquivo, 'wb') as destino:
            shutil.copyfileobj(resposta, destino)
    #Quando o arquivo correspondente ao municipio e ano não existe, o FTP devolve um erro de permissao de acesso
    #(um exemplo é São Paulo - SP - 2012)
    except TypeError:
        return('%s\t%s [%s]\t%s [%s]\n' % (ano, uf, cod_uf, municipio, cod_municipio))

    #Parsing do PDF. Um horror.
    with open(arquivo, mode="rb") as f:
        doc = PDF(f)
        #Os valores desejados estão sempre (acho) na página 4 do PDF, entre os termos listados em 'termos_de_busca'
        colunas = doc[3][doc[3].index(termos_de_busca[ano][0])+len(termos_de_busca[ano][0]):doc[3].index(termos_de_busca[ano][1])]

        valores = []

        while(colunas.find(",") != -1):
            prox_indice = colunas.index(",") + 3
            valores.append(colunas[:prox_indice])
            colunas = colunas[prox_indice:]

        atualizada = valores[1]
        realizada = valores[-2]

        return('%s\t%s [%s]\t%s [%s]\t%s\t%s\n' % (ano, uf, cod_uf, municipio, cod_municipio, atualizada, realizada))

示例#2

0

显示文件

文件： models.py 项目： BakethemPie/tendenci

    def read(self):
        """Returns a file's text data
        For now this only considers pdf files.
        if the file cannot be read this will return an empty string.
        """

        if not os.path.exists(self.file.path):
            return unicode()

        if self.type() == 'pdf':

            try:
                doc = PDF(self.file.file)
            except PDF.PDFSyntaxError:
                return unicode()

            return doc.text()

        return unicode()

示例#3

0

显示文件

文件： models.py 项目： DeepInTheCode/tendenci

    def read(self):
        """Returns a file's text data
        For now this only considers pdf files.
        if the file cannot be read this will return an empty string.
        """

        if not settings.USE_S3_STORAGE:
            if not os.path.exists(self.file.path):
                return unicode()

        if self.type() == 'pdf':

            try:
                doc = PDF(self.file.file)
            except:
                return unicode()

            return doc.text()

        return unicode()

示例#4

0

显示文件

文件： models.py 项目： jkimma/tendenci

    def read(self):
        """Returns a file's text data
        For now this only considers pdf files.
        if the file cannot be read this will return an empty string.
        """

        if not settings.USE_S3_STORAGE:
            if not os.path.exists(self.file.path):
                return unicode()

        if settings.INDEX_FILE_CONTENT:
            if self.type() == 'pdf':

                try:
                    doc = PDF(self.file.file)
                except:
                    return unicode()

                return doc.text()

        return unicode()

示例#5

0

显示文件

文件： unittests.py 项目： jasco/slate

class TestSlate(unittest.TestCase):
    def setUp(self):
        with open('example.pdf', 'rb') as f:
            self.doc = PDF(f)
        with open('protected.pdf', 'rb') as f:
            self.passwd = PDF(f, 'a')

    def test_basic(self):
        assert self.doc[0] == 'This is a test.\x0c'

    def test_metadata_extraction(self):
        assert self.doc.metadata

    def test_text_method(self):
        assert "This is a test" in self.doc.text()

    def test_text_method_unclean(self):
        assert '\x0c' in self.doc.text(clean=0)

    def test_password(self):
        assert self.passwd[0] == "Chamber of secrets.\x0c"

示例#6

0

显示文件

文件： unittests.py 项目： isabella232/slate-1

class TestSlate(unittest.TestCase):
    def setUp(self):
        with open('example.pdf', 'rb') as f:
            self.doc = PDF(f)
        with open('protected.pdf', 'rb') as f:
            self.passwd = PDF(f, 'a')

    def test_basic(self):
        assert self.doc[0] == 'This is a test.\n\n\x0c'

    def test_no_text_carry_over(self):
        assert self.doc[1] == '\x0c'

    def test_metadata_extraction(self):
        assert self.doc.metadata

    def test_text_method(self):
        assert "This is a test" in self.doc.text()

    def test_text_method_unclean(self):
        assert '\x0c' in self.doc.text(clean=0)

    def test_password(self):
        assert self.passwd[0] == "Chamber of secrets.\n\n\x0c"

示例#7

0

显示文件

文件： unittests.py 项目： jasco/slate

 def setUp(self):
     with open('example.pdf', 'rb') as f:
         self.doc = PDF(f)
     with open('protected.pdf', 'rb') as f:
         self.passwd = PDF(f, 'a')

示例#8

0

显示文件

文件： unittests.py 项目： isabella232/slate-1

 def setUp(self):
     with open('example.pdf', 'rb') as f:
         self.doc = PDF(f)
     with open('protected.pdf', 'rb') as f:
         self.passwd = PDF(f, 'a')

示例#9

0

显示文件

def pytest_funcarg__passwd(request):
    with open('protected.pdf') as f:
        return PDF(f, 'a')

示例#10

0

显示文件

def pytest_funcarg__doc(request):
    with open('example.pdf', 'rb') as f:
        return PDF(f)

示例#11

0

显示文件

from slate import PDF
from tempfile import mktemp
...

output_name = mktemp() + ".txt"

with open(url, 'rb') as pdf_file, open(output_name, 'wt') as output:
    doc = PDF(pdf_file)
    for page in doc:
        output.write(page + '\n')