Exemplo n.º 1
0
def analisa_siope_pdf(ano, uf, cod_uf, municipio, cod_municipio):
    arquivo = os.path.join('arquivos', ano, cod_uf, cod_municipio + '.pdf')

    if not os.path.exists(os.path.dirname(arquivo)):
        os.makedirs(os.path.dirname(arquivo))

    url = pdf_url + '_' + cod_municipio + '_' + periodo + '_' + ano + '.pdf'

    #Baixa o arquivo pdf correspondente e o armazena localmente
    try:
        with request.urlopen(url) as resposta, open(arquivo, 'wb') as destino:
            shutil.copyfileobj(resposta, destino)
    #Quando o arquivo correspondente ao municipio e ano não existe, o FTP devolve um erro de permissao de acesso
    #(um exemplo é São Paulo - SP - 2012)
    except TypeError:
        return('%s\t%s [%s]\t%s [%s]\n' % (ano, uf, cod_uf, municipio, cod_municipio))

    #Parsing do PDF. Um horror.
    with open(arquivo, mode="rb") as f:
        doc = PDF(f)
        #Os valores desejados estão sempre (acho) na página 4 do PDF, entre os termos listados em 'termos_de_busca'
        colunas = doc[3][doc[3].index(termos_de_busca[ano][0])+len(termos_de_busca[ano][0]):doc[3].index(termos_de_busca[ano][1])]

        valores = []

        while(colunas.find(",") != -1):
            prox_indice = colunas.index(",") + 3
            valores.append(colunas[:prox_indice])
            colunas = colunas[prox_indice:]

        atualizada = valores[1]
        realizada = valores[-2]

        return('%s\t%s [%s]\t%s [%s]\t%s\t%s\n' % (ano, uf, cod_uf, municipio, cod_municipio, atualizada, realizada))
Exemplo n.º 2
0
    def read(self):
        """Returns a file's text data
        For now this only considers pdf files.
        if the file cannot be read this will return an empty string.
        """

        if not os.path.exists(self.file.path):
            return unicode()

        if self.type() == 'pdf':

            try:
                doc = PDF(self.file.file)
            except PDF.PDFSyntaxError:
                return unicode()

            return doc.text()

        return unicode()
Exemplo n.º 3
0
    def read(self):
        """Returns a file's text data
        For now this only considers pdf files.
        if the file cannot be read this will return an empty string.
        """

        if not settings.USE_S3_STORAGE:
            if not os.path.exists(self.file.path):
                return unicode()

        if self.type() == 'pdf':

            try:
                doc = PDF(self.file.file)
            except:
                return unicode()

            return doc.text()

        return unicode()
Exemplo n.º 4
0
    def read(self):
        """Returns a file's text data
        For now this only considers pdf files.
        if the file cannot be read this will return an empty string.
        """

        if not settings.USE_S3_STORAGE:
            if not os.path.exists(self.file.path):
                return unicode()

        if settings.INDEX_FILE_CONTENT:
            if self.type() == 'pdf':

                try:
                    doc = PDF(self.file.file)
                except:
                    return unicode()

                return doc.text()

        return unicode()
Exemplo n.º 5
0
class TestSlate(unittest.TestCase):
    def setUp(self):
        with open('example.pdf', 'rb') as f:
            self.doc = PDF(f)
        with open('protected.pdf', 'rb') as f:
            self.passwd = PDF(f, 'a')

    def test_basic(self):
        assert self.doc[0] == 'This is a test.\x0c'

    def test_metadata_extraction(self):
        assert self.doc.metadata

    def test_text_method(self):
        assert "This is a test" in self.doc.text()

    def test_text_method_unclean(self):
        assert '\x0c' in self.doc.text(clean=0)

    def test_password(self):
        assert self.passwd[0] == "Chamber of secrets.\x0c"
Exemplo n.º 6
0
class TestSlate(unittest.TestCase):
    def setUp(self):
        with open('example.pdf', 'rb') as f:
            self.doc = PDF(f)
        with open('protected.pdf', 'rb') as f:
            self.passwd = PDF(f, 'a')

    def test_basic(self):
        assert self.doc[0] == 'This is a test.\n\n\x0c'

    def test_no_text_carry_over(self):
        assert self.doc[1] == '\x0c'

    def test_metadata_extraction(self):
        assert self.doc.metadata

    def test_text_method(self):
        assert "This is a test" in self.doc.text()

    def test_text_method_unclean(self):
        assert '\x0c' in self.doc.text(clean=0)

    def test_password(self):
        assert self.passwd[0] == "Chamber of secrets.\n\n\x0c"
Exemplo n.º 7
0
 def setUp(self):
     with open('example.pdf', 'rb') as f:
         self.doc = PDF(f)
     with open('protected.pdf', 'rb') as f:
         self.passwd = PDF(f, 'a')
Exemplo n.º 8
0
 def setUp(self):
     with open('example.pdf', 'rb') as f:
         self.doc = PDF(f)
     with open('protected.pdf', 'rb') as f:
         self.passwd = PDF(f, 'a')
Exemplo n.º 9
0
def pytest_funcarg__passwd(request):
    with open('protected.pdf') as f:
        return PDF(f, 'a')
Exemplo n.º 10
0
def pytest_funcarg__doc(request):
    with open('example.pdf', 'rb') as f:
        return PDF(f)
Exemplo n.º 11
0
from slate import PDF
from tempfile import mktemp
...

output_name = mktemp() + ".txt"

with open(url, 'rb') as pdf_file, open(output_name, 'wt') as output:
    doc = PDF(pdf_file)
    for page in doc:
        output.write(page + '\n')