def analisa_siope_pdf(ano, uf, cod_uf, municipio, cod_municipio): arquivo = os.path.join('arquivos', ano, cod_uf, cod_municipio + '.pdf') if not os.path.exists(os.path.dirname(arquivo)): os.makedirs(os.path.dirname(arquivo)) url = pdf_url + '_' + cod_municipio + '_' + periodo + '_' + ano + '.pdf' #Baixa o arquivo pdf correspondente e o armazena localmente try: with request.urlopen(url) as resposta, open(arquivo, 'wb') as destino: shutil.copyfileobj(resposta, destino) #Quando o arquivo correspondente ao municipio e ano não existe, o FTP devolve um erro de permissao de acesso #(um exemplo é São Paulo - SP - 2012) except TypeError: return('%s\t%s [%s]\t%s [%s]\n' % (ano, uf, cod_uf, municipio, cod_municipio)) #Parsing do PDF. Um horror. with open(arquivo, mode="rb") as f: doc = PDF(f) #Os valores desejados estão sempre (acho) na página 4 do PDF, entre os termos listados em 'termos_de_busca' colunas = doc[3][doc[3].index(termos_de_busca[ano][0])+len(termos_de_busca[ano][0]):doc[3].index(termos_de_busca[ano][1])] valores = [] while(colunas.find(",") != -1): prox_indice = colunas.index(",") + 3 valores.append(colunas[:prox_indice]) colunas = colunas[prox_indice:] atualizada = valores[1] realizada = valores[-2] return('%s\t%s [%s]\t%s [%s]\t%s\t%s\n' % (ano, uf, cod_uf, municipio, cod_municipio, atualizada, realizada))
def read(self): """Returns a file's text data For now this only considers pdf files. if the file cannot be read this will return an empty string. """ if not os.path.exists(self.file.path): return unicode() if self.type() == 'pdf': try: doc = PDF(self.file.file) except PDF.PDFSyntaxError: return unicode() return doc.text() return unicode()
def read(self): """Returns a file's text data For now this only considers pdf files. if the file cannot be read this will return an empty string. """ if not settings.USE_S3_STORAGE: if not os.path.exists(self.file.path): return unicode() if self.type() == 'pdf': try: doc = PDF(self.file.file) except: return unicode() return doc.text() return unicode()
def read(self): """Returns a file's text data For now this only considers pdf files. if the file cannot be read this will return an empty string. """ if not settings.USE_S3_STORAGE: if not os.path.exists(self.file.path): return unicode() if settings.INDEX_FILE_CONTENT: if self.type() == 'pdf': try: doc = PDF(self.file.file) except: return unicode() return doc.text() return unicode()
class TestSlate(unittest.TestCase): def setUp(self): with open('example.pdf', 'rb') as f: self.doc = PDF(f) with open('protected.pdf', 'rb') as f: self.passwd = PDF(f, 'a') def test_basic(self): assert self.doc[0] == 'This is a test.\x0c' def test_metadata_extraction(self): assert self.doc.metadata def test_text_method(self): assert "This is a test" in self.doc.text() def test_text_method_unclean(self): assert '\x0c' in self.doc.text(clean=0) def test_password(self): assert self.passwd[0] == "Chamber of secrets.\x0c"
class TestSlate(unittest.TestCase): def setUp(self): with open('example.pdf', 'rb') as f: self.doc = PDF(f) with open('protected.pdf', 'rb') as f: self.passwd = PDF(f, 'a') def test_basic(self): assert self.doc[0] == 'This is a test.\n\n\x0c' def test_no_text_carry_over(self): assert self.doc[1] == '\x0c' def test_metadata_extraction(self): assert self.doc.metadata def test_text_method(self): assert "This is a test" in self.doc.text() def test_text_method_unclean(self): assert '\x0c' in self.doc.text(clean=0) def test_password(self): assert self.passwd[0] == "Chamber of secrets.\n\n\x0c"
def setUp(self): with open('example.pdf', 'rb') as f: self.doc = PDF(f) with open('protected.pdf', 'rb') as f: self.passwd = PDF(f, 'a')
def pytest_funcarg__passwd(request): with open('protected.pdf') as f: return PDF(f, 'a')
def pytest_funcarg__doc(request): with open('example.pdf', 'rb') as f: return PDF(f)
from slate import PDF from tempfile import mktemp ... output_name = mktemp() + ".txt" with open(url, 'rb') as pdf_file, open(output_name, 'wt') as output: doc = PDF(pdf_file) for page in doc: output.write(page + '\n')