def test_real_data_3(self): filename = "tests/data/eleicoes-tcesp-161-162.pdf" expected1 = "tests/data/expected-eleicoes-tcesp-161-{}.csv".format(self.backend) expected2 = "tests/data/expected-eleicoes-tcesp-162-{}.csv".format(self.backend) begin = re.compile("Documento gerado em.*") end = re.compile("Página: [0-9]+ de.*") result = rows.import_from_pdf( filename, backend=self.backend, page_numbers=(1,), starts_after=begin, ends_before=end, algorithm="header-position", ) expected = rows.import_from_csv(expected1) self.assertEqual(list(expected), list(result)) result = rows.import_from_pdf( filename, backend=self.backend, page_numbers=(2,), starts_after=begin, ends_before=end, algorithm="header-position", ) expected = rows.import_from_csv(expected2) self.assertEqual(list(expected), list(result))
def test_real_data_3(self): filename = "tests/data/eleicoes-tcesp-161-162.pdf" expected1 = "tests/data/expected-eleicoes-tcesp-161-{}.csv".format( self.backend) expected2 = "tests/data/expected-eleicoes-tcesp-162-{}.csv".format( self.backend) begin = re.compile("Documento gerado em.*") end = re.compile("Página: [0-9]+ de.*") result = rows.import_from_pdf( filename, backend=self.backend, page_numbers=(1, ), starts_after=begin, ends_before=end, algorithm="header-position", ) expected = rows.import_from_csv(expected1) self.assertEqual(list(expected), list(result)) result = rows.import_from_pdf( filename, backend=self.backend, page_numbers=(2, ), starts_after=begin, ends_before=end, algorithm="header-position", ) expected = rows.import_from_csv(expected2) self.assertEqual(list(expected), list(result))
def parse_pdf(self, response): pdf = rows.plugins.pdf.PyMuPDFBackend(io.BytesIO(response.body)) pages = pdf.text_objects(starts_after=re.compile("EM INVESTIGAÇÃO.*")) for page in pages: for obj in page: if obj.text.startswith("Fonte:"): day, month, year = re.compile( "([0-9]{2})/([0-9]{2})/([0-9]{4})" ).findall(obj.text)[0] date = datetime.date(int(year), int(month), int(day)) break self.add_report(date=date, url=response.url) table = rows.import_from_pdf( io.BytesIO(response.body), starts_after=re.compile("DADOS DETALHADOS POR MUNICÍPIO DE RESIDÊNCIA.*"), ends_before=re.compile("Fonte:"), ) confirmed_cases = {} for row in table: city = convert_city(row.municipio_de_residencia) if city is None: continue confirmed = row.casos_confirmados_incidencia_por_n_100_ooo_hab.splitlines()[ 0 ] if confirmed in ("-", ""): confirmed = None else: confirmed = int(confirmed) confirmed_cases[city] = confirmed table = rows.import_from_pdf( io.BytesIO(response.body), starts_after=re.compile("EM INVESTIGAÇÃO.*"), ends_before=re.compile("Fonte:"), ) deaths_cases = {} for row in table: city = convert_city(row.field_0) if city is None: continue deaths_cases[city] = int(row.confirmado) cities = set(confirmed_cases.keys()) | set(deaths_cases.keys()) for city in cities: confirmed = confirmed_cases.get(city, None) deaths = deaths_cases.get(city, None) if confirmed is None and deaths is None: continue confirmed = confirmed or 0 deaths = deaths or 0 if confirmed == 0 and deaths == 0: continue if city == "TOTAL NO ESTADO": self.add_state_case(confirmed=confirmed, deaths=deaths) else: self.add_city_case(city=city, confirmed=confirmed, deaths=deaths)
def extract(self): filename, metadata = self.filename, self.metadata extension = filename.name.split(".")[-1].lower() # From 2017-11 to 2018-12, get data from Brasil.IO (in CSV) using the # `extract_magistrados` helper function. if "contracheque.csv" in filename.name: yield from extract_magistrados(filename, self.state) elif extension == "pdf": if metadata["ano"] == 2018 or (metadata["ano"] == 2017 and metadata["mes"] in (11, 12)): # Data already converted in contracheque.csv return total_pages = rows.plugins.pdf.number_of_pages(self.filename) for page in range(1, total_pages + 1): table = rows.import_from_pdf( self.filename, page_numbers=(page, ), fields=self.fields, skip_header=page == 1, ) for row in table: yield { "cargo": row.cargo, "nome": row.nome.replace("\n", " ").strip(), "rendimento_bruto": row.total_de_rendimentos, "rendimento_liquido": row.rendimento_liquido, }
def extract_table(filename_or_fobj): total_pages = rows.plugins.pdf.number_of_pages(filename_or_fobj, backend="pymupdf") result = [] for page_number in range(1, total_pages + 1): page_text = next( rows.plugins.pdf.pdf_to_text( filename_or_fobj, page_numbers=(page_number,), backend="pymupdf" ) ) page_meta = extract_page_metadata(page_text) if page_meta is None: # Empty PDF return None table = rows.import_from_pdf( filename_or_fobj, page_numbers=(page_number,), backend="pymupdf", algorithm=YGroupsXPositionAlgorithm, fields=FIELDS, skip_header=False, starts_after=starts_after, ends_before=ends_before, ) for row in table: if list(row._asdict().values()).count("") > 3: # Empty line continue row = convert_row(row) row.update(page_meta) result.append(row) return result
def extrai_tabela(url): url_final = f"http://www.imea.com.br/upload/publicacoes/arquivos/{url}" response = requests.get(url_final) return rows.import_from_pdf( io.BytesIO(response.content), ends_before=re.compile(r'\* ?Variação em .*'), )
def parse_pdf(filename, meta): # Extract update date pdf_doc = PyMuPDFBackend(filename) update_date = None for page in pdf_doc.objects(): for obj in page: if REGEXP_UPDATE.match(obj.text): update_date = PtBrDateField.deserialize( REGEXP_UPDATE.findall(obj.text)[0]) break if update_date is None: # String not found in PDF # Parse URL to get date inside PDF's filename date = (meta["boletim_url"].split("/")[-1].split(".pdf")[0].replace( "CORONA_", "").split("_")[0]) update_date = PtBrDateField2.deserialize(date) # Extract rows and inject update date and metadata table = rows.import_from_pdf(filename, backend="min-x0") for row in table: if row.municipio == "TOTAL GERAL": continue row = row._asdict() row["data"] = update_date row.update(meta) yield convert_row(row)
def extract_2015(filename): starts_after = re.compile(".*DE 13/11/2002") pages = range(1, rows.plugins.pdf.number_of_pages(filename) + 1) for page in tqdm(pages, desc=filename): table = rows.import_from_pdf(filename, page_numbers=(page, ), starts_after=starts_after) for row in table: yield convert_row_2015(row)
def test_real_data_2(self): filename = "tests/data/milho-safra-2017" result = rows.import_from_pdf( filename + ".pdf", backend=self.backend, starts_after=re.compile("MILHO SAFRA 16/17: ACOMPANHAMENTO DE .*"), ends_before="*Variação em pontos percentuais.", ) expected = rows.import_from_csv(filename + ".csv") self.assertEqual(list(expected), list(result))
def extract_table(fobj): table = rows.import_from_pdf(fobj, backend="pymupdf") result = [] for row in table: row = row._asdict() row["local_da_coleta"] = clean(row["local_da_coleta"]) row["ponto_codigo"] = clean(row["ponto_codigo"]) row["costa_ponto"] = extrai_costa(row["ponto_codigo"]) result.append(row) return result
def test_rects_boundaries(self): filename = "tests/data/ibama-autuacao-amazonas-2010-pag2" result = rows.import_from_pdf( filename + ".pdf", backend=self.backend, starts_after=re.compile("DIRETORIA DE PROTE.*"), ends_before=re.compile("Pag [0-9]+/[0-9]+"), algorithm="rects-boundaries", ) expected = rows.import_from_csv(filename + ".csv") self.assertEqual(list(expected), list(result))
def read_pdf(self, response): path = f"download/{Path(response.url).name}" self.logger.info("Saving PDF %s from %s", path, response.url) with open(path, "wb") as f: f.write(response.body) can_read = False data = {} city = None for row in rows.import_from_pdf(path): row = row._asdict() if (SUSPECTED in clean(row["field_1"]) and DISCARDED in clean(row["field_2"]) and CONFIRMED in clean(row["field_3"])): can_read = True continue if can_read: city = list(row.values())[0] if len(city.split("\n")) > 3: cities, suspected, discarded, confirmed = row.values() cities = clean_cities(cities) discarded = discarded.split("\n") suspected = list( it.islice(suspected.split("\n"), 0, None, 2)) confirmed = list( it.islice(confirmed.split("\n"), 0, None, 2)) dt = list(zip(cities, suspected, discarded, confirmed)) for (cit, susp, disc, conf) in dt: yield { "municipio": cit, "suspeitos": change_format(susp), "descartados": change_format(disc), "confirmados": change_format(conf), } else: city = clean(city) data["municipio"] = city data["suspeitos"] = row["field_1"].split("\n")[0] data["descartados"] = row["field_2"] data["confirmados"] = row["field_3"].split("\n")[0] yield data if city and any(city in text for text in tokens): break
def parse_file(filename): """Parse Amazonas' PDF file containing state employee information""" total_pages = rows.plugins.pdf.number_of_pages(filename) result = [] for page in range(1, total_pages + 1): table = rows.import_from_pdf( filename, page_numbers=(page, ), starts_after="NOME", fields=PDF_FIELD_TYPES, skip_header=True, ) for row in table: result.append(convert_row(row)) return rows.import_from_dicts(result)
def parse(self, response): meta = response.request.meta.copy() filename = meta['filename'] meta['costa_menu'] = meta['costa'] del_keys = [ key for key in meta.keys() if key.startswith('download_') or key in ('url', 'filename', 'depth', 'costa') ] for key in del_keys: del meta[key] with open(filename, mode='wb') as fobj: fobj.write(response.body) for row in rows.import_from_pdf(io.BytesIO(response.body)): row = row._asdict() row['local_da_coleta'] = clean(row['local_da_coleta']) row['ponto_codigo'] = clean(row['ponto_codigo']) row['costa_ponto'] = extrai_costa(row['ponto_codigo']) row.update(meta) yield row
def test_real_data_1(self): filename = "tests/data/balneabilidade-26-2010" result = rows.import_from_pdf(filename + ".pdf", backend=self.backend) expected = rows.import_from_csv(filename + ".csv") self.assertEqual(list(expected), list(result))
import rows file = rows.import_from_pdf('samples/xp-2.pdf') print(file.fields)
import requests import rows url = "http://balneabilidade.inema.ba.gov.br/index.php/relatoriodebalneabilidade/geraBoletim?idcampanha=42041" print("*** Downloading PDF...") response = requests.get(url) # The line below will automatically identify the table in all PDF pages - it # works for this file but not for all cases. You can be more specific defining # the page numbers, a start/end string (like the header/footer strings) and # also change the table identification algorithm. Check `backend`, `algorithm`, # `starts_after`, `ends_before` and `page_numbers` parameters. # For this simple case you could also install rows' CLI (`pip install # rows[cli]`) and run: `rows print <url>` table = rows.import_from_pdf(io.BytesIO(response.content)) rows.export_to_csv(table, "beach-data.csv") print("*** Table exported to beach-data.csv") print("*** Extracted table:") print(rows.export_to_txt(table)) # You could also iterate over the object, like: # for row in table: print(row) print("\n\n*** Extracted text:") text_pages = rows.plugins.pdf.pdf_to_text(io.BytesIO(response.content)) print("\n\n".join(text_pages))
import io import requests import rows url = "http://balneabilidade.inema.ba.gov.br/index.php/relatoriodebalneabilidade/geraBoletim?idcampanha=42041" print("*** Downloading PDF...") response = requests.get(url) # The line below will automatically identify the table in all PDF pages - it # works for this file but not for all cases. You can be more specific defining # the page numbers, a start/end string (like the header/footer strings) and # also change the table identification algorithm. Check `backend`, `algorithm`, # `starts_after`, `ends_before` and `page_numbers` parameters. # For this simple case you could also install rows' CLI (`pip install # rows[cli]`) and run: `rows print <url>` table = rows.import_from_pdf(io.BytesIO(response.content)) rows.export_to_csv(table, "beach-data.csv") print("*** Table exported to beach-data.csv") print("*** Extracted table:") print(rows.export_to_txt(table)) # You could also iterate over the object, like: # for row in table: print(row) print("\n\n*** Extracted text:") text_pages = rows.plugins.pdf.pdf_to_text(io.BytesIO(response.content)) print("\n\n".join(text_pages))