def CSV(fullPath,fileName, idFile): download_file("https://dados.fee.tche.br/php/download.php?csv/Municipio/"+idFile+"/1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016", filename=fullPath + fileName+".zip", progress=True) fileZip = zipfile.ZipFile(fullPath+fileName+".zip", 'r') fileZip.extractall(fullPath) fileZip.close() for fileCSV in os.listdir(fullPath): if ".csv" not in fileCSV: os.remove(fullPath + fileCSV)
def download(date, cache=True): data_path = Path(__file__).parent / "data" if not data_path.exists(): data_path.mkdir() url = f"https://data.brasil.io/dataset/covid19/backup/{date}/obito_cartorio.csv.gz" filename = data_path / f"{date}-obito_cartorio.csv.gz" if not cache or not filename.exists(): download_file(url, filename, progress=True) return filename
def main(): data_path = pathlib.Path("Lista de Variaveis") if not data_path.exists(): data_path.mkdir() download_file("https://dados.fee.tche.br/php/doc_down.php?csv/ListaVars", filename="Lista de Variaveis\\ListaVariaveis.zip", progress=True) fileZip = zipfile.ZipFile("Lista de Variaveis\\ListaVariaveis.zip", 'r') fileZip.extractall("Lista de Variaveis") fileZip.close() for fileCSV in os.listdir("Lista de Variaveis"): if ".csv" not in fileCSV: os.remove("Lista de Variaveis\\"+fileCSV)
def download(self, year, force=False): filename = self.filename(year) if not force and filename.exists(): # File has already been downloaded return {"downloaded": False, "filename": filename} url = self.url(year) file_data = download_file(url, progress=True) move_file(file_data.uri, filename) return {"downloaded": True, "filename": filename}
def download(self, year, force=False): filename = self.download_filename(year) if not filename.parent.exists(): filename.parent.mkdir(parents=True) if not force and filename.exists(): # File has already been downloaded return {"downloaded": False, "filename": filename} url = self.url(year) file_data = download_file(url, progress=True, chunk_size=256 * 1024) rename_file(file_data.uri, filename) return {"downloaded": True, "filename": filename}
def download_photos(year): year = str(year) url = f"http://agencia.tse.jus.br/estatistica/sead/eleicoes/eleicoes{year}/fotos/" table = import_from_uri(url) for row in table: if row.name == "Parent Directory": continue filename = download_path / year / row.name print(f"Downloading {filename.name}", end="") if filename.exists(): print(" - downloaded already, skipping.") else: if not filename.parent.exists(): filename.parent.mkdir() print() download_file(urljoin(url, row.name), progress=True, filename=filename) print(f" saved: {filename}") photo_path = output_path / year if not photo_path.exists(): photo_path.mkdir() print(f" Exporting to: {photo_path}") zf = ZipFile(filename) for file_info in tqdm(zf.filelist, desc="Exporting pictures"): internal_name = file_info.filename internal_path = Path(internal_name) extension = internal_path.name.split(".")[-1].lower() info = internal_path.name.split(".")[0].split("_")[0] state, sequence_number = info[1:3], info[3:] new_filename = photo_path / state / f"{sequence_number}.{extension}" if not new_filename.parent.exists(): new_filename.parent.mkdir() zfobj = zf.open(internal_name) with open(new_filename, mode="wb") as fobj: fobj.write(zfobj.read())
def command_pdf_to_text(output_encoding, quiet, backend, pages, source, output): # Define page range if pages: pages = extract_intervals(pages) # Define if output is file or stdout if output: output = open(output, mode="w", encoding=output_encoding) write = output.write else: write = click.echo quiet = True progress = not quiet # Download the file if source is an HTTP URL downloaded = False if source.lower().startswith("http:") or source.lower().startswith("https:"): result = download_file(source, progress=progress, detect=False) source = result.uri downloaded = True reader = rows.plugins.pdf.pdf_to_text(source, page_numbers=pages, backend=backend) if progress: # Calculate total number of pages and create a progress bar if pages: total_pages = len(pages) else: total_pages = rows.plugins.pdf.number_of_pages(source, backend=backend) reader = tqdm(reader, desc="Extracting text", total=total_pages) for page in reader: write(page) if output: output.close() if downloaded: os.unlink(source)
from rows.utils import download_file import pathlib estados = [ "AC", "AL", "AM", "AP", "BA", "CE", "DF", "ES", "GO", "MA", "MT", "MS", "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN", "RO", "RS", "RR", "SC", "SE", "SP", "TO" ] data_path = pathlib.Path("data") if not data_path.exists(): data_path.mkdir() for UF in estados: download_file("https://dadosabertos.ibama.gov.br/dados/SICAFI/" + UF + "/Quantidade/multasDistribuidasBensTutelados.csv", filename="data\\" + UF + ".csv", progress=True)
import pathlib estados = [ "AC", "AL", "AM", "AP", "BA", "CE", "DF", "ES", "GO", "MA", "MT", "MS", "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN", "RO", "RS", "RR", "SC", "SE", "SP", "TO" ] data_path = pathlib.Path("data") if not data_path.exists(): data_path.mkdir() CTF_path = pathlib.Path("data\\CTF") if not CTF_path.exists(): CTF_path.mkdir() AIDA_path = pathlib.Path("data\\AIDA") if not AIDA_path.exists(): AIDA_path.mkdir() for UF in estados: download_file("http://dadosabertos.ibama.gov.br/dados/CTF/APP/" + UF + "/pessoasJuridicas.csv", filename="data\\CTF\\CTF-" + UF + ".csv", progress=True) download_file( "http://dadosabertos.ibama.gov.br/dados/CTF/AIDA/pessoasJuridicas.csv", filename="data\\AIDA\\AIDA.csv", progress=True)
) for row in table: result.append(convert_row(row)) return rows.import_from_dicts(result) if __name__ == "__main__": from argparse import ArgumentParser from pathlib import Path from rows.utils import download_file parser = ArgumentParser() parser.add_argument( "--url", default="http://www.transparencia.am.gov.br/arquivos/2014/158_201404.pdf", ) args = parser.parse_args() url = args.url pdf_filename = Path(url).name csv_filename = pdf_filename.replace(".pdf", ".csv") download_file(url, pdf_filename, progress=True) print("Parsing PDF...") table = parse_file(pdf_filename) print("Exporting to CSV...") rows.export_to_csv(table, csv_filename)