def main(): dt = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%S") parser = argparse.ArgumentParser() parser.add_argument("--username", default="user-api-leitos") parser.add_argument("--password", default="aQbLL3ZStaTr38tj") parser.add_argument("--api-url", default="https://elastic-leitos.saude.gov.br/") parser.add_argument("--index", default="leito_ocupacao") parser.add_argument("--ttl", default="10m") parser.add_argument("--output-filename", default=DOWNLOAD_PATH / f"ocupacao-{dt}.csv") args = parser.parse_args() es = ElasticSearch(args.api_url) iterator = es.paginate( index=args.index, sort_by="dataNotificacaoOcupacao", user=args.username, password=args.password, ttl=args.ttl, ) writer = CsvLazyDictWriter(args.output_filename) progress = tqdm(unit_scale=True) for page_number, page in enumerate(iterator, start=1): progress.desc = f"Downloading page {page_number}" for row in page["hits"]["hits"]: writer.writerow(convert_row(row["_source"])) progress.update() writer.close()
def main(): dt = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%S") parser = argparse.ArgumentParser() parser.add_argument("--username", default="user-api-leitos") parser.add_argument("--password", default="aQbLL3ZStaTr38tj") parser.add_argument("--api-url", default="https://elastic-leitos.saude.gov.br/") parser.add_argument("--index", default="leito_ocupacao") parser.add_argument("--ttl", default="10m") parser.add_argument("--output-filename", default=DOWNLOAD_PATH / f"ocupacao-{dt}.csv") args = parser.parse_args() es = ElasticSearch( args.api_url, username=args.username, password=args.password, ) iterator = es.search( index=args.index, sort_by="dataNotificacaoOcupacao", ttl=args.ttl, ) writer = CsvLazyDictWriter(args.output_filename) for row in tqdm(iterator, unit_scale=True): writer.writerow(convert_row(row)) writer.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--chunk-size", type=int, default=1_024 * 1_024) parser.add_argument("--refresh-count", type=int, default=10_000) parser.add_argument("--input-encoding", type=str, default="utf-8") parser.add_argument("--connections", type=int, default=8) parser.add_argument("--preserve-raw", action="store_true") parser.add_argument("--buffering", type=int, default=8 * 1024 * 1024) args = parser.parse_args() # TODO: adicionar opção para selecionar qual dos 3 possíveis CSVs o script # irá gerar. # TODO: configurar saída do logger para arquivo e não stdout/stderr # TODO: adicionar opção para salvar ou não CSV original (compactado) url, date = get_latest_url_and_date() output_path = Path(__file__).parent / "data" / "output" filename_raw = output_path / f"microdados_vacinacao-raw-{date}.csv.xz" filename_censored = output_path / "microdados_vacinacao.csv.gz" filename_uncensored = output_path / "microdados_vacinacao-uncensored.csv.gz" if not output_path.exists(): output_path.mkdir(parents=True) download_file_curl(url, filename_raw) with open_compressed(filename_raw) as fobj: fobj_censored = open_compressed(filename_censored, mode="w", buffering=args.buffering) writer_censored = CsvLazyDictWriter(fobj_censored) censored_writerow = writer_censored.writerow fobj_uncensored = open_compressed(filename_uncensored, mode="w", buffering=args.buffering) writer_uncensored = CsvLazyDictWriter(fobj_uncensored) uncensored_writerow = writer_uncensored.writerow refresh_count = args.refresh_count reader = csv.DictReader(fobj, delimiter=";") for counter, row in tqdm(enumerate(reader), unit_scale=True, unit="row"): row = convert_row_uncensored(row) uncensored_writerow(row) censor(row) censored_writerow(row) writer_censored.close() writer_uncensored.close() if not args.preserve_raw: filename_raw.unlink()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--raw", action="store_true") parser.add_argument("--no-censorship", action="store_true") parser.add_argument("--username", default="imunizacao_public") parser.add_argument("--password", default="qlto5t&7r_@+#Tlstigi") parser.add_argument("--api-url", default="https://imunizacao-es.saude.gov.br/") parser.add_argument("--index", default="desc-imunizacao") parser.add_argument("--ttl", default="10m") parser.add_argument("--input-filename") parser.add_argument("output_filename") args = parser.parse_args() convert_row = convert_row_censored if args.raw: convert_row = lambda row: row elif args.no_censorship: convert_row = convert_row_uncensored writer = CsvLazyDictWriter(args.output_filename) if args.input_filename: # Use local CSV with open_compressed(args.input_filename) as in_fobj: reader = csv.DictReader(in_fobj) for row in tqdm(reader, unit_scale=True): writer.writerow(convert_row(row)) else: # Get data from ElasticSearch API es = ElasticSearch(args.api_url) iterator = es.paginate( index=args.index, sort_by="@timestamp", user=args.username, password=args.password, ttl=args.ttl, ) progress = tqdm(unit_scale=True) for page_number, page in enumerate(iterator, start=1): progress.desc = f"Downloading page {page_number}" for row in page["hits"]["hits"]: writer.writerow(convert_row(row["_source"])) progress.update() writer.close()
def merge_files(filenames, output_filename): groups = groupby( filenames, key=lambda row: row.name.split("T")[0].replace("ocupacao-", "")) progress = tqdm() writer = CsvLazyDictWriter(output_filename) for index, (date, group) in enumerate(groups, start=1): progress.desc = f"Processing file {index}" group = sorted(group) filename = group[-1] # Process only the last file per day dt = filename.name.split("ocupacao-")[1].split(".csv")[0] base_row = {"datahora": dt} with open(filename) as fobj: reader = csv.DictReader(fobj) for row in reader: new = base_row.copy() new.update({key.lower(): value for key, value in row.items()}) writer.writerow(new) progress.update() progress.close() writer.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--raw", action="store_true") parser.add_argument("--no-censorship", action="store_true") parser.add_argument("--username", default="imunizacao_public") parser.add_argument("--password", default="qlto5t&7r_@+#Tlstigi") parser.add_argument("--api-url", default="https://imunizacao-es.saude.gov.br/") parser.add_argument("--index", default="desc-imunizacao") parser.add_argument("--ttl", default="10m") parser.add_argument("--input-filename") parser.add_argument("output_filename") args = parser.parse_args() convert_row = convert_row_censored if args.raw: convert_row = lambda row: row elif args.no_censorship: convert_row = convert_row_uncensored if args.input_filename: # Use local CSV writer = CsvLazyDictWriter(args.output_filename) with open_compressed(args.input_filename) as in_fobj: reader = csv.DictReader(in_fobj) for row in tqdm(reader, unit_scale=True): writer.writerow(convert_row(row)) writer.close() else: # Get data from ElasticSearch API ElasticSearchConsumer( api_url=args.api_url, index_name=args.index, sort_by="@timestamp", username=args.username, password=args.password, convert_function=partial(convert_rows, convert_row), output_filename=args.output_filename, ).run()
def main(): parser = argparse.ArgumentParser() parser.add_argument("empresa_csv_filename") parser.add_argument("cnae_secundaria_csv_filename") parser.add_argument("output_csv_filename") args = parser.parse_args() writer = CsvLazyDictWriter(args.output_csv_filename) fobj = open_compressed(args.empresa_csv_filename) reader = csv.DictReader(fobj) for row in tqdm(reader): writer.writerow( {"cnpj": row["cnpj"], "cnae": row["cnae_fiscal"], "primaria": "t"}) fobj.close() fobj = open_compressed(args.cnae_secundaria_csv_filename) reader = csv.DictReader(fobj) for row in tqdm(reader): writer.writerow( {"cnpj": row["cnpj"], "cnae": row["cnae"], "primaria": "f"}) fobj.close() writer.close()