Пример #1
0
def main():
    dt = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%S")

    parser = argparse.ArgumentParser()
    parser.add_argument("--username", default="user-api-leitos")
    parser.add_argument("--password", default="aQbLL3ZStaTr38tj")
    parser.add_argument("--api-url",
                        default="https://elastic-leitos.saude.gov.br/")
    parser.add_argument("--index", default="leito_ocupacao")
    parser.add_argument("--ttl", default="10m")
    parser.add_argument("--output-filename",
                        default=DOWNLOAD_PATH / f"ocupacao-{dt}.csv")
    args = parser.parse_args()

    es = ElasticSearch(args.api_url)
    iterator = es.paginate(
        index=args.index,
        sort_by="dataNotificacaoOcupacao",
        user=args.username,
        password=args.password,
        ttl=args.ttl,
    )

    writer = CsvLazyDictWriter(args.output_filename)
    progress = tqdm(unit_scale=True)
    for page_number, page in enumerate(iterator, start=1):
        progress.desc = f"Downloading page {page_number}"
        for row in page["hits"]["hits"]:
            writer.writerow(convert_row(row["_source"]))
            progress.update()
    writer.close()
Пример #2
0
def main():
    dt = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%S")

    parser = argparse.ArgumentParser()
    parser.add_argument("--username", default="user-api-leitos")
    parser.add_argument("--password", default="aQbLL3ZStaTr38tj")
    parser.add_argument("--api-url",
                        default="https://elastic-leitos.saude.gov.br/")
    parser.add_argument("--index", default="leito_ocupacao")
    parser.add_argument("--ttl", default="10m")
    parser.add_argument("--output-filename",
                        default=DOWNLOAD_PATH / f"ocupacao-{dt}.csv")
    args = parser.parse_args()

    es = ElasticSearch(
        args.api_url,
        username=args.username,
        password=args.password,
    )
    iterator = es.search(
        index=args.index,
        sort_by="dataNotificacaoOcupacao",
        ttl=args.ttl,
    )

    writer = CsvLazyDictWriter(args.output_filename)
    for row in tqdm(iterator, unit_scale=True):
        writer.writerow(convert_row(row))
    writer.close()
Пример #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--chunk-size", type=int, default=1_024 * 1_024)
    parser.add_argument("--refresh-count", type=int, default=10_000)
    parser.add_argument("--input-encoding", type=str, default="utf-8")
    parser.add_argument("--connections", type=int, default=8)
    parser.add_argument("--preserve-raw", action="store_true")
    parser.add_argument("--buffering", type=int, default=8 * 1024 * 1024)
    args = parser.parse_args()

    # TODO: adicionar opção para selecionar qual dos 3 possíveis CSVs o script
    # irá gerar.
    # TODO: configurar saída do logger para arquivo e não stdout/stderr
    # TODO: adicionar opção para salvar ou não CSV original (compactado)

    url, date = get_latest_url_and_date()
    output_path = Path(__file__).parent / "data" / "output"
    filename_raw = output_path / f"microdados_vacinacao-raw-{date}.csv.xz"
    filename_censored = output_path / "microdados_vacinacao.csv.gz"
    filename_uncensored = output_path / "microdados_vacinacao-uncensored.csv.gz"
    if not output_path.exists():
        output_path.mkdir(parents=True)

    download_file_curl(url, filename_raw)

    with open_compressed(filename_raw) as fobj:
        fobj_censored = open_compressed(filename_censored, mode="w", buffering=args.buffering)
        writer_censored = CsvLazyDictWriter(fobj_censored)
        censored_writerow = writer_censored.writerow

        fobj_uncensored = open_compressed(filename_uncensored, mode="w", buffering=args.buffering)
        writer_uncensored = CsvLazyDictWriter(fobj_uncensored)
        uncensored_writerow = writer_uncensored.writerow

        refresh_count = args.refresh_count
        reader = csv.DictReader(fobj, delimiter=";")
        for counter, row in tqdm(enumerate(reader), unit_scale=True, unit="row"):
            row = convert_row_uncensored(row)
            uncensored_writerow(row)
            censor(row)
            censored_writerow(row)
        writer_censored.close()
        writer_uncensored.close()

    if not args.preserve_raw:
        filename_raw.unlink()
Пример #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--raw", action="store_true")
    parser.add_argument("--no-censorship", action="store_true")
    parser.add_argument("--username", default="imunizacao_public")
    parser.add_argument("--password", default="qlto5t&7r_@+#Tlstigi")
    parser.add_argument("--api-url",
                        default="https://imunizacao-es.saude.gov.br/")
    parser.add_argument("--index", default="desc-imunizacao")
    parser.add_argument("--ttl", default="10m")
    parser.add_argument("--input-filename")
    parser.add_argument("output_filename")
    args = parser.parse_args()

    convert_row = convert_row_censored
    if args.raw:
        convert_row = lambda row: row
    elif args.no_censorship:
        convert_row = convert_row_uncensored

    writer = CsvLazyDictWriter(args.output_filename)

    if args.input_filename:  # Use local CSV
        with open_compressed(args.input_filename) as in_fobj:
            reader = csv.DictReader(in_fobj)
            for row in tqdm(reader, unit_scale=True):
                writer.writerow(convert_row(row))

    else:  # Get data from ElasticSearch API
        es = ElasticSearch(args.api_url)
        iterator = es.paginate(
            index=args.index,
            sort_by="@timestamp",
            user=args.username,
            password=args.password,
            ttl=args.ttl,
        )
        progress = tqdm(unit_scale=True)
        for page_number, page in enumerate(iterator, start=1):
            progress.desc = f"Downloading page {page_number}"
            for row in page["hits"]["hits"]:
                writer.writerow(convert_row(row["_source"]))
                progress.update()

    writer.close()
Пример #5
0
def merge_files(filenames, output_filename):
    groups = groupby(
        filenames,
        key=lambda row: row.name.split("T")[0].replace("ocupacao-", ""))
    progress = tqdm()
    writer = CsvLazyDictWriter(output_filename)
    for index, (date, group) in enumerate(groups, start=1):
        progress.desc = f"Processing file {index}"
        group = sorted(group)
        filename = group[-1]  # Process only the last file per day
        dt = filename.name.split("ocupacao-")[1].split(".csv")[0]
        base_row = {"datahora": dt}
        with open(filename) as fobj:
            reader = csv.DictReader(fobj)
            for row in reader:
                new = base_row.copy()
                new.update({key.lower(): value for key, value in row.items()})
                writer.writerow(new)
                progress.update()
    progress.close()
    writer.close()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--raw", action="store_true")
    parser.add_argument("--no-censorship", action="store_true")
    parser.add_argument("--username", default="imunizacao_public")
    parser.add_argument("--password", default="qlto5t&7r_@+#Tlstigi")
    parser.add_argument("--api-url",
                        default="https://imunizacao-es.saude.gov.br/")
    parser.add_argument("--index", default="desc-imunizacao")
    parser.add_argument("--ttl", default="10m")
    parser.add_argument("--input-filename")
    parser.add_argument("output_filename")
    args = parser.parse_args()

    convert_row = convert_row_censored
    if args.raw:
        convert_row = lambda row: row
    elif args.no_censorship:
        convert_row = convert_row_uncensored

    if args.input_filename:  # Use local CSV
        writer = CsvLazyDictWriter(args.output_filename)
        with open_compressed(args.input_filename) as in_fobj:
            reader = csv.DictReader(in_fobj)
            for row in tqdm(reader, unit_scale=True):
                writer.writerow(convert_row(row))
        writer.close()

    else:  # Get data from ElasticSearch API

        ElasticSearchConsumer(
            api_url=args.api_url,
            index_name=args.index,
            sort_by="@timestamp",
            username=args.username,
            password=args.password,
            convert_function=partial(convert_rows, convert_row),
            output_filename=args.output_filename,
        ).run()
Пример #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("empresa_csv_filename")
    parser.add_argument("cnae_secundaria_csv_filename")
    parser.add_argument("output_csv_filename")
    args = parser.parse_args()

    writer = CsvLazyDictWriter(args.output_csv_filename)

    fobj = open_compressed(args.empresa_csv_filename)
    reader = csv.DictReader(fobj)
    for row in tqdm(reader):
        writer.writerow(
            {"cnpj": row["cnpj"], "cnae": row["cnae_fiscal"], "primaria": "t"})
    fobj.close()

    fobj = open_compressed(args.cnae_secundaria_csv_filename)
    reader = csv.DictReader(fobj)
    for row in tqdm(reader):
        writer.writerow(
            {"cnpj": row["cnpj"], "cnae": row["cnae"], "primaria": "f"})
    fobj.close()

    writer.close()