예제 #1
0
def main():
    dt = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%S")

    parser = argparse.ArgumentParser()
    parser.add_argument("--username", default="user-api-leitos")
    parser.add_argument("--password", default="aQbLL3ZStaTr38tj")
    parser.add_argument("--api-url",
                        default="https://elastic-leitos.saude.gov.br/")
    parser.add_argument("--index", default="leito_ocupacao")
    parser.add_argument("--ttl", default="10m")
    parser.add_argument("--output-filename",
                        default=DOWNLOAD_PATH / f"ocupacao-{dt}.csv")
    args = parser.parse_args()

    es = ElasticSearch(args.api_url)
    iterator = es.paginate(
        index=args.index,
        sort_by="dataNotificacaoOcupacao",
        user=args.username,
        password=args.password,
        ttl=args.ttl,
    )

    writer = CsvLazyDictWriter(args.output_filename)
    progress = tqdm(unit_scale=True)
    for page_number, page in enumerate(iterator, start=1):
        progress.desc = f"Downloading page {page_number}"
        for row in page["hits"]["hits"]:
            writer.writerow(convert_row(row["_source"]))
            progress.update()
    writer.close()
예제 #2
0
def main():
    parser = ArgumentParser()
    parser.add_argument("socio_filename")
    parser.add_argument("empresa_filename")
    parser.add_argument("output_filename")
    args = parser.parse_args()

    holdings_it = filter_csv(
        args.socio_filename,
        lambda row: row["identificador_de_socio"] == "1",
        convert_function=convert_socio,
        progress=True,
    )
    holdings = {row["cnpj"]: row for row in holdings_it}

    cnpjs = set(holdings.keys())
    company_names_it = filter_csv(
        args.empresa_filename,
        lambda row: row["cnpj"] in cnpjs,
        convert_function=convert_empresa,
        progress=True,
    )
    company_names = {
        row["cnpj"]: row["razao_social"]
        for row in company_names_it
    }

    fobj_writer = open_compressed(args.output_filename, mode="w")
    csv_writer = CsvLazyDictWriter(fobj_writer)
    for holding in tqdm(holdings.values(), desc="Writting output file"):
        holding["razao_social"] = company_names.get(holding["cnpj"], "")
        csv_writer.writerow(holding)
    fobj_writer.close()
예제 #3
0
def main():
    base_path = Path(__file__).parent
    output_path = base_path / "data" / "output"
    error_filename = output_path / "errors.csv"

    parser = ArgumentParser()
    parser.add_argument("output_path", default=str(output_path))
    parser.add_argument("input_filenames", nargs="+")
    parser.add_argument("--no_censorship", action="store_true")
    args = parser.parse_args()

    input_encoding = "latin1"
    input_filenames = args.input_filenames
    output_path = Path(args.output_path)
    if not output_path.exists():
        output_path.mkdir(parents=True)
    error_filename = output_path / "error.csv.gz"
    censorship = not args.no_censorship

    row_types = {
        "0": {
            "header_filename": "headers/header.csv",
            "output_filename": output_path / "header.csv.gz",
            "transform_function": lambda row: [row],
        },
        "1": {
            "header_filename": "headers/empresa.csv",
            "output_filename": output_path / "empresa.csv.gz",
            "transform_function": transform_empresa,
        },
        "2": {
            "header_filename": "headers/socio.csv",
            "output_filename": output_path / "socio.csv.gz",
            "transform_function": transform_socio,
        },
        "6": {
            "header_filename": "headers/cnae-secundaria.csv",
            "output_filename": output_path / "cnae-secundaria.csv.gz",
            "transform_function": transform_cnae_secundaria,
        },
        "9": {
            "header_filename": "headers/trailler.csv",
            "output_filename": output_path / "trailler.csv.gz",
            "transform_function": lambda row: [row],
        },
    }
    header_definitions, output_writers, transform_functions = {}, {}, {}
    for row_type, data in row_types.items():
        header_definitions[row_type] = read_header(data["header_filename"])
        output_writers[row_type] = CsvLazyDictWriter(data["output_filename"])
        transform_functions[row_type] = data["transform_function"]
    extract_files(
        filenames=input_filenames,
        header_definitions=header_definitions,
        transform_functions=transform_functions,
        output_writers=output_writers,
        error_filename=error_filename,
        input_encoding=input_encoding,
        censorship=censorship,
    )
예제 #4
0
def main():
    dt = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%S")

    parser = argparse.ArgumentParser()
    parser.add_argument("--username", default="user-api-leitos")
    parser.add_argument("--password", default="aQbLL3ZStaTr38tj")
    parser.add_argument("--api-url",
                        default="https://elastic-leitos.saude.gov.br/")
    parser.add_argument("--index", default="leito_ocupacao")
    parser.add_argument("--ttl", default="10m")
    parser.add_argument("--output-filename",
                        default=DOWNLOAD_PATH / f"ocupacao-{dt}.csv")
    args = parser.parse_args()

    es = ElasticSearch(
        args.api_url,
        username=args.username,
        password=args.password,
    )
    iterator = es.search(
        index=args.index,
        sort_by="dataNotificacaoOcupacao",
        ttl=args.ttl,
    )

    writer = CsvLazyDictWriter(args.output_filename)
    for row in tqdm(iterator, unit_scale=True):
        writer.writerow(convert_row(row))
    writer.close()
예제 #5
0
    def __init__(
        self,
        api_url,
        index_name,
        sort_by,
        convert_function,
        output_filename,
        username=None,
        password=None,
        ttl="10m",
        progress=True,
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)

        self.convert_function = convert_function
        self.es = ElasticSearch(
            api_url,
            username=username,
            password=password,
        )
        self.iterator = self.es.search(
            index=index_name,
            sort_by=sort_by,
            ttl=ttl,
        )
        self.writer = CsvLazyDictWriter(output_filename)
        self.show_progress = progress
        if self.show_progress:
            self.progress = tqdm(unit_scale=True)
예제 #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--chunk-size", type=int, default=1_024 * 1_024)
    parser.add_argument("--refresh-count", type=int, default=10_000)
    parser.add_argument("--input-encoding", type=str, default="utf-8")
    parser.add_argument("--connections", type=int, default=8)
    parser.add_argument("--preserve-raw", action="store_true")
    parser.add_argument("--buffering", type=int, default=8 * 1024 * 1024)
    args = parser.parse_args()

    # TODO: adicionar opção para selecionar qual dos 3 possíveis CSVs o script
    # irá gerar.
    # TODO: configurar saída do logger para arquivo e não stdout/stderr
    # TODO: adicionar opção para salvar ou não CSV original (compactado)

    url, date = get_latest_url_and_date()
    output_path = Path(__file__).parent / "data" / "output"
    filename_raw = output_path / f"microdados_vacinacao-raw-{date}.csv.xz"
    filename_censored = output_path / "microdados_vacinacao.csv.gz"
    filename_uncensored = output_path / "microdados_vacinacao-uncensored.csv.gz"
    if not output_path.exists():
        output_path.mkdir(parents=True)

    download_file_curl(url, filename_raw)

    with open_compressed(filename_raw) as fobj:
        fobj_censored = open_compressed(filename_censored, mode="w", buffering=args.buffering)
        writer_censored = CsvLazyDictWriter(fobj_censored)
        censored_writerow = writer_censored.writerow

        fobj_uncensored = open_compressed(filename_uncensored, mode="w", buffering=args.buffering)
        writer_uncensored = CsvLazyDictWriter(fobj_uncensored)
        uncensored_writerow = writer_uncensored.writerow

        refresh_count = args.refresh_count
        reader = csv.DictReader(fobj, delimiter=";")
        for counter, row in tqdm(enumerate(reader), unit_scale=True, unit="row"):
            row = convert_row_uncensored(row)
            uncensored_writerow(row)
            censor(row)
            censored_writerow(row)
        writer_censored.close()
        writer_uncensored.close()

    if not args.preserve_raw:
        filename_raw.unlink()
예제 #7
0
def extract_files(
    filenames,
    header_definitions,
    transform_functions,
    output_writers,
    error_filename,
    input_encoding="latin1",
    censorship=True,
):
    """Extract files from a fixed-width file containing more than one row type

    `filenames` is expected to be a list of ZIP files having only one file
    inside each. The file is read and metadata inside `fobjs` is used to parse
    it and save the output files.
    """
    error_fobj = open_compressed(error_filename, mode="w", encoding="latin1")
    error_writer = CsvLazyDictWriter(error_fobj)

    for filename in filenames:
        # TODO: use another strategy to open this file (like using rows'
        # open_compressed when archive support is implemented)
        if os.path.isdir(filename):
            continue
        if not str(filename).endswith('.zip'):
            continue

        zf = ZipFile(filename)
        inner_filenames = zf.filelist
        assert (
            len(inner_filenames) == 1
        ), f"Only one file inside the zip is expected (got {len(inner_filenames)})"
        # XXX: The current approach of decoding here and then extracting
        # fixed-width-file data will work only for encodings where 1 character is
        # represented by 1 byte, such as latin1. If the encoding can represent one
        # character using more than 1 byte (like UTF-8), this approach will make
        # incorrect results.
        fobj = TextIOWrapper(zf.open(inner_filenames[0]), encoding=input_encoding)
        for line in tqdm(fobj, desc=f"Extracting {filename}"):
            row_type = line[0]
            try:
                row = parse_row(header_definitions[row_type], line)
            except ParsingError as exception:
                error_writer.writerow(
                    {"error": exception.error, "line": exception.line}
                )
                continue
            data = transform_functions[row_type](row)
            for row in data:
                if censorship:  # Clear sensitive information
                    censor(row_type, row)
                output_writers[row_type].writerow(row)

        fobj.close()
        zf.close()

    error_fobj.close()
예제 #8
0
def filter_csv(input_filename,
               output_filename,
               filter_function,
               progress=True):
    fobj_reader = open_compressed(input_filename, mode="r")
    fobj_writer = open_compressed(output_filename, mode="w")
    csv_reader = DictReader(fobj_reader)
    csv_writer = CsvLazyDictWriter(fobj_writer)
    if progress:
        csv_reader = tqdm(csv_reader)
    for row in csv_reader:
        if filter_function(row):
            csv_writer.writerow(row)
    fobj_reader.close()
    fobj_writer.close()
예제 #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--raw", action="store_true")
    parser.add_argument("--no-censorship", action="store_true")
    parser.add_argument("--username", default="imunizacao_public")
    parser.add_argument("--password", default="qlto5t&7r_@+#Tlstigi")
    parser.add_argument("--api-url",
                        default="https://imunizacao-es.saude.gov.br/")
    parser.add_argument("--index", default="desc-imunizacao")
    parser.add_argument("--ttl", default="10m")
    parser.add_argument("--input-filename")
    parser.add_argument("output_filename")
    args = parser.parse_args()

    convert_row = convert_row_censored
    if args.raw:
        convert_row = lambda row: row
    elif args.no_censorship:
        convert_row = convert_row_uncensored

    writer = CsvLazyDictWriter(args.output_filename)

    if args.input_filename:  # Use local CSV
        with open_compressed(args.input_filename) as in_fobj:
            reader = csv.DictReader(in_fobj)
            for row in tqdm(reader, unit_scale=True):
                writer.writerow(convert_row(row))

    else:  # Get data from ElasticSearch API
        es = ElasticSearch(args.api_url)
        iterator = es.paginate(
            index=args.index,
            sort_by="@timestamp",
            user=args.username,
            password=args.password,
            ttl=args.ttl,
        )
        progress = tqdm(unit_scale=True)
        for page_number, page in enumerate(iterator, start=1):
            progress.desc = f"Downloading page {page_number}"
            for row in page["hits"]["hits"]:
                writer.writerow(convert_row(row["_source"]))
                progress.update()

    writer.close()
예제 #10
0
def extract_data(ExtractorClass, year_range, output_filename, base_url, force_redownload=False, download_only=False):
    extractor_name = ExtractorClass.__name__.replace("Extractor", "")
    extractor = ExtractorClass(base_url)
    writer = CsvLazyDictWriter(output_filename)
    for year in year_range:
        print(f"{extractor_name} {year}")

        print("  Downloading...", end="")
        result = extractor.download(year, force=force_redownload)
        if not result["downloaded"]:
            print(f" file has already been downloaded.")

        if not download_only:
            data = extractor.extract(year)
            for row in tqdm(data, desc="  Extracting..."):
                writer.writerow(row)

        print()
예제 #11
0
def merge_files(filenames, output_filename):
    groups = groupby(
        filenames,
        key=lambda row: row.name.split("T")[0].replace("ocupacao-", ""))
    progress = tqdm()
    writer = CsvLazyDictWriter(output_filename)
    for index, (date, group) in enumerate(groups, start=1):
        progress.desc = f"Processing file {index}"
        group = sorted(group)
        filename = group[-1]  # Process only the last file per day
        dt = filename.name.split("ocupacao-")[1].split(".csv")[0]
        base_row = {"datahora": dt}
        with open(filename) as fobj:
            reader = csv.DictReader(fobj)
            for row in reader:
                new = base_row.copy()
                new.update({key.lower(): value for key, value in row.items()})
                writer.writerow(new)
                progress.update()
    progress.close()
    writer.close()
예제 #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--raw", action="store_true")
    parser.add_argument("--no-censorship", action="store_true")
    parser.add_argument("--username", default="imunizacao_public")
    parser.add_argument("--password", default="qlto5t&7r_@+#Tlstigi")
    parser.add_argument("--api-url",
                        default="https://imunizacao-es.saude.gov.br/")
    parser.add_argument("--index", default="desc-imunizacao")
    parser.add_argument("--ttl", default="10m")
    parser.add_argument("--input-filename")
    parser.add_argument("output_filename")
    args = parser.parse_args()

    convert_row = convert_row_censored
    if args.raw:
        convert_row = lambda row: row
    elif args.no_censorship:
        convert_row = convert_row_uncensored

    if args.input_filename:  # Use local CSV
        writer = CsvLazyDictWriter(args.output_filename)
        with open_compressed(args.input_filename) as in_fobj:
            reader = csv.DictReader(in_fobj)
            for row in tqdm(reader, unit_scale=True):
                writer.writerow(convert_row(row))
        writer.close()

    else:  # Get data from ElasticSearch API

        ElasticSearchConsumer(
            api_url=args.api_url,
            index_name=args.index,
            sort_by="@timestamp",
            username=args.username,
            password=args.password,
            convert_function=partial(convert_rows, convert_row),
            output_filename=args.output_filename,
        ).run()
예제 #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("empresa_csv_filename")
    parser.add_argument("cnae_secundaria_csv_filename")
    parser.add_argument("output_csv_filename")
    args = parser.parse_args()

    writer = CsvLazyDictWriter(args.output_csv_filename)

    fobj = open_compressed(args.empresa_csv_filename)
    reader = csv.DictReader(fobj)
    for row in tqdm(reader):
        writer.writerow(
            {"cnpj": row["cnpj"], "cnae": row["cnae_fiscal"], "primaria": "t"})
    fobj.close()

    fobj = open_compressed(args.cnae_secundaria_csv_filename)
    reader = csv.DictReader(fobj)
    for row in tqdm(reader):
        writer.writerow(
            {"cnpj": row["cnpj"], "cnae": row["cnae"], "primaria": "f"})
    fobj.close()

    writer.close()
예제 #14
0
def write_csv(filename, iterator):
    writer = CsvLazyDictWriter(filename)
    for page in iterator:
        for row in page:
            writer.writerow(row)