Пример #1
0
def filter_csv(input_filename,
               output_filename,
               filter_function,
               progress=True):
    fobj_reader = open_compressed(input_filename, mode="r")
    fobj_writer = open_compressed(output_filename, mode="w")
    csv_reader = DictReader(fobj_reader)
    csv_writer = CsvLazyDictWriter(fobj_writer)
    if progress:
        csv_reader = tqdm(csv_reader)
    for row in csv_reader:
        if filter_function(row):
            csv_writer.writerow(row)
    fobj_reader.close()
    fobj_writer.close()
Пример #2
0
def extract_data(ExtractorClass,
                 year_range,
                 output_filename,
                 base_url,
                 force_redownload=False,
                 download_only=False):
    extractor_name = ExtractorClass.__name__.replace("Extractor", "")
    extractor = ExtractorClass(base_url)
    output_fobj = open_compressed(output_filename, mode="w", encoding="utf-8")
    writer = csv.DictWriter(
        output_fobj,
        fieldnames=list(extractor.schema.keys()),
    )
    writer.writeheader()
    for year in year_range:
        print(f"{extractor_name} {year}")

        print("  Downloading...", end="")
        result = extractor.download(year, force=force_redownload)
        if not result["downloaded"]:
            print(f" file has already been downloaded.")

        if not download_only:
            data = extractor.extract(year)
            for row in tqdm(data, desc="  Extracting..."):
                writer.writerow(row)

        print()
    output_fobj.close()
Пример #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument("socio_filename")
    parser.add_argument("empresa_filename")
    parser.add_argument("output_filename")
    args = parser.parse_args()

    holdings_it = filter_csv(
        args.socio_filename,
        lambda row: row["identificador_de_socio"] == "1",
        convert_function=convert_socio,
        progress=True,
    )
    holdings = {row["cnpj"]: row for row in holdings_it}

    cnpjs = set(holdings.keys())
    company_names_it = filter_csv(
        args.empresa_filename,
        lambda row: row["cnpj"] in cnpjs,
        convert_function=convert_empresa,
        progress=True,
    )
    company_names = {
        row["cnpj"]: row["razao_social"]
        for row in company_names_it
    }

    fobj_writer = open_compressed(args.output_filename, mode="w")
    csv_writer = CsvLazyDictWriter(fobj_writer)
    for holding in tqdm(holdings.values(), desc="Writting output file"):
        holding["razao_social"] = company_names.get(holding["cnpj"], "")
        csv_writer.writerow(holding)
    fobj_writer.close()
Пример #4
0
def file_metadata(filename, chunk_size=8 * 1024 * 1024):
    hasher = hashlib.sha1()

    with open(filename, mode="rb") as fobj, tqdm(unit_scale=True,
                                                 unit="B") as progress:
        finished = False
        while not finished:
            data = fobj.read(chunk_size)
            hasher.update(data)
            chunk_length = len(data)
            finished = chunk_length == 0
            progress.update(chunk_length)
        total_bytes = progress.n

    new_lines = 0
    with open_compressed(filename,
                         mode="rb") as fobj, tqdm(unit_scale=True,
                                                  unit="B") as progress:
        finished = False
        finish_with_new_line = False
        while not finished:
            data = fobj.read(chunk_size)
            new_lines += data.count(b"\n")
            chunk_length = len(data)
            finished = chunk_length == 0
            if not finished:
                finish_with_new_line = data[-1] == b"\n"
            progress.update(chunk_length)
        uncompressed_bytes = progress.n
        if not finish_with_new_line:
            new_lines += 1

    return hasher.hexdigest(), new_lines, total_bytes, uncompressed_bytes
Пример #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--chunk-size", type=int, default=1_024 * 1_024)
    parser.add_argument("--refresh-count", type=int, default=10_000)
    parser.add_argument("--input-encoding", type=str, default="utf-8")
    parser.add_argument("--connections", type=int, default=8)
    parser.add_argument("--preserve-raw", action="store_true")
    parser.add_argument("--buffering", type=int, default=8 * 1024 * 1024)
    args = parser.parse_args()

    # TODO: adicionar opção para selecionar qual dos 3 possíveis CSVs o script
    # irá gerar.
    # TODO: configurar saída do logger para arquivo e não stdout/stderr
    # TODO: adicionar opção para salvar ou não CSV original (compactado)

    url, date = get_latest_url_and_date()
    output_path = Path(__file__).parent / "data" / "output"
    filename_raw = output_path / f"microdados_vacinacao-raw-{date}.csv.xz"
    filename_censored = output_path / "microdados_vacinacao.csv.gz"
    filename_uncensored = output_path / "microdados_vacinacao-uncensored.csv.gz"
    if not output_path.exists():
        output_path.mkdir(parents=True)

    download_file_curl(url, filename_raw)

    with open_compressed(filename_raw) as fobj:
        fobj_censored = open_compressed(filename_censored, mode="w", buffering=args.buffering)
        writer_censored = CsvLazyDictWriter(fobj_censored)
        censored_writerow = writer_censored.writerow

        fobj_uncensored = open_compressed(filename_uncensored, mode="w", buffering=args.buffering)
        writer_uncensored = CsvLazyDictWriter(fobj_uncensored)
        uncensored_writerow = writer_uncensored.writerow

        refresh_count = args.refresh_count
        reader = csv.DictReader(fobj, delimiter=";")
        for counter, row in tqdm(enumerate(reader), unit_scale=True, unit="row"):
            row = convert_row_uncensored(row)
            uncensored_writerow(row)
            censor(row)
            censored_writerow(row)
        writer_censored.close()
        writer_uncensored.close()

    if not args.preserve_raw:
        filename_raw.unlink()
Пример #6
0
 def load(self):
     fobj = open_compressed(self.filename, encoding="utf-8")
     reader = csv.DictReader(fobj)
     for row in reader:
         if Decimal(row["ratio"]) < 0.95:
             continue
         self.cache[row["first_name"]] = row["classification"]
     fobj.close()
Пример #7
0
def extract_files(
    filenames,
    header_definitions,
    transform_functions,
    output_writers,
    error_filename,
    input_encoding="latin1",
    censorship=True,
):
    """Extract files from a fixed-width file containing more than one row type

    `filenames` is expected to be a list of ZIP files having only one file
    inside each. The file is read and metadata inside `fobjs` is used to parse
    it and save the output files.
    """
    error_fobj = open_compressed(error_filename, mode="w", encoding="latin1")
    error_writer = CsvLazyDictWriter(error_fobj)

    for filename in filenames:
        # TODO: use another strategy to open this file (like using rows'
        # open_compressed when archive support is implemented)
        if os.path.isdir(filename):
            continue
        if not str(filename).endswith('.zip'):
            continue

        zf = ZipFile(filename)
        inner_filenames = zf.filelist
        assert (
            len(inner_filenames) == 1
        ), f"Only one file inside the zip is expected (got {len(inner_filenames)})"
        # XXX: The current approach of decoding here and then extracting
        # fixed-width-file data will work only for encodings where 1 character is
        # represented by 1 byte, such as latin1. If the encoding can represent one
        # character using more than 1 byte (like UTF-8), this approach will make
        # incorrect results.
        fobj = TextIOWrapper(zf.open(inner_filenames[0]), encoding=input_encoding)
        for line in tqdm(fobj, desc=f"Extracting {filename}"):
            row_type = line[0]
            try:
                row = parse_row(header_definitions[row_type], line)
            except ParsingError as exception:
                error_writer.writerow(
                    {"error": exception.error, "line": exception.line}
                )
                continue
            data = transform_functions[row_type](row)
            for row in data:
                if censorship:  # Clear sensitive information
                    censor(row_type, row)
                output_writers[row_type].writerow(row)

        fobj.close()
        zf.close()

    error_fobj.close()
Пример #8
0
    def start_requests(self):
        # TODO: add option to change filename via parameter
        links_filename = OUTPUT_PATH / "tce-link.csv"
        links = rows.import_from_csv(open_compressed(links_filename,
                                                     mode="rb"))
        links.order_by('-year')

        for item in links:
            filename = DOWNLOAD_PATH / item.url.split("/")[-1]
            if not filename.exists():  # TODO: add option to force redownload
                yield scrapy.Request(url=item.url, meta={"filename": filename})
Пример #9
0
def read_data(filename):
    fobj = open_compressed(filename)
    data = defaultdict(dict)
    for row in csv.DictReader(fobj):
        state, date = row.pop("state"), row.pop("date")
        row = {
            key: int(value) if value else None
            for key, value in row.items()
        }
        data[state][date] = row
    fobj.close()
    return data
Пример #10
0
def filter_csv(input_filename,
               filter_function,
               convert_function,
               progress=True):
    fobj_reader = open_compressed(input_filename, mode="r")
    csv_reader = DictReader(fobj_reader)
    if progress:
        csv_reader = tqdm(csv_reader,
                          desc=f"Reading {Path(input_filename).name}")
    for row in csv_reader:
        if filter_function(row):
            yield convert_function(row)
    fobj_reader.close()
    def start_requests(self):

        filename = glob.glob(str(OUTPUT_PATH / "ba-boletim.csv*"))[0]
        fobj = open_compressed(filename, encoding="utf8")
        for row in csv.DictReader(fobj):
            filename = DOWNLOAD_PATH / f'BA/{row["id_campanha"]}.pdf'
            if not filename.parent.exists():
                filename.parent.mkdir()
            row["filename"] = filename
            if filename.exists():
                url = "file://" + str(filename.absolute())
            else:
                url = row["url"]

            yield scrapy.Request(url=url, meta=row)
Пример #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("empresa_csv_filename")
    parser.add_argument("cnae_secundaria_csv_filename")
    parser.add_argument("output_csv_filename")
    args = parser.parse_args()

    writer = CsvLazyDictWriter(args.output_csv_filename)

    fobj = open_compressed(args.empresa_csv_filename)
    reader = csv.DictReader(fobj)
    for row in tqdm(reader):
        writer.writerow(
            {"cnpj": row["cnpj"], "cnae": row["cnae_fiscal"], "primaria": "t"})
    fobj.close()

    fobj = open_compressed(args.cnae_secundaria_csv_filename)
    reader = csv.DictReader(fobj)
    for row in tqdm(reader):
        writer.writerow(
            {"cnpj": row["cnpj"], "cnae": row["cnae"], "primaria": "f"})
    fobj.close()

    writer.close()
 def start_requests(self):
     filename = glob.glob(str(OUTPUT_PATH / "sc-boletim.csv*"))[0]
     table = rows.import_from_csv(open_compressed(filename, mode="rb"))
     for row in table:
         row = row._asdict()
         pdf_url = row.pop("pdf_url")
         row["filename"] = (
             DOWNLOAD_PATH /
             f'SC/{row["ano"]}-{row["municipio_id"]}-{row["balneario_id"]}.pdf'
         )
         if not row["filename"].parent.exists():
             row["filename"].parent.mkdir()
         if not row["filename"].exists():
             url = pdf_url
         else:
             url = "file://" + str(row["filename"].absolute())
         yield scrapy.Request(url=url, meta=row, callback=self.parse_pdf)
Пример #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--raw", action="store_true")
    parser.add_argument("--no-censorship", action="store_true")
    parser.add_argument("--username", default="imunizacao_public")
    parser.add_argument("--password", default="qlto5t&7r_@+#Tlstigi")
    parser.add_argument("--api-url",
                        default="https://imunizacao-es.saude.gov.br/")
    parser.add_argument("--index", default="desc-imunizacao")
    parser.add_argument("--ttl", default="10m")
    parser.add_argument("--input-filename")
    parser.add_argument("output_filename")
    args = parser.parse_args()

    convert_row = convert_row_censored
    if args.raw:
        convert_row = lambda row: row
    elif args.no_censorship:
        convert_row = convert_row_uncensored

    writer = CsvLazyDictWriter(args.output_filename)

    if args.input_filename:  # Use local CSV
        with open_compressed(args.input_filename) as in_fobj:
            reader = csv.DictReader(in_fobj)
            for row in tqdm(reader, unit_scale=True):
                writer.writerow(convert_row(row))

    else:  # Get data from ElasticSearch API
        es = ElasticSearch(args.api_url)
        iterator = es.paginate(
            index=args.index,
            sort_by="@timestamp",
            user=args.username,
            password=args.password,
            ttl=args.ttl,
        )
        progress = tqdm(unit_scale=True)
        for page_number, page in enumerate(iterator, start=1):
            progress.desc = f"Downloading page {page_number}"
            for row in page["hits"]["hits"]:
                writer.writerow(convert_row(row["_source"]))
                progress.update()

    writer.close()
Пример #15
0
def extract_magistrados(filename, uf):
    for nome, sigla in TRIBUNAL_UF.items():
        if sigla == uf:
            tribunal = nome
            break

    reader_fobj = open_compressed(filename, mode="r", encoding="utf-8")
    for row in csv.DictReader(reader_fobj):
        row_ano, row_tribunal = row["ano_de_referencia"], row["tribunal"]
        if row_ano in ("2017", "2018") and row_tribunal == tribunal:
            yield {
                "ano": row_ano,
                "cargo": row["cargo"],
                "instituicao": "TJE",
                "mes": row["mes_de_referencia"],
                "nome": row["nome"],
                "observacao": "",
                "rendimento_bruto": row["total_de_rendimentos"],
                "rendimento_liquido": row["rendimento_liquido"],
                "uf": TRIBUNAL_UF[row_tribunal],
            }
Пример #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--raw", action="store_true")
    parser.add_argument("--no-censorship", action="store_true")
    parser.add_argument("--username", default="imunizacao_public")
    parser.add_argument("--password", default="qlto5t&7r_@+#Tlstigi")
    parser.add_argument("--api-url",
                        default="https://imunizacao-es.saude.gov.br/")
    parser.add_argument("--index", default="desc-imunizacao")
    parser.add_argument("--ttl", default="10m")
    parser.add_argument("--input-filename")
    parser.add_argument("output_filename")
    args = parser.parse_args()

    convert_row = convert_row_censored
    if args.raw:
        convert_row = lambda row: row
    elif args.no_censorship:
        convert_row = convert_row_uncensored

    if args.input_filename:  # Use local CSV
        writer = CsvLazyDictWriter(args.output_filename)
        with open_compressed(args.input_filename) as in_fobj:
            reader = csv.DictReader(in_fobj)
            for row in tqdm(reader, unit_scale=True):
                writer.writerow(convert_row(row))
        writer.close()

    else:  # Get data from ElasticSearch API

        ElasticSearchConsumer(
            api_url=args.api_url,
            index_name=args.index,
            sort_by="@timestamp",
            username=args.username,
            password=args.password,
            convert_function=partial(convert_rows, convert_row),
            output_filename=args.output_filename,
        ).run()
Пример #17
0
def read_csv(filename):
    fobj = open_compressed(filename, mode="r", encoding="utf-8")
    yield from csv.DictReader(fobj)
    fobj.close()
Пример #18
0
if __name__ == "__main__":
    import argparse
    import csv

    import rows
    from tqdm import tqdm

    import settings

    parser = argparse.ArgumentParser()
    parser.add_argument("--start_at")
    args = parser.parse_args()

    extractors = {"xls": XLSFileExtractor, "xlsx": XLSXFileExtractor}
    file_list = open_compressed(settings.OUTPUT_PATH / "planilha.csv.gz",
                                mode="rb")
    fobjs, writers = [], {}
    for sheet_name, info in SHEET_INFO.items():
        fobj = open_compressed(info["output_filename"],
                               mode="w",
                               encoding="utf-8")
        field_names = list(SHEET_INFO[sheet_name]["schema"].keys()) + [
            "tribunal",
            "mes_de_referencia",
            "mes_ano_de_referencia",
            "ano_de_referencia",
            "data_de_publicacao",
        ]
        writers[sheet_name] = csv.DictWriter(fobj, fieldnames=field_names)
        writers[sheet_name].writeheader()
        fobjs.append(fobj)
Пример #19
0
def main():
    # The file pattern used to find the files may change in other operating
    # systems or running inside Docker (needs to share the volume). If that's
    # the case, just define the `--data_path` parameter.
    data_path = settings.ORE_DATA_PATH
    institutions = set(Extractor.institution
                       for Extractor in FileExtractor.registry())
    states = set(Extractor.state for Extractor in FileExtractor.registry())
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_path", default=data_path.absolute())
    parser.add_argument("--filename")
    parser.add_argument("--output")
    parser.add_argument("institution", choices=institutions)
    parser.add_argument("state", choices=states)
    args = parser.parse_args()
    institution, state = args.institution, args.state
    output_path = Path("data")
    data_path = Path(args.data_path)
    gender_filename = data_path / "nomes.csv.gz"

    Extractor = FileExtractor.get_child(state=args.state,
                                        institution=args.institution)
    if args.filename:
        filenames = [data_path / args.filename]
    else:
        patterns = Extractor.filename_pattern
        if isinstance(patterns, str):
            patterns = [patterns]
        filename_patterns = [data_path / pattern for pattern in patterns]
        filenames = sorted(
            Path(filename) for filename_pattern in filename_patterns
            for filename in glob.glob(str(filename_pattern)))
    output_filename = (args.output or output_path /
                       f"ore-{institution.lower()}-{state.lower()}.csv.gz")
    if not output_filename.parent.exists():
        output_filename.parent.mkdir(parents=True)
    desc = f"{institution} {state}"

    field_names = (
        "ano",
        "mes",
        "instituicao",
        "uf",
        "cargo",
        "nome",
        "genero",
        "rendimento_bruto",
        "rendimento_liquido",
        "observacao",
    )
    gender_classifier = NameClassifier(gender_filename)
    gender_classifier.load()
    fobj = open_compressed(output_filename, mode="w", encoding="utf-8")
    writer = csv.DictWriter(fobj, fieldnames=field_names)
    writer.writeheader()
    for filename in tqdm(filenames, desc=desc):
        extractor = Extractor(filename)
        for row in extractor.data:
            nome = row["nome"]
            row["genero"] = gender_classifier.classify(nome) if nome else ""
            writer.writerow(row)
        # TODO: should force types anywhere here or in FileExtractor?
    fobj.close()
Пример #20
0
    "uf": ("uf", ),
    "observacao": ("observacao", ),
    "genero": ("genero", ),
    "nome": ("nome", ),
    "cargo": ("cargo", ),
    "rendimento_bruto":
    ("rendimento_bruto", "total_bruto", "total_rendimentos"),
    "rendimento_liquido": ("rendimento_liquido", ),
}

# TODO: use settings.*DATA*
base_path = Path(__file__).parent.parent.parent
output_filename = base_path / "data" / "consolidado.csv.gz"
filename_pattern = (base_path / "justa/converters/data" / "ore-*.csv*"
                    )  # TODO: may change to .csv.gz
fobj = open_compressed(output_filename, mode="w", encoding="utf-8")
writer = csv.DictWriter(fobj, fieldnames=list(field_translation.keys()))
writer.writeheader()
for filename in tqdm(glob.glob(str(filename_pattern))):
    for row in read_csv(filename):
        data = {}

        # TODO: move these convertions to extractors
        for key, possible_field_names in field_translation.items():
            for field_name in possible_field_names:
                if field_name in row:
                    data[key] = row[field_name]
                    break

        if not data["rendimento_liquido"] and "ore-mpe-sp.csv" in filename:
            if data["rendimento_liquido"] and row["desconto_total"]:
Пример #21
0
def detect_schema(dataset_slug, tablename, version_name, filename, encoding,
                  samples):

    # TODO: max_length should not be filled if field type is `date`
    # TODO: should be able to force some fields (example: CPF as string)

    if samples:
        total = samples
    else:
        desc = "Counting number of rows"
        reader = csv.reader(open_compressed(filename))
        _ = next(reader)  # Skip header
        total = sum(1 for line in tqdm(reader, desc=desc, unit=" rows"))

    desc = "Creating schema using {}".format(
        f"{samples} samples" if samples else "all rows")
    reader = csv.reader(open_compressed(filename))
    header = next(reader)
    iterator = tqdm(reader, desc=desc, total=total)
    if samples:
        iterator = islice(iterator, samples)

    detector = BrasilIOTypeDetector(header)
    detector.feed(iterator)

    result = Table(fields=OrderedDict([
        ("dataset_slug", fields.TextField),
        ("table_name", fields.TextField),
        ("version_name", fields.TextField),
        ("order", fields.IntegerField),
        ("obfuscate", fields.BoolField),
        ("name", fields.TextField),
        ("title", fields.TextField),
        ("description", fields.TextField),
        ("type", fields.TextField),
        ("null", fields.BoolField),
        ("has_choices", fields.BoolField),
        ("options", fields.JSONField),
        ("show", fields.BoolField),
        ("show_on_frontend", fields.BoolField),
        ("frontend_filter", fields.BoolField),
        ("link_template", fields.TextField),
    ]))

    for index, (field_name, field_type) in enumerate(detector.fields.items()):
        # TODO: replace "string" with "text" inside Brasil.IO's code
        field_type = (field_type.__name__.lower().replace("field", "").replace(
            "text", "string").replace("float", "decimal"))
        title = make_title(field_name)
        min_size, max_size = detector.min_sizes[index], detector.max_sizes[
            index]
        options = {"max_length": max_size}
        if field_type == "decimal":
            options["max_digits"] = options.pop("max_length")
            options["decimal_places"] = 2
        has_choices = detector.choices[index] is not None
        link_template = ""
        if "cnpj" in field_name or "cpf" in field_name:
            link_template = "/especiais/documento/{{ " + field_name + "|encrypt_if_needed }}"

        result.append({
            "dataset_slug": dataset_slug,
            "description": title,
            "frontend_filter": has_choices,
            "has_choices": has_choices,
            "link_template": link_template,
            "name": field_name,
            "null": min_size == 0,
            "obfuscate": bool(link_template),
            "options": options,
            "order": index + 1,
            "show": True,
            "show_on_frontend": True,
            "table_name": table_name,
            "title": title,
            "type": field_type,
            "version_name": version_name,
        })
    return result
Пример #22
0
def get_data_from_csv(filename, page_size):
    with open_compressed(filename) as fobj:
        reader = csv.DictReader(fobj)
        iterator = rows.utils.ipartition(reader, page_size)
        for page in tqdm(iterator, unit_scale=True):
            yield page
Пример #23
0
    def handle(self, *args, **kwargs):
        dataset_slug = kwargs["dataset_slug"]
        tablename = kwargs["tablename"]
        filename = kwargs["filename"]
        ask_confirmation = not kwargs["no_input"]
        import_data = not kwargs["no_import_data"]
        vacuum = not kwargs["no_vacuum"]
        clear_view_cache = not kwargs["no_clear_view_cache"]
        create_filter_indexes = not kwargs["no_create_filter_indexes"]
        fill_choices = not kwargs["no_fill_choices"]
        collect_date = self.clean_collect_date(kwargs["collect_date"])

        if ask_confirmation:
            print("This operation will DESTROY the existing data for this " "dataset table.")
            answer = input("Do you want to continue? (y/n) ")
            if answer.lower().strip() not in ("y", "yes"):
                exit()

        table = Table.objects.for_dataset(dataset_slug).named(tablename)
        Model = table.get_model()

        if import_data:
            # Create the table if not exists
            with transaction.atomic():
                try:
                    Model.delete_table()
                except ProgrammingError:  # Does not exist
                    pass
                finally:
                    Model.create_table(create_indexes=False)
                    Model.create_triggers()

            # Get file object, header and set command to run
            table_name = Model._meta.db_table
            database_uri = os.environ["DATABASE_URL"]
            encoding = "utf-8"  # TODO: receive as a parameter
            timeout = 0.1  # TODO: receive as a parameter
            start_time = time.time()
            progress = ProgressBar(prefix="Importing data", unit="bytes")

            # TODO: change the way we do it (CSV dialect may change, encoding
            # etc.)
            file_header = open_compressed(filename).readline().strip().split(",")
            table_schema = table.schema
            schema = OrderedDict([(field_name, table_schema[field_name]) for field_name in file_header])
            try:
                import_meta = pgimport(
                    filename=filename,
                    encoding=encoding,
                    dialect="excel",
                    database_uri=database_uri,
                    table_name=table_name,
                    create_table=False,
                    timeout=timeout,
                    callback=progress.update,
                    schema=schema,
                )
            except RuntimeError as exception:
                progress.close()
                print("ERROR: {}".format(exception.args[0]))
                exit(1)
            else:
                progress.close()
                table.import_date = timezone.now()
                table.save()
                if collect_date:
                    table.version.collected_at = collect_date
                    table.version.save()
                end_time = time.time()
                duration = end_time - start_time
                rows_imported = import_meta["rows_imported"]
                print(
                    "  done in {:7.3f}s ({} rows imported, {:.3f} rows/s).".format(
                        duration, rows_imported, rows_imported / duration
                    )
                )
            Model = table.get_model(cache=False)
            table.invalidate_cache()

        if vacuum:
            print("Running VACUUM ANALYSE...", end="", flush=True)
            start = time.time()
            Model.analyse_table()
            end = time.time()
            print("  done in {:.3f}s.".format(end - start))

        if create_filter_indexes:
            # TODO: warn if field has_choices but not in Table.filtering
            print("Creating filter indexes...", end="", flush=True)
            start = time.time()
            Model.create_indexes()
            end = time.time()
            print("  done in {:.3f}s.".format(end - start))

        if fill_choices:
            print("Filling choices...")
            start = time.time()
            choiceables = Field.objects.for_table(table).choiceables()
            for field in choiceables:
                print("  {}".format(field.name), end="", flush=True)
                start_field = time.time()
                field.update_choices()
                field.save()
                end_field = time.time()
                print(" - done in {:.3f}s.".format(end_field - start_field))
            end = time.time()
            print("  done in {:.3f}s.".format(end - start))

        if clear_view_cache:
            print("Clearing view cache...")
            cache.clear()
Пример #24
0
output_field_names = [
    "ano",
    "mes",
    "instituicao",
    "uf",
    "observacao",
    "genero",
    "nome",
    "cargo",
    "rendimento_bruto",
    "rendimento_liquido",
]
input_filename = DATA_PATH / "contracheque.csv.gz"
output_filename = DATA_PATH / "ore-tje.csv.gz"
gender_filename = DATA_PATH / "nomes.csv.gz"

gender_classifier = NameClassifier(gender_filename)
gender_classifier.load()

reader_fobj = open_compressed(input_filename, mode="r", encoding="utf-8")
reader = csv.DictReader(reader_fobj)
output_fobj = open_compressed(output_filename, mode="w", encoding="utf-8")
writer = csv.DictWriter(output_fobj, fieldnames=output_field_names)
writer.writeheader()

for row in tqdm(reader):
    if row["tribunal"] in TRIBUNAIS:
        writer.writerow(convert_row(row))

output_fobj.close()