def filter_csv(input_filename, output_filename, filter_function, progress=True): fobj_reader = open_compressed(input_filename, mode="r") fobj_writer = open_compressed(output_filename, mode="w") csv_reader = DictReader(fobj_reader) csv_writer = CsvLazyDictWriter(fobj_writer) if progress: csv_reader = tqdm(csv_reader) for row in csv_reader: if filter_function(row): csv_writer.writerow(row) fobj_reader.close() fobj_writer.close()
def extract_data(ExtractorClass, year_range, output_filename, base_url, force_redownload=False, download_only=False): extractor_name = ExtractorClass.__name__.replace("Extractor", "") extractor = ExtractorClass(base_url) output_fobj = open_compressed(output_filename, mode="w", encoding="utf-8") writer = csv.DictWriter( output_fobj, fieldnames=list(extractor.schema.keys()), ) writer.writeheader() for year in year_range: print(f"{extractor_name} {year}") print(" Downloading...", end="") result = extractor.download(year, force=force_redownload) if not result["downloaded"]: print(f" file has already been downloaded.") if not download_only: data = extractor.extract(year) for row in tqdm(data, desc=" Extracting..."): writer.writerow(row) print() output_fobj.close()
def main(): parser = ArgumentParser() parser.add_argument("socio_filename") parser.add_argument("empresa_filename") parser.add_argument("output_filename") args = parser.parse_args() holdings_it = filter_csv( args.socio_filename, lambda row: row["identificador_de_socio"] == "1", convert_function=convert_socio, progress=True, ) holdings = {row["cnpj"]: row for row in holdings_it} cnpjs = set(holdings.keys()) company_names_it = filter_csv( args.empresa_filename, lambda row: row["cnpj"] in cnpjs, convert_function=convert_empresa, progress=True, ) company_names = { row["cnpj"]: row["razao_social"] for row in company_names_it } fobj_writer = open_compressed(args.output_filename, mode="w") csv_writer = CsvLazyDictWriter(fobj_writer) for holding in tqdm(holdings.values(), desc="Writting output file"): holding["razao_social"] = company_names.get(holding["cnpj"], "") csv_writer.writerow(holding) fobj_writer.close()
def file_metadata(filename, chunk_size=8 * 1024 * 1024): hasher = hashlib.sha1() with open(filename, mode="rb") as fobj, tqdm(unit_scale=True, unit="B") as progress: finished = False while not finished: data = fobj.read(chunk_size) hasher.update(data) chunk_length = len(data) finished = chunk_length == 0 progress.update(chunk_length) total_bytes = progress.n new_lines = 0 with open_compressed(filename, mode="rb") as fobj, tqdm(unit_scale=True, unit="B") as progress: finished = False finish_with_new_line = False while not finished: data = fobj.read(chunk_size) new_lines += data.count(b"\n") chunk_length = len(data) finished = chunk_length == 0 if not finished: finish_with_new_line = data[-1] == b"\n" progress.update(chunk_length) uncompressed_bytes = progress.n if not finish_with_new_line: new_lines += 1 return hasher.hexdigest(), new_lines, total_bytes, uncompressed_bytes
def main(): parser = argparse.ArgumentParser() parser.add_argument("--chunk-size", type=int, default=1_024 * 1_024) parser.add_argument("--refresh-count", type=int, default=10_000) parser.add_argument("--input-encoding", type=str, default="utf-8") parser.add_argument("--connections", type=int, default=8) parser.add_argument("--preserve-raw", action="store_true") parser.add_argument("--buffering", type=int, default=8 * 1024 * 1024) args = parser.parse_args() # TODO: adicionar opção para selecionar qual dos 3 possíveis CSVs o script # irá gerar. # TODO: configurar saída do logger para arquivo e não stdout/stderr # TODO: adicionar opção para salvar ou não CSV original (compactado) url, date = get_latest_url_and_date() output_path = Path(__file__).parent / "data" / "output" filename_raw = output_path / f"microdados_vacinacao-raw-{date}.csv.xz" filename_censored = output_path / "microdados_vacinacao.csv.gz" filename_uncensored = output_path / "microdados_vacinacao-uncensored.csv.gz" if not output_path.exists(): output_path.mkdir(parents=True) download_file_curl(url, filename_raw) with open_compressed(filename_raw) as fobj: fobj_censored = open_compressed(filename_censored, mode="w", buffering=args.buffering) writer_censored = CsvLazyDictWriter(fobj_censored) censored_writerow = writer_censored.writerow fobj_uncensored = open_compressed(filename_uncensored, mode="w", buffering=args.buffering) writer_uncensored = CsvLazyDictWriter(fobj_uncensored) uncensored_writerow = writer_uncensored.writerow refresh_count = args.refresh_count reader = csv.DictReader(fobj, delimiter=";") for counter, row in tqdm(enumerate(reader), unit_scale=True, unit="row"): row = convert_row_uncensored(row) uncensored_writerow(row) censor(row) censored_writerow(row) writer_censored.close() writer_uncensored.close() if not args.preserve_raw: filename_raw.unlink()
def load(self): fobj = open_compressed(self.filename, encoding="utf-8") reader = csv.DictReader(fobj) for row in reader: if Decimal(row["ratio"]) < 0.95: continue self.cache[row["first_name"]] = row["classification"] fobj.close()
def extract_files( filenames, header_definitions, transform_functions, output_writers, error_filename, input_encoding="latin1", censorship=True, ): """Extract files from a fixed-width file containing more than one row type `filenames` is expected to be a list of ZIP files having only one file inside each. The file is read and metadata inside `fobjs` is used to parse it and save the output files. """ error_fobj = open_compressed(error_filename, mode="w", encoding="latin1") error_writer = CsvLazyDictWriter(error_fobj) for filename in filenames: # TODO: use another strategy to open this file (like using rows' # open_compressed when archive support is implemented) if os.path.isdir(filename): continue if not str(filename).endswith('.zip'): continue zf = ZipFile(filename) inner_filenames = zf.filelist assert ( len(inner_filenames) == 1 ), f"Only one file inside the zip is expected (got {len(inner_filenames)})" # XXX: The current approach of decoding here and then extracting # fixed-width-file data will work only for encodings where 1 character is # represented by 1 byte, such as latin1. If the encoding can represent one # character using more than 1 byte (like UTF-8), this approach will make # incorrect results. fobj = TextIOWrapper(zf.open(inner_filenames[0]), encoding=input_encoding) for line in tqdm(fobj, desc=f"Extracting {filename}"): row_type = line[0] try: row = parse_row(header_definitions[row_type], line) except ParsingError as exception: error_writer.writerow( {"error": exception.error, "line": exception.line} ) continue data = transform_functions[row_type](row) for row in data: if censorship: # Clear sensitive information censor(row_type, row) output_writers[row_type].writerow(row) fobj.close() zf.close() error_fobj.close()
def start_requests(self): # TODO: add option to change filename via parameter links_filename = OUTPUT_PATH / "tce-link.csv" links = rows.import_from_csv(open_compressed(links_filename, mode="rb")) links.order_by('-year') for item in links: filename = DOWNLOAD_PATH / item.url.split("/")[-1] if not filename.exists(): # TODO: add option to force redownload yield scrapy.Request(url=item.url, meta={"filename": filename})
def read_data(filename): fobj = open_compressed(filename) data = defaultdict(dict) for row in csv.DictReader(fobj): state, date = row.pop("state"), row.pop("date") row = { key: int(value) if value else None for key, value in row.items() } data[state][date] = row fobj.close() return data
def filter_csv(input_filename, filter_function, convert_function, progress=True): fobj_reader = open_compressed(input_filename, mode="r") csv_reader = DictReader(fobj_reader) if progress: csv_reader = tqdm(csv_reader, desc=f"Reading {Path(input_filename).name}") for row in csv_reader: if filter_function(row): yield convert_function(row) fobj_reader.close()
def start_requests(self): filename = glob.glob(str(OUTPUT_PATH / "ba-boletim.csv*"))[0] fobj = open_compressed(filename, encoding="utf8") for row in csv.DictReader(fobj): filename = DOWNLOAD_PATH / f'BA/{row["id_campanha"]}.pdf' if not filename.parent.exists(): filename.parent.mkdir() row["filename"] = filename if filename.exists(): url = "file://" + str(filename.absolute()) else: url = row["url"] yield scrapy.Request(url=url, meta=row)
def main(): parser = argparse.ArgumentParser() parser.add_argument("empresa_csv_filename") parser.add_argument("cnae_secundaria_csv_filename") parser.add_argument("output_csv_filename") args = parser.parse_args() writer = CsvLazyDictWriter(args.output_csv_filename) fobj = open_compressed(args.empresa_csv_filename) reader = csv.DictReader(fobj) for row in tqdm(reader): writer.writerow( {"cnpj": row["cnpj"], "cnae": row["cnae_fiscal"], "primaria": "t"}) fobj.close() fobj = open_compressed(args.cnae_secundaria_csv_filename) reader = csv.DictReader(fobj) for row in tqdm(reader): writer.writerow( {"cnpj": row["cnpj"], "cnae": row["cnae"], "primaria": "f"}) fobj.close() writer.close()
def start_requests(self): filename = glob.glob(str(OUTPUT_PATH / "sc-boletim.csv*"))[0] table = rows.import_from_csv(open_compressed(filename, mode="rb")) for row in table: row = row._asdict() pdf_url = row.pop("pdf_url") row["filename"] = ( DOWNLOAD_PATH / f'SC/{row["ano"]}-{row["municipio_id"]}-{row["balneario_id"]}.pdf' ) if not row["filename"].parent.exists(): row["filename"].parent.mkdir() if not row["filename"].exists(): url = pdf_url else: url = "file://" + str(row["filename"].absolute()) yield scrapy.Request(url=url, meta=row, callback=self.parse_pdf)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--raw", action="store_true") parser.add_argument("--no-censorship", action="store_true") parser.add_argument("--username", default="imunizacao_public") parser.add_argument("--password", default="qlto5t&7r_@+#Tlstigi") parser.add_argument("--api-url", default="https://imunizacao-es.saude.gov.br/") parser.add_argument("--index", default="desc-imunizacao") parser.add_argument("--ttl", default="10m") parser.add_argument("--input-filename") parser.add_argument("output_filename") args = parser.parse_args() convert_row = convert_row_censored if args.raw: convert_row = lambda row: row elif args.no_censorship: convert_row = convert_row_uncensored writer = CsvLazyDictWriter(args.output_filename) if args.input_filename: # Use local CSV with open_compressed(args.input_filename) as in_fobj: reader = csv.DictReader(in_fobj) for row in tqdm(reader, unit_scale=True): writer.writerow(convert_row(row)) else: # Get data from ElasticSearch API es = ElasticSearch(args.api_url) iterator = es.paginate( index=args.index, sort_by="@timestamp", user=args.username, password=args.password, ttl=args.ttl, ) progress = tqdm(unit_scale=True) for page_number, page in enumerate(iterator, start=1): progress.desc = f"Downloading page {page_number}" for row in page["hits"]["hits"]: writer.writerow(convert_row(row["_source"])) progress.update() writer.close()
def extract_magistrados(filename, uf): for nome, sigla in TRIBUNAL_UF.items(): if sigla == uf: tribunal = nome break reader_fobj = open_compressed(filename, mode="r", encoding="utf-8") for row in csv.DictReader(reader_fobj): row_ano, row_tribunal = row["ano_de_referencia"], row["tribunal"] if row_ano in ("2017", "2018") and row_tribunal == tribunal: yield { "ano": row_ano, "cargo": row["cargo"], "instituicao": "TJE", "mes": row["mes_de_referencia"], "nome": row["nome"], "observacao": "", "rendimento_bruto": row["total_de_rendimentos"], "rendimento_liquido": row["rendimento_liquido"], "uf": TRIBUNAL_UF[row_tribunal], }
def main(): parser = argparse.ArgumentParser() parser.add_argument("--raw", action="store_true") parser.add_argument("--no-censorship", action="store_true") parser.add_argument("--username", default="imunizacao_public") parser.add_argument("--password", default="qlto5t&7r_@+#Tlstigi") parser.add_argument("--api-url", default="https://imunizacao-es.saude.gov.br/") parser.add_argument("--index", default="desc-imunizacao") parser.add_argument("--ttl", default="10m") parser.add_argument("--input-filename") parser.add_argument("output_filename") args = parser.parse_args() convert_row = convert_row_censored if args.raw: convert_row = lambda row: row elif args.no_censorship: convert_row = convert_row_uncensored if args.input_filename: # Use local CSV writer = CsvLazyDictWriter(args.output_filename) with open_compressed(args.input_filename) as in_fobj: reader = csv.DictReader(in_fobj) for row in tqdm(reader, unit_scale=True): writer.writerow(convert_row(row)) writer.close() else: # Get data from ElasticSearch API ElasticSearchConsumer( api_url=args.api_url, index_name=args.index, sort_by="@timestamp", username=args.username, password=args.password, convert_function=partial(convert_rows, convert_row), output_filename=args.output_filename, ).run()
def read_csv(filename): fobj = open_compressed(filename, mode="r", encoding="utf-8") yield from csv.DictReader(fobj) fobj.close()
if __name__ == "__main__": import argparse import csv import rows from tqdm import tqdm import settings parser = argparse.ArgumentParser() parser.add_argument("--start_at") args = parser.parse_args() extractors = {"xls": XLSFileExtractor, "xlsx": XLSXFileExtractor} file_list = open_compressed(settings.OUTPUT_PATH / "planilha.csv.gz", mode="rb") fobjs, writers = [], {} for sheet_name, info in SHEET_INFO.items(): fobj = open_compressed(info["output_filename"], mode="w", encoding="utf-8") field_names = list(SHEET_INFO[sheet_name]["schema"].keys()) + [ "tribunal", "mes_de_referencia", "mes_ano_de_referencia", "ano_de_referencia", "data_de_publicacao", ] writers[sheet_name] = csv.DictWriter(fobj, fieldnames=field_names) writers[sheet_name].writeheader() fobjs.append(fobj)
def main(): # The file pattern used to find the files may change in other operating # systems or running inside Docker (needs to share the volume). If that's # the case, just define the `--data_path` parameter. data_path = settings.ORE_DATA_PATH institutions = set(Extractor.institution for Extractor in FileExtractor.registry()) states = set(Extractor.state for Extractor in FileExtractor.registry()) parser = argparse.ArgumentParser() parser.add_argument("--data_path", default=data_path.absolute()) parser.add_argument("--filename") parser.add_argument("--output") parser.add_argument("institution", choices=institutions) parser.add_argument("state", choices=states) args = parser.parse_args() institution, state = args.institution, args.state output_path = Path("data") data_path = Path(args.data_path) gender_filename = data_path / "nomes.csv.gz" Extractor = FileExtractor.get_child(state=args.state, institution=args.institution) if args.filename: filenames = [data_path / args.filename] else: patterns = Extractor.filename_pattern if isinstance(patterns, str): patterns = [patterns] filename_patterns = [data_path / pattern for pattern in patterns] filenames = sorted( Path(filename) for filename_pattern in filename_patterns for filename in glob.glob(str(filename_pattern))) output_filename = (args.output or output_path / f"ore-{institution.lower()}-{state.lower()}.csv.gz") if not output_filename.parent.exists(): output_filename.parent.mkdir(parents=True) desc = f"{institution} {state}" field_names = ( "ano", "mes", "instituicao", "uf", "cargo", "nome", "genero", "rendimento_bruto", "rendimento_liquido", "observacao", ) gender_classifier = NameClassifier(gender_filename) gender_classifier.load() fobj = open_compressed(output_filename, mode="w", encoding="utf-8") writer = csv.DictWriter(fobj, fieldnames=field_names) writer.writeheader() for filename in tqdm(filenames, desc=desc): extractor = Extractor(filename) for row in extractor.data: nome = row["nome"] row["genero"] = gender_classifier.classify(nome) if nome else "" writer.writerow(row) # TODO: should force types anywhere here or in FileExtractor? fobj.close()
"uf": ("uf", ), "observacao": ("observacao", ), "genero": ("genero", ), "nome": ("nome", ), "cargo": ("cargo", ), "rendimento_bruto": ("rendimento_bruto", "total_bruto", "total_rendimentos"), "rendimento_liquido": ("rendimento_liquido", ), } # TODO: use settings.*DATA* base_path = Path(__file__).parent.parent.parent output_filename = base_path / "data" / "consolidado.csv.gz" filename_pattern = (base_path / "justa/converters/data" / "ore-*.csv*" ) # TODO: may change to .csv.gz fobj = open_compressed(output_filename, mode="w", encoding="utf-8") writer = csv.DictWriter(fobj, fieldnames=list(field_translation.keys())) writer.writeheader() for filename in tqdm(glob.glob(str(filename_pattern))): for row in read_csv(filename): data = {} # TODO: move these convertions to extractors for key, possible_field_names in field_translation.items(): for field_name in possible_field_names: if field_name in row: data[key] = row[field_name] break if not data["rendimento_liquido"] and "ore-mpe-sp.csv" in filename: if data["rendimento_liquido"] and row["desconto_total"]:
def detect_schema(dataset_slug, tablename, version_name, filename, encoding, samples): # TODO: max_length should not be filled if field type is `date` # TODO: should be able to force some fields (example: CPF as string) if samples: total = samples else: desc = "Counting number of rows" reader = csv.reader(open_compressed(filename)) _ = next(reader) # Skip header total = sum(1 for line in tqdm(reader, desc=desc, unit=" rows")) desc = "Creating schema using {}".format( f"{samples} samples" if samples else "all rows") reader = csv.reader(open_compressed(filename)) header = next(reader) iterator = tqdm(reader, desc=desc, total=total) if samples: iterator = islice(iterator, samples) detector = BrasilIOTypeDetector(header) detector.feed(iterator) result = Table(fields=OrderedDict([ ("dataset_slug", fields.TextField), ("table_name", fields.TextField), ("version_name", fields.TextField), ("order", fields.IntegerField), ("obfuscate", fields.BoolField), ("name", fields.TextField), ("title", fields.TextField), ("description", fields.TextField), ("type", fields.TextField), ("null", fields.BoolField), ("has_choices", fields.BoolField), ("options", fields.JSONField), ("show", fields.BoolField), ("show_on_frontend", fields.BoolField), ("frontend_filter", fields.BoolField), ("link_template", fields.TextField), ])) for index, (field_name, field_type) in enumerate(detector.fields.items()): # TODO: replace "string" with "text" inside Brasil.IO's code field_type = (field_type.__name__.lower().replace("field", "").replace( "text", "string").replace("float", "decimal")) title = make_title(field_name) min_size, max_size = detector.min_sizes[index], detector.max_sizes[ index] options = {"max_length": max_size} if field_type == "decimal": options["max_digits"] = options.pop("max_length") options["decimal_places"] = 2 has_choices = detector.choices[index] is not None link_template = "" if "cnpj" in field_name or "cpf" in field_name: link_template = "/especiais/documento/{{ " + field_name + "|encrypt_if_needed }}" result.append({ "dataset_slug": dataset_slug, "description": title, "frontend_filter": has_choices, "has_choices": has_choices, "link_template": link_template, "name": field_name, "null": min_size == 0, "obfuscate": bool(link_template), "options": options, "order": index + 1, "show": True, "show_on_frontend": True, "table_name": table_name, "title": title, "type": field_type, "version_name": version_name, }) return result
def get_data_from_csv(filename, page_size): with open_compressed(filename) as fobj: reader = csv.DictReader(fobj) iterator = rows.utils.ipartition(reader, page_size) for page in tqdm(iterator, unit_scale=True): yield page
def handle(self, *args, **kwargs): dataset_slug = kwargs["dataset_slug"] tablename = kwargs["tablename"] filename = kwargs["filename"] ask_confirmation = not kwargs["no_input"] import_data = not kwargs["no_import_data"] vacuum = not kwargs["no_vacuum"] clear_view_cache = not kwargs["no_clear_view_cache"] create_filter_indexes = not kwargs["no_create_filter_indexes"] fill_choices = not kwargs["no_fill_choices"] collect_date = self.clean_collect_date(kwargs["collect_date"]) if ask_confirmation: print("This operation will DESTROY the existing data for this " "dataset table.") answer = input("Do you want to continue? (y/n) ") if answer.lower().strip() not in ("y", "yes"): exit() table = Table.objects.for_dataset(dataset_slug).named(tablename) Model = table.get_model() if import_data: # Create the table if not exists with transaction.atomic(): try: Model.delete_table() except ProgrammingError: # Does not exist pass finally: Model.create_table(create_indexes=False) Model.create_triggers() # Get file object, header and set command to run table_name = Model._meta.db_table database_uri = os.environ["DATABASE_URL"] encoding = "utf-8" # TODO: receive as a parameter timeout = 0.1 # TODO: receive as a parameter start_time = time.time() progress = ProgressBar(prefix="Importing data", unit="bytes") # TODO: change the way we do it (CSV dialect may change, encoding # etc.) file_header = open_compressed(filename).readline().strip().split(",") table_schema = table.schema schema = OrderedDict([(field_name, table_schema[field_name]) for field_name in file_header]) try: import_meta = pgimport( filename=filename, encoding=encoding, dialect="excel", database_uri=database_uri, table_name=table_name, create_table=False, timeout=timeout, callback=progress.update, schema=schema, ) except RuntimeError as exception: progress.close() print("ERROR: {}".format(exception.args[0])) exit(1) else: progress.close() table.import_date = timezone.now() table.save() if collect_date: table.version.collected_at = collect_date table.version.save() end_time = time.time() duration = end_time - start_time rows_imported = import_meta["rows_imported"] print( " done in {:7.3f}s ({} rows imported, {:.3f} rows/s).".format( duration, rows_imported, rows_imported / duration ) ) Model = table.get_model(cache=False) table.invalidate_cache() if vacuum: print("Running VACUUM ANALYSE...", end="", flush=True) start = time.time() Model.analyse_table() end = time.time() print(" done in {:.3f}s.".format(end - start)) if create_filter_indexes: # TODO: warn if field has_choices but not in Table.filtering print("Creating filter indexes...", end="", flush=True) start = time.time() Model.create_indexes() end = time.time() print(" done in {:.3f}s.".format(end - start)) if fill_choices: print("Filling choices...") start = time.time() choiceables = Field.objects.for_table(table).choiceables() for field in choiceables: print(" {}".format(field.name), end="", flush=True) start_field = time.time() field.update_choices() field.save() end_field = time.time() print(" - done in {:.3f}s.".format(end_field - start_field)) end = time.time() print(" done in {:.3f}s.".format(end - start)) if clear_view_cache: print("Clearing view cache...") cache.clear()
output_field_names = [ "ano", "mes", "instituicao", "uf", "observacao", "genero", "nome", "cargo", "rendimento_bruto", "rendimento_liquido", ] input_filename = DATA_PATH / "contracheque.csv.gz" output_filename = DATA_PATH / "ore-tje.csv.gz" gender_filename = DATA_PATH / "nomes.csv.gz" gender_classifier = NameClassifier(gender_filename) gender_classifier.load() reader_fobj = open_compressed(input_filename, mode="r", encoding="utf-8") reader = csv.DictReader(reader_fobj) output_fobj = open_compressed(output_filename, mode="w", encoding="utf-8") writer = csv.DictWriter(output_fobj, fieldnames=output_field_names) writer.writeheader() for row in tqdm(reader): if row["tribunal"] in TRIBUNAIS: writer.writerow(convert_row(row)) output_fobj.close()