def main(): parser = ArgumentParser() parser.add_argument("socio_filename") parser.add_argument("empresa_filename") parser.add_argument("output_filename") args = parser.parse_args() holdings_it = filter_csv( args.socio_filename, lambda row: row["identificador_de_socio"] == "1", convert_function=convert_socio, progress=True, ) holdings = {row["cnpj"]: row for row in holdings_it} cnpjs = set(holdings.keys()) company_names_it = filter_csv( args.empresa_filename, lambda row: row["cnpj"] in cnpjs, convert_function=convert_empresa, progress=True, ) company_names = { row["cnpj"]: row["razao_social"] for row in company_names_it } fobj_writer = open_compressed(args.output_filename, mode="w") csv_writer = CsvLazyDictWriter(fobj_writer) for holding in tqdm(holdings.values(), desc="Writting output file"): holding["razao_social"] = company_names.get(holding["cnpj"], "") csv_writer.writerow(holding) fobj_writer.close()
def main(): dt = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%S") parser = argparse.ArgumentParser() parser.add_argument("--username", default="user-api-leitos") parser.add_argument("--password", default="aQbLL3ZStaTr38tj") parser.add_argument("--api-url", default="https://elastic-leitos.saude.gov.br/") parser.add_argument("--index", default="leito_ocupacao") parser.add_argument("--ttl", default="10m") parser.add_argument("--output-filename", default=DOWNLOAD_PATH / f"ocupacao-{dt}.csv") args = parser.parse_args() es = ElasticSearch( args.api_url, username=args.username, password=args.password, ) iterator = es.search( index=args.index, sort_by="dataNotificacaoOcupacao", ttl=args.ttl, ) writer = CsvLazyDictWriter(args.output_filename) for row in tqdm(iterator, unit_scale=True): writer.writerow(convert_row(row)) writer.close()
def main(): dt = datetime.datetime.today().strftime("%Y-%m-%dT%H:%M:%S") parser = argparse.ArgumentParser() parser.add_argument("--username", default="user-api-leitos") parser.add_argument("--password", default="aQbLL3ZStaTr38tj") parser.add_argument("--api-url", default="https://elastic-leitos.saude.gov.br/") parser.add_argument("--index", default="leito_ocupacao") parser.add_argument("--ttl", default="10m") parser.add_argument("--output-filename", default=DOWNLOAD_PATH / f"ocupacao-{dt}.csv") args = parser.parse_args() es = ElasticSearch(args.api_url) iterator = es.paginate( index=args.index, sort_by="dataNotificacaoOcupacao", user=args.username, password=args.password, ttl=args.ttl, ) writer = CsvLazyDictWriter(args.output_filename) progress = tqdm(unit_scale=True) for page_number, page in enumerate(iterator, start=1): progress.desc = f"Downloading page {page_number}" for row in page["hits"]["hits"]: writer.writerow(convert_row(row["_source"])) progress.update() writer.close()
def extract_files( filenames, header_definitions, transform_functions, output_writers, error_filename, input_encoding="latin1", censorship=True, ): """Extract files from a fixed-width file containing more than one row type `filenames` is expected to be a list of ZIP files having only one file inside each. The file is read and metadata inside `fobjs` is used to parse it and save the output files. """ error_fobj = open_compressed(error_filename, mode="w", encoding="latin1") error_writer = CsvLazyDictWriter(error_fobj) for filename in filenames: # TODO: use another strategy to open this file (like using rows' # open_compressed when archive support is implemented) if os.path.isdir(filename): continue if not str(filename).endswith('.zip'): continue zf = ZipFile(filename) inner_filenames = zf.filelist assert ( len(inner_filenames) == 1 ), f"Only one file inside the zip is expected (got {len(inner_filenames)})" # XXX: The current approach of decoding here and then extracting # fixed-width-file data will work only for encodings where 1 character is # represented by 1 byte, such as latin1. If the encoding can represent one # character using more than 1 byte (like UTF-8), this approach will make # incorrect results. fobj = TextIOWrapper(zf.open(inner_filenames[0]), encoding=input_encoding) for line in tqdm(fobj, desc=f"Extracting {filename}"): row_type = line[0] try: row = parse_row(header_definitions[row_type], line) except ParsingError as exception: error_writer.writerow( {"error": exception.error, "line": exception.line} ) continue data = transform_functions[row_type](row) for row in data: if censorship: # Clear sensitive information censor(row_type, row) output_writers[row_type].writerow(row) fobj.close() zf.close() error_fobj.close()
class ElasticSearchConsumer(AsyncProcessExecutor): def __init__( self, api_url, index_name, sort_by, convert_function, output_filename, username=None, password=None, ttl="10m", progress=True, *args, **kwargs, ): super().__init__(*args, **kwargs) self.convert_function = convert_function self.es = ElasticSearch( api_url, username=username, password=password, ) self.iterator = self.es.search( index=index_name, sort_by=sort_by, ttl=ttl, ) self.writer = CsvLazyDictWriter(output_filename) self.show_progress = progress if self.show_progress: self.progress = tqdm(unit_scale=True) async def tasks(self): if self.show_progress: self.progress.desc = f"Downloading page 001" self.progress.refresh() for page_number, page in enumerate(self.iterator, start=1): if self.show_progress: self.progress.desc = f"Downloaded page {page_number:03d}" self.progress.refresh() yield Task(function=self.convert_function, args=(page, )) async def process(self, result): for row in result: self.writer.writerow(row) if self.show_progress: self.progress.update() async def finsihed(self): if self.show_progress: self.progress.close()
def filter_csv(input_filename, output_filename, filter_function, progress=True): fobj_reader = open_compressed(input_filename, mode="r") fobj_writer = open_compressed(output_filename, mode="w") csv_reader = DictReader(fobj_reader) csv_writer = CsvLazyDictWriter(fobj_writer) if progress: csv_reader = tqdm(csv_reader) for row in csv_reader: if filter_function(row): csv_writer.writerow(row) fobj_reader.close() fobj_writer.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--raw", action="store_true") parser.add_argument("--no-censorship", action="store_true") parser.add_argument("--username", default="imunizacao_public") parser.add_argument("--password", default="qlto5t&7r_@+#Tlstigi") parser.add_argument("--api-url", default="https://imunizacao-es.saude.gov.br/") parser.add_argument("--index", default="desc-imunizacao") parser.add_argument("--ttl", default="10m") parser.add_argument("--input-filename") parser.add_argument("output_filename") args = parser.parse_args() convert_row = convert_row_censored if args.raw: convert_row = lambda row: row elif args.no_censorship: convert_row = convert_row_uncensored writer = CsvLazyDictWriter(args.output_filename) if args.input_filename: # Use local CSV with open_compressed(args.input_filename) as in_fobj: reader = csv.DictReader(in_fobj) for row in tqdm(reader, unit_scale=True): writer.writerow(convert_row(row)) else: # Get data from ElasticSearch API es = ElasticSearch(args.api_url) iterator = es.paginate( index=args.index, sort_by="@timestamp", user=args.username, password=args.password, ttl=args.ttl, ) progress = tqdm(unit_scale=True) for page_number, page in enumerate(iterator, start=1): progress.desc = f"Downloading page {page_number}" for row in page["hits"]["hits"]: writer.writerow(convert_row(row["_source"])) progress.update() writer.close()
def extract_data(ExtractorClass, year_range, output_filename, base_url, force_redownload=False, download_only=False): extractor_name = ExtractorClass.__name__.replace("Extractor", "") extractor = ExtractorClass(base_url) writer = CsvLazyDictWriter(output_filename) for year in year_range: print(f"{extractor_name} {year}") print(" Downloading...", end="") result = extractor.download(year, force=force_redownload) if not result["downloaded"]: print(f" file has already been downloaded.") if not download_only: data = extractor.extract(year) for row in tqdm(data, desc=" Extracting..."): writer.writerow(row) print()
def merge_files(filenames, output_filename): groups = groupby( filenames, key=lambda row: row.name.split("T")[0].replace("ocupacao-", "")) progress = tqdm() writer = CsvLazyDictWriter(output_filename) for index, (date, group) in enumerate(groups, start=1): progress.desc = f"Processing file {index}" group = sorted(group) filename = group[-1] # Process only the last file per day dt = filename.name.split("ocupacao-")[1].split(".csv")[0] base_row = {"datahora": dt} with open(filename) as fobj: reader = csv.DictReader(fobj) for row in reader: new = base_row.copy() new.update({key.lower(): value for key, value in row.items()}) writer.writerow(new) progress.update() progress.close() writer.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--raw", action="store_true") parser.add_argument("--no-censorship", action="store_true") parser.add_argument("--username", default="imunizacao_public") parser.add_argument("--password", default="qlto5t&7r_@+#Tlstigi") parser.add_argument("--api-url", default="https://imunizacao-es.saude.gov.br/") parser.add_argument("--index", default="desc-imunizacao") parser.add_argument("--ttl", default="10m") parser.add_argument("--input-filename") parser.add_argument("output_filename") args = parser.parse_args() convert_row = convert_row_censored if args.raw: convert_row = lambda row: row elif args.no_censorship: convert_row = convert_row_uncensored if args.input_filename: # Use local CSV writer = CsvLazyDictWriter(args.output_filename) with open_compressed(args.input_filename) as in_fobj: reader = csv.DictReader(in_fobj) for row in tqdm(reader, unit_scale=True): writer.writerow(convert_row(row)) writer.close() else: # Get data from ElasticSearch API ElasticSearchConsumer( api_url=args.api_url, index_name=args.index, sort_by="@timestamp", username=args.username, password=args.password, convert_function=partial(convert_rows, convert_row), output_filename=args.output_filename, ).run()
def main(): parser = argparse.ArgumentParser() parser.add_argument("empresa_csv_filename") parser.add_argument("cnae_secundaria_csv_filename") parser.add_argument("output_csv_filename") args = parser.parse_args() writer = CsvLazyDictWriter(args.output_csv_filename) fobj = open_compressed(args.empresa_csv_filename) reader = csv.DictReader(fobj) for row in tqdm(reader): writer.writerow( {"cnpj": row["cnpj"], "cnae": row["cnae_fiscal"], "primaria": "t"}) fobj.close() fobj = open_compressed(args.cnae_secundaria_csv_filename) reader = csv.DictReader(fobj) for row in tqdm(reader): writer.writerow( {"cnpj": row["cnpj"], "cnae": row["cnae"], "primaria": "f"}) fobj.close() writer.close()
def write_csv(filename, iterator): writer = CsvLazyDictWriter(filename) for page in iterator: for row in page: writer.writerow(row)