def test_sink(self): output_format = "key: {} | value: {} | ts: {}" etl = ETL() etl.sink(ConsoleDataSink, output_format) self.assertIsNotNone(etl.data_sink) self.assertIsInstance(etl.data_sink, DataSink) self.assertIsInstance(etl.data_sink, ConsoleDataSink) self.assertEqual(output_format, etl.data_sink.output_format)
def test_source(self): source_filepath = "/path/to/file.json" chunk_size = 1024 etl = ETL() etl.source(FileDataSource, source_filepath, chunk_size) self.assertIsNotNone(etl.data_source) self.assertIsInstance(etl.data_source, DataSource) self.assertIsInstance(etl.data_source, FileDataSource) self.assertEqual(source_filepath, etl.data_source.source_filepath) self.assertEqual(chunk_size, etl.data_source.chunk_size)
def transform(self, source: str = 'all'): """ Unzip the downloaded files and append data into a single file. The resulting files are located in data/processed. :param source: (str, default 'all') When all, extract files from all sources. Possible values: cpgf, cpcc, despesas-execucao, licitacoes, compras, viagens (a) """ sources = [source] if source != 'all' else self.SOURCES for source in sources: ETL.transform(source)
def test_method_chaining(self): source_filepath = os.path.join(INPUT_FILES_DIR, "single_message.json") sink_output_format = "key: {} | value: {} | ts: {}" expected_output = "key: A123 | value: 15.6 | ts: 2020-10-07 13:28:43.399620+02:00" etl = ETL() self.assertIsInstance(etl.source(FileDataSource, source_filepath), ETL) self.assertIsInstance(etl.sink(ConsoleDataSink, sink_output_format), ETL) del etl # explicitly not required anymore with CaptureSTDOUT() as output: ETL().source(FileDataSource, source_filepath).sink(ConsoleDataSink, sink_output_format).run() self.assertEqual(1, len(output)) self.assertEqual(expected_output, output[0])
def extract(self, source: str = 'all', replace: bool = False): """ Extract 2015-2020 files from a specified source. :param source: (str, default 'all') When all, extract files from all sources. Possible values: cpgf, cpcc, despesas-execucao, licitacoes, compras, viagens (a) :param replace: (bool, default False) Whether to replace the existing files. """ sources = [source] if source != 'all' else self.SOURCES for source in sources: if source != 'viagens': series = itertools.product(self.YEARS, self.MONTHS) else: series = itertools.product(self.YEARS, ['']) for year, month in series: ETL.extract(source, year, month, replace)
def test_run(self): source_filepath = os.path.join(INPUT_FILES_DIR, "single_message.json") sink_output_format = "key: {} | value: {} | ts: {}" expected_output = "key: A123 | value: 15.6 | ts: 2020-10-07 13:28:43.399620+02:00" etl = ETL() etl.source(FileDataSource, source_filepath) etl.sink(ConsoleDataSink, sink_output_format) with CaptureSTDOUT() as output: etl.run() self.assertEqual(1, len(output)) self.assertEqual(expected_output, output[0])
def load(self, host, database, username, password, source: str = 'all'): """ Load the processed files into the database. :param host: (str) DB address. :param database: (str) Database name. :param username: (str) User with create/drop/insert table/schema privileges. :param password: (str) User's password. :param source: (str, default 'all') When all, extract files from all sources. Possible values: cpgf, cpcc, despesas-execucao, licitacoes, compras, viagens (a) """ # host = localhost # database = transparencia # username = python # password = python sources = [source] if source != 'all' else self.SOURCES for source in sources: ETL.load(source, host, database, username, password)
def main() -> None: quit_ = False while not quit_: clear() print("1. Start ETL") print("2. Exit") print(">> ", end='') option = input() if option == '1': source_cls, source_args = select_source() sink_cls, sink_args = select_sink() print("Launching ETL process...") ETL().source(source_cls, *source_args).sink(sink_cls, *sink_args).run() print("ETL process has finished. Press 'Enter' to continue...") input() elif option == '2': print("Quiting...") quit_ = True else: print("Invalid input. Press 'Enter' to continue...") input()
def test_object_construction(self): etl = ETL() self.assertIsNone(etl.data_source) self.assertIsNone(etl.data_sink)