示例#1
0
 def test_sink(self):
     output_format = "key: {} | value: {} | ts: {}"
     etl = ETL()
     etl.sink(ConsoleDataSink, output_format)
     self.assertIsNotNone(etl.data_sink)
     self.assertIsInstance(etl.data_sink, DataSink)
     self.assertIsInstance(etl.data_sink, ConsoleDataSink)
     self.assertEqual(output_format, etl.data_sink.output_format)
示例#2
0
 def test_source(self):
     source_filepath = "/path/to/file.json"
     chunk_size = 1024
     etl = ETL()
     etl.source(FileDataSource, source_filepath, chunk_size)
     self.assertIsNotNone(etl.data_source)
     self.assertIsInstance(etl.data_source, DataSource)
     self.assertIsInstance(etl.data_source, FileDataSource)
     self.assertEqual(source_filepath, etl.data_source.source_filepath)
     self.assertEqual(chunk_size, etl.data_source.chunk_size)
示例#3
0
    def transform(self, source: str = 'all'):
        """
        Unzip the downloaded files and append data into a single file.
        The resulting files are located in data/processed.

        :param source: (str, default 'all') When all, extract files from all sources.
        Possible values: cpgf, cpcc, despesas-execucao, licitacoes, compras, viagens (a)
        """
        sources = [source] if source != 'all' else self.SOURCES

        for source in sources:
            ETL.transform(source)
示例#4
0
 def test_method_chaining(self):
     source_filepath = os.path.join(INPUT_FILES_DIR, "single_message.json")
     sink_output_format = "key: {} | value: {} | ts: {}"
     expected_output = "key: A123 | value: 15.6 | ts: 2020-10-07 13:28:43.399620+02:00"
     etl = ETL()
     self.assertIsInstance(etl.source(FileDataSource, source_filepath), ETL)
     self.assertIsInstance(etl.sink(ConsoleDataSink, sink_output_format),
                           ETL)
     del etl  # explicitly not required anymore
     with CaptureSTDOUT() as output:
         ETL().source(FileDataSource,
                      source_filepath).sink(ConsoleDataSink,
                                            sink_output_format).run()
     self.assertEqual(1, len(output))
     self.assertEqual(expected_output, output[0])
示例#5
0
    def extract(self, source: str = 'all', replace: bool = False):
        """
        Extract 2015-2020 files from a specified source.

        :param source: (str, default 'all') When all, extract files from all sources.
        Possible values: cpgf, cpcc, despesas-execucao, licitacoes, compras, viagens (a)
        :param replace: (bool, default False) Whether to replace the existing files.
        """
        sources = [source] if source != 'all' else self.SOURCES

        for source in sources:
            if source != 'viagens':
                series = itertools.product(self.YEARS, self.MONTHS)
            else:
                series = itertools.product(self.YEARS, [''])

            for year, month in series:
                ETL.extract(source, year, month, replace)
示例#6
0
 def test_run(self):
     source_filepath = os.path.join(INPUT_FILES_DIR, "single_message.json")
     sink_output_format = "key: {} | value: {} | ts: {}"
     expected_output = "key: A123 | value: 15.6 | ts: 2020-10-07 13:28:43.399620+02:00"
     etl = ETL()
     etl.source(FileDataSource, source_filepath)
     etl.sink(ConsoleDataSink, sink_output_format)
     with CaptureSTDOUT() as output:
         etl.run()
     self.assertEqual(1, len(output))
     self.assertEqual(expected_output, output[0])
示例#7
0
    def load(self, host, database, username, password, source: str = 'all'):
        """
        Load the processed files into the database.

        :param host: (str) DB address.
        :param database: (str) Database name.
        :param username: (str) User with create/drop/insert table/schema privileges.
        :param password: (str) User's password.
        :param source: (str, default 'all') When all, extract files from all sources.
        Possible values: cpgf, cpcc, despesas-execucao, licitacoes, compras, viagens (a)
        """
        # host = localhost
        # database = transparencia
        # username = python
        # password = python
        sources = [source] if source != 'all' else self.SOURCES

        for source in sources:
            ETL.load(source, host, database, username, password)
示例#8
0
def main() -> None:
    quit_ = False
    while not quit_:
        clear()
        print("1. Start ETL")
        print("2. Exit")
        print(">> ", end='')
        option = input()
        if option == '1':
            source_cls, source_args = select_source()
            sink_cls, sink_args = select_sink()
            print("Launching ETL process...")
            ETL().source(source_cls, *source_args).sink(sink_cls, *sink_args).run()
            print("ETL process has finished. Press 'Enter' to continue...")
            input()
        elif option == '2':
            print("Quiting...")
            quit_ = True
        else:
            print("Invalid input. Press 'Enter' to continue...")
            input()
示例#9
0
 def test_object_construction(self):
     etl = ETL()
     self.assertIsNone(etl.data_source)
     self.assertIsNone(etl.data_sink)