def process(client, params): """ The ETL pipeline. It contains the main nodes of the extract-transform-load pipeline from the COVID-19 process. Mainly, it prepares which files are necessary to download from the url-source. Then, it goes through each node by performing the operations described, namely: gathering, transforming, storing and visualizing the information. """ data_preparation.run(client, params) for file in params.files_to_download: params.file = file if not data_gathering.done(client, params): data_gathering.update(client, params) if not data_transform.done(client, params): data_transform.update(client, params) if not data_storage.done(client, params): data_storage.update(client, params) if not data_viz.done(client, params): data_viz.update(client, params)
def process(client, params): # It fails when missing something. requirements.check(client, params) if not data_extraction.done(client, params): data_extraction.update(client, params) if not data_transform.done(client, params): data_transform.update(client, params) if not data_storage.done(client, params): data_storage.update(client, params)
def process(client, params): """ The ETL pipeline. """ data_requirements.check(client, params) if not data_gathering.done(client, params): data_gathering.update(client, params) if not data_transform.done(client, params): data_transform.update(client, params) if not data_storage.done(client, params): data_storage.update(client, params)
def process(client, params): """ The ETL pipeline. It contains the main nodes of the extract-transform-load pipeline from the process. """ data_preparation.run(client, params) if not data_gathering.done(client, params): data_gathering.update(client, params) if not data_transform.done(client, params): data_transform.update(client, params) if not data_storage.done(client, params): data_storage.update(client, params)
def process(client, params): """ The ETL pipeline. It contains the main nodes of the extract-transform-load pipeline from the process. Parameters ---------- client: Client parmas: Params Notes ----- The main idea is to consider each task as a conceptual **node**. This function, `process` is the **pipeline** that integrates all tasks together. Each node is a .py file imported from the `nodes` directory. The main idea is that each node can be in one of the following state: - up-to-date: the task to be done given the input parameters is already completed. Hence, no rework is needed. - out-of-date: the task to be done is not completed and should be run. """ data_preparation.run(client, params) if not data_gathering.done(client, params): data_gathering.update(client, params) if not data_transform.done(client, params): data_transform.update(client, params) if not data_storage.done(client, params): data_storage.update(client, params) if not data_viz.done(client, params): data_viz.update(client, params)
def process(client, params): """ This is the core of the ETL-pipeline. """ # It fails when missing something. requirements.check(client, params) # --> params.csv_files for file_url in params.csv_files: params.file_url = file_url if not data_extraction.done(client, params): data_extraction.update(client, params) # --> if not data_transform.done(client, params): data_transform.update(client, params) if not data_storage.done(client, params): data_storage.update(client, params)
def process(client, params): """ The ETL pipeline. It contains the main nodes of the extract-transform-load pipeline from the process. """ data_preparation.run(client, params) gather_done = data_gathering.done(client, params) if len(gather_done) > 0: data_gathering.update(client, params, gather_done) gather_done = ['onibus'] if not data_transform.done(client, params, gather_done): df_pass, df_linha, df_metro = data_transform.update( client, params, gather_done) nome_pass = '******' nome_linha = 'linha' nome_metro = 'metro' if not data_storage.done(client, params): data_storage.update(client, params, df_pass, nome_pass) data_storage.update(client, params, df_linha, nome_linha) data_storage.update(client, params, df_metro, nome_metro) if not data_viz.done(client, params): data_viz.update(client, params)