def _ingest(file_path): raw_path = AIRFLOW_RAW / f"{file_path.name[:22]}.tsv" ti = preprocess(raw_path) file_config = ti.xcom_pull("config", "init") extract_table_name = file_config["extract_table"] load_table_name = file_config["load_table"] date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive() table_name = f"fact.session_{date.format('YYYY_MM_DD')}" new_tables.update([table_name, extract_table_name, load_table_name]) Fact.etl(date, file_path.name, extract_table_name, load_table_name)
def ingest_callable(**kwargs): """ ingest preprocessed wifi log files to database. """ task_instance = kwargs["ti"] file_config = task_instance.xcom_pull(key="config", task_ids="init") file_stem = file_config["file_stem"] extract_table_name = file_config["extract_table"] load_table_name = file_config["load_table"] logging.info(f"Looping through '{file_stem}*.csv'") ingest_errors = [] for file_path in AIRFLOW_IMPORT.glob(f"{file_stem}*.csv"): logging.info(f"Ingesting {file_path}.") date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive() session = Session() if ETL.can_process("session_file", file_path, date, session): try: ETL.commit_new("session_file", file_path, date, session) Fact.etl(date, file_path.name, extract_table_name, load_table_name) ETL.set_status("session_file", file_path, date, "completed", session) session.close() except: ingest_errors.append(file_path) ETL.set_status("session_file", file_path, date, "quarantine", session) session.close() if len(ingest_errors) > 0: logging.info( f"The following files could not be ingested: {ingest_errors}.") raise Exception( f"A total of {len(ingest_errors)} files could not be ingested. Failing DAG run" )