示例#1
0
 def _ingest(file_path):
     raw_path = AIRFLOW_RAW / f"{file_path.name[:22]}.tsv"
     ti = preprocess(raw_path)
     file_config = ti.xcom_pull("config", "init")
     extract_table_name = file_config["extract_table"]
     load_table_name = file_config["load_table"]
     date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive()
     table_name = f"fact.session_{date.format('YYYY_MM_DD')}"
     new_tables.update([table_name, extract_table_name, load_table_name])
     Fact.etl(date, file_path.name, extract_table_name, load_table_name)
示例#2
0
文件: dag_etl.py 项目: NUS-IDS/cofi
def ingest_callable(**kwargs):
    """ ingest preprocessed wifi log files to database. """

    task_instance = kwargs["ti"]
    file_config = task_instance.xcom_pull(key="config", task_ids="init")

    file_stem = file_config["file_stem"]
    extract_table_name = file_config["extract_table"]
    load_table_name = file_config["load_table"]

    logging.info(f"Looping through '{file_stem}*.csv'")

    ingest_errors = []

    for file_path in AIRFLOW_IMPORT.glob(f"{file_stem}*.csv"):
        logging.info(f"Ingesting {file_path}.")
        date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive()
        session = Session()
        if ETL.can_process("session_file", file_path, date, session):
            try:
                ETL.commit_new("session_file", file_path, date, session)
                Fact.etl(date, file_path.name, extract_table_name,
                         load_table_name)
                ETL.set_status("session_file", file_path, date, "completed",
                               session)
                session.close()
            except:
                ingest_errors.append(file_path)
                ETL.set_status("session_file", file_path, date, "quarantine",
                               session)
                session.close()

        if len(ingest_errors) > 0:
            logging.info(
                f"The following files could not be ingested: {ingest_errors}.")
            raise Exception(
                f"A total of {len(ingest_errors)} files could not be ingested. Failing DAG run"
            )