def generate_vessel_dag(vessel, dag_id, settings): home = configuration.get("core", "airflow_home") dag = DAG( dag_id=dag_id, description="", default_args=default_args, ) with dag: tile_aggregator_task = EmsaTdmOperator( task_id=f"{vessel}_tile_aggregator", handler=f"{home}/processing/tasks.py:aggregate_tiles", ) num_tiles = tiles.adjust_num_tiles(settings["processing_tiles"]) for tile in range(num_tiles): tile_subdag_id = f"tile_{tile:03d}" tile_subdag = subdag_operator.SubDagOperator( task_id=tile_subdag_id, subdag=generate_tile_dag( tile, vessel, f"{dag.dag_id}.{tile_subdag_id}", settings, )) tile_subdag >> tile_aggregator_task return dag
def __init__(self, parent_dag, component_name, unique_name, driver, executor, input_dict, output_dict, exec_properties): # Prepare parameters to create TFX worker. if unique_name: worker_name = component_name + '.' + unique_name else: worker_name = component_name task_id = parent_dag.dag_id + '.' + worker_name # Create output object of appropriate type output_dir = self._get_working_dir(parent_dag.project_path, component_name, unique_name or '') # Update the output dict before providing to downstream componentsget_ for k, output_list in output_dict.items(): for single_output in output_list: single_output.source = _OrchestrationSource(key=k, component_id=task_id) my_logger_config = logging_utils.LoggerConfig( log_root=parent_dag.logger_config.log_root, log_level=parent_dag.logger_config.log_level, pipeline_name=parent_dag.logger_config.pipeline_name, worker_name=worker_name) driver_options = base_driver.DriverOptions( worker_name=worker_name, base_output_dir=output_dir, enable_cache=parent_dag.enable_cache) worker = _TfxWorker( component_name=component_name, task_id=task_id, parent_dag=parent_dag, input_dict=input_dict, output_dict=output_dict, exec_properties=exec_properties, driver_options=driver_options, driver_class=driver, executor_class=executor, additional_pipeline_args=parent_dag.additional_pipeline_args, metadata_connection_config=parent_dag.metadata_connection_config, logger_config=my_logger_config) subdag = subdag_operator.SubDagOperator( subdag=worker, task_id=worker_name, dag=parent_dag) parent_dag.add_node_to_graph( node=subdag, consumes=input_dict.values(), produces=output_dict.values())
def create_train_model_dag() -> models.DAG: """Creates the main dag for train model main dag. Returns: Parent training DAG. """ bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) args = { 'start_date': airflow.utils.dates.days_ago(1), 'dataflow_default_options': { 'project': bb_project_vars['gcp_project_id'], 'region': bb_project_vars['gcp_region'], 'zone': bb_project_vars['gcp_zone'], 'tempLocation': bb_storage_vars['gcs_temp_path'] }, } main_dag = airflow_utils.initialize_airflow_dag( dag_id=_DAG_ID, schedule=None, retries=blockbuster_constants.DEFAULT_DAG_RETRY, retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) load_data_subdag = subdag_operator.SubDagOperator( task_id=_LOAD_DATA_TASK_NAME, subdag=load_data_dag.create_dag(args, _DAG_ID), dag=main_dag) train_model_subdag = subdag_operator.SubDagOperator( task_id=_TRAIN_MODEL_TASK_NAME, subdag=train_model_dag.create_dag(args, _DAG_ID), dag=main_dag) helpers.chain(load_data_subdag, train_model_subdag) return main_dag
def create_cleanup_task(main_dag: models.DAG, args: _AirflowDagArgs, task_id: str) -> subdag_operator.SubDagOperator: """Creates the main dag for cleaning up of GCS bucket. Args: main_dag: DAG to add this operator to. args: Dict type DAG arguments. task_id: Task id for the subdag. Returns: Parent prediction DAG for cleanup. """ cleanup_gcs_subdag = cleanup_storage_dag.create_dag(args, _DAG_ID) return subdag_operator.SubDagOperator( task_id=task_id, subdag=cleanup_gcs_subdag, dag=main_dag)
def create_activate_task(main_dag: models.DAG, args: _AirflowDagArgs, task_id: str) -> subdag_operator.SubDagOperator: """Creates activate_ga subdag for the batch_predictions pipeline. Args: main_dag: DAG to add this operator to. args: Dict type DAG arguments. task_id: Task id for the subdag. Returns: SubdagOperator to use within a DAG to do activation. """ activate_ga_subdag = activate_ga_dag.create_dag(args, _DAG_ID) return subdag_operator.SubDagOperator( task_id=task_id, subdag=activate_ga_subdag, dag=main_dag)
def create_preprocess_task(main_dag: models.DAG, args: _AirflowDagArgs, task_id: str) -> subdag_operator.SubDagOperator: """Creates preprocess subdag for the batch_predictions pipeline. Args: main_dag: DAG to add this operator to. args: Dict type DAG arguments. task_id: Task id for the subdag. Returns: SubdagOperator to use within a DAG to preprocess. """ preprocess_subdag = preprocess_dag.create_dag( args, blockbuster_constants.PreprocessingType.PREDICTION, _DAG_ID) return subdag_operator.SubDagOperator( task_id=task_id, subdag=preprocess_subdag, dag=main_dag)
def create_analyze_subdag( main_dag: models.DAG, args: _AirflowDagArgs) -> subdag_operator.SubDagOperator: """Creates analyze pipeline subdag. Args: main_dag: DAG to add this operator to. args : Dict type DAG arguments. Returns: SubdagOperator to use within a DAG to create source for training. """ analyze_subdag = analyze_dag.create_dag( args, blockbuster_constants.PreprocessingType.TRAINING, _DAG_ID) return subdag_operator.SubDagOperator(task_id=_TASK_NAME, subdag=analyze_subdag, dag=main_dag)
def create_prepare_source_subdag( main_dag: models.DAG, args: Dict[str, Union[Dict[str, Any], datetime.datetime]] ) -> subdag_operator.SubDagOperator: """Creates source preparing pipeline subdag. Args: main_dag: DAG to add this operator to. args : Dict type DAG arguments. Returns: SubdagOperator to use within a DAG to create source for training. """ prepare_source_subdag = prepare_source.create_dag( args, blockbuster_constants.PreprocessingType.TRAINING, _DAG_ID) return subdag_operator.SubDagOperator(task_id=prepare_source.DAG_NAME, subdag=prepare_source_subdag, dag=main_dag)
def generate_production_dag(settings): home = configuration.get("core", "airflow_home") dag = DAG("production_draft", description="A draft DAG for using airflow inside docker", default_args=default_args, schedule_interval=dt.timedelta(days=1)) dag.doc_md = """# This is some documentation for the DAG""" with dag: all_vessels_tdm_task = EmsaTdmOperator( task_id="all_vessels_tdm", handler=f"{home}/processing/tasks.py:generate_all_vessels_tdm", ) for vessel_config in settings["vessels"]: vessel_subdag_id = f"{vessel_config['name']}_tdm" vessel_subdag = subdag_operator.SubDagOperator( task_id=vessel_subdag_id, subdag=generate_vessel_dag(vessel_config["name"], f"{dag.dag_id}.{vessel_subdag_id}", settings), ) vessel_subdag >> all_vessels_tdm_task return dag