Exemplo n.º 1
0
def generate_vessel_dag(vessel, dag_id, settings):
    home = configuration.get("core", "airflow_home")
    dag = DAG(
        dag_id=dag_id,
        description="",
        default_args=default_args,
    )
    with dag:
        tile_aggregator_task = EmsaTdmOperator(
            task_id=f"{vessel}_tile_aggregator",
            handler=f"{home}/processing/tasks.py:aggregate_tiles",
        )
        num_tiles = tiles.adjust_num_tiles(settings["processing_tiles"])
        for tile in range(num_tiles):
            tile_subdag_id = f"tile_{tile:03d}"
            tile_subdag = subdag_operator.SubDagOperator(
                task_id=tile_subdag_id,
                subdag=generate_tile_dag(
                    tile,
                    vessel,
                    f"{dag.dag_id}.{tile_subdag_id}",
                    settings,
                ))
            tile_subdag >> tile_aggregator_task
    return dag
Exemplo n.º 2
0
  def __init__(self,
               parent_dag,
               component_name,
               unique_name,
               driver,
               executor,
               input_dict,
               output_dict,
               exec_properties):
    # Prepare parameters to create TFX worker.
    if unique_name:
      worker_name = component_name + '.' + unique_name
    else:
      worker_name = component_name
    task_id = parent_dag.dag_id + '.' + worker_name

    # Create output object of appropriate type
    output_dir = self._get_working_dir(parent_dag.project_path, component_name,
                                       unique_name or '')

    # Update the output dict before providing to downstream componentsget_
    for k, output_list in output_dict.items():
      for single_output in output_list:
        single_output.source = _OrchestrationSource(key=k, component_id=task_id)

    my_logger_config = logging_utils.LoggerConfig(
        log_root=parent_dag.logger_config.log_root,
        log_level=parent_dag.logger_config.log_level,
        pipeline_name=parent_dag.logger_config.pipeline_name,
        worker_name=worker_name)
    driver_options = base_driver.DriverOptions(
        worker_name=worker_name,
        base_output_dir=output_dir,
        enable_cache=parent_dag.enable_cache)

    worker = _TfxWorker(
        component_name=component_name,
        task_id=task_id,
        parent_dag=parent_dag,
        input_dict=input_dict,
        output_dict=output_dict,
        exec_properties=exec_properties,
        driver_options=driver_options,
        driver_class=driver,
        executor_class=executor,
        additional_pipeline_args=parent_dag.additional_pipeline_args,
        metadata_connection_config=parent_dag.metadata_connection_config,
        logger_config=my_logger_config)
    subdag = subdag_operator.SubDagOperator(
        subdag=worker, task_id=worker_name, dag=parent_dag)

    parent_dag.add_node_to_graph(
        node=subdag,
        consumes=input_dict.values(),
        produces=output_dict.values())
Exemplo n.º 3
0
def create_train_model_dag() -> models.DAG:
    """Creates the main dag for train model main dag.

  Returns:
    Parent training DAG.
  """
    bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    args = {
        'start_date': airflow.utils.dates.days_ago(1),
        'dataflow_default_options': {
            'project': bb_project_vars['gcp_project_id'],
            'region': bb_project_vars['gcp_region'],
            'zone': bb_project_vars['gcp_zone'],
            'tempLocation': bb_storage_vars['gcs_temp_path']
        },
    }

    main_dag = airflow_utils.initialize_airflow_dag(
        dag_id=_DAG_ID,
        schedule=None,
        retries=blockbuster_constants.DEFAULT_DAG_RETRY,
        retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
        **args)
    load_data_subdag = subdag_operator.SubDagOperator(
        task_id=_LOAD_DATA_TASK_NAME,
        subdag=load_data_dag.create_dag(args, _DAG_ID),
        dag=main_dag)
    train_model_subdag = subdag_operator.SubDagOperator(
        task_id=_TRAIN_MODEL_TASK_NAME,
        subdag=train_model_dag.create_dag(args, _DAG_ID),
        dag=main_dag)

    helpers.chain(load_data_subdag, train_model_subdag)
    return main_dag
def create_cleanup_task(main_dag: models.DAG, args: _AirflowDagArgs,
                        task_id: str) -> subdag_operator.SubDagOperator:
  """Creates the main dag for cleaning up of GCS bucket.

  Args:
    main_dag: DAG to add this operator to.
    args: Dict type DAG arguments.
    task_id: Task id for the subdag.

  Returns:
    Parent prediction DAG for cleanup.
  """
  cleanup_gcs_subdag = cleanup_storage_dag.create_dag(args, _DAG_ID)

  return subdag_operator.SubDagOperator(
      task_id=task_id, subdag=cleanup_gcs_subdag, dag=main_dag)
def create_activate_task(main_dag: models.DAG, args: _AirflowDagArgs,
                         task_id: str) -> subdag_operator.SubDagOperator:
  """Creates activate_ga subdag for the batch_predictions pipeline.

  Args:
    main_dag: DAG to add this operator to.
    args: Dict type DAG arguments.
    task_id: Task id for the subdag.

  Returns:
    SubdagOperator to use within a DAG to do activation.
  """
  activate_ga_subdag = activate_ga_dag.create_dag(args, _DAG_ID)

  return subdag_operator.SubDagOperator(
      task_id=task_id, subdag=activate_ga_subdag, dag=main_dag)
def create_preprocess_task(main_dag: models.DAG, args: _AirflowDagArgs,
                           task_id: str) -> subdag_operator.SubDagOperator:
  """Creates preprocess subdag for the batch_predictions pipeline.

  Args:
    main_dag: DAG to add this operator to.
    args: Dict type DAG arguments.
    task_id: Task id for the subdag.

  Returns:
    SubdagOperator to use within a DAG to preprocess.
  """
  preprocess_subdag = preprocess_dag.create_dag(
      args, blockbuster_constants.PreprocessingType.PREDICTION, _DAG_ID)

  return subdag_operator.SubDagOperator(
      task_id=task_id, subdag=preprocess_subdag, dag=main_dag)
def create_analyze_subdag(
        main_dag: models.DAG,
        args: _AirflowDagArgs) -> subdag_operator.SubDagOperator:
    """Creates analyze pipeline subdag.

  Args:
    main_dag: DAG to add this operator to.
    args : Dict type DAG arguments.

  Returns:
    SubdagOperator to use within a DAG to create source for training.
  """
    analyze_subdag = analyze_dag.create_dag(
        args, blockbuster_constants.PreprocessingType.TRAINING, _DAG_ID)

    return subdag_operator.SubDagOperator(task_id=_TASK_NAME,
                                          subdag=analyze_subdag,
                                          dag=main_dag)
def create_prepare_source_subdag(
    main_dag: models.DAG, args: Dict[str, Union[Dict[str, Any],
                                                datetime.datetime]]
) -> subdag_operator.SubDagOperator:
    """Creates source preparing pipeline subdag.

  Args:
    main_dag: DAG to add this operator to.
    args : Dict type DAG arguments.

  Returns:
    SubdagOperator to use within a DAG to create source for training.
  """

    prepare_source_subdag = prepare_source.create_dag(
        args, blockbuster_constants.PreprocessingType.TRAINING, _DAG_ID)

    return subdag_operator.SubDagOperator(task_id=prepare_source.DAG_NAME,
                                          subdag=prepare_source_subdag,
                                          dag=main_dag)
Exemplo n.º 9
0
def generate_production_dag(settings):
    home = configuration.get("core", "airflow_home")
    dag = DAG("production_draft",
              description="A draft DAG for using airflow inside docker",
              default_args=default_args,
              schedule_interval=dt.timedelta(days=1))
    dag.doc_md = """# This is some documentation for the DAG"""
    with dag:
        all_vessels_tdm_task = EmsaTdmOperator(
            task_id="all_vessels_tdm",
            handler=f"{home}/processing/tasks.py:generate_all_vessels_tdm",
        )
        for vessel_config in settings["vessels"]:
            vessel_subdag_id = f"{vessel_config['name']}_tdm"
            vessel_subdag = subdag_operator.SubDagOperator(
                task_id=vessel_subdag_id,
                subdag=generate_vessel_dag(vessel_config["name"],
                                           f"{dag.dag_id}.{vessel_subdag_id}",
                                           settings),
            )
            vessel_subdag >> all_vessels_tdm_task

    return dag