def create_dag(
    args: Mapping[str, Any],
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
  """Generates a DAG that pushes data from Google Cloud Storage to GA.

  Args:
    args: Arguments to provide to the Airflow DAG object as defaults.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
  dag = airflow_utils.initialize_airflow_dag(
      dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name),
      schedule=None,
      retries=blockbuster_constants.DEFAULT_DAG_RETRY,
      retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
      start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)

  storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
      blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)

  bucket_name, bucket_path = dag_utils.extract_bucket_parts(
      storage_vars['gcs_output_path'])
  _cleanup_storage_task(dag, bucket_name, bucket_path)

  return dag
예제 #2
0
def _add_storage_to_ga_task(dag, bucket_uri, ga_tracking_id, bq_dataset,
                            bq_table):
    """Adds Google Cloud Storage(GCS) to Google Analytics data transfer task.

  Args:
    dag: The dag object which will include this task.
    bucket_uri: The uri of the GCS path containing the data.
    ga_tracking_id: The Google Analytics tracking id.
    bq_dataset: BQ data set.
    bq_table: BQ Table for monitoring purposes.

  Returns:
    The task to move data from GCS to GA.
  """
    bucket_name, bucket_prefix = dag_utils.extract_bucket_parts(bucket_uri)
    return (data_connector_operator.DataConnectorOperator(
        dag_name=_DAG_NAME,
        task_id='storage_to_ga',
        input_hook=hook_factory.InputHookType.GOOGLE_CLOUD_STORAGE,
        output_hook=hook_factory.OutputHookType.GOOGLE_ANALYTICS,
        enable_monitoring=False,
        monitoring_dataset=bq_dataset,
        monitoring_table=bq_table,
        monitoring_bq_conn_id='bigquery_default',
        gcs_bucket=bucket_name,
        gcs_prefix=bucket_prefix,
        gcs_content_type=_GCS_CONTENT_TYPE,
        ga_base_params=_GA_BASE_PARAMS,
        ga_tracking_id=ga_tracking_id,
        dag=dag))
예제 #3
0
def create_dag(
    args: Mapping[str, Any],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that analyzes data before preprocessing.

  Args:
    args: Arguments to provide to the Operators as defaults.
    output_type: Which set of Variables to load for preprocessing.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    # Load params from Airflow Variables
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)
    feature_vars = dag_utils.get_feature_config_val(
        blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG)
    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None,
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        bucket_name, bucket_path = dag_utils.extract_bucket_parts(
            f'{storage_vars["gcs_temp_path"]}/training')
    else:
        bucket_name, bucket_path = dag_utils.extract_bucket_parts(
            f'{storage_vars["gcs_temp_path"]}/prediction')

    clean_temp_dir_task = gcs_delete_operator.GoogleCloudStorageDeleteOperator(
        task_id=_CLEAN_TEMP_DIR_TASK,
        bucket=bucket_name,
        directory=bucket_path,
        dag=dag)

    user_session_pipeline_task = add_user_session_task(
        dag, _USER_SESSION_TASK_ID, output_type, feature_vars, prediction_vars,
        preprocess_vars, storage_vars, training_vars)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        data_visualization_pipeline_task = add_data_visualization_task(
            dag, _DATA_VISUALISATION_TASK_ID, preprocess_vars, storage_vars)
        generate_categorical_stats_task = add_categorical_stats_task(
            dag, feature_vars, storage_vars)
        generate_numeric_stats_task = add_numeric_stats_task(
            dag, feature_vars, storage_vars)
        helpers.chain(
            clean_temp_dir_task, user_session_pipeline_task,
            data_visualization_pipeline_task,
            [generate_categorical_stats_task, generate_numeric_stats_task])
    else:
        helpers.chain(clean_temp_dir_task, user_session_pipeline_task)
    return dag