Пример #1
0
def add_numeric_stats_task(
    dag: models.DAG, feature_vars: dag_utils.FeatureConfigListMapping,
    storage_vars: dag_utils.AirflowVarsConfig
) -> bigquery_operator.BigQueryExecuteQueryOperator:
    """Builds an Operator that generates numeric fact stats within a DAG.

  Args:
    dag: The dag that the task needs to be added to.
    feature_vars: The parsed config values from airflow feature object variable.
    storage_vars: The parsed config values from airflow storage variable.

  Returns:
    Operator used to build numeric stats within a DAG.
  """
    num_feats = dag_utils.get_features(feature_vars, 'type', 'Numeric')
    stats_dataset = (f'{storage_vars["bq_working_project"]}.'
                     f'{storage_vars["bq_working_dataset"]}')
    numeric_stats_sql = pipeline_utils.render_sql_from_template(
        'numeric_stats',
        fact_table=f'{stats_dataset}.facts',
        feature_columns=[
            f'\'{dag_utils.get_feature_name(x)}\'' for x in num_feats
        ])

    return bigquery_operator.BigQueryExecuteQueryOperator(
        task_id=_GENERATE_NUMERIC_STATS_TASK,
        sql=numeric_stats_sql,
        use_legacy_sql=False,
        destination_dataset_table=f'{stats_dataset}.num_facts_stats_table',
        create_disposition='CREATE_IF_NEEDED',
        write_disposition='WRITE_TRUNCATE',
        allow_large_results=True,
        dag=dag)
Пример #2
0
def _generate_batch_prediction_sql_template(template: str,
                                            bq_location: str) -> str:
    """Build batch_predict sql as per Measurement Protocol.

  Args:
    template: sql jinja template to load.
    bq_location: Location where the batch_predictions are stored by auto_ml

  Returns:
    SQL.
  """
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    activation_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    prediction_source_table = '{project}.{bigquery_location}.predictions'.format(
        project=storage_vars['bq_working_project'],
        bigquery_location=bq_location)
    sql_template = pipeline_utils.render_sql_from_template(
        template,
        source_table=prediction_source_table,
        num_segments=20,
        event_label=activation_vars['event_label'],
        event_action="'" + activation_vars['event_action'] + "'",
        event_category="'" + activation_vars['event_category'] + "'")

    return sql_template
Пример #3
0
def add_user_session_task(
        dag: models.DAG, task_id: str,
        output_type: blockbuster_constants.PreprocessingType,
        feature_vars: dag_utils.FeatureConfigListMapping,
        prediction_vars: dag_utils.AirflowVarsConfig,
        preprocess_vars: dag_utils.AirflowVarsConfig,
        storage_vars: dag_utils.AirflowVarsConfig,
        training_vars: dag_utils.AirflowVarsConfig) -> models.BaseOperator:
    """Builds the UserSessionPipeline Operator.

  Args:
    dag: The dag that the task needs to be added to.
    task_id: Id string for this specific task within the DAG.
    output_type: Indicate whether this pipeline is to be used for training or
      prediction.
    feature_vars: The parsed config values from airflow feature object variable.
    prediction_vars: The parsed config values from airflow prediction variable.
    preprocess_vars: The parsed config values from airflow preprocess variable.
    storage_vars: The parsed config values from airflow storage variable.
    training_vars: The parsed config values from airflow training variable.

  Returns:
    Operator to use within a DAG to run the User Session Pipeline on Dataflow.
  """
    # Load start/end date from the appropriate Airflow Variable
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        output_path = f'{storage_vars["gcs_temp_path"]}/training'

    elif output_type == blockbuster_constants.PreprocessingType.PREDICTION:
        output_path = f'{storage_vars["gcs_temp_path"]}/prediction'

    template_file_directory = storage_vars['gcs_dataflow_path']
    sql_vars = get_user_session_sql_params(
        output_type,
        feature_vars,
        prediction_vars,
        preprocess_vars,
        storage_vars,
    )
    sql = pipeline_utils.render_sql_from_template('usersession_source',
                                                  **sql_vars)

    return dataflow_operator.DataflowTemplatedJobStartOperator(
        task_id=task_id,
        template=f'{template_file_directory}/UserSessionPipeline',
        parameters={
            'inputBigQuerySQL': sql,
            'outputSessionsAvroPrefix': f'{output_path}/usersession-output/',
            'predictionFactName': training_vars['predictionFactName'],
            'predictionFactValues': training_vars['predictionFactValues']
        },
        dag=dag)
Пример #4
0
def _add_prepare_automl_data_in_bq_task(
    dag: models.DAG, output_type: blockbuster_constants.PreprocessingType,
    prediction_vars: dag_utils.AirflowVarsConfig,
    storage_vars: dag_utils.AirflowVarsConfig
) -> bigquery_operator.BigQueryExecuteQueryOperator:
    """Adds the task to write the output to Big Query to dag.

  Args:
    dag: The dag that the task needs to be added to.
    output_type: Indicate whether this pipeline is to be used for training or
      prediction.
    prediction_vars: The parsed config values from airflow prediction variable.
    storage_vars: The parsed config values from airflow storage variable.

  Returns:
    The configured BigQueryOperator task to write input data for automl that was
    added to the dag.
  """
    exclude_from_output = ['userId', 'RECENT_BB_id', 'RECENT_most_recent_lead']
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        output_table = 'training'
        exclude_from_output.append('BB_id')
    elif output_type == blockbuster_constants.PreprocessingType.PREDICTION:
        output_table = 'prediction'
        exclude_from_output.append('MLDataSplit')
    features_table = dag_utils.construct_bq_table_path(
        storage_vars['bq_working_project'], storage_vars['bq_working_dataset'],
        f'ga_{output_table}_input')
    prepare_data_sql = pipeline_utils.render_sql_from_template(
        'prepare_data',
        features_table=features_table,
        exclude_from_output=exclude_from_output,
        inclusion_recency_days=prediction_vars['leads_submission_window'])

    output_dataset = dag_utils.construct_bq_table_path(
        storage_vars['bq_working_project'], storage_vars['bq_working_dataset'],
        f'automl_{output_table}_input')

    prepare_data_for_automl = bigquery_operator.BigQueryExecuteQueryOperator(
        task_id='prepare_data_for_automl',
        sql=prepare_data_sql,
        use_legacy_sql=False,
        destination_dataset_table=output_dataset,
        create_disposition='CREATE_IF_NEEDED',
        write_disposition='WRITE_TRUNCATE',
        allow_large_results=True,
        dag=dag,
    )
    return prepare_data_for_automl
def create_dag(
    args: Dict[str, Union[Dict[str, Any], dt.datetime]],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that create source table from GA tables.


  Args:
    args: Arguments to provide to the operators as defaults.
    output_type: Which set of variables to load for preprocessing and
      prediction.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    # Load params from Airflow Variables.
    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    dag_id = dag_utils.get_dag_id(DAG_NAME, parent_dag_name)
    dag = airflow_utils.initialize_airflow_dag(
        dag_id, None, blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        table_suffix = 'training'
        sql_vars = get_sql_params_for_training_stage(preprocess_vars)
    elif output_type == blockbuster_constants.PreprocessingType.PREDICTION:
        table_suffix = 'prediction'
        sql_vars = get_sql_params_for_prediction_stage(preprocess_vars,
                                                       prediction_vars)
    sql = pipeline_utils.render_sql_from_template('source_leads', **sql_vars)
    bq_working_project = storage_vars['bq_working_project']
    bq_working_dataset = storage_vars['bq_working_dataset']
    leads_table = get_leads_table(bq_working_project, bq_working_dataset,
                                  table_suffix)
    gcp_region = bb_vars['gcp_region']
    add_prepare_source_data_task_to_dag(dag, sql, leads_table, gcp_region)
    return dag