Exemplo n.º 1
0
def add_numeric_stats_task(
    dag: models.DAG, feature_vars: dag_utils.FeatureConfigListMapping,
    storage_vars: dag_utils.AirflowVarsConfig
) -> bigquery_operator.BigQueryExecuteQueryOperator:
    """Builds an Operator that generates numeric fact stats within a DAG.

  Args:
    dag: The dag that the task needs to be added to.
    feature_vars: The parsed config values from airflow feature object variable.
    storage_vars: The parsed config values from airflow storage variable.

  Returns:
    Operator used to build numeric stats within a DAG.
  """
    num_feats = dag_utils.get_features(feature_vars, 'type', 'Numeric')
    stats_dataset = (f'{storage_vars["bq_working_project"]}.'
                     f'{storage_vars["bq_working_dataset"]}')
    numeric_stats_sql = pipeline_utils.render_sql_from_template(
        'numeric_stats',
        fact_table=f'{stats_dataset}.facts',
        feature_columns=[
            f'\'{dag_utils.get_feature_name(x)}\'' for x in num_feats
        ])

    return bigquery_operator.BigQueryExecuteQueryOperator(
        task_id=_GENERATE_NUMERIC_STATS_TASK,
        sql=numeric_stats_sql,
        use_legacy_sql=False,
        destination_dataset_table=f'{stats_dataset}.num_facts_stats_table',
        create_disposition='CREATE_IF_NEEDED',
        write_disposition='WRITE_TRUNCATE',
        allow_large_results=True,
        dag=dag)
Exemplo n.º 2
0
def get_user_session_sql_params(
    output_type: blockbuster_constants.PreprocessingType,
    feature_vars: dag_utils.FeatureConfigListMapping,
    prediction_vars: dag_utils.AirflowVarsConfig,
    preprocess_vars: dag_utils.AirflowVarsConfig,
    storage_vars: dag_utils.AirflowVarsConfig
) -> Dict[str, Union[str, List[str], Iterator[str]]]:
    """Returns the sql params required for the user session pipeline.

  Args:
    output_type: Indicate whether this pipeline is to be used for training or
      prediction.
    feature_vars: The parsed config values from airflow feature object variable.
    prediction_vars: The parsed config values from airflow prediction variable.
    preprocess_vars: The parsed config values from airflow preprocess variable.
    storage_vars: The parsed config values from airflow storage variable.

  Returns:
    The user session pipeline sql param names and values as a Dict.
  """
    selected_fields = dag_utils.get_select_field_array(feature_vars)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        source_table_suffix = 'training'
        proc_st_dt = get_date_from_str(preprocess_vars['start_date'])
        proc_ed_dt = get_date_from_str(preprocess_vars['end_date'])
    else:
        source_table_suffix = 'prediction'
        lookback_days = int(preprocess_vars['lookback_days'])
        lookback_gap_in_days = int(preprocess_vars['lookbackGapInDays'])
        leads_window = int(prediction_vars['leads_submission_window'])
        window_st_days_gap = leads_window + lookback_gap_in_days + lookback_days
        window_ed_days_gap = lookback_gap_in_days
        proc_st_dt = (datetime.datetime.today() -
                      datetime.timedelta(days=window_st_days_gap)).date()
        proc_ed_dt = (datetime.datetime.today() -
                      datetime.timedelta(days=window_ed_days_gap)).date()

    source_table = (f'{storage_vars["bq_working_project"]}.'
                    f'{storage_vars["bq_working_dataset"]}.'
                    f'ga_sessions_leads_{source_table_suffix}')
    selected_fields.append(
        f'CONCAT(\'BB\', {preprocess_vars["userIdColumn"]}) AS BB_id')
    # Tables to use in the FROM, as a list of tuples
    from_tables = [(f'`{source_table}`', 'sessions')]
    if dag_utils.get_features(feature_vars, 'fact', r'^hits\..*'):
        from_tables.append(('UNNEST(hits)', 'hits'))
    return {
        'feature_columns': selected_fields,
        'from_tables': list(map(' AS '.join, from_tables)),
        'start_date': get_date_str_from_date(proc_st_dt),
        'end_date': get_date_str_from_date(proc_ed_dt)
    }