Exemplo n.º 1
0
def _generate_batch_prediction_sql_template(template: str,
                                            bq_location: str) -> str:
    """Build batch_predict sql as per Measurement Protocol.

  Args:
    template: sql jinja template to load.
    bq_location: Location where the batch_predictions are stored by auto_ml

  Returns:
    SQL.
  """
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    activation_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    prediction_source_table = '{project}.{bigquery_location}.predictions'.format(
        project=storage_vars['bq_working_project'],
        bigquery_location=bq_location)
    sql_template = pipeline_utils.render_sql_from_template(
        template,
        source_table=prediction_source_table,
        num_segments=20,
        event_label=activation_vars['event_label'],
        event_action="'" + activation_vars['event_action'] + "'",
        event_category="'" + activation_vars['event_category'] + "'")

    return sql_template
Exemplo n.º 2
0
def create_dag(
    args: Mapping[str, Any],
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that trains a new model using the training dataset.

  Args:
    args: Arguments to provide to the Airflow DAG object as defaults.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    dag = airflow_utils.initialize_airflow_dag(
        dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name),
        schedule=None,
        retries=blockbuster_constants.DEFAULT_DAG_RETRY,
        retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
        **args)

    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)

    create_model_task = _add_create_model_task(dag, bb_vars, training_vars)
    update_airflow_variable_task = _add_update_airflow_variable_task(dag)

    helpers.chain(create_model_task, update_airflow_variable_task)
    return dag
Exemplo n.º 3
0
def create_training_preprocess_dag():
  """Creates the main dag for preprocess main dag.

  Returns:
    Parent training DAG for preprocessing.
  """
  bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
      blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
  bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict(
      blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
  args = {
      'start_date': airflow.utils.dates.days_ago(1),
      'dataflow_default_options': {
          'project': bb_project_vars['gcp_project_id'],
          'region': bb_project_vars['gcp_region'],
          'zone': bb_project_vars['gcp_zone'],
          'tempLocation': bb_storage_vars['gcs_temp_path']
      },
  }

  main_dag = airflow_utils.initialize_airflow_dag(
      dag_id=_DAG_ID,
      schedule=None,
      retries=blockbuster_constants.DEFAULT_DAG_RETRY,
      retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
      start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
      **args)

  create_preprocess_subdag(main_dag, args)

  return main_dag
Exemplo n.º 4
0
def create_dag(
    args: Mapping[str, Any],
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that pushes data from Google Cloud Storage to GA.

  Args:
    args: Arguments to provide to the Airflow DAG object as defaults.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    dag = airflow_utils.initialize_airflow_dag(
        dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name),
        schedule=None,
        retries=blockbuster_constants.DEFAULT_DAG_RETRY,
        retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
        **args)

    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    activation_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_ACTIVATION_CONFIG)

    bucket_uri = storage_vars['gcs_output_path']
    bq_dataset = storage_vars['bq_working_dataset']
    bq_table = 'monitoring'
    ga_tracking_id = activation_vars['ga_tracking_id']

    _add_storage_to_ga_task(dag, bucket_uri, ga_tracking_id, bq_dataset,
                            bq_table)

    return dag
Exemplo n.º 5
0
def _store_final_results_to_bq(dag: models.DAG, task_id: str,
                               batch_predict_sql: str) -> models.BaseOperator:
    """Store MP complaint results in Bigquery before GA transfer.

  Args:
    dag: the DAG to add this operator to
    task_id: ID for this specific task within the DAG.
    batch_predict_sql: Custom Query to pick records and add some additional
      colums as MP protocol.

  Returns:
    Operator to use within a DAG to store Prediction results to Bigquery.
  """
    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)

    final_output_table = '{project}.{dataset}.final_output'.format(
        project=storage_vars['bq_working_project'],
        dataset=storage_vars['bq_working_dataset'])

    return bigquery_operator.BigQueryOperator(
        task_id=task_id,
        sql=batch_predict_sql,
        use_legacy_sql=False,
        destination_dataset_table=final_output_table,
        create_disposition='CREATE_IF_NEEDED',
        write_disposition='WRITE_TRUNCATE',
        allow_large_results=True,
        location=bb_vars['gcp_region'],
        dag=dag,
    )
Exemplo n.º 6
0
def create_dag(
    args: Mapping[Text, Any],
    parent_dag_name: Optional[Text] = None,
) -> models.DAG:
    """Generates a DAG that loads data into an AutoML Dataset.

  Args:
    args: Arguments to provide to the AutoML operators as defaults.
    parent_dag_name: If this value is provided, the newly created dag object is
      made a subdag of the parent dag.

  Returns:
    The DAG object.
  """
    # Load params from Variables.
    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    # Create dag.
    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name),
        None,  # schedule
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO,
        local_macros={
            'get_column_spec': _get_column_spec,
            'target': 'predictionLabel',
            'extract_object_id':
            automl_hook.AutoMLTablesHook.extract_object_id,
        },
        **args)
    dataset_creation_task = _add_dataset_creation_task(dag, bb_vars)
    dataset_id = (
        "{{ task_instance.xcom_pull('create_dataset_task', key='dataset_id') }}"
    )
    import_data_task = _add_import_data_task(dag, dataset_id, bb_vars,
                                             storage_vars)
    list_table_specs_task = _add_list_table_specs_task(dag, dataset_id,
                                                       bb_vars)
    list_column_specs_task = _add_list_column_specs_task(
        dag, dataset_id, bb_vars)
    update_dataset_task = _add_update_dataset_task(dag, bb_vars)
    update_airflow_variable_task = _add_update_airflow_variable_task(dag)
    helpers.chain(dataset_creation_task, import_data_task,
                  list_table_specs_task, list_column_specs_task,
                  update_dataset_task, update_airflow_variable_task)
    return dag
Exemplo n.º 7
0
def _get_batch_predictions(dag: models.DAG,
                           task_id: str) -> models.BaseOperator:
    """Batch Predict the GA leads using pre-trained AutoML model.

  Args:
    dag: the DAG to add this operator to
    task_id: ID for this specific task within the DAG.

  Returns:
    Operator to use within a DAG to run the Batch Prediction Pipeline on automl.
  """
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)

    model_id = airflow_utils.get_airflow_variable(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_RECENT_MODEL)

    bq_input_path = 'bq://{project}.{dataset}.automl_prediction_input'.format(
        project=storage_vars['bq_working_project'],
        dataset=storage_vars['bq_working_dataset'])

    output_path = f'bq://{storage_vars["bq_working_project"]}'

    output_key = 'bq_output_dataset'
    task_batch_predict = (automl_tables_batch_prediction_operator.
                          AutoMLTablesBatchPredictionOperator(
                              task_id=task_id,
                              model_id=model_id,
                              input_path=bq_input_path,
                              output_path=output_path,
                              output_key=output_key,
                              conn_id='google_cloud_default',
                              dag=dag))

    return task_batch_predict
Exemplo n.º 8
0
def _transfer_bigquery_to_gcs(dag, task_id) -> models.BaseOperator:
    """Pipeline to transfer finally transferable output to GCS.

  Args:
    dag: the DAG to add this operator to
    task_id: ID for this specific task within the DAG.

  Returns:
    Operator to use within a DAG to run the Pipeline for moving records to GCS.
  """
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)

    final_output_uri = '{path}/result-{timestamp}-*.json'.format(
        path=storage_vars['gcs_output_path'], timestamp=int(time.time()))

    final_output_table = '{project}.{dataset}.final_output'.format(
        project=storage_vars['bq_working_project'],
        dataset=storage_vars['bq_working_dataset'])

    return bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id=task_id,
        source_project_dataset_table=final_output_table,
        destination_cloud_storage_uris=[final_output_uri],
        export_format='NEWLINE_DELIMITED_JSON',
        dag=dag)
def create_dag(
    args: Dict[str, Union[Dict[str, Any], dt.datetime]],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that create source table from GA tables.


  Args:
    args: Arguments to provide to the operators as defaults.
    output_type: Which set of variables to load for preprocessing and
      prediction.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    # Load params from Airflow Variables.
    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    dag_id = dag_utils.get_dag_id(DAG_NAME, parent_dag_name)
    dag = airflow_utils.initialize_airflow_dag(
        dag_id, None, blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        table_suffix = 'training'
        sql_vars = get_sql_params_for_training_stage(preprocess_vars)
    elif output_type == blockbuster_constants.PreprocessingType.PREDICTION:
        table_suffix = 'prediction'
        sql_vars = get_sql_params_for_prediction_stage(preprocess_vars,
                                                       prediction_vars)
    sql = pipeline_utils.render_sql_from_template('source_leads', **sql_vars)
    bq_working_project = storage_vars['bq_working_project']
    bq_working_dataset = storage_vars['bq_working_dataset']
    leads_table = get_leads_table(bq_working_project, bq_working_dataset,
                                  table_suffix)
    gcp_region = bb_vars['gcp_region']
    add_prepare_source_data_task_to_dag(dag, sql, leads_table, gcp_region)
    return dag
Exemplo n.º 10
0
def create_dag(
    args: Mapping[str, Any],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that preprocesses data.

  Args:
    args: Arguments to provide to the Operators as defaults.
    output_type: Which set of Variables to load for preprocessing
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)
    feature_options = dag_utils.get_feature_config_val(
        blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG)

    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None,
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)

    mlwp_sliding_window_pipeline_task = _add_mlwp_sliding_window_pipeline_task(
        dag, output_type, prediction_vars, preprocess_vars, storage_vars,
        training_vars)
    mlwp_generate_features_pipeline_task = _add_mlwp_generate_features_pipeline_task(
        dag, output_type, feature_options, storage_vars)
    prepare_automl_data_in_bq_task = _add_prepare_automl_data_in_bq_task(
        dag, output_type, prediction_vars, storage_vars)
    helpers.chain(mlwp_sliding_window_pipeline_task,
                  mlwp_generate_features_pipeline_task,
                  prepare_automl_data_in_bq_task)

    return dag
Exemplo n.º 11
0
def create_prediction_activate_dag():
    """Creates the main dag for analyze main dag.

  Returns:
    Parent training DAG for analyzing.
  """
    bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    args = {
        'start_date': airflow.utils.dates.days_ago(1),
        'dataflow_default_options': {
            'project': bb_project_vars['gcp_project_id'],
            'region': bb_project_vars['gcp_region'],
            'zone': bb_project_vars['gcp_zone'],
            'tempLocation': bb_storage_vars['gcs_temp_path']
        },
    }

    main_dag = airflow_utils.initialize_airflow_dag(
        dag_id=_DAG_ID,
        schedule=None,
        retries=blockbuster_constants.DEFAULT_DAG_RETRY,
        retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
        **args)

    prepare_source_task = create_prepare_source_task(
        main_dag, args, prepare_source_dag.DAG_NAME)
    analyze_task = create_analyze_task(main_dag, args, 'analyze')
    preprocess_task = create_preprocess_task(main_dag, args, 'preprocess')
    predict_task = create_predict_task(main_dag, args, 'batch_predict')
    activate_task = create_activate_task(main_dag, args, 'activate_ga')
    clean_up_task = create_cleanup_task(main_dag, args, 'cleanup_gcs')

    # Create task dependency pipeline.
    prepare_source_task.set_downstream(analyze_task)
    analyze_task.set_downstream(preprocess_task)
    preprocess_task.set_downstream(predict_task)
    predict_task.set_downstream(activate_task)
    activate_task.set_downstream(clean_up_task)
    return main_dag
Exemplo n.º 12
0
def create_train_model_dag() -> models.DAG:
    """Creates the main dag for train model main dag.

  Returns:
    Parent training DAG.
  """
    bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    args = {
        'start_date': airflow.utils.dates.days_ago(1),
        'dataflow_default_options': {
            'project': bb_project_vars['gcp_project_id'],
            'region': bb_project_vars['gcp_region'],
            'zone': bb_project_vars['gcp_zone'],
            'tempLocation': bb_storage_vars['gcs_temp_path']
        },
    }

    main_dag = airflow_utils.initialize_airflow_dag(
        dag_id=_DAG_ID,
        schedule=None,
        retries=blockbuster_constants.DEFAULT_DAG_RETRY,
        retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
        **args)
    load_data_subdag = subdag_operator.SubDagOperator(
        task_id=_LOAD_DATA_TASK_NAME,
        subdag=load_data_dag.create_dag(args, _DAG_ID),
        dag=main_dag)
    train_model_subdag = subdag_operator.SubDagOperator(
        task_id=_TRAIN_MODEL_TASK_NAME,
        subdag=train_model_dag.create_dag(args, _DAG_ID),
        dag=main_dag)

    helpers.chain(load_data_subdag, train_model_subdag)
    return main_dag
Exemplo n.º 13
0
def create_dag(
    args: Mapping[str, Any],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that analyzes data before preprocessing.

  Args:
    args: Arguments to provide to the Operators as defaults.
    output_type: Which set of Variables to load for preprocessing.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    # Load params from Airflow Variables
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)
    feature_vars = dag_utils.get_feature_config_val(
        blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG)
    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None,
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        bucket_name, bucket_path = dag_utils.extract_bucket_parts(
            f'{storage_vars["gcs_temp_path"]}/training')
    else:
        bucket_name, bucket_path = dag_utils.extract_bucket_parts(
            f'{storage_vars["gcs_temp_path"]}/prediction')

    clean_temp_dir_task = gcs_delete_operator.GoogleCloudStorageDeleteOperator(
        task_id=_CLEAN_TEMP_DIR_TASK,
        bucket=bucket_name,
        directory=bucket_path,
        dag=dag)

    user_session_pipeline_task = add_user_session_task(
        dag, _USER_SESSION_TASK_ID, output_type, feature_vars, prediction_vars,
        preprocess_vars, storage_vars, training_vars)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        data_visualization_pipeline_task = add_data_visualization_task(
            dag, _DATA_VISUALISATION_TASK_ID, preprocess_vars, storage_vars)
        generate_categorical_stats_task = add_categorical_stats_task(
            dag, feature_vars, storage_vars)
        generate_numeric_stats_task = add_numeric_stats_task(
            dag, feature_vars, storage_vars)
        helpers.chain(
            clean_temp_dir_task, user_session_pipeline_task,
            data_visualization_pipeline_task,
            [generate_categorical_stats_task, generate_numeric_stats_task])
    else:
        helpers.chain(clean_temp_dir_task, user_session_pipeline_task)
    return dag