def create_dag( args: Mapping[str, Any], parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that pushes data from Google Cloud Storage to GA. Args: args: Arguments to provide to the Airflow DAG object as defaults. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ dag = airflow_utils.initialize_airflow_dag( dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), schedule=None, retries=blockbuster_constants.DEFAULT_DAG_RETRY, retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) bucket_name, bucket_path = dag_utils.extract_bucket_parts( storage_vars['gcs_output_path']) _cleanup_storage_task(dag, bucket_name, bucket_path) return dag
def create_dag( args: Mapping[str, Any], parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that trains a new model using the training dataset. Args: args: Arguments to provide to the Airflow DAG object as defaults. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ dag = airflow_utils.initialize_airflow_dag( dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), schedule=None, retries=blockbuster_constants.DEFAULT_DAG_RETRY, retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) bb_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) training_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG) create_model_task = _add_create_model_task(dag, bb_vars, training_vars) update_airflow_variable_task = _add_update_airflow_variable_task(dag) helpers.chain(create_model_task, update_airflow_variable_task) return dag
def create_dag( args: Mapping[Text, Any], parent_dag_name: Optional[Text] = None, ) -> models.DAG: """Generates a DAG that loads data into an AutoML Dataset. Args: args: Arguments to provide to the AutoML operators as defaults. parent_dag_name: If this value is provided, the newly created dag object is made a subdag of the parent dag. Returns: The DAG object. """ # Load params from Variables. bb_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) # Create dag. dag = airflow_utils.initialize_airflow_dag( dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None, # schedule blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, local_macros={ 'get_column_spec': _get_column_spec, 'target': 'predictionLabel', 'extract_object_id': automl_hook.AutoMLTablesHook.extract_object_id, }, **args) dataset_creation_task = _add_dataset_creation_task(dag, bb_vars) dataset_id = ( "{{ task_instance.xcom_pull('create_dataset_task', key='dataset_id') }}" ) import_data_task = _add_import_data_task(dag, dataset_id, bb_vars, storage_vars) list_table_specs_task = _add_list_table_specs_task(dag, dataset_id, bb_vars) list_column_specs_task = _add_list_column_specs_task( dag, dataset_id, bb_vars) update_dataset_task = _add_update_dataset_task(dag, bb_vars) update_airflow_variable_task = _add_update_airflow_variable_task(dag) helpers.chain(dataset_creation_task, import_data_task, list_table_specs_task, list_column_specs_task, update_dataset_task, update_airflow_variable_task) return dag
def create_dag( args: Dict[str, Union[Dict[str, Any], dt.datetime]], output_type: blockbuster_constants.PreprocessingType, parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that create source table from GA tables. Args: args: Arguments to provide to the operators as defaults. output_type: Which set of variables to load for preprocessing and prediction. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ # Load params from Airflow Variables. bb_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG) prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) dag_id = dag_utils.get_dag_id(DAG_NAME, parent_dag_name) dag = airflow_utils.initialize_airflow_dag( dag_id, None, blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) if output_type == blockbuster_constants.PreprocessingType.TRAINING: table_suffix = 'training' sql_vars = get_sql_params_for_training_stage(preprocess_vars) elif output_type == blockbuster_constants.PreprocessingType.PREDICTION: table_suffix = 'prediction' sql_vars = get_sql_params_for_prediction_stage(preprocess_vars, prediction_vars) sql = pipeline_utils.render_sql_from_template('source_leads', **sql_vars) bq_working_project = storage_vars['bq_working_project'] bq_working_dataset = storage_vars['bq_working_dataset'] leads_table = get_leads_table(bq_working_project, bq_working_dataset, table_suffix) gcp_region = bb_vars['gcp_region'] add_prepare_source_data_task_to_dag(dag, sql, leads_table, gcp_region) return dag
def create_dag( args: Mapping[str, Any], output_type: blockbuster_constants.PreprocessingType, parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that preprocesses data. Args: args: Arguments to provide to the Operators as defaults. output_type: Which set of Variables to load for preprocessing parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG) prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) training_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG) feature_options = dag_utils.get_feature_config_val( blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG) dag = airflow_utils.initialize_airflow_dag( dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None, blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) mlwp_sliding_window_pipeline_task = _add_mlwp_sliding_window_pipeline_task( dag, output_type, prediction_vars, preprocess_vars, storage_vars, training_vars) mlwp_generate_features_pipeline_task = _add_mlwp_generate_features_pipeline_task( dag, output_type, feature_options, storage_vars) prepare_automl_data_in_bq_task = _add_prepare_automl_data_in_bq_task( dag, output_type, prediction_vars, storage_vars) helpers.chain(mlwp_sliding_window_pipeline_task, mlwp_generate_features_pipeline_task, prepare_automl_data_in_bq_task) return dag
def create_batch_predict_dag( args: Dict[str, Union[Dict[str, Any], dt.datetime]], parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG/SubDAG that predicts using an existing model. Args: args: Arguments to provide to the operators as defaults. parent_dag_name: If this is provided, this is a SubDAG. Returns: DAG Raises: KeyError: If recent_model_id Airflow Variable hasn't been set. """ dag_id = dag_utils.get_dag_id(DAG_NAME, parent_dag_name) dag = airflow_utils.initialize_airflow_dag( dag_id=dag_id, schedule=None, retries=blockbuster_constants.DEFAULT_DAG_RETRY, retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, local_macros={'extract_dataset_id': _extract_dataset_id}, **args) batch_predict_task = _get_batch_predictions(dag, 'batch_predict_task') automl_bq_location = ( '{{ extract_dataset_id(' # macro to be expanded in task Jinja template 'task_instance.xcom_pull(' '"batch_predict_task", key="bq_output_dataset")) }}') batch_predict_sql = _generate_batch_prediction_sql_template( 'batch_predict', automl_bq_location) get_output_data_task = _store_final_results_to_bq(dag, 'get_output_data', batch_predict_sql) bq_to_gcs_task = _transfer_bigquery_to_gcs(dag, 'bq_to_gcs') dag >> batch_predict_task >> get_output_data_task >> bq_to_gcs_task return dag
def create_dag( args: Mapping[str, Any], output_type: blockbuster_constants.PreprocessingType, parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that analyzes data before preprocessing. Args: args: Arguments to provide to the Operators as defaults. output_type: Which set of Variables to load for preprocessing. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ # Load params from Airflow Variables preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG) prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) training_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG) feature_vars = dag_utils.get_feature_config_val( blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG) dag = airflow_utils.initialize_airflow_dag( dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None, blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) if output_type == blockbuster_constants.PreprocessingType.TRAINING: bucket_name, bucket_path = dag_utils.extract_bucket_parts( f'{storage_vars["gcs_temp_path"]}/training') else: bucket_name, bucket_path = dag_utils.extract_bucket_parts( f'{storage_vars["gcs_temp_path"]}/prediction') clean_temp_dir_task = gcs_delete_operator.GoogleCloudStorageDeleteOperator( task_id=_CLEAN_TEMP_DIR_TASK, bucket=bucket_name, directory=bucket_path, dag=dag) user_session_pipeline_task = add_user_session_task( dag, _USER_SESSION_TASK_ID, output_type, feature_vars, prediction_vars, preprocess_vars, storage_vars, training_vars) if output_type == blockbuster_constants.PreprocessingType.TRAINING: data_visualization_pipeline_task = add_data_visualization_task( dag, _DATA_VISUALISATION_TASK_ID, preprocess_vars, storage_vars) generate_categorical_stats_task = add_categorical_stats_task( dag, feature_vars, storage_vars) generate_numeric_stats_task = add_numeric_stats_task( dag, feature_vars, storage_vars) helpers.chain( clean_temp_dir_task, user_session_pipeline_task, data_visualization_pipeline_task, [generate_categorical_stats_task, generate_numeric_stats_task]) else: helpers.chain(clean_temp_dir_task, user_session_pipeline_task) return dag