def create_dag( args: Mapping[str, Any], parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that pushes data from Google Cloud Storage to GA. Args: args: Arguments to provide to the Airflow DAG object as defaults. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ dag = airflow_utils.initialize_airflow_dag( dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), schedule=None, retries=blockbuster_constants.DEFAULT_DAG_RETRY, retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) bucket_name, bucket_path = dag_utils.extract_bucket_parts( storage_vars['gcs_output_path']) _cleanup_storage_task(dag, bucket_name, bucket_path) return dag
def _add_storage_to_ga_task(dag, bucket_uri, ga_tracking_id, bq_dataset, bq_table): """Adds Google Cloud Storage(GCS) to Google Analytics data transfer task. Args: dag: The dag object which will include this task. bucket_uri: The uri of the GCS path containing the data. ga_tracking_id: The Google Analytics tracking id. bq_dataset: BQ data set. bq_table: BQ Table for monitoring purposes. Returns: The task to move data from GCS to GA. """ bucket_name, bucket_prefix = dag_utils.extract_bucket_parts(bucket_uri) return (data_connector_operator.DataConnectorOperator( dag_name=_DAG_NAME, task_id='storage_to_ga', input_hook=hook_factory.InputHookType.GOOGLE_CLOUD_STORAGE, output_hook=hook_factory.OutputHookType.GOOGLE_ANALYTICS, enable_monitoring=False, monitoring_dataset=bq_dataset, monitoring_table=bq_table, monitoring_bq_conn_id='bigquery_default', gcs_bucket=bucket_name, gcs_prefix=bucket_prefix, gcs_content_type=_GCS_CONTENT_TYPE, ga_base_params=_GA_BASE_PARAMS, ga_tracking_id=ga_tracking_id, dag=dag))
def create_dag( args: Mapping[str, Any], output_type: blockbuster_constants.PreprocessingType, parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that analyzes data before preprocessing. Args: args: Arguments to provide to the Operators as defaults. output_type: Which set of Variables to load for preprocessing. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ # Load params from Airflow Variables preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG) prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) training_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG) feature_vars = dag_utils.get_feature_config_val( blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG) dag = airflow_utils.initialize_airflow_dag( dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None, blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) if output_type == blockbuster_constants.PreprocessingType.TRAINING: bucket_name, bucket_path = dag_utils.extract_bucket_parts( f'{storage_vars["gcs_temp_path"]}/training') else: bucket_name, bucket_path = dag_utils.extract_bucket_parts( f'{storage_vars["gcs_temp_path"]}/prediction') clean_temp_dir_task = gcs_delete_operator.GoogleCloudStorageDeleteOperator( task_id=_CLEAN_TEMP_DIR_TASK, bucket=bucket_name, directory=bucket_path, dag=dag) user_session_pipeline_task = add_user_session_task( dag, _USER_SESSION_TASK_ID, output_type, feature_vars, prediction_vars, preprocess_vars, storage_vars, training_vars) if output_type == blockbuster_constants.PreprocessingType.TRAINING: data_visualization_pipeline_task = add_data_visualization_task( dag, _DATA_VISUALISATION_TASK_ID, preprocess_vars, storage_vars) generate_categorical_stats_task = add_categorical_stats_task( dag, feature_vars, storage_vars) generate_numeric_stats_task = add_numeric_stats_task( dag, feature_vars, storage_vars) helpers.chain( clean_temp_dir_task, user_session_pipeline_task, data_visualization_pipeline_task, [generate_categorical_stats_task, generate_numeric_stats_task]) else: helpers.chain(clean_temp_dir_task, user_session_pipeline_task) return dag