def do_load_to_datastore(**kwargs): """ Saves the predictions results into Datastore. Because there is no way to directly load a CSV to Datastore, we use Apache Beam on Dataflow with templates gs://dataflow-templates/latest/GCS_Text_to_Datastore. https://cloud.google.com/dataflow/docs/templates/provided-templates#gcstexttodatastore """ gcs_prediction_output = 'gs://{}/predictions/output'.format( COMPOSER_BUCKET_NAME) template = 'gs://dataflow-templates/latest/GCS_Text_to_Datastore' df_template_params = { 'textReadPattern': '{}/prediction.results*'.format(gcs_prediction_output), 'javascriptTextTransformGcsPath': 'gs://{}/gcs_datastore_transform.js'.format(COMPOSER_BUCKET_NAME), 'javascriptTextTransformFunctionName': 'from_prediction_output_to_datastore_object', 'datastoreWriteProjectId': PROJECT, 'errorWritePath': 'gs://{}/errors/serving_load'.format(COMPOSER_BUCKET_NAME) } dataflow_operator.DataflowTemplateOperator( task_id='gcs_predictions_df_transform', project_id=PROJECT, template=template, parameters=df_template_params, dag=dag).execute(kwargs)
def _add_mlwp_generate_features_pipeline_task( dag: models.DAG, output_type: blockbuster_constants.PreprocessingType, feature_options: dag_utils.FeatureConfigListMapping, storage_vars: dag_utils.AirflowVarsConfig, ) -> dataflow_operator.DataflowTemplateOperator: """Adds the Generate Features Task of ML Windowing Pipeline to dag. Args: dag: The dag that the task needs to be added to. output_type: Indicate whether this pipeline is to be used for training or prediction. feature_options: The parsed config values from airflow feature object variable. storage_vars: The parsed config values from airflow storage variable. Returns: The configured Generate Features task that was added to the input dag. """ if output_type == blockbuster_constants.PreprocessingType.TRAINING: output_table = 'training' training_mode = 'true' output_path = f'{storage_vars["gcs_temp_path"]}/training' elif output_type == blockbuster_constants.PreprocessingType.PREDICTION: output_table = 'prediction' training_mode = 'false' output_path = f'{storage_vars["gcs_temp_path"]}/prediction' template_file_directory = storage_vars['gcs_dataflow_path'] step_4_output = (f'{storage_vars["bq_working_project"]}:' f'{storage_vars["bq_working_dataset"]}.' f'ga_{output_table}_input') # Always add the BB_id as a RECENT feature mod_features = list(feature_options['features']) mod_features.append({ 'fact': 'BB_id', 'type': 'Categorical', 'accumulators': 'recent' }) feature_options_copy = dict(feature_options) feature_options_copy['features'] = mod_features return dataflow_operator.DataflowTemplateOperator( task_id='mlwp_step4', template=f'{template_file_directory}/GenerateFeaturesPipeline', parameters={ **dag_utils.generate_feature_pipeline_parameters(feature_options_copy), 'windowedAvroLocation': f'{output_path}/windowing-output/*.avro', 'featureDestinationTable': step_4_output, 'trainMode': training_mode, 'showEffectiveDateWeekOfYear': 'false', 'showEffectiveDateMonthOfYear': 'false' }, dag=dag)
def add_data_visualization_task( dag: models.DAG, task_id: str, preprocess_vars: dag_utils.AirflowVarsConfig, storage_vars: dag_utils.AirflowVarsConfig ) -> dataflow_operator.DataflowTemplateOperator: """Builds the DataVisualizationPipeline Operator. Args: dag: The dag that the task needs to be added to. task_id: ID for this specific task within the DAG. preprocess_vars: The parsed config values from airflow preprocess variable. storage_vars: The parsed config values from airflow storage variable. Returns: Operator used to run the Data Visualization Pipeline on Dataflow. """ template_file_directory = storage_vars['gcs_dataflow_path'] p2_output_dataset = (f'{storage_vars["bq_working_project"]}:' f'{storage_vars["bq_working_dataset"]}') proc_st_dt = datetime.datetime.strptime(str(preprocess_vars['start_date']), '%Y%m%d') proc_ed_dt = datetime.datetime.strptime(str(preprocess_vars['end_date']), '%Y%m%d') output_path = f'{storage_vars["gcs_temp_path"]}/training' lookback_days = int(preprocess_vars['lookback_days']) prediction_days = int(preprocess_vars['prediction_days']) return dataflow_operator.DataflowTemplateOperator( task_id=task_id, template=f'{template_file_directory}/DataVisualizationPipeline', parameters={ 'snapshotStartDate': get_date_str_from_date(proc_st_dt + datetime.timedelta(days=lookback_days), date_format='%d/%m/%Y'), 'snapshotEndDate': get_date_str_from_date(proc_ed_dt - datetime.timedelta(days=prediction_days), date_format='%d/%m/%Y'), 'inputAvroSessionsLocation': f'{output_path}/usersession-output/*.avro', 'stopOnFirstPositiveLabel': str(preprocess_vars['stopOnFirstPositiveLabel']), 'slideTimeInSeconds': str(preprocess_vars['slideTimeInSeconds']), 'minimumLookaheadTimeInSeconds': str(preprocess_vars['minimumLookaheadTimeInSeconds']), 'maximumLookaheadTimeInSeconds': str(preprocess_vars['maximumLookaheadTimeInSeconds']), 'outputBigQueryUserActivityTable': f'{p2_output_dataset}.instance', 'outputBigQueryFactsTable': f'{p2_output_dataset}.facts', }, dag=dag)
def _add_mlwp_sliding_window_pipeline_task( dag: models.DAG, output_type: blockbuster_constants.PreprocessingType, prediction_vars: dag_utils.AirflowVarsConfig, preprocess_vars: dag_utils.AirflowVarsConfig, storage_vars: dag_utils.AirflowVarsConfig, training_vars: dag_utils.AirflowVarsConfig ) -> dataflow_operator.DataflowTemplateOperator: """Adds the Sliding Window Task of ML Windowing Pipeline to dag. Args: dag: The dag that the task needs to be added to. output_type: Indicates whether this pipeline is to be used for training or prediction. prediction_vars: The parsed config values from airflow prediction variable. preprocess_vars: The parsed config values from airflow preprocess variable. storage_vars: The parsed config values from airflow storage variable. training_vars: The parsed config values from airflow training variable. Returns: The configured Sliding Window task that was added to the input dag. """ if output_type == blockbuster_constants.PreprocessingType.TRAINING: snapshot_start_dt, snapshot_end_dt, output_path = _get_sliding_window_pipeline_params_for_training( preprocess_vars, storage_vars, training_vars) elif output_type == blockbuster_constants.PreprocessingType.PREDICTION: snapshot_start_dt, snapshot_end_dt, output_path = _get_sliding_window_pipeline_params_for_prediction( prediction_vars, preprocess_vars, storage_vars) template_file_directory = storage_vars['gcs_dataflow_path'] return dataflow_operator.DataflowTemplateOperator( task_id='mlwp_step3', template=f'{template_file_directory}/SlidingWindowPipeline', parameters={ 'snapshotStartDate': snapshot_start_dt, 'snapshotEndDate': snapshot_end_dt, 'inputAvroSessionsLocation': f'{output_path}/usersession-output/*.avro', 'stopOnFirstPositiveLabel': str(preprocess_vars['stopOnFirstPositiveLabel']), 'slideTimeInSeconds': str(preprocess_vars['slideTimeInSeconds']), 'minimumLookaheadTimeInSeconds': str(preprocess_vars['minimumLookaheadTimeInSeconds']), 'maximumLookaheadTimeInSeconds': str(preprocess_vars['maximumLookaheadTimeInSeconds']), 'lookbackGapInSeconds': str(int(preprocess_vars['lookbackGapInDays']) * 86400), 'windowTimeInSeconds': str(preprocess_vars['windowTimeInSeconds']), 'outputSlidingWindowAvroPrefix': f'{output_path}/windowing-output/', }, dag=dag)
def add_user_session_task( dag: models.DAG, task_id: str, output_type: blockbuster_constants.PreprocessingType, feature_vars: dag_utils.FeatureConfigListMapping, prediction_vars: dag_utils.AirflowVarsConfig, preprocess_vars: dag_utils.AirflowVarsConfig, storage_vars: dag_utils.AirflowVarsConfig, training_vars: dag_utils.AirflowVarsConfig) -> models.BaseOperator: """Builds the UserSessionPipeline Operator. Args: dag: The dag that the task needs to be added to. task_id: Id string for this specific task within the DAG. output_type: Indicate whether this pipeline is to be used for training or prediction. feature_vars: The parsed config values from airflow feature object variable. prediction_vars: The parsed config values from airflow prediction variable. preprocess_vars: The parsed config values from airflow preprocess variable. storage_vars: The parsed config values from airflow storage variable. training_vars: The parsed config values from airflow training variable. Returns: Operator to use within a DAG to run the User Session Pipeline on Dataflow. """ # Load start/end date from the appropriate Airflow Variable if output_type == blockbuster_constants.PreprocessingType.TRAINING: output_path = f'{storage_vars["gcs_temp_path"]}/training' elif output_type == blockbuster_constants.PreprocessingType.PREDICTION: output_path = f'{storage_vars["gcs_temp_path"]}/prediction' template_file_directory = storage_vars['gcs_dataflow_path'] sql_vars = get_user_session_sql_params( output_type, feature_vars, prediction_vars, preprocess_vars, storage_vars, ) sql = pipeline_utils.render_sql_from_template('usersession_source', **sql_vars) return dataflow_operator.DataflowTemplateOperator( task_id=task_id, template=f'{template_file_directory}/UserSessionPipeline', parameters={ 'inputBigQuerySQL': sql, 'outputSessionsAvroPrefix': f'{output_path}/usersession-output/', 'predictionFactName': training_vars['predictionFactName'], 'predictionFactValues': training_vars['predictionFactValues'] }, dag=dag)
username=sftp_username, password=sftp_password, port=sftp_port) output_filename = 'output/report-%s-{{ run_id }}.csv' % advertiser_id process_elements = dataflow_operator.DataflowTemplateOperator( task_id='process_elements-%s' % advertiser_id, dataflow_default_options={ 'project': gcp_project, 'zone': gcp_zone, 'tempLocation': dataflow_staging, }, parameters={ 'inputKeywordsFile': GCS_PATH_FORMAT % (gcs_bucket, REPORT_FILENAME), 'outputKeywordsFile': GCS_PATH_FORMAT % (gcs_bucket, output_filename), 'keywordColumnNames': output_file_header, 'inputCustomDataFile': input_custom_data_file, 'customDataColumnNames': custom_data_column_names, 'advertiserId': advertiser_id }, template=dataflow_template, gcp_conn_id=sa360_conn_id, dag=dag) download_file.set_downstream(process_elements) upload_to_sftp = gcs_to_sftp_operator.GCSToSFTPOperator( task_id='upload_to_sftp-%s' % advertiser_id, gcs_hook=gcs_hook, ssh_hook=connection_hook,
schedule_interval=None, default_args=DEFAULT_DAG_ARGS) as dag: # Args required for the Dataflow job. downloadminutes = python_operator.PythonOperator(task_id='downloadminutes', python_callable=download_minutes, op_args=[GCP_BUCKET, TARGET_EVENT], provide_context=True) # use template for the xcom job_args = { 'input': "{{ task_instance.xcom_pull(task_ids='downloadminutes') }}", 'output': OUTPUT_FILE_PATH } dataflow_task = dataflow_operator.DataflowTemplateOperator( template=DATAFLOW_MINUTES_TEMPLATE, task_id="processminutes", parameters=job_args) # Here we create two conditional tasks, one of which will be executed # based on whether the dataflow_task was a success or a failure. success_move_task = python_operator.PythonOperator(task_id='success-completion', python_callable=task_completion, op_args=[SUCCESS_TAG], provide_context=True, trigger_rule=TriggerRule.ALL_SUCCESS) failure_move_task = python_operator.PythonOperator(task_id='failure-completion', python_callable=task_completion, op_args=[FAILURE_TAG], provide_context=True, trigger_rule=TriggerRule.ALL_FAILED)
default_dag_args = { 'start_date': yesterday, # To email on failure or retry set 'email' arg to your email and enable # emailing here. 'email_on_failure': False, 'email_on_retry': False, # If a task fails, retry it once after waiting at least 5 minutes 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'dataflow_default_options': { 'project': 'my-test-project-218908', 'zone': 'europe-west1-d', 'tempLocation': 'gs://staging-bucket-tes/staging' } } # [START bigquery_extracton_test] with models.DAG( 'dataflow_test', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # [END bigquery_extracton_test] first_dataflow = dataflow_operator.DataflowTemplateOperator( task_id='dataflow_test', template='gs://staging-bucket-tes/templates/PublisherDemo', gcp_conn_id='google_cloud_default', dag=dag) first_dataflow
] # 한국시 기준 데이터로 보여주기 위해 UTC 기준2일치 데이터를 처리 for i in range(2): output_directory = '{}/data/log/rescuetime'.format(datalake_gs) # 한국시(+9:00) 기준 레스큐 타임 데이터를 UTC 기준으로 저장하기 위해 한번의 2일치 데이터를 조회한다. load_rescuetime = bash_operator.BashOperator( task_id=('load_rescuetime-%s' % i), bash_command= 'java -jar ${{AIRFLOW_HOME}}/dags/dd-importers-load-rescuetime.jar -user_id={} -api_key={} -input_begin_date={} -input_end_date={} -input_timezone=Asia/Seoul -output_date={} -output_timezone=UTC -output_directory={} -output_filenameprefix={} -shard_size=3' .format(user_id, api_key, input_begin_dates[i], input_end_dates[i], input_begin_dates[i], output_directory, output_filename_prefixes[i]), dag=dag) create_rescuetime_bd = dataflow_operator.DataflowTemplateOperator( task_id=('create_rescuetime_bd-%s' % i), template='{}/templates/dd-etls-create-rescuetime'.format(dataflow_gs), parameters={ 'runner': 'DataflowRunner', 'inputFilePattern': '{}/data/log/rescuetime/{}Z-*'.format(datalake_gs, bd_dates[i]), 'outputTable': '{}:dw_datadriver.rescuetime_tbl_bd_data${}'.format( project_id, bd_dates[i]) }, dag=dag, gcp_conn_id='gcp-airflow-service-account') create_rescuetime_bd.set_upstream(load_rescuetime)
description='A DAG triggered by an external Cloud Function', schedule_interval=None, default_args=DEFAULT_DAG_ARGS) as dag: # Build arguments for dataflow task. The dag_run.conf is a way of accessing # input variables passed by calling GCF function. job_args = { 'bigtableInstanceId': config['bt_instance'], 'bigtableTableId': '{{ dag_run.conf["bigtable_id"] }}', 'inputFile': '{{ dag_run.conf["input_file"] }}', 'bigtableProjectId': config['gcp_project'], } # Main Dataflow task that will process and load the input csv file. dataflow_task = dataflow_operator.DataflowTemplateOperator( task_id='csv_to_bt', template=config['dataflow_template_location'], parameters=job_args) success_task = python_operator.PythonOperator( task_id='success-move-to-completion', python_callable=update_on_completion, op_args=[SUCCESS_TAG, FAILURE_TAG], provide_context=True, trigger_rule=TriggerRule.ALL_SUCCESS) failure_task = python_operator.PythonOperator( task_id='failure-move-to-completion', python_callable=update_on_completion, op_args=[FAILURE_TAG, SUCCESS_TAG], provide_context=True, trigger_rule=TriggerRule.ALL_FAILED)
filename_prefixes[i]), dag=dag) create_googlefitness_bd = dataflow_operator.DataflowTemplateOperator( task_id=('create_googlefitness_bd-%s' % i), template='{}/templates/dd-etls-create-googlefitness'.format( dataflow_gs), parameters={ 'runner': 'DataflowRunner', 'beginTime': begin_times[i], 'endTime': end_times[i], 'inputAggregatedDatasetsFilePattern': '{}/data/log/googlefitness/{}Z-*-aggregated-datasets-*'.format( datalake_gs, bd_dates[i]), 'inputSessionsFilePattern': '{}/data/log/googlefitness/{}Z-*-sessions-*'.format( datalake_gs, bd_dates[i]), 'outputAggregatedDatasetsTable': '{}:dw_datadriver.googlefitness_tbl_bd_aggregated_datasets${}'. format(project_id, bd_dates[i]), 'outputSessionsTable': '{}:dw_datadriver.googlefitness_tbl_bd_sessions${}'.format( project_id, bd_dates[i]) }, dag=dag, gcp_conn_id='gcp-airflow-service-account') create_googlefitness_bd.set_upstream(load_googlefitness)