def create_subdag_operators(parent_dag, symbol_list): """ :meta private: Private function to create subdag operators based on given parent dag and list of symbols. """ subdags = [] granularity = "1m" bucket = "data.binance.vision" prefix = "data/spot/{frequency}/klines/{symbol}/{granularity}/" schedule = "@once" subdags = [ create_subdag_operator( parent_dag, schedule, row.loc["symbol_name"], granularity, bucket, prefix, row.loc["symbol_id"], default_args, ) for index, row in symbol_list.iterrows() ] # chain subdag-operators together chain(*subdags) return subdags
def create_dag( args: Mapping[str, Any], parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that trains a new model using the training dataset. Args: args: Arguments to provide to the Airflow DAG object as defaults. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ dag = airflow_utils.initialize_airflow_dag( dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), schedule=None, retries=blockbuster_constants.DEFAULT_DAG_RETRY, retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) bb_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) training_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG) create_model_task = _add_create_model_task(dag, bb_vars, training_vars) update_airflow_variable_task = _add_update_airflow_variable_task(dag) helpers.chain(create_model_task, update_airflow_variable_task) return dag
def delete_s3_key_files_subdag(parent_dag_name, child_dag_name, start_date, s3_bucket, s3_key, aws_credentials): dag = DAG( f'{parent_dag_name}.{child_dag_name}', description='Delete all S3 files in the provided key.', start_date=start_date, schedule_interval=None, catchup=False, ) list_s3_processed_s3_files = S3ListOperator( task_id='list_s3_processed_s3_files', dag=dag, bucket=s3_bucket, prefix=s3_key, aws_conn_id=aws_credentials, ) delete_processed_s3_files = S3DeleteFromContextOperator( task_id='delete_processed_s3_files', dag=dag, bucket=s3_bucket, context_task_id='list_s3_processed_s3_files', aws_conn_id=aws_credentials, ) chain(list_s3_processed_s3_files, delete_processed_s3_files) return dag
def test_chain_different_length_iterable(self): dag = DAG(dag_id='test_chain', start_date=datetime.now()) [t1, t2, t3, t4, t5] = [ DummyOperator(task_id='t{i}'.format(i=i), dag=dag) for i in range(1, 6) ] with self.assertRaises(AirflowException): helpers.chain([t1, t2], [t3, t4, t5])
def test_chain_not_support_type(self): dag = DAG(dag_id='test_chain', start_date=datetime.now()) [t1, t2] = [ DummyOperator(task_id='t{i}'.format(i=i), dag=dag) for i in range(1, 3) ] with self.assertRaises(TypeError): helpers.chain([t1, t2], 1)
def test_chain(self): dag = DAG(dag_id='test_chain', start_date=datetime.now()) [t1, t2, t3, t4, t5, t6] = [DummyOperator(task_id='t{i}'.format(i=i), dag=dag) for i in range(1, 7)] helpers.chain(t1, [t2, t3], [t4, t5], t6) self.assertCountEqual([t2, t3], t1.get_direct_relatives(upstream=False)) self.assertEqual([t4], t2.get_direct_relatives(upstream=False)) self.assertEqual([t5], t3.get_direct_relatives(upstream=False)) self.assertCountEqual([t4, t5], t6.get_direct_relatives(upstream=True))
def build(self): installer = self._get_openshift_installer() install_cluster = installer.get_install_task() cleanup_cluster = installer.get_cleanup_task() with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) benchmark_tasks[-1] >> cleanup_cluster install_cluster >> benchmarks
def data_quality_check_subdag(parent_dag_name, child_dag_name, start_date, redshift_conn, staging_table): dag = DAG( f'{parent_dag_name}.{child_dag_name}', description= 'Check if dimensions tables attend data quality principles.', start_date=start_date, schedule_interval=None, catchup=False, ) dimensions_data_quality_check = RedshiftDataQualityOperator( task_id='dimensions_data_quality_check', dag=dag, redshift_conn_id=redshift_conn, rules=[ { 'query': generic_queries['table_size'].format('dm_company'), 'op': lambda x: x > 0 }, { 'query': generic_queries['table_size'].format('dm_region'), 'op': lambda x: x > 0 }, { 'query': generic_queries['table_size'].format('dm_consumer_profile'), 'op': lambda x: x > 0 }, { 'query': generic_queries['table_size'].format('dm_date'), 'op': lambda x: x > 0 }, ], ) fact_data_quality_check = RedshiftCompareResultsOperator( task_id='fact_data_quality_check', dag=dag, redshift_conn_id=redshift_conn, query=generic_queries['table_size'].format('ft_complaints'), comparison_query=generic_queries['table_size'].format( f'staging.{staging_table}'), operator=lambda x, y: x >= y, ) chain( dimensions_data_quality_check, fact_data_quality_check, ) return dag
def test_chain(self): dag = DAG(dag_id='test_chain', start_date=datetime.now()) [op1, op2, op3, op4, op5, op6] = [ DummyOperator(task_id='t{i}'.format(i=i), dag=dag) for i in range(1, 7) ] helpers.chain(op1, [op2, op3], [op4, op5], op6) self.assertCountEqual([op2, op3], op1.get_direct_relatives(upstream=False)) self.assertEqual([op4], op2.get_direct_relatives(upstream=False)) self.assertEqual([op5], op3.get_direct_relatives(upstream=False)) self.assertCountEqual([op4, op5], op6.get_direct_relatives(upstream=True))
def build(self): installer = self._get_openshift_installer() install_cluster = installer.get_install_task() cleanup_cluster = installer.get_cleanup_task() with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) with TaskGroup("Index Results", prefix_group_id=False, dag=self.dag) as post_steps: index_status_task = self._get_status_indexer().get_index_task() install_cluster >> benchmarks >> [post_steps, cleanup_cluster]
def create_dag( args: Mapping[Text, Any], parent_dag_name: Optional[Text] = None, ) -> models.DAG: """Generates a DAG that loads data into an AutoML Dataset. Args: args: Arguments to provide to the AutoML operators as defaults. parent_dag_name: If this value is provided, the newly created dag object is made a subdag of the parent dag. Returns: The DAG object. """ # Load params from Variables. bb_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) # Create dag. dag = airflow_utils.initialize_airflow_dag( dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None, # schedule blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, local_macros={ 'get_column_spec': _get_column_spec, 'target': 'predictionLabel', 'extract_object_id': automl_hook.AutoMLTablesHook.extract_object_id, }, **args) dataset_creation_task = _add_dataset_creation_task(dag, bb_vars) dataset_id = ( "{{ task_instance.xcom_pull('create_dataset_task', key='dataset_id') }}" ) import_data_task = _add_import_data_task(dag, dataset_id, bb_vars, storage_vars) list_table_specs_task = _add_list_table_specs_task(dag, dataset_id, bb_vars) list_column_specs_task = _add_list_column_specs_task( dag, dataset_id, bb_vars) update_dataset_task = _add_update_dataset_task(dag, bb_vars) update_airflow_variable_task = _add_update_airflow_variable_task(dag) helpers.chain(dataset_creation_task, import_data_task, list_table_specs_task, list_column_specs_task, update_dataset_task, update_airflow_variable_task) return dag
def build(self): installer = self._get_openshift_installer() install_cluster = installer.get_install_task() cleanup_cluster = installer.get_cleanup_task() with TaskGroup("utils", prefix_group_id=False, dag=self.dag) as utils: utils_tasks = self._get_scale_ci_diagnosis().get_utils() chain(*utils_tasks) utils_tasks[-1] >> cleanup_cluster with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) benchmark_tasks[-1] >> utils install_cluster >> benchmarks
def create_dag( args: Mapping[str, Any], output_type: blockbuster_constants.PreprocessingType, parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that preprocesses data. Args: args: Arguments to provide to the Operators as defaults. output_type: Which set of Variables to load for preprocessing parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG) prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) training_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG) feature_options = dag_utils.get_feature_config_val( blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG) dag = airflow_utils.initialize_airflow_dag( dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None, blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) mlwp_sliding_window_pipeline_task = _add_mlwp_sliding_window_pipeline_task( dag, output_type, prediction_vars, preprocess_vars, storage_vars, training_vars) mlwp_generate_features_pipeline_task = _add_mlwp_generate_features_pipeline_task( dag, output_type, feature_options, storage_vars) prepare_automl_data_in_bq_task = _add_prepare_automl_data_in_bq_task( dag, output_type, prediction_vars, storage_vars) helpers.chain(mlwp_sliding_window_pipeline_task, mlwp_generate_features_pipeline_task, prepare_automl_data_in_bq_task) return dag
def create_ETLDag(etl_name, cron_time, etl_task_list): dag_id = '{action_type}_{etl_name}'.format(action_type="ETL", etl_name=etl_name) cron_time = cron_time deploy_task_list = [] dag = DAG(dag_id=dag_id, default_args=default_args, schedule_interval=cron_time, max_active_runs=1, concurrency=8, catchup=AIRFLOW_BACKFILL) deploy_task_list = [ create_python_task(task, dag) for task in etl_task_list ] chain(*deploy_task_list) return (dag_id, dag)
def create_subdag_operators(parent_dag, symbol_df, **kwargs): """ Function breaks down intervals for which data is missing into smaller time periods. This makes it easier to fetch data. Args: parent_dag (DAG instance): parent to which subdags will be linked to. Should be an Airflow DAG instance. symbol_df (dataframe): Dataframe containing symbols, time_since last refresh. Returns: None """ subdags = [] if not symbol_df.empty: # for each symbol for index, row in symbol_df.iterrows(): schedule = "@once" symbol_id = row["symbol_id"] symbol_name = row["symbol_name"] last_known_ts = row["last_close_time"] current_ts = datetime.utcnow() # make list of subdag operators subdags.extend([ create_subdag_operator( parent_dag=parent_dag, schedule=schedule, symbol=symbol_name, granularity=kwargs.get("granularity"), symbol_id=symbol_id, start_ts=last_known_ts, end_ts=current_ts, default_args=default_args, ) ]) # chain subdag-operators together chain(*subdags) return subdags
def create_train_model_dag() -> models.DAG: """Creates the main dag for train model main dag. Returns: Parent training DAG. """ bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) args = { 'start_date': airflow.utils.dates.days_ago(1), 'dataflow_default_options': { 'project': bb_project_vars['gcp_project_id'], 'region': bb_project_vars['gcp_region'], 'zone': bb_project_vars['gcp_zone'], 'tempLocation': bb_storage_vars['gcs_temp_path'] }, } main_dag = airflow_utils.initialize_airflow_dag( dag_id=_DAG_ID, schedule=None, retries=blockbuster_constants.DEFAULT_DAG_RETRY, retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) load_data_subdag = subdag_operator.SubDagOperator( task_id=_LOAD_DATA_TASK_NAME, subdag=load_data_dag.create_dag(args, _DAG_ID), dag=main_dag) train_model_subdag = subdag_operator.SubDagOperator( task_id=_TRAIN_MODEL_TASK_NAME, subdag=train_model_dag.create_dag(args, _DAG_ID), dag=main_dag) helpers.chain(load_data_subdag, train_model_subdag) return main_dag
dag=dag, postgres_conn_id=REDSHIFT_CONN, sql=[ fact_queries['create_ft_complaints'], procon_queries['insert_ft_complaints'] ]) subdag_task_id = 'data_quality_check' data_quality_check = SubDagOperator(task_id=subdag_task_id, dag=dag, subdag=data_quality_check_subdag( DAG_NAME, subdag_task_id, start_date, REDSHIFT_CONN, STAGING_TABLE)) drop_procon_stage_table = PostgresOperator( task_id='drop_procon_stage_table', dag=dag, postgres_conn_id=REDSHIFT_CONN, sql=procon_queries['drop_stage_table']) end_operator = DummyOperator(task_id='finish_execution', dag=dag) chain(start_operator, has_file_to_process, create_procon_stage_table, load_procon_stage_data, [ load_dm_date_data, load_dm_region_data, load_dm_consumer_data, load_dm_company_data ], load_ft_complaints_data, data_quality_check, drop_procon_stage_table, end_operator) delete_s3_key_files.set_upstream(load_procon_stage_data)
dag=dag) truncate_weekly_staging_tables = PostgresOperator( task_id='truncate_weekly_staging_tables', sql='trnctTbls_wkly.sql', dag=dag) load_mo_unemployment_claims_staging = PythonOperator( task_id='load_mo_unemployment_claims_staging', python_callable=load_file, op_kwargs={ 'filename': 'mo_unemployment_claims.csv', 'table_name': 'stg_mo_unemployment_clms', 'sep': ',', 'nullstr': '' }, dag=dag) wkly_unemployment_claims_staging_to_core = PostgresOperator( task_id='wkly_unemployment_claims_staging_to_core', sql='dtaMgrtn_unemplClms_wkly.sql', dag=dag) update_weekly_timestamp = PostgresOperator(task_id='update_weekly_timestamp', sql='setLstSccssflRnDt_wklyAll.sql', dag=dag) chain(scrape_mo_unemployment_claims, truncate_weekly_staging_tables, load_mo_unemployment_claims_staging, wkly_unemployment_claims_staging_to_core, update_weekly_timestamp)
dag=dag) hive = SSHExecuteOperator( task_id="comment_import", bash_command= '(bash {path}/xianyu_itemcomment_import.sh {lastday} {last_update_date})'. format(path=path, lastday=get_lastday(), last_update_date=get_last_update_date()), ssh_hook=sshHook, dag=dag) email_update = EmailOperator(task_id='xianyu_itemcomment_update_email', to=['*****@*****.**'], subject='xianyu itemcomment workflow', html_content='[ xianyu data updated!!! ]', dag=dag) email_update_not = EmailOperator(task_id='xianyu_itemcomment_update_not_email', to=['*****@*****.**'], subject='xianyu itemcomment workflow', html_content='[ xianyu data updating!!! ]', dag=dag) branching = BranchPythonOperator(task_id='check_attach', python_callable=lambda: check_attach(), dag=dag) passover = DummyOperator(task_id='pass', dag=dag) update = DummyOperator(task_id='update', dag=dag) chain(branching, passover, email_update_not) chain(branching, update, spark, hive, email_update)
def test_complex_dag(snapshot): dag = DAG(dag_id="complex_dag", default_args=default_args, schedule_interval=None) # Create create_entry_group = DummyOperator( task_id="create_entry_group", dag=dag, ) create_entry_group_result = DummyOperator( task_id="create_entry_group_result", dag=dag, ) create_entry_group_result2 = DummyOperator( task_id="create_entry_group_result2", dag=dag, ) create_entry_gcs = DummyOperator( task_id="create_entry_gcs", dag=dag, ) create_entry_gcs_result = DummyOperator( task_id="create_entry_gcs_result", dag=dag, ) create_entry_gcs_result2 = DummyOperator( task_id="create_entry_gcs_result2", dag=dag, ) create_tag = DummyOperator( task_id="create_tag", dag=dag, ) create_tag_result = DummyOperator( task_id="create_tag_result", dag=dag, ) create_tag_result2 = DummyOperator( task_id="create_tag_result2", dag=dag, ) create_tag_template = DummyOperator( task_id="create_tag_template", dag=dag, ) create_tag_template_result = DummyOperator( task_id="create_tag_template_result", dag=dag, ) create_tag_template_result2 = DummyOperator( task_id="create_tag_template_result2", dag=dag, ) create_tag_template_field = DummyOperator( task_id="create_tag_template_field", dag=dag, ) create_tag_template_field_result = DummyOperator( task_id="create_tag_template_field_result", dag=dag, ) create_tag_template_field_result2 = DummyOperator( task_id="create_tag_template_field_result", dag=dag, ) # Delete delete_entry = DummyOperator( task_id="delete_entry", dag=dag, ) create_entry_gcs >> delete_entry delete_entry_group = DummyOperator( task_id="delete_entry_group", dag=dag, ) create_entry_group >> delete_entry_group delete_tag = DummyOperator( task_id="delete_tag", dag=dag, ) create_tag >> delete_tag delete_tag_template_field = DummyOperator( task_id="delete_tag_template_field", dag=dag, ) delete_tag_template = DummyOperator( task_id="delete_tag_template", dag=dag, ) # Get get_entry_group = DummyOperator( task_id="get_entry_group", dag=dag, ) get_entry_group_result = DummyOperator( task_id="get_entry_group_result", dag=dag, ) get_entry = DummyOperator( task_id="get_entry", dag=dag, ) get_entry_result = DummyOperator( task_id="get_entry_result", dag=dag, ) get_tag_template = DummyOperator( task_id="get_tag_template", dag=dag, ) get_tag_template_result = DummyOperator( task_id="get_tag_template_result", dag=dag, ) # List list_tags = DummyOperator( task_id="list_tags", dag=dag, ) list_tags_result = DummyOperator( task_id="list_tags_result", dag=dag, ) # Lookup lookup_entry = DummyOperator( task_id="lookup_entry", dag=dag, ) lookup_entry_result = DummyOperator( task_id="lookup_entry_result", dag=dag, ) # Rename rename_tag_template_field = DummyOperator( task_id="rename_tag_template_field", dag=dag, ) # Search search_catalog = DummyOperator( task_id="search_catalog", dag=dag, ) search_catalog_result = DummyOperator( task_id="search_catalog_result", dag=dag, ) # Update update_entry = DummyOperator( task_id="update_entry", dag=dag, ) update_tag = DummyOperator( task_id="update_tag", dag=dag, ) update_tag_template = DummyOperator( task_id="update_tag_template", dag=dag, ) update_tag_template_field = DummyOperator( task_id="update_tag_template_field", dag=dag, ) # Create create_tasks = [ create_entry_group, create_entry_gcs, create_tag_template, create_tag_template_field, create_tag, ] chain(*create_tasks) create_entry_group >> delete_entry_group create_entry_group >> create_entry_group_result create_entry_group >> create_entry_group_result2 create_entry_gcs >> delete_entry create_entry_gcs >> create_entry_gcs_result create_entry_gcs >> create_entry_gcs_result2 create_tag_template >> delete_tag_template_field create_tag_template >> create_tag_template_result create_tag_template >> create_tag_template_result2 create_tag_template_field >> delete_tag_template_field create_tag_template_field >> create_tag_template_field_result create_tag_template_field >> create_tag_template_field_result2 create_tag >> delete_tag create_tag >> create_tag_result create_tag >> create_tag_result2 # Delete delete_tasks = [ delete_tag, delete_tag_template_field, delete_tag_template, delete_entry_group, delete_entry, ] chain(*delete_tasks) # Get create_tag_template >> get_tag_template >> delete_tag_template get_tag_template >> get_tag_template_result create_entry_gcs >> get_entry >> delete_entry get_entry >> get_entry_result create_entry_group >> get_entry_group >> delete_entry_group get_entry_group >> get_entry_group_result # List create_tag >> list_tags >> delete_tag list_tags >> list_tags_result # Lookup create_entry_gcs >> lookup_entry >> delete_entry lookup_entry >> lookup_entry_result # Rename create_tag_template_field >> rename_tag_template_field >> delete_tag_template_field # Search chain(create_tasks, search_catalog, delete_tasks) search_catalog >> search_catalog_result # Update create_entry_gcs >> update_entry >> delete_entry create_tag >> update_tag >> delete_tag create_tag_template >> update_tag_template >> delete_tag_template create_tag_template_field >> update_tag_template_field >> rename_tag_template_field snapshot.assert_match( serialize_pp( PipelineSnapshot.from_pipeline_def( make_dagster_pipeline_from_airflow_dag( dag)).dep_structure_snapshot))
update_legal_entity = PostgresOperator(task_id="update_legal_entity", postgres_conn_id="redb_postgres", sql="updates/legal_entity.sql", database=DATABASE_NAME) update_neighborhood = PostgresOperator(task_id="update_neighborhood", postgres_conn_id="redb_postgres", sql="updates/neighborhood.sql", database=DATABASE_NAME) update_parcel = PostgresOperator(task_id="update_parcel", postgres_conn_id="redb_postgres", sql="updates/parcel.sql", database=DATABASE_NAME) update_unit = PostgresOperator(task_id="update_unit", postgres_conn_id="redb_postgres", sql="updates/unit.sql", database=DATABASE_NAME) # staging_1 > staging_2 (weekly) (sql/functions) staging_1_to_staging_2 = PostgresOperator( task_id="staging_1_to_staging_2", postgres_conn_id="redb_postgres", sql="functions/staging_1_to_staging_2.sql", database=DATABASE_NAME) # The "chain" method executes the tasks like normal, but provides a cleaner structure for legibility. chain(SourcesToS3, MDBtoREDB, insert_neighborhood, insert_address, insert_county_id_mapping_table, insert_legal_entity, insert_parcel, insert_building, insert_unit, update_neighborhood, update_address, update_county_id_mapping_table, update_legal_entity, update_parcel, update_building, update_unit, staging_1_to_staging_2)
last_avg_size ]) + "<br/>" + " ".join( [update_day, rows, total_files, total_size, avg_size]) return res def get_lines(file): lines = [] with open(file, "r") as f: lines = f.readlines() return lines[-8::] def formatOutput(arr): return "<br/>--------------------------------------------------------------------------------------------------------------------------<br/>".join( arr) data = map(process, get_lines(file)) #print data email = EmailOperator(task_id='show', to=[ '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**' ], subject='DATA UPDATED OVERVIEW!', html_content=formatOutput(data), dag=dag) chain(stat, email)
ssh_hook=sshHook, dag=dag) hive_sale = SSHExecuteOperator( task_id="itemsold_sale_import", bash_command='(bash {path}/ec_itemsold_sale_import.sh {lastday})' .format(path=path, lastday=get_date()[0]), ssh_hook=sshHook, dag=dag) spark_daysale = SSHExecuteOperator( task_id="itemsold_daysale_parse", bash_command='(bash {path}/ec_itemsold_daysale_parse.sh {last_2_days} {lastday})' .format(path=path, last_2_days=get_date()[1], lastday=get_date()[0]), ssh_hook=sshHook, dag=dag) hive_daysale = SSHExecuteOperator( task_id="itemsold_daysale_import", bash_command='(bash {path}/ec_itemsold_daysale_import.sh {last_2_days})' .format(path=path, last_2_days=get_date()[1]), ssh_hook=sshHook, dag=dag) email = EmailOperator(task_id='itemsold_sale_daysale_email', to=['*****@*****.**', '*****@*****.**'], subject='itemsold sale&daysale workflow', html_content='[ itemsold sale&daysale data updated!!! ]', dag=dag) chain(spark_sale, hive_sale, spark_daysale, hive_daysale, email)
update_mask={"paths": ["display_name"]}, location=LOCATION, tag_template=TEMPLATE_ID, tag_template_field_id=FIELD_NAME_1, ) # [END howto_operator_gcp_datacatalog_update_tag_template_field] # Create create_tasks = [ create_entry_group, create_entry_gcs, create_tag_template, create_tag_template_field, create_tag, ] chain(*create_tasks) create_entry_group >> delete_entry_group create_entry_group >> create_entry_group_result create_entry_group >> create_entry_group_result2 create_entry_gcs >> delete_entry create_entry_gcs >> create_entry_gcs_result create_entry_gcs >> create_entry_gcs_result2 create_tag_template >> delete_tag_template_field create_tag_template >> create_tag_template_result create_tag_template >> create_tag_template_result2 create_tag_template_field >> delete_tag_template_field create_tag_template_field >> create_tag_template_field_result
) START_TIME = PythonOperator( task_id="starttime", python_callable=print_time, ) END_TIME = PythonOperator( task_id="endtime", python_callable=print_time, ) for i in range(1, 50): CPU_TEST = KubernetesPodOperator( dag=DRONE_LOG_DAG, image=f"{environ['DOCKER_REGISTRY']}/pipeline/dronelogs:cputest", namespace="load-testing", image_pull_policy="Always", name="cpu", do_xcom_push=False, in_cluster=True, config_file=f"{environ['AIRFLOW_HOME']}/.kube/config", is_delete_operator_pod=True, hostnetwork=False, task_id=f"{PIPILE_NAME}-task-{i}", retries=4, retry_delay=datetime.timedelta(seconds=30), ) chain(START_TIME, CPU_TEST, END_TIME)
) for i in range(1, WORKLOAD + 1): ARGUMENTS = json.dumps( { "index_file": INDEX_FILE, "index_prefix": INDEX_PREFIX, "batch_number": i, "worklaod": WORKLOAD, } ) DECRYPT_FILES2 = KubernetesPodOperator( dag=DRONE_LOG_DAG, image=f"{environ['DOCKER_REGISTRY']}/pipeline/{PIPILE_NAME}:decrypt", namespace="airflow", image_pull_policy="Always", name="decrypt", do_xcom_push=False, arguments=[ARGUMENTS], secrets=[SECRET_ENV], configmaps=["airflow-config"], in_cluster=True, config_file=f"{environ['AIRFLOW_HOME']}/.kube/config", is_delete_operator_pod=True, hostnetwork=False, task_id=f"{PIPILE_NAME}-task-1-{i}", ) chain(INDEX2, DECRYPT_FILES2)
def create_dag( args: Mapping[str, Any], output_type: blockbuster_constants.PreprocessingType, parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that analyzes data before preprocessing. Args: args: Arguments to provide to the Operators as defaults. output_type: Which set of Variables to load for preprocessing. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ # Load params from Airflow Variables preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG) prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) training_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG) feature_vars = dag_utils.get_feature_config_val( blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG) dag = airflow_utils.initialize_airflow_dag( dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None, blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) if output_type == blockbuster_constants.PreprocessingType.TRAINING: bucket_name, bucket_path = dag_utils.extract_bucket_parts( f'{storage_vars["gcs_temp_path"]}/training') else: bucket_name, bucket_path = dag_utils.extract_bucket_parts( f'{storage_vars["gcs_temp_path"]}/prediction') clean_temp_dir_task = gcs_delete_operator.GoogleCloudStorageDeleteOperator( task_id=_CLEAN_TEMP_DIR_TASK, bucket=bucket_name, directory=bucket_path, dag=dag) user_session_pipeline_task = add_user_session_task( dag, _USER_SESSION_TASK_ID, output_type, feature_vars, prediction_vars, preprocess_vars, storage_vars, training_vars) if output_type == blockbuster_constants.PreprocessingType.TRAINING: data_visualization_pipeline_task = add_data_visualization_task( dag, _DATA_VISUALISATION_TASK_ID, preprocess_vars, storage_vars) generate_categorical_stats_task = add_categorical_stats_task( dag, feature_vars, storage_vars) generate_numeric_stats_task = add_numeric_stats_task( dag, feature_vars, storage_vars) helpers.chain( clean_temp_dir_task, user_session_pipeline_task, data_visualization_pipeline_task, [generate_categorical_stats_task, generate_numeric_stats_task]) else: helpers.chain(clean_temp_dir_task, user_session_pipeline_task) return dag
postgres_conn_id='redshift_conn', sql=[ brzipcode_queries['drop_staging_brzipcode'], brzipcode_queries['create_brzipcode_table'], ] ) load_brzipcode_table_data = S3ToRedshiftCustomOperator( task_id='load_brzipcode_table_data', dag=dag, redshift_conn_id='redshift_conn', aws_conn_id='aws_credentials', s3_bucket=s3_bucket, s3_key=s3_key, schema='staging', table='brzipcode', copy_options=[ "delimiter '|'", "EMPTYASNULL", ] ) end_operator = DummyOperator(task_id='finish_execution', dag=dag) chain( start_operator, create_brzipcode_table, load_brzipcode_table_data, end_operator )
update_queue = CloudTasksQueueUpdateOperator( task_queue=Queue(stackdriver_logging_config=dict(sampling_ratio=1)), location=LOCATION, queue_name=QUEUE_ID, update_mask={"paths": ["stackdriver_logging_config.sampling_ratio"]}, task_id="update_queue", ) list_queue = CloudTasksQueuesListOperator(location=LOCATION, task_id="list_queue") chain( create_queue, update_queue, pause_queue, resume_queue, purge_queue, get_queue, list_queue, delete_queue, ) # Tasks operations create_task = CloudTasksTaskCreateOperator( location=LOCATION, queue_name=QUEUE_ID, task=TASK, task_name=TASK_NAME, retry=Retry(maximum=10.0), timeout=5, task_id="create_task_to_run", )
def __init__( self, *, emr_cluster: EmrCluster ): super().__init__(PARTNER) dag = PMIDAG.get_dag() env_config = dag.env_config campaign_id_mappings_bucket = env_config['itermediate_bucket_name'] campaign_id_mappings_prefix = 'campaigns/' # erase existing campaign id mappings data erase_campaign_id_mappings_task = ClearDestinationBucketOperator( task_id=f'{self.partner}_erase_campaign_id_mappings', bucket_name=campaign_id_mappings_bucket, bucket_data_prefix=campaign_id_mappings_prefix, partner=self.partner ) # perform campaign id mappings extraction campaign_id_mappings_extraction_task = emr_cluster.create_emr_job_task( task_id=f'{self.partner}_campaign_id_mappings_extraction', job_name=f'Campaign ID Mappings Extraction: {self.partner}', script='campaign_id_mappings.py', script_args=[ '--partner', self.partner, '--data-date', '{{ ds }}', '--data-date-tz-offset', '{{ ds | data_date_tz_offset }}', '--input-database-name', env_config['raw_events_glue_db_name'], '--output-bucket-name', campaign_id_mappings_bucket, '--output-bucket-data-prefix', campaign_id_mappings_prefix ] ) # save extracted mappings in firewall DB save_campaign_id_mappings_task = PythonOperator( task_id=f'{self.partner}_save_campaign_id_mappings', python_callable=_save_campaign_id_mappings, provide_context=True, op_kwargs={ 'aws_conn_id': dag.aws_conn_id, 'mysql_conn_id': env_config['airflow_firewall_db_conn_id'], 'bucket_name': campaign_id_mappings_bucket, 'bucket_data_prefix': campaign_id_mappings_prefix, 'partner': self.partner, 'notifications_topic_arn': ':'.join([ "arn:aws:sns", dag.aws_env['region'], dag.aws_env['accountId'], env_config['notifications_topic_name'] ]) } ) # connect tasks chain( erase_campaign_id_mappings_task, campaign_id_mappings_extraction_task, save_campaign_id_mappings_task ) # set pipeline entry and exit self._entry_task = erase_campaign_id_mappings_task self._exit_task = save_campaign_id_mappings_task