def __init__(self, pipeline, schedule_interval='@daily', extra_default_args={}, extra_config={}, base_config={}): self.pipeline = pipeline loaded_config = config_tools.load_config(pipeline) self.config = base_config.copy() self.config.update(loaded_config) self.config.update(extra_config) self.default_args = config_tools.default_args(self.config) self.default_args.update(extra_default_args) self.schedule_interval = schedule_interval self.flexible_operator = Variable.get('FLEXIBLE_OPERATOR')
dag, 'arguments': [ 'publish_postgres', '{start_date}'.format(**config), '{end_date}'.format(**config), '{project_id}:{events_dataset}.{events_table}'.format( **config), '{temp_bucket}'.format(**config), '{project_id}'.format(**config), '{postgres_database_region}'.format(**config), '{postgres_db_instance_name}'.format(**config), '{postgres_database}'.format(**config), '{postgres_db_user}'.format(**config), '{postgres_db_password}'.format(**config), '{postgres_db_table}'.format(**config), 'encounter'.format(**config) ] }) publish_events_bigquery >> publish_events_postgres return dag encounters_config = config_tools.load_config('pipe_events.encounters') events_encounters_daily_dag = PipelineDagFactory(encounters_config).build( 'pipe_events_daily.encounters') events_encounters_monthly_dag = PipelineDagFactory( encounters_config, schedule_interval='@monthly').build('pipe_events_monthly.encounters') events_encounters_yearly_dag = PipelineDagFactory( encounters_config, schedule_interval='@yearly').build('pipe_events_yearly.encounters')
def build(self, dag_id): """ Override of build method. :@param dag_id: The id of the DAG. :@type dag_id: str. """ config = self.config config['source_dataset'] = config['pipeline_dataset'] config['source_tables'] = config['normalized_tables'] default_args = self.default_args subdag_default_args = dict( start_date=default_args['start_date'], end_date=default_args['end_date'] ) subdag_config = dict( pipeline_dataset=config['pipeline_dataset'], source_dataset=config['pipeline_dataset'], events_dataset=config['events_dataset'], dataflow_runner='{dataflow_runner}'.format(**config), temp_shards_per_day="3", ) config['source_paths'] = ','.join(self.source_table_paths()) config['source_dates'] = ','.join(self.source_date_range()) with DAG(dag_id, schedule_interval=self.schedule_interval, default_args=self.default_args) as dag: source_sensors = self.source_table_sensors(dag) segment = SubDagOperator( subdag=pipe_segment.PipeSegmentDagFactory( schedule_interval=dag.schedule_interval, extra_default_args=subdag_default_args, extra_config=dict( pipeline_dataset=config['pipeline_dataset'], source_dataset=config['pipeline_dataset'], source_tables='{normalized_tables}'.format(**config), dataflow_runner='{dataflow_runner}'.format(**config), temp_shards_per_day="3", ) ).build(dag_id=dag_id+'.segment'), trigger_rule=TriggerRule.ONE_SUCCESS, depends_on_past=True, task_id='segment' ) measures = SubDagOperator( subdag=pipe_measures.PipeMeasuresDagFactory( schedule_interval=dag.schedule_interval, extra_default_args=subdag_default_args, extra_config=subdag_config ).build(dag_id=dag_id+'.measures'), task_id='measures' ) port_events = SubDagOperator( subdag=pipe_anchorages.PipeAnchoragesPortEventsDagFactory( schedule_interval=dag.schedule_interval, extra_default_args=subdag_default_args, extra_config=subdag_config ).build(dag_id=dag_id+'.port_events'), task_id='port_events' ) port_visits = SubDagOperator( subdag=pipe_anchorages.PipeAnchoragesPortVisitsDagFactory( schedule_interval=dag.schedule_interval, extra_default_args=subdag_default_args, extra_config=subdag_config ).build(dag_id=dag_id+'.port_visits'), task_id='port_visits' ) encounters = SubDagOperator( subdag=pipe_encounters.PipeEncountersDagFactory( schedule_interval=dag.schedule_interval, extra_default_args=subdag_default_args, extra_config=subdag_config ).build(dag_id=dag_id+'.encounters'), task_id='encounters' ) for sensor in source_sensors: dag >> sensor >> segment >> measures measures >> port_events >> port_visits measures >> encounters if config.get('enable_features_events', False): features = SubDagOperator( subdag=pipe_features.PipeFeaturesDagFactory( schedule_interval=dag.schedule_interval, extra_default_args=subdag_default_args, extra_config=subdag_config ).build(dag_id=dag_id+'.features'), depends_on_past=True, task_id='features' ) events_anchorages = SubDagOperator( subdag = pipe_events_anchorages.PipelineDagFactory( config_tools.load_config('pipe_events.anchorages'), schedule_interval=dag.schedule_interval, extra_default_args=subdag_default_args, extra_config=subdag_config ).build(dag_id=dag_id+'.pipe_events_anchorages'), depends_on_past=True, task_id='pipe_events_anchorages' ) events_encounters = SubDagOperator( subdag = pipe_events_encounters.PipelineDagFactory( config_tools.load_config('pipe_events.encounters'), schedule_interval=dag.schedule_interval, extra_default_args=subdag_default_args, extra_config=subdag_config ).build(dag_id=dag_id+'.pipe_events_encounters'), depends_on_past=True, task_id='pipe_events_encounters' ) events_fishing = SubDagOperator( subdag = pipe_events_fishing.PipelineDagFactory( config_tools.load_config('pipe_events.fishing'), schedule_interval=dag.schedule_interval, extra_default_args=subdag_default_args, extra_config=subdag_config ).build(dag_id=dag_id+'.pipe_events_fishing'), depends_on_past=True, task_id='pipe_events_fishing' ) port_visits >> features encounters >> features # Points to each independent event features >> events_anchorages features >> events_encounters features >> events_fishing return dag
'{postgres_connection_string}'.format(**config), '{postgres_table_tracks}'.format(**config)] }) check_source_existance = config.get('check_source_existance', None) if (check_source_existance is not None or not check_source_existance): dag >> aggregate_tracks dag >> publish_vessel_info else: source_sensors = self.source_table_sensors(dag) for sensor in source_sensors: dag >> sensor sensor >> aggregate_tracks sensor >> publish_vessel_info aggregate_tracks >> publish_postgres_tracks return dag modes=['daily','monthly'] vessels_configurations = config_tools.load_config(PIPELINE)['configurations'] for mode in modes: for vessels_configuration in vessels_configurations: dag_factory = VesselsPipelineDagFactory(vessels_configuration,schedule_interval='@{}'.format(mode)) dag_id = dag_factory.get_dag_id( '{}_{}'.format(PIPELINE, mode), vessels_configuration['name'] ) globals()[dag_id] = dag_factory.build(dag_id=dag_id)
port_visits >> features encounters >> features # Points to each independent event features >> events_anchorages features >> events_encounters features >> events_fishing return dag def validateJson(data): """ Validates the configuration with a JSON schema. :@param data: The data to be validated. :@type data: dict. :raise: Error in case the dict don't match the schema. """ folder=os.path.abspath(os.path.dirname(__file__)) with open('{}/{}'.format(folder,"schemas/vms_list_schema.json")) as vms_schema: validate(instance=data, schema=json.loads(vms_schema.read())) variables = config_tools.load_config(PIPELINE) validateJson(variables) for vms in variables['vms_list']: for mode in ['daily','monthly', 'yearly']: print('>>>>>> VMS: {}'.format(vms)) pipeline_start_date = datetime.strptime(vms['start_date'].strip(), "%Y-%m-%d") dag_id = '{}_{}_{}'.format(PIPELINE, vms['name'], mode) globals()[dag_id] = VMSGenericDagFactory(vms['name'], schedule_interval='@{}'.format(mode), extra_default_args={'start_date':pipeline_start_date}, extra_config=vms).build(dag_id)
'{docker_run}'.format(**config), 'image': '{docker_image}'.format(**config), 'name': 'fishing-publish-events-postgres', 'dag': dag, 'arguments': [ 'publish_postgres', '{date_range}'.format(**config), '{project_id}:{events_dataset}.{events_table}'.format( **config), '{temp_bucket}'.format(**config), '{postgres_instance}'.format(**config), '{postgres_connection_string}'.format(**config), '{postgres_table}'.format(**config), 'fishing' ] }) publish_events_bigquery >> publish_events_postgres return dag fishing_config = config_tools.load_config('pipe_events.fishing') events_fishing_daily_dag = PipelineDagFactory(fishing_config).build( 'pipe_events_daily.fishing') events_fishing_monthly_dag = PipelineDagFactory( fishing_config, schedule_interval='@monthly').build('pipe_events_monthly.fishing') events_fishing_yearly_dag = PipelineDagFactory( fishing_config, schedule_interval='@yearly').build('pipe_events_yearly.fishing')
'dag': dag, 'arguments': [ 'publish_postgres', '{start_date}'.format(**config), '{end_date}'.format(**config), '{project_id}:{events_dataset}.{events_table}'.format( **config), '{temp_bucket}'.format(**config), '{project_id}'.format(**config), '{postgres_database_region}'.format(**config), '{postgres_db_instance_name}'.format(**config), '{postgres_database}'.format(**config), '{postgres_db_user}'.format(**config), '{postgres_db_password}'.format(**config), '{postgres_db_table}'.format(**config), 'port' ] }) publish_events_bigquery >> publish_events_postgres return dag anchorages_config = config_tools.load_config('pipe_events.anchorages') events_anchorages_daily_dag = PipelineDagFactory(anchorages_config).build( 'pipe_events_daily.anchorages') events_anchorages_monthly_dag = PipelineDagFactory( anchorages_config, schedule_interval='@monthly').build('pipe_events_monthly.anchorages') events_anchorages_yearly_dag = PipelineDagFactory( anchorages_config, schedule_interval='@yearly').build('pipe_events_yearly.anchorages')
def __init__(self, interval): subpipeline_config_key = '{}.{}'.format(PIPELINE, SUBPIPELINE) super(DagFactory, self).__init__( pipeline=PIPELINE, extra_config=config_tools.load_config(subpipeline_config_key), interval=interval)
import posixpath as pp from datetime import datetime, timedelta, date import logging from airflow import DAG from airflow.contrib.sensors.bigquery_sensor import BigQueryTableSensor from airflow.operators.bash_operator import BashOperator from airflow.models import Variable from airflow_ext.gfw.operators.bigquery_operator import BigQueryCreateEmptyTableOperator from airflow_ext.gfw.operators.dataflow_operator import DataFlowDirectRunnerOperator from airflow_ext.gfw.config import load_config from airflow_ext.gfw.config import default_args CONFIG = load_config('pipe_anchorages') DEFAULT_ARGS = default_args(CONFIG) def table_sensor(dataset_id, table_id, date): return BigQueryTableSensor( task_id='source_exists', dataset_id=dataset_id, table_id='{}{}'.format(table_id, date), poke_interval=10, # check every 10 seconds for a minute timeout=60, retries=24*7, # retry once per hour for a week retry_delay=timedelta(minutes=60), retry_exponential_backoff=False )