def setUp(self):
     self.dataflow = DataflowTemplateOperator(
         task_id=TASK_ID,
         template=TEMPLATE,
         parameters=PARAMETERS,
         dataflow_default_options=DEFAULT_OPTIONS_TEMPLATE,
         poll_sleep=POLL_SLEEP)
class DataFlowTemplateOperatorTest(unittest.TestCase):
    def setUp(self):
        self.dataflow = DataflowTemplateOperator(
            task_id=TASK_ID,
            template=TEMPLATE,
            parameters=PARAMETERS,
            dataflow_default_options=DEFAULT_OPTIONS_TEMPLATE,
            poll_sleep=POLL_SLEEP)

    def test_init(self):
        """Test DataflowTemplateOperator instance is properly initialized."""
        self.assertEqual(self.dataflow.task_id, TASK_ID)
        self.assertEqual(self.dataflow.template, TEMPLATE)
        self.assertEqual(self.dataflow.parameters, PARAMETERS)
        self.assertEqual(self.dataflow.poll_sleep, POLL_SLEEP)
        self.assertEqual(self.dataflow.dataflow_default_options,
                         DEFAULT_OPTIONS_TEMPLATE)

    @mock.patch('airflow.contrib.operators.dataflow_operator.DataFlowHook')
    def test_exec(self, dataflow_mock):
        """Test DataFlowHook is created and the right args are passed to
        start_template_workflow.

        """
        start_template_hook = dataflow_mock.return_value.start_template_dataflow
        self.dataflow.execute(None)
        self.assertTrue(dataflow_mock.called)
        expected_options = {
            'project': 'test',
            'stagingLocation': 'gs://test/staging',
            'tempLocation': 'gs://test/temp',
            'zone': 'us-central1-f'
        }
        start_template_hook.assert_called_once_with(TASK_ID, expected_options,
                                                    PARAMETERS, TEMPLATE)
 def setUp(self):
     self.dataflow = DataflowTemplateOperator(
         task_id=TASK_ID,
         template=TEMPLATE,
         parameters=PARAMETERS,
         dataflow_default_options=DEFAULT_OPTIONS_TEMPLATE,
         poll_sleep=POLL_SLEEP)
Пример #4
0
    def __init__(self, project, config, task_id_sufix, parameters, *args,
                 **kwargs):
        self.config = config

        template_location = 'gs://{}/templates/{}/v{}'.format(
            self.config['bucket_name'], self.config['template_name'],
            self.config['template_version'])

        parameters['project'] = project

        self.extra_func_wrappers = kwargs.get('extra_funcs', [])

        DataflowTemplateOperator.__init__(
            self,
            task_id='extract-{}'.format(task_id_sufix),
            template=template_location,
            parameters=parameters,
            poll_sleep=60,
            *args,
            **kwargs)
class DataFlowTemplateOperatorTest(unittest.TestCase):

    def setUp(self):
        self.dataflow = DataflowTemplateOperator(
            task_id=TASK_ID,
            template=TEMPLATE,
            job_name=JOB_NAME,
            parameters=PARAMETERS,
            dataflow_default_options=DEFAULT_OPTIONS_TEMPLATE,
            poll_sleep=POLL_SLEEP)

    def test_init(self):
        """Test DataflowTemplateOperator instance is properly initialized."""
        self.assertEqual(self.dataflow.task_id, TASK_ID)
        self.assertEqual(self.dataflow.job_name, JOB_NAME)
        self.assertEqual(self.dataflow.template, TEMPLATE)
        self.assertEqual(self.dataflow.parameters, PARAMETERS)
        self.assertEqual(self.dataflow.poll_sleep, POLL_SLEEP)
        self.assertEqual(self.dataflow.dataflow_default_options,
                         DEFAULT_OPTIONS_TEMPLATE)

    @mock.patch('airflow.contrib.operators.dataflow_operator.DataFlowHook')
    def test_exec(self, dataflow_mock):
        """Test DataFlowHook is created and the right args are passed to
        start_template_workflow.

        """
        start_template_hook = dataflow_mock.return_value.start_template_dataflow
        self.dataflow.execute(None)
        self.assertTrue(dataflow_mock.called)
        expected_options = {
            'project': 'test',
            'stagingLocation': 'gs://test/staging',
            'tempLocation': 'gs://test/temp',
            'zone': 'us-central1-f'
        }
        start_template_hook.assert_called_once_with(JOB_NAME, expected_options,
                                                    PARAMETERS, TEMPLATE)
Пример #6
0
        'project': PROJECT_ID,
    }
}

with models.DAG(dag_id=DAG_NAME,
                schedule_interval="@once",
                default_args=default_dag_args) as dag:

    t1 = DataflowTemplateOperator(
        task_id='task1',
        template='gs://dataflow-templates/latest/GCS_Text_to_BigQuery',
        parameters={
            'javascriptTextTransformFunctionName':
            'transform',  # udf.jsファイルにある呼び出したいメソッド名
            'JSONPath':
            'gs://{}/composer/schema/schema.json'.format(
                BUCKET_NAME),  # bqのスキーマ定義ファイルのgcsパス
            'javascriptTextTransformGcsPath':
            'gs://{}/composer/udf/udf.js'.format(
                BUCKET_NAME),  # udf.jsファイルのgcsパス
            'inputFilePattern':
            'gs://{}/composer/csv/sample.csv'.format(
                BUCKET_NAME),  # bqへ投入するcsvファイルのgcsパス
            'outputTable':
            '{}:my_dataset.sample'.format(
                PROJECT_ID),  # 保存するbqのプロジェクトid:データセット名.テーブル名
            'bigQueryLoadingTemporaryDirectory':
            'gs://{}/composer/temp'.format(BUCKET_NAME),  # bqへのロード中のtempディレクトリ
        },
    )
def subdag(parent_dag_name, child_dag_name, args, json_gs):
    dag_subdag = DAG(
        dag_id=f'{parent_dag_name}.{child_dag_name}',
        default_args=args,
        start_date=datetime.datetime(2021, 8, 5, 20, 0),
        schedule_interval='0 13,14,15,16,17,18,19,20,21,22,23,0,1 * * *',
    )

    connection_airflow_yas_sa_sii_de = BaseHook.get_connection(
        'google_cloud_yas_sa_sii_de')
    service_account_yas_sa_sii_de = ast.literal_eval(
        connection_airflow_yas_sa_sii_de.
        extra_dejson["extra__google_cloud_platform__keyfile_dict"])

    with gcsfs.GCSFileSystem(
            project='yas-dev-sii-pid',
            token=service_account_yas_sa_sii_de).open(json_gs) as f:
        jd = json.load(f)

    # Variables para ejecucion desde JSON
    url_trn = jd['url_trn']

    # Datos de TRN
    job_name_hom = jd['job_name_hom']
    url_hom = jd['url_hom']
    file_name_hom = jd['file_name_hom']
    template_location_hom = jd['template_location_hom']

    # Datos Generales para la ejecucion
    temp_location = jd['temp_location']
    project = jd['project']
    region = jd['region']
    subnetwork = jd['subnetwork']
    service_account_email = jd['service_account_email']
    machine_type = jd['machine_type']
    max_num_workers = jd['max_num_workers']
    num_workers = jd['num_workers']

    folders = gcsfs.GCSFileSystem(
        project='yas-dev-sii-pid',
        token=service_account_yas_sa_sii_de).ls(url_trn)

    if len(folders) > 0:
        for folder in folders:
            date_folder = folder.split('/')[3]

            if len(date_folder) >= 10:
                url_source = 'gs://' + folder
                url_dest = url_hom + date_folder + '/' + file_name_hom

                parent_dag_name_for_id = parent_dag_name.lower()

                print('url_source: ' + url_source)
                print('url_dest: ' + url_dest)

                DataflowTemplateOperator(
                    template=template_location_hom,
                    job_name=
                    f'{parent_dag_name_for_id}-{child_dag_name}-{date_folder}',
                    task_id=
                    f'{parent_dag_name_for_id}-{child_dag_name}-{date_folder}',
                    location=region,
                    parameters={
                        'url_trn': url_source,
                        'url_hom': url_dest,
                    },
                    default_args=args,
                    dataflow_default_options={
                        'project': project,
                        'zone': 'us-east1-c',
                        'tempLocation': temp_location,
                        'machineType': machine_type,
                        'serviceAccountEmail': service_account_email,
                        'subnetwork': subnetwork,
                    },
                    gcp_conn_id='google_cloud_yas_sa_sii_de',
                    dag=dag_subdag,
                )
    return dag_subdag
DEFAULT_DAG_ARGS = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': TODAY,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    'schedule_interval': '@hourly',
    'dataflow_default_options': {
        'project': PROJECT,
        'zone': 'us-east1-b',
        'stagingLocation': TEMP_BUCKET
    }
}

dag = DAG('Demo-DAG-DataflowCronHourly',
          default_args=DEFAULT_DAG_ARGS,
          dagrun_timeout=timedelta(hours=1),
          schedule_interval='00 * * * *')

start = DummyOperator(task_id='inicio', dag=dag)
end = DummyOperator(task_id='fim', dag=dag)

t1 = DataflowTemplateOperator(task_id='dataflow_count_words_example',
                              template=TEMPLATE,
                              parameters=PARAMETERS,
                              dag=dag)

start >> t1 >> end
Пример #9
0
    def execute(self, context):
        for extra_func_wrapper in self.extra_func_wrappers:
            extra_func_internal = extra_func_wrapper(self)
            extra_func_internal(context)

        DataflowTemplateOperator.execute(self, context)
Пример #10
0
        'zone': ZONE,
        'stagingLocation': TEMP_BUCKET
    }
}

dag = DAG('CDC-DAG-v1',
          default_args=DEFAULT_DAG_ARGS,
          dagrun_timeout=timedelta(hours=3),
          schedule_interval='00 * * * *')

start = DummyOperator(task_id='Start', dag=dag)
end = DummyOperator(task_id='End', dag=dag)

dataflow_load_table1 = DataflowTemplateOperator(
    task_id='loadbq_table1_dataflow',
    template=TEMPLATE,
    parameters=PARAMETERS,
    environment=ENVIRONMENT,
    dag=dag)

# Perform most popular question query.
bq_merge_table1 = bigquery_operator.BigQueryOperator(task_id='bq_merge_table1',
                                                     bql="""
        MERGE
          `{table}` T
        USING
          (
          SELECT
            CASE
              WHEN a.id IS NULL AND b.id IS NOT NULL THEN "I"
              WHEN a.id IS NOT NULL
            AND b.id IS NULL THEN "D"
stage_data = DataflowTemplateOperator(
    task_id='stage_data',
    template=
    '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["template"] }}',
    dataflow_default_options={
        'project':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["project"] }}',
        'region':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["region"] }}',
        'zone':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["zone"] }}',
        'network':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["network"] }}',
        'subnetwork':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["subnetwork"] }}',
        'tempLocation':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["tempLocation"] }}',
    },
    parameters={
        'driverJars':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["driverJars"] }}',
        'driverClassName':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["driverClassName"] }}',
        'connectionURL':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["connectionURL"] }}',
        'query':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["query"] }}',
        'outputTable':
        '{{ (var.json|attr("config-{}".format(run_id)))["bigquery"]["staging_table"] }}',
        'bigQueryLoadingTemporaryDirectory':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["bigQueryLoadingTemporaryDirectory"] }}',
        'connectionProperties':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["connectionProperties"] }}',
        'username':
        '******',
        'password':
        '******',
    },
    dag=dag)
Пример #12
0
        source_bucket='{}_hr_data_8980'.format(os.environ['AIRFLOW_VAR_ENV']),
        source_object='inbox/*.csv',
        destination_bucket='{}_hr_data_8980'.format(
            os.environ['AIRFLOW_VAR_ENV']),
        destination_object='processing/',
        move_object=True,
        google_cloud_storage_conn_id='etl_sa')

    dataflow_task = DataflowTemplateOperator(
        task_id="invoke_dataflow",
        template="gs://{}_df_templates_8987/pipeline1/pipeline1_template".
        format(os.environ['AIRFLOW_VAR_ENV']),
        job_name='sample_dataflow_example',
        poll_sleep=5,
        parameters={
            'input':
            'gs://{}_hr_data_8980/processing/*.csv'.format(
                os.environ['AIRFLOW_VAR_ENV']),
            'deIdentiyTemplateId':
            'generic_deidentify_template'
        },
        dag=dag,
    )
    move_to_archieve = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id='move_file_to_archieve',
        source_bucket='{}_hr_data_8980'.format(os.environ['AIRFLOW_VAR_ENV']),
        source_object='/processing/*.csv',
        destination_bucket='{}_hr_data_arc_7856'.format(
            os.environ['AIRFLOW_VAR_ENV']),
        destination_object='/',
        move_object=True,
Пример #13
0
        'serviceAccountEmail':
        '*****@*****.**',
        'subnetwork':
        "https://www.googleapis.com/compute/v1/projects/sha-net-dev-id/regions/us-east1/subnetworks/subnet-analytics-region-a"
    }
}

dag = DAG('dag-sii-bch-ing-ab-raw',
          default_args=default_args,
          schedule_interval='59 4 * * *',
          tags=['RAW'])

raw_estatus_cuentas = DataflowTemplateOperator(
    template=
    'gs://yas-sii-int-des-dev/AB/templates/TPL_SII_BCH_ING_AB_RAW_ESTATUS_CUENTAS',
    job_name='sii-bch-ing-ab-raw-estatus-cuentas',
    task_id='sii-bch-ing-ab-raw-estatus-cuentas',
    location=gce_region,
    gcp_conn_id='google_cloud_yas_sa_sii_de',
    dag=dag)

raw_servicios_cuenta = DataflowTemplateOperator(
    template=
    'gs://yas-sii-int-des-dev/AB/templates/TPL_SII_BCH_ING_AB_RAW_SERVICIOS_CUENTA',
    job_name='sii-bch-ing-ab-raw-servicios-cuenta',
    task_id='sii-bch-ing-ab-raw-servicios-cuenta',
    location=gce_region,
    gcp_conn_id='google_cloud_yas_sa_sii_de',
    dag=dag)

raw_tipos_cuentas = DataflowTemplateOperator(
    template=
Пример #14
0
    },
    'start_date': datetime.datetime(2019, 10, 15)
}


with models.DAG('start_dataflow', schedule_interval=None, default_args=default_args, catchup=False) as dag:

    def start_greeting():
        import logging
        logging.info('Hello! Welcome to AirFlow')

    def end_greeting():
        import logging
        logging.info('Thank you, Goodbye!')

    start = python_operator.PythonOperator(task_id='start', python_callable=start_greeting)
    end = python_operator.PythonOperator(task_id='end', python_callable=end_greeting)

    df_pipeline_mutation = DataflowTemplateOperator(
        task_id='df_pipeline_mutation',
        template='{{var.value.gcp_df_mutation_template}}',
        gcp_conn_id='google_cloud_default'
    )

    df_pipeline_import = DataflowTemplateOperator(
        task_id='df_pipeline_import',
        template='{{var.value.gcp_df_import_template}}',
        gcp_conn_id='google_cloud_default'
    )
    start >> df_pipeline_mutation >> df_pipeline_import >> end
Пример #15
0
dummy_start = DummyOperator(task_id='job_start', dag=dag)

dummy_end = DummyOperator(task_id='job_end',
                          dag=dag,
                          trigger_rule='all_success')

cntr = 0
while cntr < count_of_jobs:
    dftask = DataflowTemplateOperator(
        task_id='dataflow_pubsub_to_gcs-' + str(cntr),
        template='gs://$YOUR-PROJ-poc_dataflow/templates/PubsubJsonToGcs_v1.0',
        job_name='csv-pubsub-to-gcs-' + str(cntr) + '',
        dataflow_default_options=dataflow_default_options,
        parameters={
            'runner':
            'DataflowRunner',
            'tempLocation':
            'gs://$YOUR-PROJ-poc_dataflow/temp/',
            'inputPath':
            'projects/$YOUR-PROJ--poc-proj/subscriptions/sub_csv' + str(cntr),
            'outputPath':
            'gs://$YOUR-PROJ-poc_data/ingested/json/2020-10-21/run' +
            str(cntr) + '/',
            'jobName':
            'pubsub-to-gcs-airflow'
        },
        dag=dag)
    dummy_start >> dftask
    dftask >> dummy_end
    cntr += 1
        machineType_exe,
        'serviceAccountEmail':
        '*****@*****.**',
        'subnetwork':
        "https://www.googleapis.com/compute/v1/projects/sha-net-dev-id/regions/us-east1/subnetworks/subnet-analytics-region-a"
    }
}

dag = DAG('dag-sii-bch-ing-ab-raw-cue-mov',
          default_args=default_args,
          schedule_interval='0 13,14,15,16,17,18,19,20,21,22,23,0,1 * * *',
          tags=['RAW', 'Movimientos', 'Cuentas'])

raw_cuentas = DataflowTemplateOperator(
    template=
    'gs://yas-sii-int-des-dev/AB/templates/TPL_SII_BCH_ING_AB_RAW_CUENTAS',
    job_name='sii-bch-ing-ab-raw-cuenta',
    task_id='sii-bch-ing-ab-raw-cuenta',
    location=gce_region,
    gcp_conn_id='google_cloud_yas_sa_sii_de',
    dag=dag)

raw_movimientos = DataflowTemplateOperator(
    template=
    'gs://yas-sii-int-des-dev/AB/templates/TPL_SII_BCH_ING_AB_RAW_MOVIMIENTOS',
    job_name='sii-bch-ing-ab-raw-movimientos',
    task_id='sii-bch-ing-ab-raw-movimientos',
    location=gce_region,
    gcp_conn_id='google_cloud_yas_sa_sii_de',
    dag=dag)
Пример #17
0
                        python_callable=get_nodash_date,
                        provide_context=True)

    dataflow_job = DataflowTemplateOperator(
        # The task id of your job
        task_id="dataflow_operator_transform_csv_to_bq",
        # The name of the template that you're using.
        # Below is a list of all the templates you can use.
        # For versions in non-production environments, use the subfolder 'latest'
        # https://cloud.google.com/dataflow/docs/guides/templates/provided-batch#gcstexttobigquery
        template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery",
        # Use the link above to specify the correct parameters for your template.
        parameters={
            "javascriptTextTransformFunctionName":
            "transformCSVtoJSON",
            "JSONPath":
            bucket_path + "/schema.json",
            "javascriptTextTransformGcsPath":
            bucket_path + "/transform.js",
            "inputFilePattern":
            "gs://week_2_bs/keyword_search/search_" +
            '{{ ti.xcom_pull("get_execution_date") }}' + ".csv",
            "outputTable":
            project_id + ":searched_keyword.searched_keyword",
            "bigQueryLoadingTemporaryDirectory":
            bucket_path + "/tmp/",
        },
    )

    t2 = PythonOperator(task_id='get_dash_date',
                        python_callable=get_dash_date,
Пример #18
0
        "region": gce_region,
        # Set to your zone
        "zone": gce_zone,
        # This is a subfolder for storing temporary files, like the staged pipeline job.
        "temp_location": bucket_path + "/tmp/",
    },
}

# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.
with models.DAG(
        # The id you will see in the DAG airflow page
        "dataflow_template_composer_dataflow_dag",
        default_args=default_args,
        # The interval with which to schedule the DAG
        schedule_interval=datetime.timedelta(
            days=1),  # Override to match your needs
) as dag:

    start_template_job = DataflowTemplateOperator(
        # The task id of your job
        task_id="dataflow_template_operator_run",
        # The name of the template that you're using.
        # Below is a list of all the templates you can use.
        # For versions in non-production environments, use the subfolder 'latest'
        # https://cloud.google.com/dataflow/docs/guides/templates/provided-batch#gcstexttobigquery
        template="gs://dataflow_cicd_test/templates/test_beam",
        # Use the link above to specify the correct parameters for your template.
        parameters={})
with models.DAG(
        # The id you will see in the DAG airflow page
        "composer_dataflow_dag",
        default_args=default_args,
        # The interval with which to schedule the DAG
        schedule_interval=datetime.timedelta(
            days=1),  # Override to match your needs
) as dag:

    start_template_job = DataflowTemplateOperator(
        # The task id of your job
        task_id="dataflow_operator_transform_csv_to_bq",
        # The name of the template that you're using.
        # Below is a list of all the templates you can use.
        # For versions in non-production environments, use the subfolder 'latest'
        # https://cloud.google.com/dataflow/docs/guides/templates/provided-batch#gcstexttobigquery
        template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery",
        # Use the link above to specify the correct parameters for your template.
        parameters={
            "javascriptTextTransformFunctionName": "transformCSVtoJSON",
            "JSONPath": bucket_path + "/jsonSchema.json",
            "javascriptTextTransformGcsPath":
            bucket_path + "/transformCSVtoJSON.js",
            "inputFilePattern": bucket_path + "/inputFile.txt",
            "outputTable": project_id + ":average_weather.average_weather",
            "bigQueryLoadingTemporaryDirectory": bucket_path + "/tmp/",
        },
    )

# [END composer_dataflow_dag]
Пример #20
0
# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.

template_path = 'gs://{}/template/GCS_TO_GCS_5'.format(project_id)
with models.DAG('composer_dataflowtemplate1016_3',
                schedule_interval='@once',
                default_args=default_dag_args,
                concurrency=1,
                max_active_runs=1) as dag:

    execute_dataflow_1 = DataflowTemplateOperator(
        task_id='datapflow_example1',
        template=template_path,
        parameters={
            'inputFile':
            "gs://{}/sample.csv".format(project_id),
            'outputFile':
            "gs://{}/composer_output/sample_1.csv".format(project_id),
        },
        dag=dag)
    execute_dataflow_2 = DataflowTemplateOperator(
        task_id='datapflow_example2',
        template=template_path,
        parameters={
            'inputFile':
            "gs://{}/composer_output/sample_1.csv".format(project_id),
            'outputFile':
            "gs://{}/composer_output/sample_2.csv".format(project_id),
        },
        dag=dag)
    execute_dataflow_3 = DataflowTemplateOperator(