def env(): return { 'env': os.environ['ENV'], 'project': gcloud.project(os.environ['ENV']), 'user': gcloud.config()['username'], 'client': 'bluesun' }
def process(self, unused_el): if isinstance(self._files_startwith, ValueProvider): self._files_startwith = self._files_startwith.get() if isinstance(self._files_ext, ValueProvider): self._files_ext = self._files_ext.get() if isinstance(self._sort_key, ValueProvider): self._sort_key = self._sort_key.get() if isinstance(self._env, ValueProvider): self._env = self._env.get() if isinstance(self._bucket, ValueProvider): self._bucket = self._bucket.get() project_id = GCLOUD.project(self._env) blobs = CloudStorage.factory(project_id).list_blobs( self._bucket, self._files_startwith) # Keep only files at the root bucket paths = [ f'gs://{b.bucket.name}/{b.name}' for b in blobs if '/' not in b.name and self._files_ext in b.name ] if isinstance(self._sort_key, str): self._sort_key = dill.loads(bytes.fromhex(self._sort_key)) paths.sort(key=self._sort_key) if len(paths) > 1 else None for file in paths: yield file
def _run(cls, p, options): with p: project_id = GCLOUD.project(options.env) big_query_data = (p | 'Read BigQuery Data' >> bq.ReadAsJson( project=project_id, query=options.query, page_size=options.page_size) | 'Transform Nested Datetimes' >> beam.ParDo( StringifyDatetimes())) staging_table = 'staging.orders' wv_p = (big_query_data | 'Apply World Ventures Transform' >> WorldVenturesStagingOrdersTransform()) bs_p = (big_query_data | 'Apply Bluesun Transform' >> beam.ParDo(Bluesun())) ((wv_p, bs_p) | 'Merge Client Collections for Writing to BigQuery' >> beam.Flatten() | 'Prewrite Cleanup' >> beam.ParDo(PrewriteCleanup()) | 'Write to staging' >> beam.io.WriteToBigQuery( '{}:{}'.format(project_id, staging_table), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, custom_gcs_temp_location=f'gs://{project_id}-dataflow/tmp'))
def _run(cls, p, options): crypto = Crypto(env=options.env, key='vibe-cdc') keys = list( crypto.list_secret_keys( client=secretmanager.SecretManagerServiceClient())) with p: project_id = gcloud.project(options.env) bucket = f'{project_id}-cdc-imports' (p | 'Iterate File Paths' >> FileListIteratorTransform( env=options.env, bucket=bucket, files_startwith=options.files_startwith, sort_key=options.sort_key) | 'Read from a File' >> beam.io.ReadAllFromText() | 'Apply Decryption Transform' >> beam.ParDo( RecordDecryption(env=options.env, keys=keys, crypto=crypto)) | 'Insert Ingestion Timestamp' >> beam.ParDo( InsertIngestionTimestamp()) | 'Ingest table schema' >> beam.ParDo( bq.IngectTableSchema(options.dest)) | 'Transform String to Standard SQL Datetime' >> beam.ParDo( StringToDatetime()) | 'Transform Nested Datetimes' >> beam.ParDo( StringifyDatetimes('%Y-%m-%d %H:%M:%S')) | 'Write to Bigquery' >> beam.io.WriteToBigQuery( options.dest, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER))
def _run(cls, p, options): with p: project_id = GCLOUD.project(options.env) bucket = f'{project_id}-wrench-imports' dest_tbl = f'{project_id}:lake.wrench_metrics' (p | 'Iterate File Paths' >> FileListIteratorTransform( env=options.env, bucket=bucket, files_ext=options.files_ext, sort_key=options.sort_key) | 'Read from a File' >> beam.io.ReadAllFromText(skip_header_lines=1) | 'Apply Wrench Transform' >> beam.ParDo(WrenchCSVPayloadMapper()) | 'Insert Ingestion Timestamp' >> beam.ParDo( InsertIngestionTimestamp()) | 'Ingest table schema' >> beam.ParDo( bq.IngectTableSchema(table=dest_tbl)) | 'Transform String to Standard SQL Datetime' >> beam.ParDo( StringToDatetime()) | 'Transform Nested Datetimes' >> beam.ParDo( StringifyDatetimes('%Y-%m-%d %H:%M:%S')) | 'Write to lake' >> beam.io.WriteToBigQuery( dest_tbl, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER))
def start_bundle(self): self._ml_config = Config.get_config(gcloud.env(self._env.get())) self._client_name = self._client.get() self._client_config = self._ml_config['cloudsql'][self._client_name] if self._client_config['host'] != 'mysql': proxy_connect( self._ml_config['cloudsql'][self._client.get()]['host'])
def _run(cls, p, options): with p: project_id = GCLOUD.project(options.env) big_query_data = (p | 'Read BigQuery Data' >> bq.ReadAsJson( project=project_id, query=options.query, page_size=options.page_size) | 'Insert Ingestion Timestamp' >> beam.ParDo( InsertIngestionTimestamp()) | 'Transform Nested Datetimes' >> beam.ParDo( StringifyDatetimes())) staging_table = 'staging.users' wv_p = (big_query_data | 'Apply World Ventures Transform' >> WorldVenturesStagingUsersTransform()) bs_p = (big_query_data | 'Apply Bluesun Transform' >> beam.ParDo(Bluesun())) results = ( (wv_p, bs_p) | 'Merge Client Collections for Writing to BigQuery' >> beam.Flatten() | 'Write to staging' >> beam.io.WriteToBigQuery( '{}:{}'.format(project_id, staging_table), insert_retry_strategy='RETRY_NEVER', write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER )) print(results)
def _run(cls, p, options): project_id = gcloud.project(options.env) with p: (p | 'Read BigQuery Data' >> bq.ReadAsJson(project=project_id, query=options.query, page_size=options.page_size) | 'Transform Nested Datetimes' >> beam.ParDo(StringifyDatetimes()) | 'Convert None values to empty strings' >> beam.ParDo( ConvertNoneToEmptyString()) | 'Convert Bool values to ints' >> beam.ParDo(ConvertBoolsToInts()) | 'Convert dict elements to json' >> beam.Map(json.dumps) | 'Write file to Cloud Storage' >> beam.io.WriteToText( options.destination, file_name_suffix='.ndjson.gz', compression_type=beam.io.filesystem.CompressionTypes.GZIP))
import os import logging from libs import GCLOUD as gcloud, CloudStorage from airflow import DAG from datetime import datetime, timedelta from airflow.models import Variable from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import PythonOperator, BranchPythonOperator from airflow.sensors.custom_sensors import DataflowJobStateSensor from airflow.operators.custom_operators import ScheduleDataflowJobOperator from libs import report_failure log = logging.getLogger() airflow_vars = Variable.get('airflow_vars', deserialize_json=True) env = os.environ['ENV'] project_id = gcloud.project(env) bucket = f'{project_id}-cdc-imports' processed_bucket = f'{project_id}-cdc-imports-processed' default_args = { 'owner': 'airflow', 'start_date': datetime(2020, 3, 8), 'retries': 1, 'retry_delay': timedelta(minutes=5), 'on_failure_callback': report_failure, 'email_on_failure': False, 'email_on_retry': False } def list_blobs(bucket, files_startswith):
import pytest import apache_beam as beam from libs import GCLOUD as gcloud, Crypto import datetime from apache_beam.testing.test_pipeline import TestPipeline from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to from load_cdc_from_gcs_to_lake import RecordDecryption, Runner, RuntimeOptions project_id = gcloud.project('local') bucket = f'{project_id}-cdc-imports' # TODO: Move test data to Factory wbrito 05/05/2020 file_seed = 'eee7cb05d80b8a58bccff2f4290b49ff:b42e122e33a81939c5810d44ce92830e431ea45689\ f0aa7ba6c8cd04f2049bfa2fa92283c6e7fb12458a8e910873a2322e8f2d18f6ae7f728f715a4c633b73328d12b0260\ f6b49b84e46c127c58acf71e81b5847318f5eea3293c304ea38090de987c24debf2f8c282920ac82bfc5bd473a9002c9\ 162bfa08f1eb309e46dc7e7ca1964334620c8581f028228e8b3edc4233c0dca5edf7aea2a00a96ef272c5061f9b47f2e3\ edf5a52a26fa6f5976e3a12c27c4a0cd7ef8ec401cd8755ac4566f674aa3a33a22111e55bd7d76aa32e8911612971e1aa0\ 7ad09e21323be4d47972d94666ecb48271f8fe44be7b6b2b2508fcffdf34c0e7a9cec72aee07252469c4dacc060a4dadedae\ 4fd28679e047f2b6394fb668e021e79fe0f414e2a15a0a652f3f628431df1463da2d61d0f1211a121c5ea715d3ac3fe22cd11b\ 97933c3f337b8224261caf321f667a6c894b09eba6a13b9a155be69b903d3c51fc8507a7fc43e2b9145bb8e53826c78ef9645b8\ 3e811913e365eb9daee0e732fd6fe3bc0fc9f6eebe1ca0297f7fc98f8b5075a28283cd3f17ff0482116aaa0b2e3d7d8bed976f4b\ 82e50efb2627728cb76601819b4aa69b59eee86348b4d51540a8659da87f630468344c0212f393011345668fd0f4afbef1f36cf7\ b94815a688aa8754fa9a4d9942bc14de16e2fdf712b72a9b9e0e57bbe27df145f75a554806e24ee3c34bf76d1da247d653cca9507\ b3d13e43afbb94e5e8915559541d6a2d1822a9a5ba8d1157477fceb13687852e69d1a3cc40253727a7e0480fd303005059450cd381\ f8ea427887ac591665ff4266d3f5ae93e6c11c6553cb7220a695c04ebceac7369dc0fd8e1a68ab82b23099db224511ac9320e60a684\ b656f23cab924ec795662b9c3e94c65d2b076d856483a89c1f3f87a743c07acf6eb0824d1834abf14c50717343138d67d7e8171f4cac7\ 71f71bc22a330dad96a62ac8a9354992d622c644def96b7c2e3ea550a2648383d3900faff5e74cad261998749faf42f595d24c3f342dd3c\ 6b38078d35c9d5c339048b234a11f022c21d46194a0935a74cf366b9862389d2fe5203b9243ccee8b096ee3196411d49515f0f3384a906d91325a7f80' filename = 'vibe-tree-users-final-8b8e336ea2366596964f1ed2c67c3039bc5cfe57e823db8e647835f1fee26040-1587028509211' # TODO: Move to Factory wbrito 05/05/2020
""" $dag_filename$: vibe_to_lake---client--.py """ import logging import os from airflow import DAG from airflow.models import Variable from datetime import datetime from airflow.operators.dummy_operator import DummyOperator from libs import GCLOUD as gcloud from airflow.sensors.custom_sensors import DataflowJobStateSensor from airflow.operators.custom_operators import ScheduleDataflowJobOperator from libs import report_failure env = os.environ['ENV'] project = gcloud.project(env) airflow_vars = Variable.get("airflow_vars", deserialize_json=True) default_args = { 'owner': 'airflow', 'start_date': datetime(2020, 3, 8), 'on_failure_callback': report_failure, 'email_on_failure': False, 'email_on_retry': False } DAG_ID = 'vibe_to_lake---client--' def _marker(st): logging.info('********************************{}*****************************************'.format(st))
from apache_beam.testing.test_pipeline import TestPipeline from apache_beam.options.value_provider import RuntimeValueProvider from libs import GCLOUD from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to from transforms.io.filelist_iterator import FileListIteratorTransform import dill project_id = GCLOUD.project('local') def test_a_serialized_file_list_is_deserialized_and_processed_by_insertion_order( cloudstorage): with TestPipeline() as p: bucket = f'{project_id}-wrench-imports' file_seed = """ entity_id,tree_user_id,prediction,owner,experiment_name,processing_datetime 1fab9649-5113-4daa-a9ed-ae67bc3358b8,1038665,low,e08dc822-97b1-46f4-9154-25821286231f,lcv_level,2020-01-31T14:52:40.000+05:30 c104b7ea-d32c-4c6e-92a5-505b3651d424,873612,zero,e08dc822-97b1-46f4-9154-25821286231f,lcv_level,2020-01-31T14:52:40.000+05:30 """ filename = 'wrench_test.csv' # Update sort_key based on the filename format def _sort_key(f): delimeter = '*' ts = f[f.rfind(delimeter) + 1:] return int(ts) if ts.isdigit() else f _sort_key = bytes.hex(dill.dumps(_sort_key)) [b.delete() for b in cloudstorage.client.list_blobs(bucket)]
import os import pytest import cdc_from_gcs_to_lake from airflow.models.taskinstance import TaskInstance from datetime import datetime from airflow.operators.python_operator import BranchPythonOperator, PythonOperator from libs import GCLOUD as gcloud, CloudStorage from airflow.models import Variable from unittest.mock import patch from unittest import mock from google.cloud.storage.blob import Blob from google.cloud.storage.bucket import Bucket from google.cloud.storage.client import Client airflow_vars = Variable.get('airflow_vars', deserialize_json=True) project_id = gcloud.project(os.environ['ENV']) bucket = f'{project_id}-cdc-imports' processed_bucket = f'{project_id}-cdc-imports-processed' FILE_NAME = 'vibe-commission-bonuses-final-8b8e336ea2366596964f1ed2c67c3039bc5cfe57e823db8e647835f1fee26040-1587028509211' FILE_SEED = """ 14d1bbba50e40234839420171eb87431:0c81720eff1215c298621670f689ac76a3300ce0320c3a3c1c381d5f356f9fa405d14a9deabd0757207776d12a76bc076e2d0baaa6a79a0cb66b0ec2ee78005f05722934b501e1cb083bfedcc319e41dc0a207e899fcb9558f6c8826e3cee6beb67a0d1a878e4a5e86bb7f0579c28bcde88539add19e7aea69c495a413d2dc37892162d68b75e6003db81846bb96bfb946ef3d387a2b116b92a5b609b4c4e3c8570139f804daa04b105feeac06845efda0dce5360809de73d4c7831c9e84c4974313ebe7ea807093e2f214379f4c5e8c805fa4004cfc2f1c8cbf23ad68145a3a """ def test_has_exected_task_count(load_dag): dag_bag = load_dag('cdc_from_gcs_to_lake') dag = dag_bag.get_dag('cdc_from_gcs_to_lake') assert len(dag.tasks) == 126 def test_continue_if_file_task(load_dag):
def create_dag(): dag = DAG( DAG_ID, default_args=default_args, # Be sure to stagger the dags so they don't run all at once, # possibly causing max memory usage and pod failure. - Stu M. schedule_interval='30 * * * *', catchup=False) with dag: start_task = DummyOperator(task_id='start') finish_task = DummyOperator(task_id='finish') for table, sources in table_map.items(): pusher_task_id = f'schedule_dataflow_{table}' parsed_table = gcloud.parse_table_name(table) get_checkpoint_task = GetCheckpointOperator( task_id=f'get_checkpoint_{table}', env=env, target=table, sources=sources) continue_if_data_task = BranchPythonOperator( task_id=f'continue_if_data_{table}', python_callable=should_continue, op_args=[table], provide_context=True) parse_query_task = PythonOperator(task_id=f'parse_query_{table}', python_callable=parse_query, op_args=[table], provide_context=True) dataflow_task = ScheduleDataflowJobOperator( task_id=pusher_task_id, project=gcloud.project(env), template_name=f'load_lake_to_staging_{parsed_table}', job_name=f'lake-to-staging-{table}', job_parameters={'env': env}, pull_parameters=[{ 'param_name': 'query', 'task_id': f'parse_query_{table}' }], provide_context=True) monitor_dataflow_task = DataflowJobStateSensor( task_id=f'monitor_df_job_{table}', poke_interval=airflow_vars['dags']['lake_to_staging'] ['poke_interval'], timeout=airflow_vars['dags']['lake_to_staging'] ['poke_timeout'], dag=dag, pusher_task_id=pusher_task_id) set_checkpoint_task = SetCheckpointOperator( task_id=f'set_checkpoint_{table}', env=env, table=table) start_task.set_downstream(get_checkpoint_task) get_checkpoint_task.set_downstream(continue_if_data_task) continue_if_data_task.set_downstream(parse_query_task) parse_query_task.set_downstream(dataflow_task) dataflow_task.set_downstream(monitor_dataflow_task) monitor_dataflow_task.set_downstream(set_checkpoint_task) set_checkpoint_task.set_downstream(finish_task) start_task >> finish_task return dag
def test_query_from_table(env): project = gcloud.project(env['env']) table = 'staging.contacts' query = bq_to_wrench.build_query(table=table, project=project) assert query._builder['from'] == f'`{project}.{table}`'
def create_dag(): dag = DAG(DAG_ID, catchup=False, default_args=default_args, schedule_interval='@hourly') with dag: start_task = DummyOperator(task_id='start') finish_task = DummyOperator(task_id='finish') for table in get_airflow_vars()['dags'][DAG_ID]['tables']: table = table['name'] parsed_table = gcloud.parse_table_name(table) get_checkpoint_task = GetCheckpointOperator( task_id='get_checkpoint_{}'.format(table), env=env, target=table, sources=[table]) continue_if_data_task = BranchPythonOperator( task_id='continue_if_data_{}'.format(table), python_callable=continue_if_data, op_args=[table], trigger_rule='all_done', provide_context=True) clear_gcs_bucket_by_table_task = PythonOperator( task_id='clear_gcs_bucket_{}'.format(table), python_callable=clear_gcs_bucket_by_table, op_args=[env, table]) parse_query_task = PythonOperator(task_id=f'parse_query_{table}', python_callable=parse_query, op_args=[table], provide_context=True) dataflow_task = ScheduleDataflowJobOperator( task_id=f'schedule_dataflow_{table}', project=gcloud.project(env), template_name='offload_bq_to_cs', job_name=f'bq-to-wrench-{parsed_table}', job_parameters={ 'destination': 'gs://{}/{}/{}'.format(gcs_bucket, table, f'bq-to-wrench-{parsed_table}') }, pull_parameters=[{ 'param_name': 'query', 'task_id': f'parse_query_{table}' }], provide_context=True) monitor_dataflow_task = DataflowJobStateSensor( task_id=f'monitor_dataflow_{table}', pusher_task_id=f'schedule_dataflow_{table}', poke_interval=get_airflow_vars()['dags'][DAG_ID] ['poke_interval'], timeout=get_airflow_vars()['dags'][DAG_ID]['poke_timeout'], dag=dag) gcs_to_wrench_s3_task = PythonOperator( task_id='gcs_to_wrench_s3_{}'.format(table), python_callable=gcs_to_wrench_s3, op_args=[env, table]) commit_checkpoint_task = SetCheckpointOperator( task_id='commit_checkpoint_{}'.format(table), env=env, table=table) (start_task >> get_checkpoint_task >> continue_if_data_task >> clear_gcs_bucket_by_table_task >> parse_query_task >> dataflow_task >> monitor_dataflow_task >> gcs_to_wrench_s3_task >> commit_checkpoint_task >> finish_task) return dag