Пример #1
0
def env():
    return {
        'env': os.environ['ENV'],
        'project': gcloud.project(os.environ['ENV']),
        'user': gcloud.config()['username'],
        'client': 'bluesun'
    }
Пример #2
0
    def process(self, unused_el):
        if isinstance(self._files_startwith, ValueProvider):
            self._files_startwith = self._files_startwith.get()
        if isinstance(self._files_ext, ValueProvider):
            self._files_ext = self._files_ext.get()
        if isinstance(self._sort_key, ValueProvider):
            self._sort_key = self._sort_key.get()
        if isinstance(self._env, ValueProvider):
            self._env = self._env.get()
        if isinstance(self._bucket, ValueProvider):
            self._bucket = self._bucket.get()

        project_id = GCLOUD.project(self._env)
        blobs = CloudStorage.factory(project_id).list_blobs(
            self._bucket, self._files_startwith)
        # Keep only files at the root bucket
        paths = [
            f'gs://{b.bucket.name}/{b.name}' for b in blobs
            if '/' not in b.name and self._files_ext in b.name
        ]
        if isinstance(self._sort_key, str):
            self._sort_key = dill.loads(bytes.fromhex(self._sort_key))
        paths.sort(key=self._sort_key) if len(paths) > 1 else None
        for file in paths:
            yield file
    def _run(cls, p, options):
        with p:
            project_id = GCLOUD.project(options.env)

            big_query_data = (p | 'Read BigQuery Data' >> bq.ReadAsJson(
                project=project_id,
                query=options.query,
                page_size=options.page_size)
                              | 'Transform Nested Datetimes' >> beam.ParDo(
                                  StringifyDatetimes()))

            staging_table = 'staging.orders'

            wv_p = (big_query_data | 'Apply World Ventures Transform' >>
                    WorldVenturesStagingOrdersTransform())

            bs_p = (big_query_data
                    | 'Apply Bluesun Transform' >> beam.ParDo(Bluesun()))

            ((wv_p, bs_p) | 'Merge Client Collections for Writing to BigQuery'
             >> beam.Flatten()
             | 'Prewrite Cleanup' >> beam.ParDo(PrewriteCleanup())
             | 'Write to staging' >> beam.io.WriteToBigQuery(
                 '{}:{}'.format(project_id, staging_table),
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 custom_gcs_temp_location=f'gs://{project_id}-dataflow/tmp'))
Пример #4
0
 def _run(cls, p, options):
     crypto = Crypto(env=options.env, key='vibe-cdc')
     keys = list(
         crypto.list_secret_keys(
             client=secretmanager.SecretManagerServiceClient()))
     with p:
         project_id = gcloud.project(options.env)
         bucket = f'{project_id}-cdc-imports'
         (p
          | 'Iterate File Paths' >> FileListIteratorTransform(
              env=options.env,
              bucket=bucket,
              files_startwith=options.files_startwith,
              sort_key=options.sort_key)
          | 'Read from a File' >> beam.io.ReadAllFromText()
          | 'Apply Decryption Transform' >> beam.ParDo(
              RecordDecryption(env=options.env, keys=keys, crypto=crypto))
          | 'Insert Ingestion Timestamp' >> beam.ParDo(
              InsertIngestionTimestamp())
          | 'Ingest table schema' >> beam.ParDo(
              bq.IngectTableSchema(options.dest))
          | 'Transform String to Standard SQL Datetime' >> beam.ParDo(
              StringToDatetime())
          | 'Transform Nested Datetimes' >> beam.ParDo(
              StringifyDatetimes('%Y-%m-%d %H:%M:%S'))
          | 'Write to Bigquery' >> beam.io.WriteToBigQuery(
              options.dest,
              write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
              create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER))
Пример #5
0
 def _run(cls, p, options):
     with p:
         project_id = GCLOUD.project(options.env)
         bucket = f'{project_id}-wrench-imports'
         dest_tbl = f'{project_id}:lake.wrench_metrics'
         (p
          | 'Iterate File Paths' >> FileListIteratorTransform(
              env=options.env,
              bucket=bucket,
              files_ext=options.files_ext,
              sort_key=options.sort_key)
          |
          'Read from a File' >> beam.io.ReadAllFromText(skip_header_lines=1)
          | 'Apply Wrench Transform' >> beam.ParDo(WrenchCSVPayloadMapper())
          | 'Insert Ingestion Timestamp' >> beam.ParDo(
              InsertIngestionTimestamp())
          | 'Ingest table schema' >> beam.ParDo(
              bq.IngectTableSchema(table=dest_tbl))
          | 'Transform String to Standard SQL Datetime' >> beam.ParDo(
              StringToDatetime())
          | 'Transform Nested Datetimes' >> beam.ParDo(
              StringifyDatetimes('%Y-%m-%d %H:%M:%S'))
          | 'Write to lake' >> beam.io.WriteToBigQuery(
              dest_tbl,
              write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
              create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER))
Пример #6
0
 def start_bundle(self):
     self._ml_config = Config.get_config(gcloud.env(self._env.get()))
     self._client_name = self._client.get()
     self._client_config = self._ml_config['cloudsql'][self._client_name]
     if self._client_config['host'] != 'mysql':
         proxy_connect(
             self._ml_config['cloudsql'][self._client.get()]['host'])
    def _run(cls, p, options):
        with p:
            project_id = GCLOUD.project(options.env)

            big_query_data = (p | 'Read BigQuery Data' >> bq.ReadAsJson(
                project=project_id,
                query=options.query,
                page_size=options.page_size)
                              | 'Insert Ingestion Timestamp' >> beam.ParDo(
                                  InsertIngestionTimestamp())
                              | 'Transform Nested Datetimes' >> beam.ParDo(
                                  StringifyDatetimes()))

            staging_table = 'staging.users'

            wv_p = (big_query_data | 'Apply World Ventures Transform' >>
                    WorldVenturesStagingUsersTransform())

            bs_p = (big_query_data
                    | 'Apply Bluesun Transform' >> beam.ParDo(Bluesun()))

            results = (
                (wv_p, bs_p)
                | 'Merge Client Collections for Writing to BigQuery' >>
                beam.Flatten()
                | 'Write to staging' >> beam.io.WriteToBigQuery(
                    '{}:{}'.format(project_id, staging_table),
                    insert_retry_strategy='RETRY_NEVER',
                    write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                    create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER
                ))
            print(results)
Пример #8
0
 def _run(cls, p, options):
     project_id = gcloud.project(options.env)
     with p:
         (p
          |
          'Read BigQuery Data' >> bq.ReadAsJson(project=project_id,
                                                query=options.query,
                                                page_size=options.page_size)
          | 'Transform Nested Datetimes' >> beam.ParDo(StringifyDatetimes())
          | 'Convert None values to empty strings' >> beam.ParDo(
              ConvertNoneToEmptyString())
          |
          'Convert Bool values to ints' >> beam.ParDo(ConvertBoolsToInts())
          | 'Convert dict elements to json' >> beam.Map(json.dumps)
          | 'Write file to Cloud Storage' >> beam.io.WriteToText(
              options.destination,
              file_name_suffix='.ndjson.gz',
              compression_type=beam.io.filesystem.CompressionTypes.GZIP))
Пример #9
0
import os
import logging
from libs import GCLOUD as gcloud, CloudStorage
from airflow import DAG
from datetime import datetime, timedelta
from airflow.models import Variable
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.sensors.custom_sensors import DataflowJobStateSensor
from airflow.operators.custom_operators import ScheduleDataflowJobOperator
from libs import report_failure

log = logging.getLogger()
airflow_vars = Variable.get('airflow_vars', deserialize_json=True)
env = os.environ['ENV']
project_id = gcloud.project(env)
bucket = f'{project_id}-cdc-imports'
processed_bucket = f'{project_id}-cdc-imports-processed'

default_args = {
    'owner': 'airflow',
    'start_date': datetime(2020, 3, 8),
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    'on_failure_callback': report_failure,
    'email_on_failure': False,
    'email_on_retry': False
}


def list_blobs(bucket, files_startswith):
import pytest
import apache_beam as beam
from libs import GCLOUD as gcloud, Crypto
import datetime
from apache_beam.testing.test_pipeline import TestPipeline
from apache_beam.testing.util import assert_that
from apache_beam.testing.util import equal_to
from load_cdc_from_gcs_to_lake import RecordDecryption, Runner, RuntimeOptions

project_id = gcloud.project('local')
bucket = f'{project_id}-cdc-imports'
# TODO: Move test data to Factory wbrito 05/05/2020
file_seed = 'eee7cb05d80b8a58bccff2f4290b49ff:b42e122e33a81939c5810d44ce92830e431ea45689\
f0aa7ba6c8cd04f2049bfa2fa92283c6e7fb12458a8e910873a2322e8f2d18f6ae7f728f715a4c633b73328d12b0260\
f6b49b84e46c127c58acf71e81b5847318f5eea3293c304ea38090de987c24debf2f8c282920ac82bfc5bd473a9002c9\
162bfa08f1eb309e46dc7e7ca1964334620c8581f028228e8b3edc4233c0dca5edf7aea2a00a96ef272c5061f9b47f2e3\
edf5a52a26fa6f5976e3a12c27c4a0cd7ef8ec401cd8755ac4566f674aa3a33a22111e55bd7d76aa32e8911612971e1aa0\
7ad09e21323be4d47972d94666ecb48271f8fe44be7b6b2b2508fcffdf34c0e7a9cec72aee07252469c4dacc060a4dadedae\
4fd28679e047f2b6394fb668e021e79fe0f414e2a15a0a652f3f628431df1463da2d61d0f1211a121c5ea715d3ac3fe22cd11b\
97933c3f337b8224261caf321f667a6c894b09eba6a13b9a155be69b903d3c51fc8507a7fc43e2b9145bb8e53826c78ef9645b8\
3e811913e365eb9daee0e732fd6fe3bc0fc9f6eebe1ca0297f7fc98f8b5075a28283cd3f17ff0482116aaa0b2e3d7d8bed976f4b\
82e50efb2627728cb76601819b4aa69b59eee86348b4d51540a8659da87f630468344c0212f393011345668fd0f4afbef1f36cf7\
b94815a688aa8754fa9a4d9942bc14de16e2fdf712b72a9b9e0e57bbe27df145f75a554806e24ee3c34bf76d1da247d653cca9507\
b3d13e43afbb94e5e8915559541d6a2d1822a9a5ba8d1157477fceb13687852e69d1a3cc40253727a7e0480fd303005059450cd381\
f8ea427887ac591665ff4266d3f5ae93e6c11c6553cb7220a695c04ebceac7369dc0fd8e1a68ab82b23099db224511ac9320e60a684\
b656f23cab924ec795662b9c3e94c65d2b076d856483a89c1f3f87a743c07acf6eb0824d1834abf14c50717343138d67d7e8171f4cac7\
71f71bc22a330dad96a62ac8a9354992d622c644def96b7c2e3ea550a2648383d3900faff5e74cad261998749faf42f595d24c3f342dd3c\
6b38078d35c9d5c339048b234a11f022c21d46194a0935a74cf366b9862389d2fe5203b9243ccee8b096ee3196411d49515f0f3384a906d91325a7f80'

filename = 'vibe-tree-users-final-8b8e336ea2366596964f1ed2c67c3039bc5cfe57e823db8e647835f1fee26040-1587028509211'
# TODO: Move to Factory wbrito 05/05/2020
Пример #11
0
"""
$dag_filename$: vibe_to_lake---client--.py
"""
import logging
import os
from airflow import DAG
from airflow.models import Variable
from datetime import datetime
from airflow.operators.dummy_operator import DummyOperator
from libs import GCLOUD as gcloud
from airflow.sensors.custom_sensors import DataflowJobStateSensor
from airflow.operators.custom_operators import ScheduleDataflowJobOperator
from libs import report_failure

env = os.environ['ENV']
project = gcloud.project(env)
airflow_vars = Variable.get("airflow_vars", deserialize_json=True)

default_args = {
    'owner': 'airflow',
    'start_date': datetime(2020, 3, 8),
    'on_failure_callback': report_failure,
    'email_on_failure': False,
    'email_on_retry': False
}

DAG_ID = 'vibe_to_lake---client--'


def _marker(st):
    logging.info('********************************{}*****************************************'.format(st))
Пример #12
0
from apache_beam.testing.test_pipeline import TestPipeline
from apache_beam.options.value_provider import RuntimeValueProvider
from libs import GCLOUD
from apache_beam.testing.util import assert_that
from apache_beam.testing.util import equal_to
from transforms.io.filelist_iterator import FileListIteratorTransform
import dill

project_id = GCLOUD.project('local')


def test_a_serialized_file_list_is_deserialized_and_processed_by_insertion_order(
        cloudstorage):
    with TestPipeline() as p:
        bucket = f'{project_id}-wrench-imports'
        file_seed = """
        entity_id,tree_user_id,prediction,owner,experiment_name,processing_datetime
        1fab9649-5113-4daa-a9ed-ae67bc3358b8,1038665,low,e08dc822-97b1-46f4-9154-25821286231f,lcv_level,2020-01-31T14:52:40.000+05:30
        c104b7ea-d32c-4c6e-92a5-505b3651d424,873612,zero,e08dc822-97b1-46f4-9154-25821286231f,lcv_level,2020-01-31T14:52:40.000+05:30
        """
        filename = 'wrench_test.csv'

        # Update sort_key based on the filename format
        def _sort_key(f):
            delimeter = '*'
            ts = f[f.rfind(delimeter) + 1:]
            return int(ts) if ts.isdigit() else f

        _sort_key = bytes.hex(dill.dumps(_sort_key))

        [b.delete() for b in cloudstorage.client.list_blobs(bucket)]
import os
import pytest
import cdc_from_gcs_to_lake
from airflow.models.taskinstance import TaskInstance
from datetime import datetime
from airflow.operators.python_operator import BranchPythonOperator, PythonOperator
from libs import GCLOUD as gcloud, CloudStorage
from airflow.models import Variable
from unittest.mock import patch
from unittest import mock
from google.cloud.storage.blob import Blob
from google.cloud.storage.bucket import Bucket
from google.cloud.storage.client import Client

airflow_vars = Variable.get('airflow_vars', deserialize_json=True)
project_id = gcloud.project(os.environ['ENV'])
bucket = f'{project_id}-cdc-imports'
processed_bucket = f'{project_id}-cdc-imports-processed'
FILE_NAME = 'vibe-commission-bonuses-final-8b8e336ea2366596964f1ed2c67c3039bc5cfe57e823db8e647835f1fee26040-1587028509211'
FILE_SEED = """
14d1bbba50e40234839420171eb87431:0c81720eff1215c298621670f689ac76a3300ce0320c3a3c1c381d5f356f9fa405d14a9deabd0757207776d12a76bc076e2d0baaa6a79a0cb66b0ec2ee78005f05722934b501e1cb083bfedcc319e41dc0a207e899fcb9558f6c8826e3cee6beb67a0d1a878e4a5e86bb7f0579c28bcde88539add19e7aea69c495a413d2dc37892162d68b75e6003db81846bb96bfb946ef3d387a2b116b92a5b609b4c4e3c8570139f804daa04b105feeac06845efda0dce5360809de73d4c7831c9e84c4974313ebe7ea807093e2f214379f4c5e8c805fa4004cfc2f1c8cbf23ad68145a3a
"""


def test_has_exected_task_count(load_dag):
    dag_bag = load_dag('cdc_from_gcs_to_lake')
    dag = dag_bag.get_dag('cdc_from_gcs_to_lake')
    assert len(dag.tasks) == 126


def test_continue_if_file_task(load_dag):
Пример #14
0
def create_dag():
    dag = DAG(
        DAG_ID,
        default_args=default_args,
        # Be sure to stagger the dags so they don't run all at once,
        # possibly causing max memory usage and pod failure. - Stu M.
        schedule_interval='30 * * * *',
        catchup=False)
    with dag:
        start_task = DummyOperator(task_id='start')
        finish_task = DummyOperator(task_id='finish')

        for table, sources in table_map.items():
            pusher_task_id = f'schedule_dataflow_{table}'
            parsed_table = gcloud.parse_table_name(table)

            get_checkpoint_task = GetCheckpointOperator(
                task_id=f'get_checkpoint_{table}',
                env=env,
                target=table,
                sources=sources)

            continue_if_data_task = BranchPythonOperator(
                task_id=f'continue_if_data_{table}',
                python_callable=should_continue,
                op_args=[table],
                provide_context=True)

            parse_query_task = PythonOperator(task_id=f'parse_query_{table}',
                                              python_callable=parse_query,
                                              op_args=[table],
                                              provide_context=True)

            dataflow_task = ScheduleDataflowJobOperator(
                task_id=pusher_task_id,
                project=gcloud.project(env),
                template_name=f'load_lake_to_staging_{parsed_table}',
                job_name=f'lake-to-staging-{table}',
                job_parameters={'env': env},
                pull_parameters=[{
                    'param_name': 'query',
                    'task_id': f'parse_query_{table}'
                }],
                provide_context=True)

            monitor_dataflow_task = DataflowJobStateSensor(
                task_id=f'monitor_df_job_{table}',
                poke_interval=airflow_vars['dags']['lake_to_staging']
                ['poke_interval'],
                timeout=airflow_vars['dags']['lake_to_staging']
                ['poke_timeout'],
                dag=dag,
                pusher_task_id=pusher_task_id)

            set_checkpoint_task = SetCheckpointOperator(
                task_id=f'set_checkpoint_{table}', env=env, table=table)

            start_task.set_downstream(get_checkpoint_task)
            get_checkpoint_task.set_downstream(continue_if_data_task)
            continue_if_data_task.set_downstream(parse_query_task)
            parse_query_task.set_downstream(dataflow_task)
            dataflow_task.set_downstream(monitor_dataflow_task)
            monitor_dataflow_task.set_downstream(set_checkpoint_task)
            set_checkpoint_task.set_downstream(finish_task)

        start_task >> finish_task
    return dag
Пример #15
0
def test_query_from_table(env):
    project = gcloud.project(env['env'])
    table = 'staging.contacts'
    query = bq_to_wrench.build_query(table=table, project=project)
    assert query._builder['from'] == f'`{project}.{table}`'
Пример #16
0
def create_dag():
    dag = DAG(DAG_ID,
              catchup=False,
              default_args=default_args,
              schedule_interval='@hourly')
    with dag:
        start_task = DummyOperator(task_id='start')

        finish_task = DummyOperator(task_id='finish')

        for table in get_airflow_vars()['dags'][DAG_ID]['tables']:
            table = table['name']
            parsed_table = gcloud.parse_table_name(table)

            get_checkpoint_task = GetCheckpointOperator(
                task_id='get_checkpoint_{}'.format(table),
                env=env,
                target=table,
                sources=[table])

            continue_if_data_task = BranchPythonOperator(
                task_id='continue_if_data_{}'.format(table),
                python_callable=continue_if_data,
                op_args=[table],
                trigger_rule='all_done',
                provide_context=True)

            clear_gcs_bucket_by_table_task = PythonOperator(
                task_id='clear_gcs_bucket_{}'.format(table),
                python_callable=clear_gcs_bucket_by_table,
                op_args=[env, table])

            parse_query_task = PythonOperator(task_id=f'parse_query_{table}',
                                              python_callable=parse_query,
                                              op_args=[table],
                                              provide_context=True)

            dataflow_task = ScheduleDataflowJobOperator(
                task_id=f'schedule_dataflow_{table}',
                project=gcloud.project(env),
                template_name='offload_bq_to_cs',
                job_name=f'bq-to-wrench-{parsed_table}',
                job_parameters={
                    'destination':
                    'gs://{}/{}/{}'.format(gcs_bucket, table,
                                           f'bq-to-wrench-{parsed_table}')
                },
                pull_parameters=[{
                    'param_name': 'query',
                    'task_id': f'parse_query_{table}'
                }],
                provide_context=True)

            monitor_dataflow_task = DataflowJobStateSensor(
                task_id=f'monitor_dataflow_{table}',
                pusher_task_id=f'schedule_dataflow_{table}',
                poke_interval=get_airflow_vars()['dags'][DAG_ID]
                ['poke_interval'],
                timeout=get_airflow_vars()['dags'][DAG_ID]['poke_timeout'],
                dag=dag)

            gcs_to_wrench_s3_task = PythonOperator(
                task_id='gcs_to_wrench_s3_{}'.format(table),
                python_callable=gcs_to_wrench_s3,
                op_args=[env, table])

            commit_checkpoint_task = SetCheckpointOperator(
                task_id='commit_checkpoint_{}'.format(table),
                env=env,
                table=table)

            (start_task >> get_checkpoint_task >> continue_if_data_task >>
             clear_gcs_bucket_by_table_task >> parse_query_task >>
             dataflow_task >> monitor_dataflow_task >> gcs_to_wrench_s3_task >>
             commit_checkpoint_task >> finish_task)
    return dag