Exemplo n.º 1
0
        'source': {
            'image_uri': GCP_VISION_ANNOTATE_IMAGE_URL
        }
    },
    'features': [{
        'type': enums.Feature.Type.LOGO_DETECTION
    }],
}
# [END howto_operator_vision_annotate_image_request]

# [START howto_operator_vision_detect_image_param]
DETECT_IMAGE = {"source": {"image_uri": GCP_VISION_ANNOTATE_IMAGE_URL}}
# [END howto_operator_vision_detect_image_param]

with models.DAG('example_gcp_vision_autogenerated_id',
                default_args=default_args,
                schedule_interval=None) as dag_autogenerated_id:
    # ################################## #
    # ### Autogenerated IDs examples ### #
    # ################################## #

    # [START howto_operator_vision_product_set_create]
    product_set_create = CloudVisionCreateProductSetOperator(
        location=GCP_VISION_LOCATION,
        product_set=product_set,
        retry=Retry(maximum=10.0),
        timeout=5,
        task_id='product_set_create',
    )
    # [END howto_operator_vision_product_set_create]
Exemplo n.º 2
0
# Example dataset
DATASET = {
    "display_name": "test_video_dataset",
    "video_classification_dataset_metadata": {},
}

IMPORT_INPUT_CONFIG = {"gcs_source": {"input_uris": [GCP_AUTOML_VIDEO_BUCKET]}}

default_args = {"start_date": days_ago(1)}
extract_object_id = CloudAutoMLHook.extract_object_id

# Example DAG for AutoML Video Intelligence Classification
with models.DAG(
        "example_automl_video",
        default_args=default_args,
        schedule_interval=None,  # Override to match your needs
        user_defined_macros={"extract_object_id": extract_object_id},
        tags=['example'],
) as example_dag:
    create_dataset_task = AutoMLCreateDatasetOperator(
        task_id="create_dataset_task",
        dataset=DATASET,
        location=GCP_AUTOML_LOCATION)

    dataset_id = (
        '{{ task_instance.xcom_pull("create_dataset_task", key="dataset_id") }}'
    )

    import_dataset_task = AutoMLImportDataOperator(
        task_id="import_dataset_task",
        dataset_id=dataset_id,
# under the License.

import os

from airflow import models
from airflow.providers.google.cloud.transfers.sheets_to_gcs import GoogleSheetsToGCSOperator
from airflow.providers.google.suite.transfers.gcs_to_sheets import GCSToGoogleSheetsOperator
from airflow.utils.dates import days_ago

BUCKET = os.environ.get("GCP_GCS_BUCKET", "example-test-bucket3")
SPREADSHEET_ID = os.environ.get("SPREADSHEET_ID", "example-spreadsheetID")
NEW_SPREADSHEET_ID = os.environ.get("NEW_SPREADSHEET_ID", "1234567890qwerty")

with models.DAG(
    "example_gcs_to_sheets",
    start_date=days_ago(1),
    schedule_interval='@once',  # Override to match your needs
    tags=["example"],
) as dag:

    upload_sheet_to_gcs = GoogleSheetsToGCSOperator(
        task_id="upload_sheet_to_gcs",
        destination_bucket=BUCKET,
        spreadsheet_id=SPREADSHEET_ID,
    )

    # [START upload_gcs_to_sheets]
    upload_gcs_to_sheet = GCSToGoogleSheetsOperator(
        task_id="upload_gcs_to_sheet",
        bucket_name=BUCKET,
        object_name="{{ task_instance.xcom_pull('upload_sheet_to_gcs')[0] }}",
        spreadsheet_id=NEW_SPREADSHEET_ID,
Exemplo n.º 4
0
    'owner': '*****@*****.**',
    'start_date': datetime.datetime(2019, 5, 12),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'depends_on_past': False,
    # If a task fails, retry it once after waiting at least 5 minutes
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=10),
}

dag_name = 'kpi_dashboard'

with models.DAG(
        dag_name,
        # KPI dashboard refreshes at 16:00 UTC, so run this 15 minutes beforehand.
        schedule_interval='45 15 * * *',
        default_args=default_args) as dag:

    kpi_dashboard = bigquery_etl_query(
        destination_table='firefox_kpi_dashboard_v1',
        dataset_id='telemetry',
        date_partition_parameter=None,
        email=[
            '*****@*****.**', '*****@*****.**',
            '*****@*****.**', '*****@*****.**',
            '*****@*****.**'
        ])

    smoot_usage_new_profiles_v2 = bigquery_etl_query(
        task_id='smoot_usage_new_profiles_v2',
}

PARAMETERS = {"dataRange": "LAST_14_DAYS", "timezoneCode": "America/New_York"}

CREATE_SDF_DOWNLOAD_TASK_BODY_REQUEST: Dict = {
    "version": SDF_VERSION,
    "advertiserId": ADVERTISER_ID,
    "inventorySourceFilter": {"inventorySourceIds": []},
}

DOWNLOAD_LINE_ITEMS_REQUEST: Dict = {"filterType": ADVERTISER_ID, "format": "CSV", "fileSpec": "EWF"}
# [END howto_display_video_env_variables]

with models.DAG(
    "example_display_video",
    schedule_interval='@once',  # Override to match your needs,
    start_date=dates.days_ago(1),
) as dag1:
    # [START howto_google_display_video_createquery_report_operator]
    create_report = GoogleDisplayVideo360CreateReportOperator(body=REPORT, task_id="create_report")
    report_id = create_report.output["report_id"]
    # [END howto_google_display_video_createquery_report_operator]

    # [START howto_google_display_video_runquery_report_operator]
    run_report = GoogleDisplayVideo360RunReportOperator(
        report_id=report_id, parameters=PARAMETERS, task_id="run_report"
    )
    # [END howto_google_display_video_runquery_report_operator]

    # [START howto_google_display_video_wait_report_operator]
    wait_for_report = GoogleDisplayVideo360ReportSensor(task_id="wait_for_report", report_id=report_id)
Exemplo n.º 6
0
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import os
from datetime import datetime

from airflow import models
from airflow.providers.amazon.aws.transfers.sftp_to_s3 import SFTPToS3Operator

S3_BUCKET = os.environ.get("S3_BUCKET", "test-bucket")
S3_KEY = os.environ.get("S3_KEY", "key")

with models.DAG(
        "example_sftp_to_s3",
        schedule_interval=None,
        start_date=datetime(2021, 1, 1),
        catchup=False,
) as dag:
    # [START howto_transfer_sftp_to_s3]
    create_sftp_to_s3_job = SFTPToS3Operator(
        task_id="create_sftp_to_s3_job",
        sftp_path="/tmp/sftp_path",
        s3_bucket=S3_BUCKET,
        s3_key=S3_KEY,
    )
    # [END howto_transfer_sftp_to_s3]
Exemplo n.º 7
0
    "location={location}&" \
    "instance={instance}&" \
    "use_proxy=False&" \
    "use_ssl=True&" \
    "sslcert={client_cert_file}&" \
    "sslkey={client_key_file}&" \
    "sslrootcert={server_ca_file}".format(**mysql_kwargs)

# [END howto_operator_cloudsql_query_connections]

# [START howto_operator_cloudsql_query_operators]

connection_names = [
    "proxy_postgres_tcp", "proxy_postgres_socket", "public_postgres_tcp",
    "public_postgres_tcp_ssl", "proxy_mysql_tcp", "proxy_mysql_socket",
    "public_mysql_tcp", "public_mysql_tcp_ssl"
]

tasks = []

with models.DAG(dag_id='example_gcp_sql_query',
                default_args=default_args,
                schedule_interval=None) as dag:
    for connection_name in connection_names:
        tasks.append(
            CloudSqlQueryOperator(gcp_cloudsql_conn_id=connection_name,
                                  task_id="example_gcp_sql_task_" +
                                  connection_name,
                                  sql=SQL))
# [END howto_operator_cloudsql_query_operators]
Exemplo n.º 8
0
from utils.amplitude import export_to_amplitude

default_args = {
    'owner': '*****@*****.**',
    'start_date': datetime.datetime(2019, 6, 27),
    'email': ['*****@*****.**', '*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': datetime.timedelta(minutes=10),
}

dag_name = 'bq_events_to_amplitude'

with models.DAG(
        dag_name,
        default_args=default_args,
        schedule_interval='0 1 * * *') as dag:

    fenix_task_id = 'fenix_amplitude_export'
    SubDagOperator(
        subdag=export_to_amplitude(
            dag_name=fenix_task_id,
            parent_dag_name=dag_name,
            default_args=default_args,
            project='moz-fx-data-shared-prod',
            dataset='telemetry',
            table_or_view='fenix_events_v1',
            s3_prefix='fenix',
        ),
        task_id=fenix_task_id
    )
Exemplo n.º 9
0
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Example Airflow DAG that shows the complex DAG structure.
"""
from datetime import datetime

from airflow import models
from airflow.models.baseoperator import chain
from airflow.operators.bash import BashOperator

with models.DAG(
        dag_id="example_complex",
        schedule_interval=None,
        start_date=datetime(2021, 1, 1),
        catchup=False,
        tags=['example', 'example2', 'example3'],
) as dag:
    # Create
    create_entry_group = BashOperator(task_id="create_entry_group",
                                      bash_command="echo create_entry_group")

    create_entry_group_result = BashOperator(
        task_id="create_entry_group_result",
        bash_command="echo create_entry_group_result")

    create_entry_group_result2 = BashOperator(
        task_id="create_entry_group_result2",
        bash_command="echo create_entry_group_result2")
Exemplo n.º 10
0
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import os

from airflow import models
from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator
from airflow.utils import dates

# [START howto_gcs_environment_variables]
BUCKET_NAME = os.environ.get('GCP_GCS_BUCKET', 'example-bucket-name')
PATH_TO_UPLOAD_FILE = os.environ.get('GCP_GCS_PATH_TO_UPLOAD_FILE',
                                     'example-text.txt')
DESTINATION_FILE_LOCATION = os.environ.get('GCP_GCS_DESTINATION_FILE_LOCATION',
                                           'example-text.txt')
# [END howto_gcs_environment_variables]

with models.DAG('example_local_to_gcs',
                default_args=dict(start_date=dates.days_ago(1)),
                schedule_interval=None,
                tags=['example']) as dag:
    # [START howto_operator_local_filesystem_to_gcs]
    upload_file = LocalFilesystemToGCSOperator(
        task_id="upload_file",
        src=PATH_TO_UPLOAD_FILE,
        dst=DESTINATION_FILE_LOCATION,
        bucket=BUCKET_NAME,
    )
    # [END howto_operator_local_filesystem_to_gcs]
Exemplo n.º 11
0
    },
}

IMPORT_INPUT_CONFIG = {
    "gcs_source": {
        "input_uris": [GCP_AUTOML_TEXT_CLS_BUCKET]
    }
}

default_args = {"start_date": days_ago(1)}
extract_object_id = CloudAutoMLHook.extract_object_id

# Example DAG for AutoML Natural Language Text Classification
with models.DAG(
        "example_automl_text_cls",
        default_args=default_args,
        schedule_interval=None,  # Override to match your needs
        tags=['example'],
) as example_dag:
    create_dataset_task = AutoMLCreateDatasetOperator(
        task_id="create_dataset_task",
        dataset=DATASET,
        location=GCP_AUTOML_LOCATION)

    dataset_id = (
        '{{ task_instance.xcom_pull("create_dataset_task", key="dataset_id") }}'
    )

    import_dataset_task = AutoMLImportDataOperator(
        task_id="import_dataset_task",
        dataset_id=dataset_id,
        location=GCP_AUTOML_LOCATION,
Exemplo n.º 12
0
    "proxy_postgres_tcp",
    "proxy_postgres_socket",
    "public_postgres_tcp",
    "public_postgres_tcp_ssl",
    "proxy_mysql_tcp",
    "proxy_mysql_socket",
    "public_mysql_tcp",
    "public_mysql_tcp_ssl",
    "public_mysql_tcp_ssl_no_project_id",
]

tasks = []

with models.DAG(
        dag_id='example_gcp_sql_query',
        schedule_interval=None,
        start_date=days_ago(1),
        tags=['example'],
) as dag:
    prev_task = None

    for connection_name in connection_names:
        task = CloudSQLExecuteQueryOperator(
            gcp_cloudsql_conn_id=connection_name,
            task_id="example_gcp_sql_task_" + connection_name,
            sql=SQL)
        tasks.append(task)
        if prev_task:
            prev_task >> task
        prev_task = task

# [END howto_operator_cloudsql_query_operators]
Exemplo n.º 13
0
    # Setting start date as yesterday starts the DAG immediately when it is
    # detected in the Cloud Storage bucket.
    'start_date': yesterday,
    # To email on failure or retry set 'email' arg to your email and enable
    # emailing here.
    'email_on_failure': False,
    'email_on_retry': False,
    # If a task fails, retry it once after waiting at least 5 minutes
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

with models.DAG(
        'composer_sample_quickstart',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='quickstart-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
Exemplo n.º 14
0
    """
    Using column name returns spec of the column.
    """
    for column in columns_specs:
        if column["displayName"] == column_name:
            return extract_object_id(column)
    return ""


# Example DAG to create dataset, train model_id and deploy it.
with models.DAG(
        "example_create_and_deploy",
        default_args=default_args,
        schedule_interval=None,  # Override to match your needs
        user_defined_macros={
            "get_target_column_spec": get_target_column_spec,
            "target": TARGET,
            "extract_object_id": extract_object_id,
        },
        tags=['example'],
) as create_deploy_dag:
    # [START howto_operator_automl_create_dataset]
    create_dataset_task = AutoMLCreateDatasetOperator(
        task_id="create_dataset_task",
        dataset=DATASET,
        location=GCP_AUTOML_LOCATION,
        project_id=GCP_PROJECT_ID,
    )

    dataset_id = (
        "{{ task_instance.xcom_pull('create_dataset_task', key='dataset_id') }}"
default_args = {
    "owner": "*****@*****.**",
    "start_date": datetime.datetime(2019, 7, 25),
    "email": ["*****@*****.**", "*****@*****.**"],
    "email_on_failure": True,
    "email_on_retry": True,
    "depends_on_past": False,
    # If a task fails, retry it once after waiting at least 5 minutes
    "retries": 1,
    "retry_delay": datetime.timedelta(minutes=5),
}

dag_name = "copy_deduplicate"

with models.DAG(dag_name,
                schedule_interval="0 1 * * *",
                default_args=default_args) as dag:

    # This single task is responsible for sequentially running copy queries
    # over all the tables in _live datasets into _stable datasets except those
    # that are specifically used in another DAG.
    copy_deduplicate_all = bigquery_etl_copy_deduplicate(
        task_id="copy_deduplicate_all",
        target_project_id="moz-fx-data-shared-prod",
        # Any table listed here under except_tables _must_ have a corresponding
        # copy_deduplicate job in another DAG.
        except_tables=["telemetry_live.main_v4"])

    # Events.

    event_events = bigquery_etl_query(
Exemplo n.º 16
0
DF_TEMP = models.Variable.get('df_temp_location')
COMPOSER_BUCKET_NAME = models.Variable.get('COMPOSER_BUCKET_NAME')

#[START dag_predict_serve]
default_dag_args = {
    'start_date': datetime.datetime(2050, 1, 1),
    'schedule_internal': None,
    'provide_context': True,
    'dataflow_default_options': {
        'project': PROJECT,
        'zone': DF_ZONE,
        'tempLocation': DF_TEMP
    }
}

dag = models.DAG('predict_serve', default_args=default_dag_args)
#[END dag_predict_serve]

#
# Runs prediction.
#

job_id = 'clv-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M'))


def do_predict_clv(**kwargs):
    """ Runs a batch prediction on new data and saving the results as CSV into
    output_path.
    """
    gcs_prediction_intput = 'gs://{}/predictions/to_predict.csv'.format(
        COMPOSER_BUCKET_NAME)
Exemplo n.º 17
0
DATASET_NAME = os.environ.get("GCP_BIGQUERY_DATASET_NAME",
                              "test_dataset_operations")
LOCATION_DATASET_NAME = f"{DATASET_NAME}_location"
DATA_SAMPLE_GCS_URL = os.environ.get(
    "GCP_BIGQUERY_DATA_GCS_URL",
    "gs://cloud-samples-data/bigquery/us-states/us-states.csv",
)

DATA_SAMPLE_GCS_URL_PARTS = urlparse(DATA_SAMPLE_GCS_URL)
DATA_SAMPLE_GCS_BUCKET_NAME = DATA_SAMPLE_GCS_URL_PARTS.netloc
DATA_SAMPLE_GCS_OBJECT_NAME = DATA_SAMPLE_GCS_URL_PARTS.path[1:]

with models.DAG(
        "example_bigquery_operations",
        schedule_interval=None,  # Override to match your needs
        start_date=days_ago(1),
        tags=["example"],
) as dag:
    # [START howto_operator_bigquery_create_table]
    create_table = BigQueryCreateEmptyTableOperator(
        task_id="create_table",
        dataset_id=DATASET_NAME,
        table_id="test_table",
        schema_fields=[
            {
                "name": "emp_name",
                "type": "STRING",
                "mode": "REQUIRED"
            },
            {
                "name": "salary",
Exemplo n.º 18
0
    'owner': '*****@*****.**',
    'start_date': datetime.datetime(2019, 3, 1),
    'email': ['*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'depends_on_past': False,
    # If a task fails, retry it once after waiting at least 5 minutes
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=10),
}

dag_name = 'fxa_events'

with models.DAG(
        dag_name,
        # Continue to run DAG once per day
        schedule_interval='0 10 * * *',
        default_args=default_args) as dag:

    fxa_auth_events = bigquery_etl_query(
        task_id='fxa_auth_events',
        destination_table='fxa_auth_events_v1',
        dataset_id='telemetry',
        arguments=('--schema_update_option=ALLOW_FIELD_ADDITION',),
    )

    fxa_auth_bounce_events = bigquery_etl_query(
        task_id='fxa_auth_bounce_events',
        destination_table='fxa_auth_bounce_events_v1',
        dataset_id='telemetry',
        arguments=('--schema_update_option=ALLOW_FIELD_ADDITION',),
Exemplo n.º 19
0
def build_load_dag(dag_id,
                   output_bucket,
                   destination_dataset_project_id,
                   copy_dataset_project_id=None,
                   copy_dataset_name=None,
                   chain='ethereum',
                   notification_emails=None,
                   load_start_date=datetime(2018, 7, 1),
                   schedule_interval='0 0 * * *'):
    # The following datasets must be created in BigQuery:
    # - crypto_{chain}_raw
    # - crypto_{chain}_temp
    # - crypto_{chain}
    # Environment variable OUTPUT_BUCKET must be set and point to the GCS bucket
    # where files exported by export_dag.py are located

    dataset_name = f'crypto_{chain}'
    dataset_name_raw = f'crypto_{chain}_raw'
    dataset_name_temp = f'crypto_{chain}_temp'

    if not destination_dataset_project_id:
        raise ValueError('destination_dataset_project_id is required')

    environment = {
        'DATASET_NAME': dataset_name,
        'DATASET_NAME_RAW': dataset_name_raw,
        'DATASET_NAME_TEMP': dataset_name_temp,
        'DESTINATION_DATASET_PROJECT_ID': destination_dataset_project_id
    }

    def read_bigquery_schema_from_file(filepath):
        result = []
        file_content = read_file(filepath)
        json_content = json.loads(file_content)
        for field in json_content:
            result.append(
                bigquery.SchemaField(name=field.get('name'),
                                     field_type=field.get('type', 'STRING'),
                                     mode=field.get('mode', 'NULLABLE'),
                                     description=field.get('description')))
        return result

    def read_file(filepath):
        with open(filepath) as file_handle:
            content = file_handle.read()
            for key, value in environment.items():
                # each bracket should be doubled to be escaped
                # we need two escaped and one unescaped
                content = content.replace('{{{{{key}}}}}'.format(key=key),
                                          value)
            return content

    def submit_bigquery_job(job, configuration):
        try:
            logging.info('Creating a job: ' +
                         json.dumps(configuration.to_api_repr()))
            result = job.result()
            logging.info(result)
            assert job.errors is None or len(job.errors) == 0
            return result
        except Exception:
            logging.info(job.errors)
            raise

    default_dag_args = {
        'depends_on_past': False,
        'start_date': load_start_date,
        'email_on_failure': True,
        'email_on_retry': True,
        'retries': 5,
        'retry_delay': timedelta(minutes=5)
    }

    if notification_emails and len(notification_emails) > 0:
        default_dag_args['email'] = [
            email.strip() for email in notification_emails.split(',')
        ]

    # Define a DAG (directed acyclic graph) of tasks.
    dag = models.DAG(dag_id,
                     catchup=False,
                     schedule_interval=schedule_interval,
                     default_args=default_dag_args)

    dags_folder = os.environ.get('DAGS_FOLDER', '/home/airflow/gcs/dags')

    def add_load_tasks(task, file_format, allow_quoted_newlines=False):
        wait_sensor = GoogleCloudStorageObjectSensor(
            task_id='wait_latest_{task}'.format(task=task),
            timeout=60 * 60,
            poke_interval=60,
            bucket=output_bucket,
            object='export/{task}/block_date={datestamp}/{task}.{file_format}'.
            format(task=task, datestamp='{{ds}}', file_format=file_format),
            dag=dag)

        def load_task():
            client = bigquery.Client()
            job_config = bigquery.LoadJobConfig()
            schema_path = os.path.join(
                dags_folder,
                'resources/stages/raw/schemas/{task}.json'.format(task=task))
            job_config.schema = read_bigquery_schema_from_file(schema_path)
            job_config.source_format = bigquery.SourceFormat.CSV if file_format == 'csv' else bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
            if file_format == 'csv':
                job_config.skip_leading_rows = 1
            job_config.write_disposition = 'WRITE_TRUNCATE'
            job_config.allow_quoted_newlines = allow_quoted_newlines

            export_location_uri = 'gs://{bucket}/export'.format(
                bucket=output_bucket)
            uri = '{export_location_uri}/{task}/*.{file_format}'.format(
                export_location_uri=export_location_uri,
                task=task,
                file_format=file_format)
            table_ref = client.dataset(dataset_name_raw).table(task)
            load_job = client.load_table_from_uri(uri,
                                                  table_ref,
                                                  job_config=job_config)
            submit_bigquery_job(load_job, job_config)
            assert load_job.state == 'DONE'

        load_operator = PythonOperator(task_id='load_{task}'.format(task=task),
                                       python_callable=load_task,
                                       execution_timeout=timedelta(minutes=30),
                                       dag=dag)

        wait_sensor >> load_operator
        return load_operator

    def add_enrich_tasks(task,
                         time_partitioning_field='block_timestamp',
                         dependencies=None):
        def enrich_task():
            client = bigquery.Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table = bigquery.Table(temp_table_ref, schema=schema)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(
                    field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))
            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = bigquery.QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = bigquery.QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref
            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql = read_file(sql_path)
            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            # Copy temporary table to destination
            copy_job_config = bigquery.CopyJobConfig()
            copy_job_config.write_disposition = 'WRITE_TRUNCATE'

            all_destination_projects = [(destination_dataset_project_id,
                                         dataset_name)]
            if copy_dataset_project_id is not None and len(copy_dataset_project_id) > 0 \
                    and copy_dataset_name is not None and len(copy_dataset_name) > 0:
                all_destination_projects.append(
                    (copy_dataset_project_id, copy_dataset_name))

            for dest_project, dest_dataset_name in all_destination_projects:
                dest_table_name = '{task}'.format(task=task)
                dest_table_ref = client.dataset(
                    dest_dataset_name,
                    project=dest_project).table(dest_table_name)
                copy_job = client.copy_table(temp_table_ref,
                                             dest_table_ref,
                                             location='US',
                                             job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)

        enrich_operator = PythonOperator(
            task_id='enrich_{task}'.format(task=task),
            python_callable=enrich_task,
            execution_timeout=timedelta(minutes=60),
            dag=dag)

        if dependencies is not None and len(dependencies) > 0:
            for dependency in dependencies:
                dependency >> enrich_operator
        return enrich_operator

    def add_verify_tasks(task, dependencies=None):
        # The queries in verify/sqls will fail when the condition is not met
        # Have to use this trick since the Python 2 version of BigQueryCheckOperator doesn't support standard SQL
        # and legacy SQL can't be used to query partitioned tables.
        sql_path = os.path.join(
            dags_folder,
            'resources/stages/verify/sqls/{task}.sql'.format(task=task))
        sql = read_file(sql_path)
        verify_task = BigQueryOperator(
            task_id='verify_{task}'.format(task=task),
            bql=sql,
            use_legacy_sql=False,
            dag=dag)
        if dependencies is not None and len(dependencies) > 0:
            for dependency in dependencies:
                dependency >> verify_task
        return verify_task

    load_blocks_task = add_load_tasks('blocks', 'csv')
    load_transactions_task = add_load_tasks('transactions', 'csv')
    load_receipts_task = add_load_tasks('receipts', 'csv')
    load_logs_task = add_load_tasks('logs', 'json')
    load_contracts_task = add_load_tasks('contracts', 'json')
    load_tokens_task = add_load_tasks('tokens',
                                      'csv',
                                      allow_quoted_newlines=True)
    load_token_transfers_task = add_load_tasks('token_transfers', 'csv')
    load_traces_task = add_load_tasks('traces', 'csv')

    enrich_blocks_task = add_enrich_tasks('blocks',
                                          time_partitioning_field='timestamp',
                                          dependencies=[load_blocks_task])
    enrich_transactions_task = add_enrich_tasks('transactions',
                                                dependencies=[
                                                    load_blocks_task,
                                                    load_transactions_task,
                                                    load_receipts_task
                                                ])
    enrich_logs_task = add_enrich_tasks(
        'logs', dependencies=[load_blocks_task, load_logs_task])
    enrich_tokens_task = add_enrich_tasks('tokens',
                                          time_partitioning_field=None,
                                          dependencies=[load_tokens_task])
    enrich_token_transfers_task = add_enrich_tasks(
        'token_transfers',
        dependencies=[load_blocks_task, load_token_transfers_task])
    enrich_traces_task = add_enrich_tasks(
        'traces', dependencies=[load_blocks_task, load_traces_task])
    enrich_contracts_task = add_enrich_tasks(
        'contracts', dependencies=[load_contracts_task, enrich_traces_task])

    verify_blocks_count_task = add_verify_tasks('blocks_count',
                                                [enrich_blocks_task])
    verify_blocks_have_latest_task = add_verify_tasks('blocks_have_latest',
                                                      [enrich_blocks_task])
    verify_transactions_count_task = add_verify_tasks(
        'transactions_count', [enrich_blocks_task, enrich_transactions_task])
    verify_transactions_have_latest_task = add_verify_tasks(
        'transactions_have_latest', [enrich_transactions_task])
    verify_logs_have_latest_task = add_verify_tasks('logs_have_latest',
                                                    [enrich_logs_task])
    verify_token_transfers_have_latest_task = add_verify_tasks(
        'token_transfers_have_latest', [enrich_token_transfers_task])
    verify_traces_blocks_count_task = add_verify_tasks(
        'traces_blocks_count', [enrich_blocks_task, enrich_traces_task])
    verify_traces_transactions_count_task = add_verify_tasks(
        'traces_transactions_count',
        [enrich_transactions_task, enrich_traces_task])
    verify_traces_contracts_count_task = add_verify_tasks(
        'traces_contracts_count',
        [enrich_transactions_task, enrich_traces_task, enrich_contracts_task])

    if notification_emails and len(notification_emails) > 0:
        send_email_task = EmailOperator(
            task_id='send_email',
            to=[email.strip() for email in notification_emails.split(',')],
            subject='Ethereum ETL Airflow Load DAG Succeeded',
            html_content='Ethereum ETL Airflow Load DAG Succeeded - {}'.format(
                chain),
            dag=dag)
        verify_blocks_count_task >> send_email_task
        verify_blocks_have_latest_task >> send_email_task
        verify_transactions_count_task >> send_email_task
        verify_transactions_have_latest_task >> send_email_task
        verify_logs_have_latest_task >> send_email_task
        verify_token_transfers_have_latest_task >> send_email_task
        verify_traces_blocks_count_task >> send_email_task
        verify_traces_transactions_count_task >> send_email_task
        verify_traces_contracts_count_task >> send_email_task
        enrich_tokens_task >> send_email_task

    return dag
Exemplo n.º 20
0
# specific language governing permissions and limitations
# under the License.
"""
Example Airflow DAG that translates text in Google Cloud Translate
service in the Google Cloud.

"""

from airflow import models
from airflow.operators.bash import BashOperator
from airflow.providers.google.cloud.operators.translate import CloudTranslateTextOperator
from airflow.utils.dates import days_ago

with models.DAG(
        'example_gcp_translate',
        schedule_interval='@once',  # Override to match your needs
        start_date=days_ago(1),
        tags=['example'],
) as dag:
    # [START howto_operator_translate_text]
    product_set_create = CloudTranslateTextOperator(
        task_id='translate',
        values=['zażółć gęślą jaźń'],
        target_language='en',
        format_='text',
        source_language=None,
        model='base',
    )
    # [END howto_operator_translate_text]
    # [START howto_operator_translate_access]
    translation_access = BashOperator(
        task_id='access',
Exemplo n.º 21
0
# [START howto_operator_cloudsql_db_create_body]
db_create_body = {
    "instance": INSTANCE_NAME,
    "name": DB_NAME,
    "project": GCP_PROJECT_ID
}
# [END howto_operator_cloudsql_db_create_body]
# [START howto_operator_cloudsql_db_patch_body]
db_patch_body = {"charset": "utf16", "collation": "utf16_general_ci"}
# [END howto_operator_cloudsql_db_patch_body]

default_args = {'start_date': airflow.utils.dates.days_ago(1)}

with models.DAG(
        'example_gcp_sql',
        default_args=default_args,
        schedule_interval=None  # Override to match your needs
) as dag:

    def next_dep(task, prev):
        prev >> task
        return task

    # ############################################## #
    # ### INSTANCES SET UP ######################### #
    # ############################################## #

    # [START howto_operator_cloudsql_create]
    sql_instance_create_task = CloudSqlInstanceCreateOperator(
        project_id=GCP_PROJECT_ID,
        body=body,
Exemplo n.º 22
0
GCS_PYTHON = os.environ.get('GCP_DATAFLOW_PYTHON', 'gs://INVALID BUCKET NAME/wordcount_debugging.py')

GCS_JAR_PARTS = urlparse(GCS_JAR)
GCS_JAR_BUCKET_NAME = GCS_JAR_PARTS.netloc
GCS_JAR_OBJECT_NAME = GCS_JAR_PARTS.path[1:]

default_args = {
    'dataflow_default_options': {
        'tempLocation': GCS_TMP,
        'stagingLocation': GCS_STAGING,
    }
}

with models.DAG(
    "example_gcp_dataflow_native_java",
    schedule_interval='@once',  # Override to match your needs
    start_date=days_ago(1),
    tags=['example'],
) as dag_native_java:

    # [START howto_operator_start_java_job_jar_on_gcs]
    start_java_job = BeamRunJavaPipelineOperator(
        task_id="start-java-job",
        jar=GCS_JAR,
        pipeline_options={
            'output': GCS_OUTPUT,
        },
        job_class='org.apache.beam.examples.WordCount',
        dataflow_config={
            "check_if_running": CheckJobRunning.IgnoreJob,
            "location": 'europe-west3',
            "poll_sleep": 10,
                ],
            },
        ],
        "resources": {
            "regions": [f"{LOCATION}"],
            "virtualMachine": {
                "machineType": "n1-standard-1",
            },
        },
    }
}
# [END howto_configure_multiple_action_pipeline]

with models.DAG(
    "example_gcp_life_sciences",
    default_args=dict(start_date=dates.days_ago(1)),
    schedule_interval=None,
    tags=['example'],
) as dag:

    # [START howto_run_pipeline]
    simple_life_science_action_pipeline = LifeSciencesRunPipelineOperator(
        task_id='simple-action-pipeline',
        body=SIMPLE_ACTION_PIEPELINE,
        project_id=PROJECT_ID,
        location=LOCATION,
    )
    # [END howto_run_pipeline]

    multiple_life_science_action_pipeline = LifeSciencesRunPipelineOperator(
        task_id='multi-action-pipeline', body=MULTI_ACTION_PIPELINE, project_id=PROJECT_ID, location=LOCATION
    )
    # The name of the environment variable, since deploy_type is `env` rather
    # than `volume`.
    deploy_target='SQL_CONN',
    # Name of the Kubernetes Secret
    secret='airflow-secrets',
    # Key of a secret stored in this Secret object
    key='sql_alchemy_conn')
# [END composer_kubernetespodoperator_secretobject]

YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1)

# If a Pod fails to launch, or has an error occur in the container, Airflow
# will show the task as failed, as well as contain all of the task logs
# required to debug.
with models.DAG(dag_id='composer_sample_kubernetes_pod',
                schedule_interval=datetime.timedelta(days=1),
                start_date=YESTERDAY) as dag:
    # Only name, namespace, image, and task_id are required to create a
    # KubernetesPodOperator. In Cloud Composer, currently the operator defaults
    # to using the config file found at `/home/airflow/composer_kube_config if
    # no `config_file` parameter is specified. By default it will contain the
    # credentials for Cloud Composer's Google Kubernetes Engine cluster that is
    # created upon environment creation.

    # [START composer_kubernetespodoperator_minconfig]
    kubernetes_min_pod = kubernetes_pod_operator.KubernetesPodOperator(
        # The ID specified for the task.
        task_id='pod-ex-minimum',
        # Name of task you want to run, used to generate Pod ID.
        name='pod-ex-minimum',
        # Entrypoint of the container, if not specified the Docker container's
Exemplo n.º 25
0
        "metrics": ["METRIC_IMPRESSIONS", "METRIC_CLICKS"],
        "includeInviteData": True,
    },
    "schedule": {
        "frequency": "ONE_TIME"
    },
}

PARAMS = {"dataRange": "LAST_14_DAYS", "timezoneCode": "America/New_York"}
# [END howto_display_video_env_variables]

default_args = {"start_date": dates.days_ago(1)}

with models.DAG(
        "example_display_video",
        default_args=default_args,
        schedule_interval=None,  # Override to match your needs
) as dag:
    # [START howto_google_display_video_createquery_report_operator]
    create_report = GoogleDisplayVideo360CreateReportOperator(
        body=REPORT, task_id="create_report")
    report_id = "{{ task_instance.xcom_pull('create_report', key='report_id') }}"
    # [END howto_google_display_video_createquery_report_operator]

    # [START howto_google_display_video_runquery_report_operator]
    run_report = GoogleDisplayVideo360RunReportOperator(report_id=report_id,
                                                        params=PARAMS,
                                                        task_id="run_report")
    # [END howto_google_display_video_runquery_report_operator]

    # [START howto_google_display_video_wait_report_operator]
Exemplo n.º 26
0
    "/global/instanceTemplates/" + GCE_NEW_TEMPLATE_NAME)

UPDATE_POLICY = {
    "type": "OPPORTUNISTIC",
    "minimalAction": "RESTART",
    "maxSurge": {
        "fixed": 1
    },
    "minReadySec": 1800
}

# [END howto_operator_compute_igm_update_template_args]

with models.DAG(
        'example_gcp_compute_igm',
        default_args=default_args,
        schedule_interval=None,  # Override to match your needs
        tags=['example'],
) as dag:
    # [START howto_operator_gce_igm_copy_template]
    gce_instance_template_copy = ComputeEngineCopyInstanceTemplateOperator(
        project_id=GCP_PROJECT_ID,
        resource_id=GCE_TEMPLATE_NAME,
        body_patch=GCE_INSTANCE_TEMPLATE_BODY_UPDATE,
        task_id='gcp_compute_igm_copy_template_task')
    # [END howto_operator_gce_igm_copy_template]
    # Added to check for idempotence
    # [START howto_operator_gce_igm_copy_template_no_project_id]
    gce_instance_template_copy2 = ComputeEngineCopyInstanceTemplateOperator(
        resource_id=GCE_TEMPLATE_NAME,
        body_patch=GCE_INSTANCE_TEMPLATE_BODY_UPDATE,
        task_id='gcp_compute_igm_copy_template_task_2')
Exemplo n.º 27
0
default_args = {"email": ["*****@*****.**"]}

YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1)

secret_file = secret.Secret(
    deploy_type="volume",
    deploy_target="/tmp/secrets/google",
    secret="gc-storage-rw-key",
    key="key.json",
)
dag_id = "data_tagger_pipeline"
dag_number = dag_id

dag = models.DAG(
    dag_id,
    schedule_interval=datetime.timedelta(days=1),
    default_args=default_args,
    start_date=YESTERDAY,
)
with dag:
    kubernetes_list_bucket_pod = kubernetes_pod_operator.KubernetesPodOperator(
        task_id="data-tagger",
        name="data-tagger",
        cmds=[
            "python",
            "-m",
            "src.scripts.data_tagger",
            "cluster",
            bucket_name,
            "data/audiotospeech/config/datatagger/config.yaml",
        ],
        # namespace='composer-1-10-4-airflow-1-10-6-3b791e93',
Exemplo n.º 28
0
# under the License.
"""
Example DAG using GoogleCloudStorageToBigQueryOperator.
"""
from airflow import models
from airflow.operators.bash import BashOperator
from airflow.operators.gcs_to_bq import GCSToBigQueryOperator
from airflow.utils.dates import days_ago

args = {
    'owner': 'airflow',
    'start_date': days_ago(2)
}

dag = models.DAG(
    dag_id='example_gcs_to_bq_operator', default_args=args,
    schedule_interval=None, tags=['example'])

create_test_dataset = BashOperator(
    task_id='create_airflow_test_dataset',
    bash_command='bq mk airflow_test',
    dag=dag)

# [START howto_operator_gcs_to_bq]
load_csv = GCSToBigQueryOperator(
    task_id='gcs_to_bq_example',
    bucket='cloud-samples-data',
    source_objects=['bigquery/us-states/us-states.csv'],
    destination_project_dataset_table='airflow_test.gcs_to_bq_table',
    schema_fields=[
        {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
    'start_date': datetime(2019, 2, 1),
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 5,
    'retry_delay': timedelta(minutes=5)
}

notification_emails = os.environ.get('NOTIFICATION_EMAILS')
if notification_emails and len(notification_emails) > 0:
    default_dag_args['email'] = [
        email.strip() for email in notification_emails.split(',')
    ]

dag = models.DAG(dag_id='redshift_refresh_aggregates',
                 schedule_interval=None,
                 concurrency=1,
                 max_active_runs=1,
                 default_args=default_dag_args)

sql_folder = os.environ.get('REDSHIFT_SQL_FOLDER',
                            "/usr/local/airflow/dags/redshift/sql")
if sql_folder is None:
    raise ValueError("You must set REDSHIFT_SQL_FOLDER environment variable")


def run_sql(**kwargs):
    conn_id = kwargs.get('conn_id')
    sql_file_path = kwargs.get('sql_file_path')
    pg_hook = PostgresHook(conn_id)

    with open(sql_file_path, 'r') as sql_file:
Exemplo n.º 30
0
)
from airflow.utils import dates

ACCOUNT_ID = os.environ.get("GA_ACCOUNT_ID", "123456789")

BUCKET = os.environ.get("GMP_ANALYTICS_BUCKET",
                        "test-airflow-analytics-bucket")
BUCKET_FILENAME = "data.csv"
WEB_PROPERTY_ID = os.environ.get("GA_WEB_PROPERTY", "UA-12345678-1")
WEB_PROPERTY_AD_WORDS_LINK_ID = os.environ.get(
    "GA_WEB_PROPERTY_AD_WORDS_LINK_ID", "rQafFTPOQdmkx4U-fxUfhj")
DATA_ID = "kjdDu3_tQa6n8Q1kXFtSmg"

with models.DAG(
        "example_google_analytics",
        schedule_interval='@once',  # Override to match your needs,
        start_date=dates.days_ago(1),
) as dag:
    # [START howto_marketing_platform_list_accounts_operator]
    list_account = GoogleAnalyticsListAccountsOperator(task_id="list_account")
    # [END howto_marketing_platform_list_accounts_operator]

    # [START howto_marketing_platform_get_ads_link_operator]
    get_ad_words_link = GoogleAnalyticsGetAdsLinkOperator(
        web_property_ad_words_link_id=WEB_PROPERTY_AD_WORDS_LINK_ID,
        web_property_id=WEB_PROPERTY_ID,
        account_id=ACCOUNT_ID,
        task_id="get_ad_words_link",
    )
    # [END howto_marketing_platform_get_ads_link_operator]