Exemplo n.º 1
0
def add_numeric_stats_task(
    dag: models.DAG, feature_vars: dag_utils.FeatureConfigListMapping,
    storage_vars: dag_utils.AirflowVarsConfig
) -> bigquery_operator.BigQueryOperator:
    """Builds an Operator that generates numeric fact stats within a DAG.

  Args:
    dag: The dag that the task needs to be added to.
    feature_vars: The parsed config values from airflow feature object variable.
    storage_vars: The parsed config values from airflow storage variable.

  Returns:
    Operator used to build numeric stats within a DAG.
  """
    num_feats = dag_utils.get_features(feature_vars, 'type', 'Numeric')
    stats_dataset = (f'{storage_vars["bq_working_project"]}.'
                     f'{storage_vars["bq_working_dataset"]}')
    numeric_stats_sql = pipeline_utils.render_sql_from_template(
        'numeric_stats',
        fact_table=f'{stats_dataset}.facts',
        feature_columns=[
            f'\'{dag_utils.get_feature_name(x)}\'' for x in num_feats
        ])

    return bigquery_operator.BigQueryOperator(
        task_id=_GENERATE_NUMERIC_STATS_TASK,
        sql=numeric_stats_sql,
        use_legacy_sql=False,
        destination_dataset_table=f'{stats_dataset}.num_facts_stats_table',
        create_disposition='CREATE_IF_NEEDED',
        write_disposition='WRITE_TRUNCATE',
        allow_large_results=True,
        dag=dag)
Exemplo n.º 2
0
def _store_final_results_to_bq(dag: models.DAG, task_id: str,
                               batch_predict_sql: str) -> models.BaseOperator:
    """Store MP complaint results in Bigquery before GA transfer.

  Args:
    dag: the DAG to add this operator to
    task_id: ID for this specific task within the DAG.
    batch_predict_sql: Custom Query to pick records and add some additional
      colums as MP protocol.

  Returns:
    Operator to use within a DAG to store Prediction results to Bigquery.
  """
    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)

    final_output_table = '{project}.{dataset}.final_output'.format(
        project=storage_vars['bq_working_project'],
        dataset=storage_vars['bq_working_dataset'])

    return bigquery_operator.BigQueryOperator(
        task_id=task_id,
        sql=batch_predict_sql,
        use_legacy_sql=False,
        destination_dataset_table=final_output_table,
        create_disposition='CREATE_IF_NEEDED',
        write_disposition='WRITE_TRUNCATE',
        allow_large_results=True,
        location=bb_vars['gcp_region'],
        dag=dag,
    )
def add_prepare_source_data_task_to_dag(
        dag: models.DAG, sql: str, leads_table: str,
        gcp_region: str) -> bigquery_operator.BigQueryOperator:
    """Adds the BQ task to dag to create ML source table from GA source table.

  Args:
    dag: The dag that the task needs to be added to.
    sql: The parsed config values from airflow prediction variable.
    leads_table: The destination table the output data will be written to.
    gcp_region: GCP region the job will run in.

  Returns:
    The configured BigQueryOperator task that was added to the input dag.
  """
    prepare_source_data = bigquery_operator.BigQueryOperator(
        task_id=TASK_NAME,
        sql=sql,
        use_legacy_sql=False,
        destination_dataset_table=leads_table,
        create_disposition='CREATE_IF_NEEDED',
        write_disposition='WRITE_TRUNCATE',
        allow_large_results=True,
        location=gcp_region,
        dag=dag,
    )
    return prepare_source_data
Exemplo n.º 4
0
def _add_prepare_automl_data_in_bq_task(
    dag: models.DAG, output_type: blockbuster_constants.PreprocessingType,
    prediction_vars: dag_utils.AirflowVarsConfig,
    storage_vars: dag_utils.AirflowVarsConfig
) -> bigquery_operator.BigQueryOperator:
    """Adds the task to write the output to Big Query to dag.

  Args:
    dag: The dag that the task needs to be added to.
    output_type: Indicate whether this pipeline is to be used for training or
      prediction.
    prediction_vars: The parsed config values from airflow prediction variable.
    storage_vars: The parsed config values from airflow storage variable.

  Returns:
    The configured BigQueryOperator task to write input data for automl that was
    added to the dag.
  """
    exclude_from_output = ['userId', 'RECENT_BB_id', 'RECENT_most_recent_lead']
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        output_table = 'training'
        exclude_from_output.append('BB_id')
    elif output_type == blockbuster_constants.PreprocessingType.PREDICTION:
        output_table = 'prediction'
        exclude_from_output.append('MLDataSplit')
    features_table = dag_utils.construct_bq_table_path(
        storage_vars['bq_working_project'], storage_vars['bq_working_dataset'],
        f'ga_{output_table}_input')
    prepare_data_sql = pipeline_utils.render_sql_from_template(
        'prepare_data',
        features_table=features_table,
        exclude_from_output=exclude_from_output,
        inclusion_recency_days=prediction_vars['leads_submission_window'])

    output_dataset = dag_utils.construct_bq_table_path(
        storage_vars['bq_working_project'], storage_vars['bq_working_dataset'],
        f'automl_{output_table}_input')

    prepare_data_for_automl = bigquery_operator.BigQueryOperator(
        task_id='prepare_data_for_automl',
        sql=prepare_data_sql,
        use_legacy_sql=False,
        destination_dataset_table=output_dataset,
        create_disposition='CREATE_IF_NEEDED',
        write_disposition='WRITE_TRUNCATE',
        allow_large_results=True,
        dag=dag,
    )
    return prepare_data_for_automl
Exemplo n.º 5
0
def create_bq_operator_dict_from(sql_statements_dictionary):
    d = sql_statements_dictionary
    result = dict((k, '') for k in d)
    # iterating through the keys.
    for k in d:
        sql = d[k].decode()
        tid = k.replace('.sql', '')
        tid = tid.replace('tmp/', '')
        b = bigquery_operator.BigQueryOperator(
            task_id='make_tmp_table_{}'.format(tid),
            sql=sql,
            use_legacy_sql=False,
            location='asia-southeast1',
            destination_dataset_table=gcp_project + '.tmp.' + tid,
            write_disposition='WRITE_TRUNCATE')
        result[k] = b
    return result
Exemplo n.º 6
0
        source_format='NEWLINE_DELIMITED_JSON')

    # リスト6-5. gcpbook_ch5.dauテーブルへの書き込みタスクの定義
    # BigQueryの作業用テーブルとユーザ情報テーブルを結合し、課金ユーザと
    # 無課金ユーザそれぞれのユーザ数を算出して、結果をgcpbook_ch5.dau
    # テーブルへ書き込むタスクを定義する。
    insert_dau = bigquery_operator.BigQueryOperator(task_id='insert_dau',
                                                    use_legacy_sql=False,
                                                    sql="""
            insert gcpbook_ch5.dau
            select
                date('{{ ds }}') as dt
            ,   countif(u.is_paid_user) as paid_users
            ,   countif(not u.is_paid_user) as free_to_play_users
            from
                (
                    select distinct
                        user_pseudo_id
                    from
                        gcpbook_ch5.work_events
                ) e
                    inner join
                        gcpbook_ch5.users u
                    on
                        u.user_pseudo_id = e.user_pseudo_id
        """)

    # リスト6-6. 作業用テーブルを削除するタスクの定義
    # BigQueryの作業用テーブルを削除するタスクを定義する。
    delete_work_table = \
        bigquery_table_delete_operator.BigQueryTableDeleteOperator(
            task_id='delete_work_table',
Exemplo n.º 7
0
        task_id='generate_account_index',
        python_callable=generate_accounts_views_index)

    project_id = models.Variable.get('GCP_PROJECT', 'dta-ga-bigquery')
    temp_table = 'ga_domains'
    query = """
                CREATE TABLE `{{params.project_id}}.tmp.{{params.temp_table}}_{{ ts_nodash }}`
                OPTIONS(
                  expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 1 HOUR)
                ) AS
                            select distinct domain_name, hostname, ga_code from web_crawl.url_resource, unnest(google_analytics) as ga_code
                           """
    query_webcrawl_ga_index = bigquery_operator.BigQueryOperator(
        task_id='query_webcrawl_ga_index',
        bql=query,
        use_legacy_sql=False,
        params={
            'project_id': project_id,
            'temp_table': temp_table,
        })
    export_webcrawl_ga_index_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_webcrawl_ga_index_to_gcs',
        source_project_dataset_table=
        "{{params.project_id}}.tmp.{{params.temp_table}}_{{ ts_nodash }}",
        params={
            'project_id': project_id,
            'temp_table': temp_table
        },
        destination_cloud_storage_uris=[
            "gs://%s/data/%s.csv" % (models.Variable.get(
                'AIRFLOW_BUCKET',
                'us-east1-dta-airflow-b3415db4-bucket'), temp_table)
Exemplo n.º 8
0
    'retry_delay': datetime.timedelta(minutes=5)
}

with models.DAG(
        'bigquery_data_export_rds',
        # schedule_interval=datetime.timedelta(days=1),
        schedule_interval=None,
        catchup=False,
        default_args=default_dag_args) as dag:
    project_id = models.Variable.get('GCP_PROJECT', 'dta-ga-bigquery')

    # BSQL script to pull the data from BigQuery
    bigquery_data_script = bigquery_operator.BigQueryOperator(
        task_id='bigquery_data_script',
        bql=pathlib.Path(
            dobs_constants.DAGS_DIR +
            "/bq_observatory_service/bq_sql_ga_accounts_data_query.sql").
        read_text(),
        use_legacy_sql=False)

    # BigQuery data fetch
    bigquery_data_fetch = bigquery_get_data.BigQueryGetDataOperator(
        task_id='bigquery_data_fetch',
        dataset_id=dobs_constants.DATASET_ID,
        table_id=dobs_constants.TABLE_ID)

    def prepare_data(data_, id_):
        output = []
        for datum in data_:
            agency_name, webproperty_id, domain_name, hostname, service_name = datum
            output.append({
Exemplo n.º 9
0
from airflow.utils.dates import days_ago
from datetime import timedelta

# Query recent StackOverflow questions.

default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "start_date": days_ago(1),
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
}

# dag = DAG("tableau_dags", default_args=default_args,schedule_interval='@daily', catchup=False)
dag = DAG("testbq", default_args=default_args,
          schedule_interval='0 10 * * * ', catchup=False)


bq_recent_questions_query = bigquery_operator.BigQueryOperator(
    task_id='bq_recent_questions_query',
    bigquery_conn_id='bigquery_default',
    sql="""
    SELECT * FROM `fr-stg-self-service-bi.sekitest.superstore_seki` LIMIT 500
    """,
    use_legacy_sql=False,
    destination_dataset_table='fr-stg-self-service-bi.sekitest.superstore_seki2',
    write_disposition='WRITE_TRUNCATE',
    dag=dag)

bq_recent_questions_query
Exemplo n.º 10
0
              `dta-ga-bigquery.{{params.view_id}}.ga_sessions_*`,
              UNNEST(hits)
        
            WHERE
              _TABLE_SUFFIX BETWEEN '{{params.start}}' AND '{{params.end}}'
            AND       REGEXP_CONTAINS(page.pagePath, r"{{ params.search_param }}=(.*?)(?:&|$)")
            group by query
            order by count(*) desc
                       """

        query_internalsearch = bigquery_operator.BigQueryOperator(
            task_id='query_internalsearch_' + galileo.domain_slug(d['domain']),
            bql=query, use_legacy_sql=False, params={
                'project_id': project_id,
                'view_id': d['view_id'],
                'start': start,
                'end': end,
                'temp_table': temp_table,
                'domain': d['domain'],
                'search_param': d['search_param']
            })
        export_internalsearch_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
            task_id='export_internalsearch_to_gcs_' + galileo.domain_slug(d['domain']),
            source_project_dataset_table="{{params.project_id}}.tmp.{{params.temp_table}}_{{ ts_nodash }}",
            params= {
                'project_id': project_id,
                    'temp_table': temp_table
            },
            destination_cloud_storage_uris=[
                "gs://%s/data/searchqueries/%s.csv" % (
                models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), temp_table)],
Exemplo n.º 11
0
from airflow.contrib.operators import bigquery_to_gcs
from airflow.utils import trigger_rule

default_dag_args = {
    'start_date': airflow.utils.dates.days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
}

output_file = 'gs://southamerica-east1-cassio-a-77e1beeb-bucket/data/address.csv'
#Replace <Your bucket> with your path details
with DAG(dag_id='demo_bq_dag',
         schedule_interval=datetime.timedelta(days=1),
         default_args=default_dag_args) as dag:

    bq_airflow_commits_query = bigquery_operator.BigQueryOperator(
        task_id='bq_airflow_commits_query',
        bql="""    SELECT Address
         FROM [airflow-studies:Address.Add]
          """)

    export_commits_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_airflow_commits_to_gcs',
        source_project_dataset_table='airflow-studies:Address.Add',
        destination_cloud_storage_uris=[output_file],
        export_format='CSV')

    bq_airflow_commits_query >> export_commits_to_gcs
Exemplo n.º 12
0
bq_dataset_name = 'test.users_data_last'

default_dag_args = {
    'start_date': yesterday,
    # To email on failure or retry set 'email' arg to your email and enable
    # emailing here.
    'email_on_failure': False,
    'email_on_retry': False,
    # If a task fails, retry it once after waiting at least 5 minutes
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5)
}

# [START bigquery_extracton_test]
with models.DAG(
        'users_data_bigquery_load_test',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:
    # [END bigquery_extracton_test]

    bq_extraction_test = bigquery_operator.BigQueryOperator(
        task_id='bq_extraction_test_query',
        bql="""
            SELECT *
            FROM `test.users_data` 
            """,
        use_legacy_sql=False,
        destination_dataset_table=bq_dataset_name)

bq_extraction_test
Exemplo n.º 13
0
}

with models.DAG(
        'pageviews_snapshot_doi',
        # schedule_interval=datetime.timedelta(days=1),
        schedule_interval='0 12 * * *',
        catchup=False,
        default_args=default_dag_args) as dag:
    project_id = models.Variable.get('GCP_PROJECT', 'dta-ga-bigquery')

    # BigQuery Scripts
    # pageviews snapshot
    query_pageviews_snapshot = bigquery_operator.BigQueryOperator(
        task_id='query_pageviews_snapshot',
        priority='BATCH',
        bql=pathlib.Path(galileo.DAGS_DIR +
                         "/bq_scripts_doi/dta_sql_pgvw_daily_snapshot_full_doi"
                         ).read_text(),
        use_legacy_sql=False)

    query_pageviews_snapshot_delta = bigquery_operator.BigQueryOperator(
        task_id='query_pageviews_snapshot_delta',
        priority='BATCH',
        bql=pathlib.Path(
            galileo.DAGS_DIR +
            "/bq_scripts_doi/dta_sql_pgvw_daily_snapshot_delta_doi").read_text(
            ),
        use_legacy_sql=False)

    # # total visitors snapshot
    # query_total_visitors_snapshot = bigquery_operator.BigQueryOperator(
Exemplo n.º 14
0
}

with models.DAG(
        'bigquery_output_prototype',
        # schedule_interval=datetime.timedelta(days=1),
        schedule_interval=None,
        catchup=False,
        on_failure_callback=None,
        default_args=default_dag_args) as dag:
    project_id = models.Variable.get('GCP_PROJECT', 'dta-ga-bigquery')

    # BSQL script to pull the data from BigQuery
    bigquery_data_type1 = bigquery_operator.BigQueryOperator(
        task_id='bigquery_data_type1',
        priority='BATCH',
        bql=pathlib.Path(
            dobs_constants.DAGS_DIR +
            "/bq_observatory_service/bq_sql_exec_basics_weekly.sql").read_text(
            ),
        use_legacy_sql=False)

    bigquery_data_type2 = bigquery_operator.BigQueryOperator(
        task_id='bigquery_data_type2',
        priority='BATCH',
        bql=pathlib.Path(dobs_constants.DAGS_DIR +
                         "/bq_observatory_service/bq_sql_exec_basics_daily.sql"
                         ).read_text(),
        use_legacy_sql=False)

    bigquery_data_type3 = bigquery_operator.BigQueryOperator(
        task_id='bigquery_data_type3',
        priority='BATCH',
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=2),
    'project_id': models.Variable.get('gcp_project')
}

with models.DAG('current_products_on_sale_notification',
                schedule_interval=datetime.timedelta(days=1),
                default_args=default_dag_args) as dag:

    query_current_sales_products = bigquery_operator.BigQueryOperator(
        task_id='query_products_on_sale',
        bql="""
        SELECT product_id, product_name
        FROM `{bq_table_id}` 
        WHERE sale = True
        """.format(bq_table_id=models.Variable.get('bq_table_id')),
        use_legacy_sql=False,
        destination_dataset_table=bq_products_on_sale_table_id,
        write_disposition='WRITE_TRUNCATE')

    export_data_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_sale_data_to_gcs',
        source_project_dataset_table=bq_products_on_sale_table_id,
        destination_cloud_storage_uris=[output_file],
        export_format='CSV')

    email_updation_notification = email_operator.EmailOperator(
        task_id='email_notification',
        to=email_id,
        subject='Sale product data updated',
Exemplo n.º 16
0
    'start_date': airflow.utils.dates.days_ago(2)
}

with models.DAG(
    dag_id='load_data_from_files_v2', default_args=default_dag_args,
    schedule_interval=None
)as dag:

# [START howto_operator_gcs_to_bq]
load_table_a = GoogleCloudStorageToBigQueryOperator(
    task_id='load_table_a',
    bucket='fs-storage-cb-sh',
    source_objects=['LPMXfile_table_a_YYMMDD.txt'],
    field_delimiter='|',
    destination_project_dataset_table='gcp_schema_bigquery.gcp_table_a',
    write_disposition='WRITE_TRUNCATE',
    dag=dag)

# [END howto_operator_gcs_to_bq]
copy_data_to_b = bigquery_operator.BigQueryOperator(
    task_id='copy_data_to_b',
    sql="""
        insert into {0}.TABLE_B select * from {1}.gcp_table_a
        """.format(dm_ex,dm_ex),
    big_query_conn_id='bigquery_default',
    write_disposition='WRITE_TRUNCATE',
    allow_large_results=True,
    use_legacy_sql=False,
    dag=dag)

load_table_a >> copy_data_to_b
Exemplo n.º 17
0
}

output_file = 'gs://us-east1-example-environmen-3bb7fd7a-bucket/data/product.csv'

dag = DAG(
    'bq_export_dag',
    'catchup=False',
    default_args=default_args,
    schedule_interval="@once",
)

start_task = dummy_operator.DummyOperator(
    task_id='start',
    default_args=default_args,
    dag=dag,
)

bq_query_task = bigquery_operator.BigQueryOperator(
    task_id='bq_query',
    bql='''SELECT * FROM [hd-personalization-dev.temp.Prod] ''',
    dag=dag)

export_to_gcs_task = bigquery_to_gcs.BigQueryToCloudStorageOperator(
    task_id='export_to_gcs',
    source_project_dataset_table='hd-personalization-dev.temp.Prod',
    destination_cloud_storage_uris=[output_file],
    export_format='CSV',
    dag=dag)

start_task >> bq_query_task >> export_to_gcs_task
Exemplo n.º 18
0
    datetime.datetime.today() - datetime.timedelta(1),
    datetime.datetime.min.time())

# [START composer_notify_failure]
default_dag_args = {
    'start_date': yesterday,
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': 'thermal-loop-237313'
}

with models.DAG('batman_bi_daily_report',
                schedule_interval=datetime.timedelta(hours=1),
                default_args=default_dag_args) as dag:

    bq_recent_questions_query = bigquery_operator.BigQueryOperator(
        task_id='bq_count_crimes_by_locations',
        bql="""
            SELECT 
                DATE(`date`, "America/Chicago") as report_day, 
                primary_type, 
                location_description, 
                count(*) as crime_count
            FROM `batman_bi.crime_logs`
            GROUP BY 1, 2, 3
            """,
        use_legacy_sql=False,
        destination_dataset_table='batman_bi.crime_count_by_location',
        write_disposition='WRITE_TRUNCATE')
    task_id='delete_sl_partition',
    dag=dag,
    bash_command=
    '''bq rm -f -t 'dataops_demo_sl_dev.t_twitter_google${{ macros.ds_format(ds, "%Y-%m-%d", "%Y%m%d") }}' ''',
)

# We can of course think of something way more complex, here we are taking advantage of the json fiel so no need to reload everything
from_raw_to_sl = bigquery_operator.BigQueryOperator(
    task_id='from_raw_to_sl',
    dag=dag,
    sql=
    '''SELECT PARSE_TIMESTAMP('%a %b %d %H:%M:%S +0000 %E4Y', created_at) AS c_timestamp,
        CAST(PARSE_TIMESTAMP('%a %b %d %H:%M:%S +0000 %E4Y', created_at) AS date) as c_created, id, user_id, user_name,
        lang, user_screen_name, text, JSON_EXTRACT(json, '$.source') AS source
        FROM dataops_demo_raw_dev.t_twitter_google_{{ macros.ds_format(ds, "%Y-%m-%d", "%Y_%m_%d") }}''',
    destination_dataset_table='dataops_demo_sl_dev.t_twitter_google',
    write_disposition='WRITE_APPEND',
    create_disposition='CREATE_IF_NEEDED',
    time_partitioning={
        'type': 'DAY',
        'field': 'c_created'
    },
    allow_large_results=True,
    use_legacy_sql=False)

# twitter_spark_etl = dagrun_operator.TriggerDagRunOperator(
#    task_id="execute_other_dag",
#    trigger_dag_id="twitter_spark_etl",  # Ensure this equals the dag_id of the DAG to trigger
#    conf={"job_date": "{{ ds }}"}, # No need for this parameter, please check execution_date
#    execution_date="{{ ds }}",
#    dag=dag,
Exemplo n.º 20
0
bq_merge_table1 = bigquery_operator.BigQueryOperator(task_id='bq_merge_table1',
                                                     bql="""
        MERGE
          `{table}` T
        USING
          (
          SELECT
            CASE
              WHEN a.id IS NULL AND b.id IS NOT NULL THEN "I"
              WHEN a.id IS NOT NULL
            AND b.id IS NULL THEN "D"
            ELSE
            "U"
          END
            AS op,
          IF
            (b.id IS NULL,
              a,
              b).*
          FROM
            `Google_CarlosAugusto.table1` a
          FULL OUTER JOIN
            `Google_CarlosAugusto.table1_staging` b
          ON
            a.id = b.id
          WHERE
            a.id IS NULL
            OR b.id IS NULL
            OR FARM_FINGERPRINT(FORMAT("%T", a)) <> FARM_FINGERPRINT(FORMAT("%T", b)) ) S
        ON
          T.id = S.id
          WHEN MATCHED AND S.op = 'U' THEN UPDATE SET col1 = S.col1, col2 = S.col2, col3 = S.col3, col4 = S.col4
          WHEN MATCHED
          AND S.op = 'D' THEN
        DELETE
          WHEN NOT MATCHED THEN INSERT (id, col1, col2, col3, col4) VALUES (S.id, S.col1, S.col2, S.col3, S.col4)
        """.format(table="Google_CarlosAugusto.table1"),
                                                     use_legacy_sql=False,
                                                     dag=dag)
Exemplo n.º 21
0
    # 'start_date': airflow.utils.dates.days_ago(1)
    'email': models.Variable.get('email'),
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}



output_file ='gs://prp-target/address.csv' 

with airflow.DAG(
        dag_id='demo_bq_dag',        
        schedule_interval = datetime.timedelta(days=1),
        default_args = default_dag_args) as dag:
        
        bq_airflow_commits_query = bigquery_operator.BigQueryOperator(
            task_id='bq_airflow_commits_query',
            bql = ''' SELECT * FROM [able-hull-251304.my_dataset.Address] ''')
            
        export_commits_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_airflow_commits_to_gcs',
        source_project_dataset_table = 'able-hull-251304.my_dataset.Address',
        destination_cloud_storage_uris = [output_file],
        export_format = 'CSV'
        )
        
bq_airflow_commits_query >> export_commits_to_gcs
Exemplo n.º 22
0
default_dag_args = {
    # Setting start date as yesterday starts the DAG immediately when it is
    # detected in the Cloud Storage bucket.
    'start_date': yesterday,
}

# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.
with models.DAG('composer_sample_connections',
                schedule_interval=datetime.timedelta(days=1),
                default_args=default_dag_args) as dag:
    # [START composer_connections_default_airflow_1]
    task_default = bigquery_operator.BigQueryOperator(
        task_id='task_default_connection',
        sql='SELECT 1',
        use_legacy_sql=False)
    # [END composer_connections_default_airflow_1]
    # [START composer_connections_explicit_airflow_1]
    task_explicit = bigquery_operator.BigQueryOperator(
        task_id='task_explicit_connection',
        sql='SELECT 1',
        use_legacy_sql=False,
        # Composer creates a 'google_cloud_default' connection by default.
        bigquery_conn_id='google_cloud_default')
    # [END composer_connections_explicit_airflow_1]
    # [START composer_connections_custom_airflow_1]
    task_custom = bigquery_operator.BigQueryOperator(
        task_id='task_custom_connection',
        sql='SELECT 1',
        use_legacy_sql=False,
Exemplo n.º 23
0
    make_bq_dataset = bash_operator.BashOperator(
        task_id='make_bq_dataset',
        # Executing 'bq' command requires Google Cloud SDK which comes
        # preinstalled in Cloud Composer.
        bash_command='bq ls {} || bq mk {}'.format(
            bq_dataset_name, bq_dataset_name))
    # [END composer_bash_bq]

    # [START composer_bigquery]
    # Query recent StackOverflow questions.
    bq_recent_questions_query = bigquery_operator.BigQueryOperator(
        task_id='bq_recent_questions_query',
        sql="""
        SELECT owner_display_name, title, view_count
        FROM `bigquery-public-data.stackoverflow.posts_questions`
        WHERE creation_date < CAST('{max_date}' AS TIMESTAMP)
            AND creation_date >= CAST('{min_date}' AS TIMESTAMP)
        ORDER BY view_count DESC
        LIMIT 100
        """.format(max_date=max_query_date, min_date=min_query_date),
        use_legacy_sql=False,
        destination_dataset_table=bq_recent_questions_table_id)
    # [END composer_bigquery]

    # Export query result to Cloud Storage.
    export_questions_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_recent_questions_to_gcs',
        source_project_dataset_table=bq_recent_questions_table_id,
        destination_cloud_storage_uris=[output_file],
        export_format='CSV')

    # Perform most popular question query.
Exemplo n.º 24
0
        task_id='Run_Fact_Actuary_Staging',
        bash_command=
        f"python3 /home/airflow/gcs/dags/dp_exec/DIM_EXEC2.py --pipeline  Fact_Actuary_Staging --dataset_meta {dataset_meta}",
        dag=Areaman_as_dag)

    t1 = BashOperator(
        task_id='Run_Fact_Actuary',
        bash_command=
        f"python3 /home/airflow/gcs/dags/dp_exec/DIM_EXEC2.py --pipeline  Fact_Actuary --dataset_meta {dataset_meta}",
        dag=Areaman_as_dag)

    #   To produce DB_Run_Off_Fields table
    t3 = BashOperator(
        task_id='Run_DB_Run_Off',
        bash_command=
        f"python3 /home/airflow/gcs/dags/dp_exec/DIM_EXEC2.py --pipeline  DB_Run_Off --dataset_meta {dataset_meta}",
        dag=Areaman_as_dag)

    t100 = bigquery_operator.BigQueryOperator(
        task_id='Fact_Actuary_Prestaging',
        sql=f"""
        CREATE TABLE `geb-dwh-test.uat_geb_dwh_eu_act.fact_actuary_w_o_null` AS SELECT * FROM `geb-dwh-test.uat_geb_dwh_eu_act.source_fact_actuary_staging`
            WHERE LOCALINSURER IS NOT NULL
            ORDER BY LOCALINSURER,CARRIERREFERENCE, RE_BALANCEYEARPTF, RE_QUARTERPTF
        """,
        use_legacy_sql=False,
        dag=Areaman_as_dag)

    t00 >> t0 >> t1 >> t100 >> t9 >> t99
    # t00 >> t0 >> t01 >> t02 >> t1 >>
Exemplo n.º 25
0
    project_id='umg-spinnup',
    dataset_id='spotify',
    table_id='spotify_trends${{ ds_nodash }}',
    row_count=100000,
    poke_interval=600,
    bigquery_conn_id='bigquery_default',
    pool='sensors',
    dag=dag
)

# Generate dataset
generate_dataset = bigquery_operator.BigQueryOperator(
    task_id='generate_dataset',
    sql='sql/spinnup/fraud_detection_v02_data.sql',
    destination_dataset_table='umg-data-science.detect_fraud_spinnup.v02_data${{ ds_nodash }}',
    write_disposition='WRITE_TRUNCATE',
    allow_large_results=True,
    bigquery_conn_id='bigquery_default',
    use_legacy_sql=False,
    dag=dag
)

# Make a temp directory
mkdir = BashOperator(
    task_id='mkdir',
    bash_command='mkdir -p {}'.format(DIR),
    dag=dag
)

# Download python script from GCS
download_script = gcs_download_operator.GoogleCloudStorageDownloadOperator(
    task_id='download_python_script',
    skip_leading_rows=1,
    destination_project_dataset_table="{}.{}.{}".format(PROJECT,
                                                        DATASET,
                                                        'data_source'),
    create_disposition="CREATE_IF_NEEDED",
    write_disposition="WRITE_TRUNCATE",
    dag=dag
)

# Clean the data from BigQuery to BigQuery
t2 = bigquery_operator.BigQueryOperator(
    task_id='bq_from_source_to_clean',
    bql='{}/common/clean.sql'.format(GCS_SQL),
    use_legacy_sql=False,
    allow_large_results=True,
    destination_dataset_table="{}.{}.{}".format(PROJECT,
                                                DATASET,
                                                'data_cleaned'),
    create_disposition="CREATE_IF_NEEDED",
    write_disposition="WRITE_TRUNCATE",
    dag=dag
)

# Creates split between features and targets and also aggregates both sides.
# The threshold date is passed as an arg when calling the Airflow job and
# dynamically understood within the .sql file.
# We should pass query_params but we run into various problems:
# - if using BigQueryOperator, we can not pass dag_run.conf['threshold_date']
# - if using hooks, run_query does not accept a .sql file and needs the string
# So the way is to add directly {{ dag_run.conf['threshold_date'] }} into the
# .sql file which Airflow can ping up when running the operator.
t3 = bigquery_operator.BigQueryOperator(
Exemplo n.º 27
0
    raw_sql_files = read_sql_from_gcs(bq_raw_dataset_name, gcs_bucket)

    bq_start_making_raw_tables = dummy_operator.DummyOperator(
        task_id='start_making_raw_tables')

    bq_end_making_raw_tables = dummy_operator.DummyOperator(
        task_id='end_making_raw_tables')

    for filename in raw_sql_files:
        sql_statement = raw_sql_files[filename].decode()
        table_name = filename.replace('.sql', '')
        table_name = table_name.replace('raw/', '')
        bq_make_raw_tables = bigquery_operator.BigQueryOperator(
            task_id='make_raw_table_{}'.format(table_name),
            sql=sql_statement,
            use_legacy_sql=False,
            location='asia-southeast1')
        bq_start_making_raw_tables >> bq_make_raw_tables
        bq_make_raw_tables >> bq_end_making_raw_tables

    bq_make_raw_dataset >> bq_start_making_raw_tables

    bq_make_tmp_dataset = bash_operator.BashOperator(
        task_id='make_bq_tmp_dataset',
        bash_command=
        'bq --location=asia-southeast1 ls {} || bq --location=asia-southeast1 mk {}'
        .format(bq_tmp_dataset_name, bq_tmp_dataset_name))

    bq_start_making_tmp_tables = dummy_operator.DummyOperator(
        task_id='start_making_tmp_tables')
Exemplo n.º 28
0
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

with models.DAG(
        'composer_sample_bq_notify',
        schedule_interval=datetime.timedelta(weeks=4),
        default_args=default_dag_args) as dag: 

    # Query recent today's data
    bq_fetch_todays_data = bigquery_operator.BigQueryOperator(
        task_id='bq_fetch_todays_data',
        bql="""
          SELECT * FROM `camel-154800.chicago_historical_congestion_data.Aug2_Larger_dataset_view`  WHERE FULL_DATE=CURRENT_DATE("America/Chicago") ORDER BY HH_hour DESC  
        """.format(max_date=max_query_date, min_date=min_query_date),
        use_legacy_sql=False,
        destination_dataset_table=bq_temp_composer_dataset)
    # [END composer_bigquery]

    # [START composer_bigquery]
  

    # Export query result to Cloud Storage.
    export_output_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_output_to_gcs',
        source_project_dataset_table=bq_temp_composer_dataset,
        destination_cloud_storage_uris=[output_file],
        export_format='CSV')
Exemplo n.º 29
0
default_args = {
    'owner': 'Nitin Ware',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(1),
}

dag = DAG(
    'bq_conn_dag',
    'catchup=False',
    default_args=default_args,
    schedule_interval="@once",
)

task_default = bigquery_operator.BigQueryOperator(
    task_id='task_default_connection',
    bql='SELECT 1',
    use_legacy_sql=False,
    dag=dag)

task_explicit = bigquery_operator.BigQueryOperator(
    task_id='task_explicit_connection',
    bql='SELECT 1',
    use_legacy_sql=False,
    bigquery_conn_id='google_cloud_default',
    dag=dag)

task_custom = bigquery_operator.BigQueryOperator(
    task_id='task_custom_connection',
    bql='SELECT 1',
    use_legacy_sql=False,
    bigquery_conn_id='my_gcp_connection',
Exemplo n.º 30
0
_wait_for_completion = wait_for_completion_operator.WaitForCompletionOperator(
    task_id=_WAIT_FOR_COMPLETION_TASK,
    dag=dag,
    project_id=_PROJECT_ID,
    queue_location=_QUEUE_LOCATION,
    queue_name=_QUEUE_NAME,
    service_account_path=_SERVICE_ACCOUNT,
    try_count_limit=_TRY_COUNT_LIMIT)

_update_expiration_tracking = bigquery_operator.BigQueryOperator(
    task_id=_UPDATE_EXPIRATION_TRACKING,
    dag=dag,
    sql=_UPDATE_ITEMS_EXPIRATION_TRACKING_QUERY,
    use_legacy_sql=False,
    params={
        'timezone': _TIMEZONE_UTC_OFFSET,
        'feed_dataset_id': _FEED_DATASET_ID,
        'expiration_tracking_table_id': _EXPIRATION_TRACKING_TABLE_ID,
        'monitor_dataset_id': _MONITOR_DATASET_ID,
        'item_results_table_id': _ITEM_RESULTS_TABLE_ID
    })

_bq_to_pubsub = bq_to_pubsub_operator.GetRunResultsAndTriggerReportingOperator(
    task_id=_TRIGGER_REPORTING_TASK,
    dag=dag,
    project_id=_PROJECT_ID,
    dataset_id=_MONITOR_DATASET_ID,
    table_id=_MONITOR_TABLE_ID,
    query_file_path=_RESULT_QUERY_FILE,
    topic_name=_TOPIC_NAME)