Exemplo n.º 1
0
    def test_hook_correct_region():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcHiveOperator(task_id=TASK_ID,
                                                 region=GCP_REGION)

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(
                mock.ANY, mock.ANY, GCP_REGION, mock.ANY)
Exemplo n.º 2
0
    def test_hook_correct_region(self):
        with patch('airflow.contrib.operators.dataproc_operator.DataProcHook'
                   ) as mock_hook:
            dataproc_task = DataProcHiveOperator(task_id=TASK_ID,
                                                 region=REGION)

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(
                mock.ANY, mock.ANY, REGION)
    def test_hook_correct_region(self):
       with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook:
            dataproc_task = DataProcHiveOperator(
                task_id=TASK_ID,
                region=REGION
            )

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY, REGION)
    def test_hook_correct_region():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcHiveOperator(
                task_id=TASK_ID,
                region=GCP_REGION
            )

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY,
                                                                  GCP_REGION, mock.ANY)
Exemplo n.º 5
0
    def test_dataproc_job_id_is_set():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcHiveOperator(
                task_id=TASK_ID
            )

            _assert_dataproc_job_id(mock_hook, dataproc_task)
Exemplo n.º 6
0
    def test_correct_job_definition(self, mock_hook, mock_uuid):
        # Expected job
        job_definition = deepcopy(DATAPROC_JOB_TO_SUBMIT)
        job_definition['job']['hiveJob'] = {'queryFileUri': None}
        job_definition['job']['reference']['projectId'] = None
        job_definition['job']['reference']['jobId'] = DATAPROC_JOB_ID + "_test"

        # Prepare job using operator
        task = DataProcHiveOperator(task_id=TASK_ID,
                                    region=GCP_REGION,
                                    cluster_name=CLUSTER_NAME,
                                    job_name=DATAPROC_JOB_ID,
                                    labels=LABELS)

        task.execute(context=None)
        self.assertDictEqual(job_definition, task.job_template.job)
Exemplo n.º 7
0
def hive_task(owner, dag, task_id, query):
    return DataProcHiveOperator(dag=dag,
                                task_id=task_id,
                                cluster_name='cluster-dataproc',
                                region='europe-west3',
                                job_name='_'.join((owner, task_id,
                                                   '{{ execution_date.year }}',
                                                   '{{ params.job_suffix }}')),
                                query=query,
                                params={'job_suffix': randint(0, 100000)})
Exemplo n.º 8
0
USERNAME = '******'

default_args = {'owner': USERNAME, 'start_date': datetime(2012, 1, 1, 0, 0, 0)}

dag = DAG(
    USERNAME + '_data_lake_etl_issue',
    default_args=default_args,
    description='Data Lake ETL tasks',
    schedule_interval="0 0 1 1 *",
)

ods_issue = DataProcHiveOperator(
    task_id='ods_issue',
    dag=dag,
    query="""
        INSERT OVERWRITE TABLE asamoilov.ods_issue PARTITION (year = {{ execution_date.year }})
        SELECT CAST(user_id AS BIGINT) as user_id,
            CAST(start_time AS TIMESTAMP) AS start_time,
            CAST(end_time AS TIMESTAMP) AS end_time,
            title,
            description,
            service
        FROM asamoilov.stg_issue WHERE year(start_time) = {{ execution_date.year }};
    """,
    cluster_name='cluster-dataproc',
    job_name=USERNAME +
    '_ods_issue_{{ execution_date.year }}_{{ params.job_suffix }}',
    params={"job_suffix": randint(0, 100000)},
    region='europe-west3',
)
start_date = datetime(2019, 11, 18)

default_args = {
    'owner': 'Airflow',
    'depends_on_past': False,
    'start_date': start_date,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

with DAG(dag_id='hive-query-submit',
         default_args=default_args,
         start_date=start_date,
         schedule_interval=None) as dag:

    submit_hive_task = DataProcHiveOperator(
        task_id='HiveSubmit',
        project_id='gcp-cicd',
        cluster_name='cluster-1',
        query_uri=HQL_BUCKET + HQL_SCRIPT_NAME,
        # dataproc_hive_jars=[UDF_BUCKET + UDF_JAR_MANE],
        # variables={'PROJECT_ID': PROJECT_ID},
        region='europe-west1')

    dummy_task = DummyOperator(task_id='DummyTask')

    dummy_task >> submit_hive_task
Exemplo n.º 10
0
    datetime.datetime.min.time())

default_dag_args = {
    'start_date': yesterday,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('PROJECT_ID','dataproc-usecase-276411')
}

with models.DAG(
        'EndToEndDAG14',
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:
		
    run_dataproc_hive_create_db = DataProcHiveOperator(
        task_id='create_db',
        gcp_conn_id='google_cloud_default', 
        query="CREATE DATABASE IF NOT EXISTS default3 LOCATION 'gs://dphivedb/HQL/CSV/test/';",
        cluster_name='dataproc',
        region='us-west1',
        dag=dag)

    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='dataproc',
        region='us-west1',
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    run_dataproc_hive_create_db >> delete_dataproc_cluster 
Exemplo n.º 11
0
                      from bektova.stg_traffic where year(from_unixtime(cast(`timestamp`/1000 as int))) = {{ execution_date.year }};   
            """

    elif task == 'payment':
        query = """
               insert overwrite table bektova.ods_payment partition (year='{{ execution_date.year }}') 
               select user_id, pay_doc_type, pay_doc_num, account, phone, billing_period, cast(pay_date as DATE), cast(sum as DECIMAL(10,2))
                 from bektova.stg_payment where year(pay_date) = {{ execution_date.year }};   
            """

    ods.append(
        DataProcHiveOperator(
            task_id='ods_' + task,
            dag=dag,
            query=query,
            cluster_name='cluster-dataproc',
            job_name=username + '_{{ execution_date.year }}_ods_' + task +
            '_{{ params.job_suffix }}',
            params={"job_suffix": randint(0, 100000)},
            region='europe-west3',
        ))

dm = DataProcHiveOperator(
    task_id='dm_traffic',
    dag=dag,
    query="""
        insert overwrite table bektova.dm_traffic partition (year='{{ execution_date.year }}')  
        select user_id, max(bytes_received), min(bytes_received), round(avg(bytes_received)) as avg_bytes_received
        from bektova.ods_traffic where year = {{ execution_date.year }} group by user_id order by avg_bytes_received;    
        """,
    cluster_name='cluster-dataproc',
    job_name=username +
Exemplo n.º 12
0
                shiftleft(cast(split(device_ip_addr, '[\.]')[1] as int), 16) +
                shiftleft(cast(split(device_ip_addr, '[\.]')[2] as int), 8) +
                cast(split(device_ip_addr, '[\.]')[3] as int),
            cast(bytes_sent as bigint),
            cast(bytes_received as bigint)
        -- Ниже - проверил - только двойной cast дает нормальный результат в year - либо надо обрезать миллисекунды
        from izykov.stg_traffic where year(cast(cast(`timestamp` as bigint) as timestamp)) = {{ execution_date.year }};
    """
}

for table, hiveql in ods_tables.items():
    dpho = DataProcHiveOperator(
        task_id=user_name + '_ods_' + table,
        dag=dag,
        query=hiveql,
        cluster_name='cluster-dataproc',
        job_name=user_name + '_ods_' + table +
        '_{{ execution_date.year }}_{{ params.job_suffix }}',
        params={"job_suffix": randint(0, 100000)},
        region='europe-west3',
    )
    if table == 'traffic':
        dphoc = DataProcHiveOperator(
            task_id=user_name + '_dm_' + table,
            dag=dag,
            query="""
                /*
                    Пояснения по витрине:
                    Несмотря на свежую версию hive (может, конкретная сборка виновата?), никак не работал вариант
                    с create materialized view ... partitioned on / by.
                    Были ошибки типа:
                    FAILED: ParseException line 1:52 mismatched input 'partitioned' expecting AS near 'rewrite' in create materialized view statement
Exemplo n.º 13
0
    'dm_traffic': [
        'user_id, MAX(bytes_received), MIN(bytes_received), AVG(bytes_received)',
        'ods_traffic', 'year'
    ]
}

for i in tables:
    params = randint(0, 100000)
    if i != 'dm_traffic':
        data_proc = DataProcHiveOperator(
            task_id=i,
            dag=dag,
            query="INSERT OVERWRITE TABLE alevanov." + i +
            " PARTITION (year={{ execution_date.year }})\n" + "SELECT " +
            tables[i][0] + " FROM alevanov." + tables[i][1] + " WHERE year(" +
            tables[i][2] + ") = '{{ execution_date.year }}';",
            cluster_name='cluster-dataproc',
            job_name=USERNAME + '_' + i +
            '_{{ execution_date.year }}_{{ params.job_suffix }}',
            params={"job_suffix": randint(0, 100000)},
            region='europe-west3',
        )
        if i == 'ods_traffic':
            date_proc_dm = DataProcHiveOperator(
                task_id='dm_traffic',
                dag=dag,
                query=
                "INSERT OVERWRITE TABLE alevanov.dm_traffic PARTITION (year={{ execution_date.year }})\n"
                + "SELECT " + tables['dm_traffic'][0] + " FROM alevanov." +
                tables['dm_traffic'][1] + " WHERE " + tables['dm_traffic'][2] +
                " = '{{ execution_date.year }}' GROUP BY user_id;",
Exemplo n.º 14
0
from datetime import timedelta, datetime
from random import randint

from airflow import DAG
from airflow.contrib.operators.dataproc_operator import DataProcHiveOperator

USERNAME = '******'

default_args = {'owner': USERNAME, 'start_date': datetime(2012, 1, 1, 0, 0, 0)}

dag = DAG(
    USERNAME + '_data_lake_etl',
    default_args=default_args,
    description='Data Lake ETL tasks',
    schedule_interval="0 0 1 1 *",
)

ods_billing = DataProcHiveOperator(
    task_id='ods_billing',
    dag=dag,
    query="""
        insert overwrite table emateshuk.ods_billing partition (year='{{ execution_date.year }}') 
        select * from emateshuk.stg_billing where year(created_at) = {{ execution_date.year }};
    """,
    cluster_name='cluster-dataproc',
    job_name=USERNAME +
    '_ods_billing_{{ execution_date.year }}_{{ params.job_suffix }}',
    params={"job_suffix": randint(0, 100000)},
    region='europe-west3',
)
    # If a task fails, retry it once after waiting at least 5 minutes
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

with models.DAG(
        'composer_sample_quickstart',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:

    HiveInsertingTable = DataProcHiveOperator(
        task_id='HiveInsertingTable',
        gcp_conn_id='google_cloud_default',
        query=
        "CREATE EXTERNAL TABLE trades_sample6(trading_date_time TIMESTAMP,network CHAR(1),message_category CHAR(1),message_type CHAR(1),message_sequence BIGINT,market_exchange CHAR(1),symbol VARCHAR(10),trade_price DOUBLE,trade_size BIGINT,trade_conditions VARCHAR(6),trade_conditions2 VARCHAR(6) )ROW FORMAT DELIMITED FIELDS TERMINATED BY ','LOCATION 'gs://market-data11-bucket/data/';",
        cluster_name='YourClusterName',
        region='us-central1',
        dag=dag)

    QuerytoGS = DataProcHiveOperator(
        task_id='QuerytoGS',
        gcp_conn_id='google_cloud_default',
        query=
        "INSERT OVERWRITE DIRECTORY 'gs://market-data11-bucket/output/' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' SELECT * FROM trades_sample6;",
        cluster_name='YourClusterName',
        region='us-central1',
        dag=dag)

    QuerytoGS.set_upstream(HiveInsertingTable)
dag = DAG(
    USERNAME + '_data_lake_complete',
    default_args=default_args,
    description='Data Lake Complete ETL by sperfilyev',
    schedule_interval="@yearly",
)

ods_tasks = {}
for ods_table in metadata_ods.keys():
    ods_tasks[ods_table] = DataProcHiveOperator(
        query=generate_ods_fill(
            ods_table,
            metadata_ods[ods_table]['field_of_partition'],
            metadata_ods[ods_table]['fields_to_import'],
        ),
        cluster_name='cluster-dataproc',
        job_name=generate_ods_job(ods_table),
        params={"job_suffix": randint(0, 100000)},
        region='europe-west3',
        task_id='ods_' + ods_table,
        dag=dag,
    )

dm_tasks = {}
for dm_table in metadata_dm.keys():
    dm_tasks[dm_table] = DataProcHiveOperator(
        query=generate_dm_fill(
            dm_table,
            metadata_dm[dm_table]['field_aggregate'],
            metadata_dm[dm_table]['field_group_by'],
        ),
Exemplo n.º 17
0
        main_class="org.apache.spark.examples.SparkPi",
        dataproc_jars="file:///usr/lib/spark/examples/jars/spark-examples.jar",
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    pyspark_task = DataProcPySparkOperator(
        task_id="pyspark_task",
        main=PYSPARK_URI,
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    hive_task = DataProcHiveOperator(
        task_id="hive_task",
        query="SHOW DATABASES;",
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    hadoop_task = DataProcHadoopOperator(
        task_id="hadoop_task",
        main_jar="file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar",
        arguments=["wordcount", "gs://pub/shakespeare/rose.txt", OUTPUT_PATH],
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    delete_cluster = DataprocClusterDeleteOperator(
        task_id="delete_cluster",
        project_id=PROJECT_ID,
        cluster_name=CLUSTER_NAME,
Exemplo n.º 18
0
        query = """
               insert overwrite table ods.issue partition (year='{{ execution_date.year }}') 
               select cast(user_id as INT), cast(start_time as TIMESTAMP), cast(end_time as TIMESTAMP), title, description, service 
                      from stg.issue where year(start_time) = {{ execution_date.year }};   
            """
    elif task == 'payment':
        query = """
               insert overwrite table ods.payment partition (year='{{ execution_date.year }}') 
               select user_id, pay_doc_type, pay_doc_num, account, phone, billing_period, cast(pay_date as DATE), cast(sum as DECIMAL(10,2))
                 from stg.payment where year(pay_date) = {{ execution_date.year }};   
            """
    ods.append(
        DataProcHiveOperator(
            task_id='ods_' + task,
            dag=dag,
            query=query,
            cluster_name='cluster-dataproc',
            region='us-central1',
        ))

dm = DataProcHiveOperator(
    task_id='dm_traffic',
    dag=dag,
    query="""
               insert overwrite table dm.traffic partition (year='{{ execution_date.year }}')  
               select user_id, max(bytes_received), min(bytes_received), round(avg(bytes_received)) as avg_traf
                 from ods.traffic where year = {{ execution_date.year }} group by user_id order by avg_traf;   
          """,
    cluster_name='cluster-dataproc',
    region='us-central1',
)
Exemplo n.º 19
0
    USERNAME + '_data_lake_etl_traffic',
    default_args=default_args,
    description='Data Lake ETL tasks',
    schedule_interval="0 0 1 1 *",
)

ods_traffic = DataProcHiveOperator(
    task_id='ods_traffic',
    dag=dag,
    query="""
        INSERT OVERWRITE TABLE asamoilov.ods_traffic PARTITION (year = {{ execution_date.year }})
        SELECT user_id,
            from_unixtime(CAST(`timestamp`/1000 as BIGINT)) AS traffic_time,
            device_id,
            device_ip_addr,
            bytes_sent,
            bytes_received
        FROM asamoilov.stg_traffic 
        WHERE year(from_unixtime(CAST(`timestamp`/1000 as BIGINT))) = {{ execution_date.year }};
    """,
    cluster_name='cluster-dataproc',
    job_name=USERNAME +
    '_ods_traffic_{{ execution_date.year }}_{{ params.job_suffix }}',
    params={"job_suffix": randint(0, 100000)},
    region='europe-west3',
)

dm_traffic = DataProcHiveOperator(
    task_id='dm_traffic',
    dag=dag,
    query="""
        INSERT OVERWRITE TABLE asamoilov.dm_traffic PARTITION (year = {{ execution_date.year }})
         schedule_interval=None) as dag:

    # Create a Cloud Dataproc cluster with one node
    create_dataproc_cluster = DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        project_id=PROJECT_ID,
        cluster_name=CLUSTER_NAME,
        num_workers=0,
        region=REGION_ID,
        zone=ZONE,
        # service_account='*****@*****.**',
        # service_account='*****@*****.**',
        master_machine_type='n1-standard-1')

    submit_hive_task = DataProcHiveOperator(
        task_id='hive_submit',
        project_id=PROJECT_ID,
        cluster_name=CLUSTER_NAME,
        query_uri=HQL_BUCKET + HQL_SCRIPT_NAME,
        # dataproc_hive_jars=[UDF_BUCKET + UDF_JAR_MANE],
        # variables={'PROJECT_ID': PROJECT_ID},
        region=REGION_ID)

    delete_dataproc_cluster = DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        project_id=PROJECT_ID,
        cluster_name=CLUSTER_NAME,
        trigger_rule=TriggerRule.ALL_DONE)

    create_dataproc_cluster >> submit_hive_task >> delete_dataproc_cluster
 	region=my_region,
 	num_workers=2,
 	storage_bucket=my_bucket,
 	master_machine_type=my_instance,
 	master_disk_size=my_disk_size,
 	worker_machine_type=my_instance,
 	worker_disk_size=my_disk_size,
     num_preemptible_workers=0, #use scale out/in operator
     zone=my_zone,
     idle_delete_ttl=my_idle_delete_ttl,
  	dag=dag)       
  	
  drop_if_exists_src_table = DataProcHiveOperator(
 	task_id='drop_if_exists_src_table',
 	job_name='drop_if_exists_src_table_job_name',
 	cluster_name=my_cluster_name,
 	region=my_region,
 	query=drop_if_exists_src_table
  )
  
  drop_if_exists_dst_table = DataProcHiveOperator(
 	task_id='drop_if_exists_dst_table',
 	job_name='drop_if_exists_dst_table_job_name',
 	cluster_name=my_cluster_name,
 	region=my_region,
 	query=drop_if_exists_dst_table
  )
  
  create_external_src_table = DataProcHiveOperator(
 	task_id='create_external_src_table',
 	job_name='create_external_src_table_job_name',
Exemplo n.º 22
0
default_args = {'owner': USERNAME, 'start_date': datetime(2012, 1, 1, 0, 0, 0)}

dag = DAG(
    USERNAME + '_data_lake_etl_payment',
    default_args=default_args,
    description='Data Lake ETL tasks',
    schedule_interval="0 0 1 1 *",
)

ods_payment = DataProcHiveOperator(
    task_id='ods_payment',
    dag=dag,
    query="""
        INSERT OVERWRITE TABLE asamoilov.ods_payment PARTITION (year = {{ execution_date.year }})
        SELECT user_id,
            pay_doc_type,
            pay_doc_num,
            account,
            phone,
            CAST(from_unixtime(unix_timestamp(billing_period, 'yyyy-MM')) AS DATE) AS billing_period,
            CAST(pay_date AS TIMESTAMP) AS pay_date,
            CAST(sum AS DECIMAL(10,2)) AS sum
        FROM asamoilov.stg_payment WHERE year(pay_date) = {{ execution_date.year }};
    """,
    cluster_name='cluster-dataproc',
    job_name=USERNAME +
    '_ods_payment_{{ execution_date.year }}_{{ params.job_suffix }}',
    params={"job_suffix": randint(0, 100000)},
    region='europe-west3',
)
Exemplo n.º 23
0
USERNAME = '******'

default_args = {'owner': USERNAME, 'start_date': datetime(2012, 1, 1, 0, 0, 0)}

dag = DAG(
    USERNAME + '_data_lake_etl_billing',
    default_args=default_args,
    description='Data Lake ETL tasks',
    schedule_interval="0 0 1 1 *",
)

ods_billing = DataProcHiveOperator(
    task_id='ods_billing',
    dag=dag,
    query="""        
        INSERT OVERWRITE TABLE asamoilov.ods_billing PARTITION (year = {{ execution_date.year }})
        SELECT user_id,
            CAST(from_unixtime(unix_timestamp(billing_period, 'yyyy-MM')) AS DATE) AS billing_period,
            service, 
            tariff, 
            CAST(sum AS DECIMAL(10,2)) AS sum,
            CAST(created_at AS TIMESTAMP) AS created_at
        FROM asamoilov.stg_billing WHERE year(created_at) = {{ execution_date.year }};
    """,
    cluster_name='cluster-dataproc',
    job_name=USERNAME +
    '_ods_billing_{{ execution_date.year }}_{{ params.job_suffix }}',
    params={"job_suffix": randint(0, 100000)},
    region='europe-west3',
)
Exemplo n.º 24
0
default_args = {'owner': USERNAME, 'start_date': datetime(2012, 1, 1, 0, 0, 0)}

dag = DAG(
    USERNAME + '_data_lake_etl',
    default_args=default_args,
    description='Data Lake ETL tasks',
    schedule_interval="0 0 1 1 *",
)

ods_billing = DataProcHiveOperator(
    task_id='ods_billing',
    dag=dag,
    query="""
        insert overwrite table dlybin.ods_billing partition (year={{ execution_date.year }}) 
        select user_id,cast(replace(billing_period,"-","") as int),service,tariff,sum,created_at from dlybin.stg_billing where year(created_at) = {{ execution_date.year }};
    """,
    cluster_name='cluster-dataproc',
    job_name=USERNAME +
    '_ods_billing_{{ execution_date.year }}_{{ params.job_suffix }}',
    params={"job_suffix": randint(0, 100000)},
    region='europe-west3',
)

ods_issue = DataProcHiveOperator(
    task_id='ods_issue',
    dag=dag,
    query="""
        insert overwrite table dlybin.ods_issue partition (year={{ execution_date.year }}) 
        select user_id,cast(start_time as timestamp),cast(end_time as timestamp),title,description,service from dlybin.stg_issue where year(end_time) = {{ execution_date.year }};
    """,
    cluster_name='cluster-dataproc',
Exemplo n.º 25
0
dag = airflow.DAG('Import-MySQL-to-GS-and-DataProc',
                  'catchup=False',
                  default_args=default_args,
                  schedule_interval=datetime.timedelta(days=1))

t1 = DataProcPySparkOperator(
    task_id='import-mysql-data',
    main='gs://mysqlnosql/spark_jdbc_to_gs.py',
    cluster_name='mydataproc2',
    region='us-central1',
    dataproc_pyspark_jars=['gs://mysqlnosql/spark-avro.jar'],
    dag=dag)

t2 = DataProcHiveOperator(query=query_part,
                          cluster_name='mydataproc2',
                          region='us-central1',
                          task_id='create_table_in_hive_2_cols',
                          dag=dag)

t3 = DataProcHiveOperator(query=query_all,
                          cluster_name='mydataproc2',
                          region='us-central1',
                          task_id='create_table_in_hive_all_cols',
                          dag=dag)

t4 = DataFlowPythonOperator(
    py_file='gs://mysqlnosql/beam_gcs_bt.py',
    task_id='loadfrom-gcs-to-bt',
    dataflow_default_options={'project': '<yourprojectid>'},
    options={
        'avro_input': 'gs://mysqldataflow/avro/customer/',
Exemplo n.º 26
0
		PROJECT_ID='dataproc-usecase-276215',
		region='us-west1',
		num_masters=1,
        num_workers=2,
		zone='us-west1-b',
        master_machine_type='n1-standard-1',
		master_disk_size = 100,
		worker_disk_size = 100,
		num_preemptible_workers=0,
        worker_machine_type='n1-standard-1',
		idle_delete_ttl =1800,
		subnetwork_uri='ctl',
		optional_components=['PRESTO','SOLR','RANGER'],
		#service_account_scopes=['cloud-platform','default','sql-admin'],
		#init_actions_uris=['gs://us-west4-test1-d1c785e8-bucket/data/cloud-sql-proxy/cloud-sql-proxy.sh'])
        #metadata ={'kms-key-uri': 'projects/dataproc-usecase-276215/locations/global/keyRings/my-key-ring/cryptoKeys/migration-key', 'db-hive-password-uri': 'gs://secrectkeybucket2/hive-password.encrypted'})
        metadata= {'kms-key-uri': 'projects/dataproc-usecase-276215/locations/global/keyRings/my-key-ring/cryptoKeys/migration-key', 'db-hive-password-uri': 'gs://secrectkeybucket2/hive-password.encrypted','use-cloud-sql-private-ip': 'true', 'hive-metastore-instance': 'dataproc-usecase-276215:us-west1:hive-metadata'})
		
		
    # Run the Hive job on the Cloud Dataproc cluster
    run_dataproc_hive_create_db = DataProcHiveOperator(
        task_id='create_db',
        gcp_conn_id='google_cloud_default', 
        query_uri="gs://data-bucket-hive2/HQL/create_db.hql",
        cluster_name='dataproc',
        region='us-west1',
        dag=dag)
	
    # Define DAG dependencies.
    #create_dataproc_cluster >> run_dataproc_hive >> delete_dataproc_cluster
    create_dataproc_cluster >> run_dataproc_hive_create_db 
Exemplo n.º 27
0
}

with models.DAG('gcp_poc_automation1',
                schedule_interval=datetime.timedelta(days=1),
                default_args=default_dag_args) as dag:

    create_dataproc_cluster = BashOperator(
        task_id='create_dataproc_cluster',
        bash_command=
        'gcloud dataproc clusters create test --region us-central1 --subnet default --zone us-central1-a --master-machine-type n1-standard-1 --master-boot-disk-size 100 --num-workers 2 --worker-machine-type n1-standard-1 --worker-boot-disk-size 100 --image-version 1.3-debian10 --project dataproc-usecase-286310',
        dag=dag)

    dataproc_hive_create_db = DataProcHiveOperator(
        task_id='create_db',
        gcp_conn_id='google_cloud_default',
        query=
        "CREATE DATABASE IF NOT EXISTS default_autotestbqdb9 LOCATION 'gs://dataproc-staging-us-central1-788915459809-fk8wm6rc/HQL/';",
        cluster_name='dataproc',
        region='us-west1',
        dag=dag)

    dataproc_hive_create_table_par = DataProcHiveOperator(
        task_id='dataproc_hive_create_table_par',
        gcp_conn_id='google_cloud_default',
        query=
        "CREATE EXTERNAL TABLE IF NOT EXISTS default.chicago_taxi_trips_parquet_autotestbq9(unique_key   STRING,taxi_id  STRING,trip_start_timestamp  STRING, trip_end_timestamp  STRING, trip_seconds  STRING, trip_miles   STRING, pickup_census_tract  STRING, dropoff_census_tract  STRING, pickup_community_area  STRING, dropoff_community_area  STRING, fare  STRING, tips  STRING, tolls  STRING, extras  STRING, trip_total  STRING, payment_type  STRING, company  STRING, pickup_latitude  STRING, pickup_longitude  STRING, pickup_location  STRING,dropoff_latitude  STRING, dropoff_longitude  STRING, dropoff_location  STRING) STORED AS PARQUET location 'gs://dataproc-staging-us-central1-788915459809-fk8wm6rc/HQL/PARQUET/';",
        cluster_name='dataproc',
        region='us-west1',
        dag=dag)

    dataproc_hive_create_table_csv = DataProcHiveOperator(
        task_id='dataproc_hive_create_table_csv',
Exemplo n.º 28
0
              , `account`
              , `phone`
              , cast(concat(`billing_period`,'-01') as DATE)
              , cast(`pay_date` as                     DATE)
              , `sum`
            from
                pryzhov.stg_payment
            where
                year(cast(`pay_date` as DATE)) = {{ execution_date.year }}
        '''

    ods = DataProcHiveOperator(
        task_id='ods_%s' % table,
        dag=dag,
        query=query,
        job_name='%s_{{ execution_date.year }}_ods_%s_{{ params.job_suffix }}'
        % (USERNAME, table),
        params={"job_suffix": randint(0, 100000)},
        cluster_name='cluster-dataproc',
        region='europe-west3',
    )
    load_op >> ods

    if table == 'traffic':
        dm = DataProcHiveOperator(
            task_id='dm_%s' % table,
            dag=dag,
            query='''
                insert overwrite table pryzhov.dm_traffic partition (year={{ execution_date.year }})
                select
                    `user_id`
                    , max(`bytes_received`)