Exemplo n.º 1
0
 def test_hiveconf_jinja_translate(self):
     hql = "SELECT ${num_col} FROM ${hiveconf:table};"
     t = HiveOperator(
         hiveconf_jinja_translate=True,
         task_id='dry_run_basic_hql', hql=hql, dag=self.dag)
     t.prepare_template()
     self.assertEqual(t.hql, "SELECT {{ num_col }} FROM {{ table }};")
Exemplo n.º 2
0
 def setUp(self):
     configuration.load_test_config()
     args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     self.dag = DAG('test_dag_id', default_args=args)
     self.next_day = (DEFAULT_DATE +
                      datetime.timedelta(days=1)).isoformat()[:10]
     self.database = 'airflow'
     self.partition_by = 'ds'
     self.table = 'static_babynames_partitioned'
     self.hql = """
     CREATE DATABASE IF NOT EXISTS {{ params.database }};
     USE {{ params.database }};
     DROP TABLE IF EXISTS {{ params.table }};
     CREATE TABLE IF NOT EXISTS {{ params.table }} (
         state string,
         year string,
         name string,
         gender string,
         num int)
     PARTITIONED BY ({{ params.partition_by }} string);
     ALTER TABLE {{ params.table }}
     ADD PARTITION({{ params.partition_by }}='{{ ds }}');
     """
     self.hook = HiveMetastoreHook()
     t = HiveOperator(
         task_id='HiveHook_' + str(random.randint(1, 10000)),
         params={
             'database': self.database,
             'table': self.table,
             'partition_by': self.partition_by
         },
         hive_cli_conn_id='beeline_default',
         hql=self.hql, dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
           ignore_ti_state=True)
Exemplo n.º 3
0
 def setUp(self):
     configuration.load_test_config()
     self._upload_dataframe()
     args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     self.dag = DAG('test_dag_id', default_args=args)
     self.database = 'airflow'
     self.table = 'hive_server_hook'
     self.hql = """
     CREATE DATABASE IF NOT EXISTS {{ params.database }};
     USE {{ params.database }};
     DROP TABLE IF EXISTS {{ params.table }};
     CREATE TABLE IF NOT EXISTS {{ params.table }} (
         a int,
         b int)
     ROW FORMAT DELIMITED
     FIELDS TERMINATED BY ',';
     LOAD DATA LOCAL INPATH '{{ params.csv_path }}'
     OVERWRITE INTO TABLE {{ params.table }};
     """
     self.columns = ['{}.a'.format(self.table),
                     '{}.b'.format(self.table)]
     self.hook = HiveMetastoreHook()
     t = HiveOperator(
         task_id='HiveHook_' + str(random.randint(1, 10000)),
         params={
             'database': self.database,
             'table': self.table,
             'csv_path': self.local_path
         },
         hive_cli_conn_id='beeline_default',
         hql=self.hql, dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
           ignore_ti_state=True)
Exemplo n.º 4
0
 def test_hive_queues(self):
     t = HiveOperator(
         task_id='test_hive_queues', hql=self.hql,
         mapred_queue='default', mapred_queue_priority='HIGH',
         mapred_job_name='airflow.test_hive_queues',
         dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
           ignore_ti_state=True)
Exemplo n.º 5
0
 def test_hiveconf(self):
     hql = "SELECT * FROM ${hiveconf:table} PARTITION (${hiveconf:day});"
     t = HiveOperator(
         hiveconfs={'table': 'static_babynames', 'day': '{{ ds }}'},
         task_id='dry_run_basic_hql', hql=hql, dag=self.dag)
     t.prepare_template()
     self.assertEqual(
         t.hql,
         "SELECT * FROM ${hiveconf:table} PARTITION (${hiveconf:day});")
Exemplo n.º 6
0
    def test_hive_airflow_default_config_queue_override(self):
        specific_mapred_queue = 'default'
        t = HiveOperator(
            task_id='test_default_config_queue',
            hql=self.hql,
            mapred_queue=specific_mapred_queue,
            mapred_queue_priority='HIGH',
            mapred_job_name='airflow.test_default_config_queue',
            dag=self.dag)

        self.assertEqual(t.get_hook().mapred_queue, specific_mapred_queue)
Exemplo n.º 7
0
    def test_hive_airflow_default_config_queue(self):
        t = HiveOperator(
            task_id='test_default_config_queue',
            hql=self.hql,
            mapred_queue_priority='HIGH',
            mapred_job_name='airflow.test_default_config_queue',
            dag=self.dag)

        # just check that the correct default value in test_default.cfg is used
        test_config_hive_mapred_queue = configuration.conf.get(
            'hive',
            'default_hive_mapred_queue'
        )
        self.assertEqual(t.get_hook().mapred_queue, test_config_hive_mapred_queue)
Exemplo n.º 8
0
    def test_mapred_job_name(self, mock_get_hook):
        mock_hook = mock.MagicMock()
        mock_get_hook.return_value = mock_hook
        t = HiveOperator(
            task_id='test_mapred_job_name',
            hql=self.hql,
            dag=self.dag)

        fake_execution_date = timezone.datetime(2018, 6, 19)
        fake_ti = TaskInstance(task=t, execution_date=fake_execution_date)
        fake_ti.hostname = 'fake_hostname'
        fake_context = {'ti': fake_ti}

        t.execute(fake_context)
        self.assertEqual(
            "Airflow HiveOperator task for {}.{}.{}.{}"
            .format(fake_ti.hostname,
                    self.dag.dag_id, t.task_id,
                    fake_execution_date.isoformat()), mock_hook.mapred_job_name)
Exemplo n.º 9
0
    bash_command='hadoop fs -rm -r -f ' + wf_data_path + '/dt={{ yesterday_ds }}/_SUCCESS',
    dag=dag
)

delete_hdfs_file_2 = BashOperator(
    task_id='delete_hdfs_file_2',
    bash_command='hadoop fs -rm -r -f ' + wf_data_path + '/dt={{ ds }}/_SUCCESS',
    dag=dag
)

af_adt_advertiser_revenue_report_ag_hql = HiveOperator(
    task_id='af_adt_advertiser_revenue_report_ag_hql',
    hive_cli_conn_id='hiveserver2_default',
    depends_on_past=True,
    hql=open(af_hql_path + "/hql_adt_advertiser_revenue_report_ag.hql", 'r').read().replace("hivevar", "hiveconf"),
    hiveconfs={'data_date': '{{ ds }}', 'data_date_before': '{{ yesterday_ds }}',
               'data_date_last_hour': '{{ (execution_date - macros.timedelta(hours=3)).strftime("%Y-%m-%d") }}',
               'data_date_next_hour': '{{ (execution_date + macros.timedelta(hours=3)).strftime("%Y-%m-%d") }}'},
    params={'wf_data_path': wf_data_path, 'date_enable': True},
    dag=dag
)

"""
    创建SUCCESS文件
"""
create_hdfs_file = BashOperator(
    task_id='create_hdfs_file',
    bash_command='hadoop fs -touchz ' + wf_data_path + '/dt={{ yesterday_ds }}/_SUCCESS',
    dag=dag
)
Exemplo n.º 10
0
 def test_beeline(self):
     t = HiveOperator(
         task_id='beeline_hql', hive_cli_conn_id='beeline_default',
         hql=self.hql, dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
           ignore_ti_state=True)
Exemplo n.º 11
0
            'db': obus_table.get('db'),
            'table': obus_table.get('table'),
            'conn': obus_table.get('conn')
        },
        dag=dag
    )

    '''
    添加hive数据表分区
    '''
    add_partitions = HiveOperator(
        task_id='add_partitions_{}'.format(hive_table.format(bs=obus_table.get('table'))),
        priority_weight=obus_table.get('priority_weight'),
        hql='''
            ALTER TABLE {hive_db}.{table} ADD IF NOT EXISTS PARTITION (country_code='nal', dt='{{{{ ds }}}}')
        '''.format(
            hive_db=hive_db,
            table=hive_table.format(bs=obus_table.get('table'))
        ),
        schema=hive_db,
        dag=dag
    )

    '''
    检查数据导入是否正确
    '''
    validate_all_data = PythonOperator(
        task_id='validate_data_{}'.format(hive_table.format(bs=obus_table.get('table'))),
        priority_weight=obus_table.get('priority_weight'),
        python_callable=validata_data,
        provide_context=True,
        op_kwargs={
dag = DAG('dw_commerce_contract_d',
          default_args=default_args,
          schedule_interval='4 1 * * *')

start = SqlSensor(
    task_id='start',
    conn_id='etl_db',
    sql=
    "SELECT * FROM etl.signal WHERE name='dw_sso_basic_d' AND value='{{ macros.ds(ti) }}';",
    dag=dag)

# contract

del_partiton_stg_contract = HiveOperator(
    task_id='del_partiton_stg_contract',
    hive_cli_conn_id='spark_thrift',
    hql=
    "alter table stg.newuuabc_contract drop if exists PARTITION (etl_date='{{ macros.ds(ti) }}');\n ",
    dag=dag)

src_stg_contract = BashOperator(
    task_id='src_stg_contract',
    bash_command=
    'dataship extract uuold_newuuabc.contract {{ macros.ds(ti) }} {{ macros.tomorrow_ds(ti) }}',
    pool="embulk_pool",
    dag=dag)

add_partiton_stg_contract = HiveOperator(
    task_id='add_partiton_stg_contract',
    hive_cli_conn_id='spark_thrift',
    hql=
    "alter table stg.newuuabc_contract add PARTITION (etl_date='{{ macros.ds(ti) }}');\n ",

dag = airflow.DAG('init_datavault2_example',
                  schedule_interval="@once",
                  default_args=args,
                  template_searchpath='/usr/local/airflow/sql',
                  max_active_runs=1)

t1 = PythonOperator(task_id='init_datavault2_example',
                    python_callable=init_datavault2_example,
                    provide_context=False,
                    dag=dag)

t2 = HiveOperator(
    task_id='create_stg_database',
    hive_cli_conn_id='hive_default',
    schema='default',
    hql='CREATE DATABASE IF NOT EXISTS {0}'.format(ADVWORKS_STAGING),
    dag=dag)

t3 = HiveOperator(task_id='create_dv_database',
                  hive_cli_conn_id='hive_default',
                  schema='default',
                  hql='CREATE DATABASE IF NOT EXISTS {0}'.format(DATAVAULT),
                  dag=dag)

t4 = HiveOperator(task_id='create_dv_temp',
                  hive_cli_conn_id='hive_default',
                  schema='default',
                  hql='CREATE DATABASE IF NOT EXISTS {0}'.format(DV_TEMP),
                  dag=dag)
Exemplo n.º 14
0
dag = airflow.DAG('okash_hourly',
                  schedule_interval="10 * * * *",
                  default_args=args)

check_client_file = OssSensor(
    task_id='check_client_file',
    bucket_key=
    'okash/okash/{table}/dt={{{{ ds }}}}/hour={{{{ execution_date.strftime("%H") }}}}/'
    .format(table='client'),
    bucket_name='okash',
    timeout=3600,
    dag=dag)

add_client_partitions = HiveOperator(task_id='add_client_partitions',
                                     hql="""
            ALTER TABLE ods_log_client_hi ADD IF NOT EXISTS PARTITION (dt = '{{ ds }}', hour = '{{ execution_date.strftime("%H") }}');
    """,
                                     schema='okash_dw',
                                     dag=dag)

export_to_mysql = HiveToMySqlTransfer(task_id='export_to_mysql',
                                      sql="""
            --SET mapreduce.job.queuename=root.airflow;
            SELECT
                *,
                from_unixtime(unix_timestamp())
            FROM
                okash_dw.ods_log_client_hi
            WHERE
                dt='{{ ds }}'
                AND hour='{{ execution_date.strftime("%H") }}'
        """,
# - 家长表

dag = DAG('dw_student_basic_d',
          default_args=default_args,
          schedule_interval='0 1 * * *')

start = SqlSensor(
    task_id='start',
    conn_id='src_main_db',
    sql=
    "SELECT * FROM restore_tracking.restore_log WHERE date(restored_time) = current_date;",
    dag=dag)

del_partiton_stg_parents = HiveOperator(
    task_id='del_partiton_stg_parents',
    hive_cli_conn_id='spark_thrift',
    hql=
    "alter table stg.newuuabc_parents drop if exists PARTITION (etl_date='{{ macros.ds(ti) }}');\n ",
    dag=dag)

src_stg_parents = BashOperator(
    task_id='src_stg_parents',
    bash_command=
    'dataship extract uuold_newuuabc.parents {{ macros.ds(ti) }} {{ macros.tomorrow_ds(ti) }}',
    pool="embulk_pool",
    dag=dag)

add_partiton_stg_parents = HiveOperator(
    task_id='add_partiton_stg_parents',
    hive_cli_conn_id='spark_thrift',
    hql=
    "alter table stg.newuuabc_parents add PARTITION (etl_date='{{ macros.ds(ti) }}');\n ",
Exemplo n.º 16
0
create_obus_client_event = HiveOperator(task_id='create_obus_client_event',
                                        hql="""
        CREATE TABLE IF NOT EXISTS `ods_log_client_event`(
            `ip` string,
            `server_ip` string,
            `timestamp` bigint,
            `common` struct<
                `user_id`:string,
                `user_number`:string,
                `client_timestamp`:string,
                `platform`:string,
                `os_version`:string,
                `app_name`:string,
                `app_version`:string,
                `locale`:string,
                `device_id`:string,
                `device_screen`:string,
                `device_model`:string,
                `device_manufacturer`:string,
                `is_root`:string,
                `channel`:string,
                `subchannel`:string,
                `gaid`:string,
                `appsflyer_id`:string
            >,
            `events` Array<
                struct<
                    `event_time`:string,
                    `event_name`:string,
                    `page`:string,
                    `source`:string,
                    `event_value`:string
                >
            >
        )
        PARTITIONED BY (
            `dt` string,
            `hour` string)
        ROW FORMAT SERDE
            'org.openx.data.jsonserde.JsonSerDe'
        with SERDEPROPERTIES("ignore.malformed.json"="true")
        LOCATION 's3a://opay-bi/obus_buried/obdm.client_event'
    """,
                                        schema='obus_dw_ods',
                                        dag=dag)
Exemplo n.º 17
0
dag = airflow.DAG('oride_daily',
                  schedule_interval="00 03 * * *",
                  default_args=args)

create_oride_driver_overview = HiveOperator(
    task_id='create_oride_driver_overview',
    hql="""
        CREATE TABLE IF NOT EXISTS oride_driver_overview(
          driver_id bigint,
          order_status int,
          payment_mode int,
          payment_status int,
          order_num_total bigint,
          price_total decimal(10,2),
          distance_total bigint,
          duration_total bigint
        )
        PARTITIONED BY (
            dt STRING
        )
        STORED AS PARQUET

    """,
    schema='dashboard',
    dag=dag)

insert_oride_driver_overview = HiveOperator(
    task_id='insert_oride_driver_overview',
    hql="""
        INSERT OVERWRITE TABLE oride_driver_overview PARTITION (dt='{{ ds }}')
 def test_hive_dryrun(self):
     t = HiveOperator(task_id='dry_run_basic_hql',
                      hql=self.hql,
                      dag=self.dag)
     t.dry_run()
Exemplo n.º 19
0
hql = """
    CREATE EXTERNAL TABLE IF NOT EXISTS dwdii2.noaa_temps 
    (
        station string,
        year int,
        jan string,
        feb string,
        mar string,
        apr string,
        may string,
        jun string,
        jul string,
        aug string,
        sep string,
        oct string,
        nov string,
        dec string
    ) PARTITIONED BY (exdt string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
    STORED AS TEXTFILE
    LOCATION '{{ params.hdfs_destination }}';
"""

createTable = HiveOperator(task_id='hive_create_ext_table',
                           hql=hql,
                           params=s3fetch_params,
                           dag=dag)

t1.set_upstream(cleanHdfs)
createTable.set_upstream(t1)
Exemplo n.º 20
0
from settings import default_args


dag = DAG('dw_sso_basic_d', default_args=default_args,
          schedule_interval='1 1 * * *')

start = SqlSensor(
    task_id='start',
    conn_id='src_main_db',
    sql="SELECT * FROM restore_tracking.restore_log WHERE date(restored_time) = current_date;",
    dag=dag)

# bk_user_info
delpart_stg_bk_user_info = HiveOperator(
    task_id='delpart_stg_bk_user_info',
    hive_cli_conn_id='spark_thrift',
    hql="alter table stg.sishu_bk_user_info drop if exists PARTITION (etl_date='{{ macros.ds(ti) }}');\n ",
    dag=dag)

stg_bk_user_info = BashOperator(
    task_id='stg_bk_user_info',
    bash_command='dataship extract uuold_sishu.bk_user_info {{ macros.ds(ti) }} {{ macros.tomorrow_ds(ti) }}',
    pool="embulk_pool",
    dag=dag)

addpart_stg_bk_user_info = HiveOperator(
    task_id='addpart_stg_bk_user_info',
    hive_cli_conn_id='spark_thrift',
    hql="alter table stg.sishu_bk_user_info add PARTITION (etl_date='{{ macros.ds(ti) }}');\n ",
    dag=dag)
def hook(dag,
         conn_id,
         tables,
         staging_dir='/tmp/airflow',
         staging_db=None,
         **options):

    staging_db = staging_db or 'staging_%s' % conn_id

    create_staging_db = HiveOperator(task_id='create_staging_db',
                                     hql='create database if not exists %s;' %
                                     staging_db,
                                     dag=dag)

    create_staging_dir = BashOperator(task_id='create_staging_dir',
                                      bash_command='hdfs dfs -mkdir -p %s' %
                                      staging_dir,
                                      dag=dag)

    for tbl in tables:
        table = {
            'hive-database': None,
            'hive-table': None,
            'mappers': 1,
            'direct': False,
            'format': 'parquet',
            'format-options': None,
            'partition_fields': [],
            'bucket_fields': []
        }
        table.update(tbl)
        assert table['hive-database'] is not None
        if table['hive-table'] is None:
            table['hive-table'] = table['name']

        staging_tbl_dir = os.path.join(staging_dir, conn_id, table['name'])

        clean_sqoop_staging = BashOperator(
            task_id=('clean_sqoop_staging_dir.%s' % (table['name'])).lower(),
            bash_command='hdfs dfs -rm -R -f %s' % staging_tbl_dir,
            dag=dag)

        clean_staging_tbl = HiveOperator(
            task_id=('clean_staging_table.%s' % (table['name'])).lower(),
            hql='''drop table if exists %(staging_db)s.%(staging_tbl)s''' % {
                'staging_db': staging_db,
                'staging_tbl': table['name']
            },
            dag=dag)

        sqoop = SqoopOperator(task_id=('sqoop.%s' % (table['name'])).lower(),
                              conn_id=conn_id,
                              table=table['name'],
                              split_by=table['split_by'],
                              num_mappers=table['mappers'],
                              direct=table['direct'],
                              target_dir=staging_tbl_dir,
                              extra_import_options={
                                  'hive-import': '',
                                  'hive-database': staging_db,
                                  'hive-table': table['name'],
                                  'hive-delims-replacement': ' ',
                                  'temporary-rootdir': staging_dir,
                              },
                              dag=dag)

        create_statement = ('create table %s.%s_tmp\n') % (
            table['hive-database'], table['hive-table'])

        create_statement += 'stored as %s\n' % table['format']

        format_opts = table.get('format-options', None)
        if format_opts:
            create_statement += '%s\n' % format_opts

        convert_to_parquet = HiveOperator(
            task_id=('hive_convert_format.%s' % (table['name'])).lower(),
            hql=
            ('create database if not exists %(dst_db)s;\n'
             'drop table if exists %(dst_db)s.%(dst_tbl)s_tmp;\n'
             '%(create_statement)s'
             'as select * from %(staging_db)s.%(staging_tbl)s;\n'
             'drop table if exists %(dst_db)s.%(dst_tbl)s;\n'
             'alter table %(dst_db)s.%(dst_tbl)s_tmp rename to  %(dst_db)s.%(dst_tbl)s;\n'
             ) % {
                 'dst_db': table['hive-database'],
                 'dst_tbl': table['hive-table'],
                 'staging_db': staging_db,
                 'staging_tbl': table['name'],
                 'create_statement': create_statement
             },
            dag=dag)

        clean_staging_tbl.set_upstream(create_staging_db)
        clean_sqoop_staging.set_upstream(create_staging_dir)
        sqoop.set_upstream(clean_sqoop_staging)
        sqoop.set_upstream(clean_staging_tbl)
        convert_to_parquet.set_upstream(sqoop)
Exemplo n.º 22
0
CREATE EXTERNAL TABLE IF NOT EXISTS airflow_hive(
`Global_new_confirmed` int,
`Global_new_deaths` int,
`Global_new_recovered` int,
`Global_total_confirmed` int,
`Global_total_deaths` int,
`Global_total_recovered` int,
`Country_code` string,
`Country_name` string,
`Country_new_deaths` int,
`Country_new_recovered` int,
`Country_newconfirmed` int,
`Country_slug` string,
`Country_total_confirmed` int,
`Country_total_deaths` int,
`Country_total_recovered` int,
`Extracted_timestamp` timestamp);
insert into airflow_hive from corona;
"""

hive_task = HiveOperator(hql = hql_query,
  task_id = "hive_script_task",
  hive_cli_conn_id = "hive_local",
  dag = dag_hive
)

hive_task

if__name__ == '__main__ ':
  dag_hive.cli()
    task_id='upload_to_hdfs',
    bash_command=
    f'hdfs dfs -put -f {DATA_FOLDER}/products-with-good-columns.csv /products-with-good-columns.csv',
    dag=dag,
)

# Create Hive table
create_product_lookup_table = HiveOperator(
    task_id='create_product_lookup_table',
    hive_cli_conn_id='my_hive_conn',
    hql='''
        CREATE TABLE IF NOT EXISTS dim_product_lookup (
            UPC          STRING, 
            DESCRIPTION  STRING, 
            MANUFACTURER STRING, 
            CATEGORY     STRING, 
            SUB_CATEGORY STRING, 
            PRODUCT_SIZE STRING
        )
        ROW FORMAT DELIMITED
        FIELDS TERMINATED BY ','
        STORED AS TEXTFILE
    ''',
    dag=dag,
)

# Load data in Hive table
load_data_to_hive_table = HiveOperator(
    task_id='load_data_to_hive_table',
    hive_cli_conn_id='my_hive_conn',
    hql='''
        LOAD DATA INPATH '/products-with-good-columns.csv' OVERWRITE INTO TABLE dim_product_lookup;
Exemplo n.º 24
0
            hql += str(line)
            if not line:
                break
    print("___________")
    print(hql)
    print("_____________________________________________________")
    return hql


args['pool'] = 'pool_dw'  # 单独设置 sub dag 的pool 参数
with DAG(
        # DAG_NAME,
        dag_id='test_dag',
        default_args=args,
        schedule_interval=None,
        description='dw test') as dag:
    dummy_dw_mysqlapp = DummyOperator(task_id="DW_Tasks_Start")
    dag >> dummy_dw_mysqlapp
    last_task = dummy_dw_mysqlapp
    for hqlfilepath in dw_list:
        task_id = hqlfilepath.split("/")[-1].split(".")[0]
        print('task_id ', task_id)
        current_task = HiveOperator(hive_cli_conn_id='hive_cli_emr',
                                    task_id=task_id,
                                    hiveconf_jinja_translate=True,
                                    hql=read_hql_file(hqlfilepath),
                                    trigger_rule='all_done',
                                    dag=dag)
        current_task.set_upstream(last_task)
        last_task = current_task
Exemplo n.º 25
0
import reddit_scrape, clean_data

from airflow import DAG
from airflow.contrib.sensors.file_sensor import FileSensor
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.hive_operator import HiveOperator

default_args = {"start_date": datetime(2020, 1, 1), "owner": "jmce619"}

with DAG(dag_id="subreddit_dag",
         schedule_interval="@daily",
         default_args=default_args,
         catchup=False) as dag:

    subreddit_scrape = PythonOperator(task_id="subreddit_scrape",
                                      python_callable=reddit_scrape.main)

    data_clean = PythonOperator(task_id="data_clean",
                                python_callable=clean_data.main)

    storing_posts = BashOperator(
        task_id="storing_posts",
        bash_command='hadoop fs -f -put -f /tmp/subreddit_clean.csv /tmp/')

    loading_posts = HiveOperator(
        task_id="loading_posts",
        hql='LOAD DATA INPATH "/tmp/subreddit_clean.csv" INTO TABLE subreddit')

    subreddit_scrape >> data_clean_storing >> storing_posts >> loading_posts
    partition_names=[
        'fact_transactions/execution_date={{ macros.ds_add(ds, -1) }}'
    ],
    metastore_conn_id='my_hive_metastore_conn',
    poke_interval=30,
    dag=dag,
)

create_product_transactions_table = HiveOperator(
    task_id='create_product_transactions_table',
    hive_cli_conn_id='my_hive_conn',
    hql='''
        CREATE TABLE IF NOT EXISTS zkan_product_transactions (
            product_description STRING,
            price               DECIMAL(10, 2),
            units               INT,
            visits              INT
        )
        PARTITIONED BY (execution_date DATE)
        ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
        STORED AS TEXTFILE;
    ''',
    dag=dag,
)

add_new_product_transactions = HiveOperator(
    task_id='add_new_product_transactions',
    hive_cli_conn_id='my_hive_conn',
    hiveconfs={'hive.exec.dynamic.partition.mode': 'nonstrict'},
    hql='''
        INSERT INTO TABLE zkan_product_transactions
        SELECT dim_product_lookup.description,
Exemplo n.º 27
0
args = {
    'owner': 'Airflow',
    'start_date': airflow.utils.dates.days_ago(1),
}

dag_prjt_main = DAG(
    dag_id=DAG_NAME,
    default_args=args,
    schedule_interval='* * * * *' #"@once"
)

SQOOP_Task1 = BashOperator(task_id="Sqoop",
                      bash_command='~/sqoop-1.4.7.bin__hadoop-2.6.0/bin/sqoop job --exec Sqoop_weblogdetails_test37', dag=dag_prjt_main)

hive_cmd= """use test1;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000;
insert into weblog_partiton_table partition(host) select id, datevalue, ipaddress, url, responsecode, host from weblog_external as a where not exists(select b.id from weblog_partiton_table as b where a.id = b.id);"""

hive_part = HiveOperator(hive_cli_conn_id='hive_cli_default', hql=hive_cmd, task_id = 'Hive', dag=dag_prjt_main)

finish_task = DummyOperator(task_id="finaltask", dag=dag_prjt_main)

SQOOP_Task1 >> hive_part >> finish_task


if __name__ == '__main__':
    dag_prjt_main.cli()

Exemplo n.º 28
0
create_hdfs_spotify_raw_categories_dir = HdfsMkdirFileOperator(
    task_id='create_hdfs_spotify_raw_categories_dir',
    directory='/user/hadoop/spotify/rawCategories',
    hdfs_conn_id='hdfs',
    dag=dag,
)
create_hdfs_spotify_final_dir = HdfsMkdirFileOperator(
    task_id='create_hdfs_spotify_raw_categories_dir',
    directory='/user/hadoop/spotify/final',
    hdfs_conn_id='hdfs',
    dag=dag,
)

########################################################### S Q L ################################################################
createTableRawPlaylist = HiveOperator(task_id='create_table_raw_playlist',
                                      hql=hiveSql_createTableRawPlaylist,
                                      hive_cli_conn_id='beeline',
                                      dag=dag)

createTableRawTrackData = HiveOperator(task_id='createTableRawTrackData',
                                       hql=hiveSql_createTableRawTrackData,
                                       hive_cli_conn_id='beeline',
                                       dag=dag)

createTableRawCategorie = HiveOperator(task_id='createTableRawCategorie',
                                       hql=hiveSQL_createTableRawCategories,
                                       hive_cli_conn_id='beeline',
                                       dag=dag)

createTableFinal = HiveOperator(task_id='createTableFinal',
                                hql=hiveSQL_createTableFinal,
                                hive_cli_conn_id='beeline',
    # verifica se existe o arquivo formatado
    waiting_file_formated = FileSensor(
        task_id="waiting_file_formated",
        fs_conn_id="fs_default",
        filepath=LOCAL_PATH + FILE_FORMATED,
        poke_interval=5)
    
    # Test => docker-compose -f docker-compose.yml run --rm webserver airflow test trade_etanol_anidro move_file_formated 2020-04-02
    # move o arquivo formatado para a pasta de formatado
    move_file_formated = PythonOperator(
        task_id="move_file_formated",
        python_callable=move_file_formated,
        provide_context=True,
        op_kwargs={'conn_id': 'ftp_servidor'})
    
    # Test => docker-compose -f docker-compose.yml run --rm webserver airflow test trade_etanol_anidro load_file_formated 2020-04-02
    # exibe no log arquivo formatado
    load_file_formated = PythonOperator(
        task_id="load_file_formated",
        python_callable=print_file_formated,
        provide_context=True)
    
    # Test => docker-compose -f docker-compose.yml run --rm webserver airflow test trade_etanol_anidro transfer_into_hive 2020-04-02
    # 
    transfer_into_hive = HiveOperator(
        task_id="transfer_into_hive",
        hql="LOAD DATA IN PATH '/tmp/trade_etanol_anidro_formated.csv' INTO TABLE etanol")
        
    end_dag = DummyOperator(task_id='end_dag')
    
start_dag >> waiting_file_original >> execute_file_original >> waiting_file_formated >> [move_file_formated, load_file_formated] >> transfer_into_hive >> end_dag
Exemplo n.º 30
0
    path = 'oss://opay-datalake/ofood/client'

create_ods_log_client_event_hi = HiveOperator(
    task_id='create_ods_log_client_event_hi',
    hql='''
        CREATE EXTERNAL TABLE if not exists ods_log_client_event_hi (
            user_id bigint comment'用户ID',
            user_number string comment '用户no',
            client_timestamp int comment '客户端时间戳',
            platform string comment '平台ios/android',
            os_version string comment '系统版本',
            app_name string comment'应用名',
            app_version string comment '应用版本',
            locale string comment '本地语言',
            device_id string comment '设备号',
            device_screen string comment '设备分辨率',
            device_model string comment '设备类型',
            device_manufacturer string comment '设备品牌',
            is_root string comment '是否root',
            event_name string comment '事件名',
            page string comment 'page',
            event_values string comment '事件内容'
        )
        PARTITIONED BY (`dt` string, `hour` string) 
        ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' with SERDEPROPERTIES("ignore.malformed.json"="true")
        LOCATION '{location_path}'
    '''.format(location_path=path),
    schema='ofood_dw_ods',
    dag=dag)

ods_log_client_event_hi_partition = HiveOperator(
Exemplo n.º 31
0
 def test_hive(self):
     t = HiveOperator(
         task_id='basic_hql', hql=self.hql, dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
           ignore_ti_state=True)
        task_id='check_table_{}'.format(hive_table_name),
        priority_weight=priority_weight_nm,
        python_callable=run_check_table,
        provide_context=True,
        op_kwargs={
            'db_name': db_name,
            'table_name': table_name,
            'conn_id': conn_id,
            'hive_table_name': hive_table_name
        },
        dag=dag)
    # add partitions
    add_partitions = HiveOperator(
        task_id='add_partitions_{}'.format(hive_table_name),
        priority_weight=priority_weight_nm,
        hql='''
                ALTER TABLE {table} ADD IF NOT EXISTS PARTITION (dt = '{{{{ ds }}}}')
            '''.format(table=hive_table_name),
        schema=HIVE_DB,
        dag=dag)

    validate_all_data = PythonOperator(
        task_id='validate_data_{}'.format(hive_table_name),
        priority_weight=priority_weight_nm,
        python_callable=validata_data,
        provide_context=True,
        op_kwargs={
            'db': HIVE_DB,
            'table_name': hive_table_name,
            'table_format': HIVE_TABLE,
            'table_core_list': table_core_list,
            'table_not_core_list': table_not_core_list
Exemplo n.º 33
0
 def test_hive_dryrun(self):
     t = HiveOperator(
         task_id='dry_run_basic_hql', hql=self.hql, dag=self.dag)
     t.dry_run()
Exemplo n.º 34
0
dag = DAG('dw_tchops_basic_d',
          default_args=default_args,
          schedule_interval='7 1 * * *')

start = SqlSensor(
    task_id='start',
    conn_id='src_main_db',
    sql=
    "SELECT * FROM restore_tracking.restore_log WHERE date(restored_time) = current_date;",
    dag=dag)

# teacher_signed_log
delpart_stg_teacher_signed_log = HiveOperator(
    task_id='delpart_stg_teacher_signed_log',
    hive_cli_conn_id='spark_thrift',
    hql=
    "alter table stg.teacher_contract_teacher_signed_log drop if exists PARTITION (etl_date='{{ macros.ds(ti) }}');\n ",
    dag=dag)

stg_teacher_signed_log = BashOperator(
    task_id='stg_teacher_signed_log',
    bash_command=
    'dataship extract uuold_teacher_contract.teacher_signed_log {{ macros.ds(ti) }} {{ macros.tomorrow_ds(ti) }}',
    pool="embulk_pool",
    dag=dag)

addpart_stg_teacher_signed_log = HiveOperator(
    task_id='addpart_stg_teacher_signed_log',
    hive_cli_conn_id='spark_thrift',
    hql=
    "alter table stg.teacher_contract_teacher_signed_log add PARTITION (etl_date='{{ macros.ds(ti) }}');\n ",
Exemplo n.º 35
0
 def test_hive(self):
     t = HiveOperator(task_id='basic_hql', hql=self.hql, dag=self.dag)
     t.run(start_date=DEFAULT_DATE,
           end_date=DEFAULT_DATE,
           ignore_ti_state=True)
    bash_command=
    f'hdfs dfs -put -f {DATA_FOLDER}/stores-with-good-columns.csv {CLEANED_ZONE}/stores-with-good-columns.csv',
    dag=dag,
)

# Create Hive table
create_store_lookup_table = HiveOperator(
    task_id='create_store_lookup_table',
    hive_cli_conn_id='my_hive_conn',
    hql='''
        CREATE TABLE IF NOT EXISTS dim_store_lookup (
            store_id                 INT,
            store_name               VARCHAR(100),
            address_city_name        VARCHAR(300),
            address_state_prov_code  VARCHAR(2),
            msa_code                 VARCHAR(100),
            seg_value_name           VARCHAR(100),
            parking_space_qty        DECIMAL(38, 2),
            sales_area_size_num      INT,
            avg_weekly_baskets       DECIMAL(38, 2)
        )
        ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
        STORED AS TEXTFILE;
    ''',
    dag=dag,
)

# Load data in Hive table
load_data_to_hive_table = HiveOperator(
    task_id='load_data_to_hive_table',
    hive_cli_conn_id='my_hive_conn',
    hql=f'''
Exemplo n.º 37
0
from airflow import DAG
from airflow.contrib.sensors.file_sensor import FileSensor
from datetime import datetime
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.hive_operator import HiveOperator

import fetching_tweet, cleaning_tweet

default_args = {"start_date": datetime(2020, 1, 1), "owner": "airflow"}

with DAG(dag_id="twitter_dag",
         schedule_interval="@daily",
         default_args=default_args) as dag:
    waiting_for_tweets = FileSensor(task_id="waiting_for_tweets",
                                    fs_conn_id="fs_tweet",
                                    filepath="data.csv",
                                    poke_interval=5)
    fetching_tweets = PythonOperator(task_id="fetching_tweets",
                                     python_callable=fetching_tweet.main)
    cleaning_tweets = PythonOperator(task_id="cleaning_tweets",
                                     python_callable=cleaning_tweet.main)
    storing_tweets = BashOperator(
        task_id="storing_tweets",
        bash_command="hadoop fs -put -f /tmp/data_cleaned.csv /tmp/")
    loading_tweets = HiveOperator(
        task_id="loading_tweets",
        hql="LOAD DATA INPATH '/tmp/data_cleaned.csv' INTO TABLE tweets")
    for channel in to_channels:

        file_name = "to_" + channel + "_" + yesterday.strftime(
            "%Y-%m-%d") + ".csv"

        load_to_hdfs = BashOperator(
            task_id="put_" + channel + "_to_hdfs",
            bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
            local_dir + file_name + hdfs_dir + channel + "/")

        load_to_hdfs << analyze_tweets

        load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive",
                                    hql="LOAD DATA INPATH '" + hdfs_dir +
                                    channel + "/" + file_name + "' "
                                    "INTO TABLE " + channel + " "
                                    "PARTITION(dt='" + dt + "')")
        load_to_hive << load_to_hdfs
        load_to_hive >> hive_to_mysql

    for channel in from_channels:
        file_name = "from_" + channel + "_" + yesterday.strftime(
            "%Y-%m-%d") + ".csv"
        load_to_hdfs = BashOperator(
            task_id="put_" + channel + "_to_hdfs",
            bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
            local_dir + file_name + hdfs_dir + channel + "/")

        load_to_hdfs << analyze_tweets
    file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"

    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
                     local_dir + file_name +
                     hdfs_dir + channel + "/",
        dag=dag)

    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(
        task_id="load_" + channel + "_to_hive",
        hql="LOAD DATA INPATH '" +
            hdfs_dir + channel + "/" + file_name + "' "
            "INTO TABLE " + channel + " "
            "PARTITION(dt='" + dt + "')",
        dag=dag)
    load_to_hive.set_upstream(load_to_hdfs)
    load_to_hive.set_downstream(hive_to_mysql)

for channel in from_channels:
    file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"
    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
                     local_dir + file_name +
                     hdfs_dir + channel + "/",
        dag=dag)
Exemplo n.º 40
0
        bash_command="""
            hdfs dfs -mkdir -p /forex && \
            hdfs dfs -put -f $AIRFLOW_HOME/dags/files/forex_rates.json /forex
            """
    )

    creating_forex_rates_table = HiveOperator(
        task_id="creating_forex_rates_table",
        hive_cli_conn_id="hive_conn",
        hql="""
            CREATE EXTERNAL TABLE IF NOT EXISTS forex_rates(
                base STRING,
                last_update DATE,
                eur DOUBLE,
                usd DOUBLE,
                nzd DOUBLE,
                gbp DOUBLE,
                jpy DOUBLE,
                cad DOUBLE
                )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY ','
            STORED AS TEXTFILE
        """
    )

    forex_processing = SparkSubmitOperator(
        task_id="forex_processing",
        conn_id="spark_conn",
        application="/usr/local/airflow/dags/scripts/forex_processing.py",
        verbose=False
Exemplo n.º 41
0
    file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"

    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
                     local_dir + file_name +
                     hdfs_dir + channel + "/",
        dag=dag)

    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(
        task_id="load_" + channel + "_to_hive",
        hql="LOAD DATA INPATH '" +
            hdfs_dir + channel + "/" + file_name + "' "
            "INTO TABLE " + channel + " "
            "PARTITION(dt='" + dt + "')",
        dag=dag)
    load_to_hive.set_upstream(load_to_hdfs)
    load_to_hive.set_downstream(hive_to_mysql)

for channel in from_channels:
    file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"
    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
                     local_dir + file_name +
                     hdfs_dir + channel + "/",
        dag=dag)
Exemplo n.º 42
0
}

with DAG('twitter_dag_v2',
         start_date=datetime(2018, 10, 1),
         schedule_interval="@daily",
         default_args=DAG_DEFAULT_ARGS,
         catchup=False) as dag:
    waiting_file_task = FileSensor(
        task_id="waiting_file_task",
        fs_conn_id="fs_default",
        filepath="/home/airflow/airflow_files/data.csv",
        poke_interval=5)

    fetching_tweet_task = PythonOperator(task_id="fetching_tweet_task",
                                         python_callable=fetching_tweet.main)

    cleaning_tweet_task = PythonOperator(task_id="cleaning_tweet_task",
                                         python_callable=cleaning_tweet.main)

    load_into_hdfs_task = BashOperator(
        task_id="load_into_hdfs_task",
        bash_command="hadoop fs -put -f /tmp/data_cleaned.csv /tmp/")

    transfer_into_hive_task = HiveOperator(
        task_id="transfer_into_hive_task",
        hql=
        "LOAD DATA INPATH '/tmp/data_cleaned.csv' INTO TABLE tweets PARTITION(dt='2018-10-01')"
    )

    waiting_file_task >> fetching_tweet_task >> cleaning_tweet_task >> load_into_hdfs_task >> transfer_into_hive_task