def test_invalid_cmd_type(self):
     """
     Tests to verify if the cmd_type is not import or export, an exception is raised.
     """
     operator = SqoopOperator(task_id='sqoop_job', dag=self.dag,
                              cmd_type='invalid')
     with self.assertRaises(AirflowException):
         operator.execute({})
 def test_invalid_cmd_type(self):
     """
     Tests to verify if the cmd_type is not import or export, an exception is raised.
     """
     operator = SqoopOperator(task_id='sqoop_job', dag=self.dag,
                              cmd_type='invalid')
     with self.assertRaises(AirflowException):
         operator.execute({})
 def test_invalid_import_options(self):
     """
     Tests to verify if a user passes both a query and a table then an exception is raised.
     """
     import_query_and_table_configs = self._config.copy()
     import_query_and_table_configs['cmd_type'] = 'import'
     operator = SqoopOperator(task_id='sqoop_job',
                              dag=self.dag,
                              **import_query_and_table_configs)
     with self.assertRaises(AirflowException):
         operator.execute({})
 def test_invalid_import_options(self):
     """
     Tests to verify if a user passes both a query and a table then an exception is raised.
     """
     import_query_and_table_configs = self._config.copy()
     import_query_and_table_configs['cmd_type'] = 'import'
     operator = SqoopOperator(
         task_id='sqoop_job',
         dag=self.dag,
         **import_query_and_table_configs
     )
     with self.assertRaises(AirflowException):
         operator.execute({})
    def test_execute(self, conn_id='sqoop_default'):
        operator = SqoopOperator(task_id='sqoop_job',
                                 dag=self.dag,
                                 **self._config)

        self.assertEqual(conn_id, operator.conn_id)

        self.assertEqual(self._config['cmd_type'], operator.cmd_type)
        self.assertEqual(self._config['table'], operator.table)
        self.assertEqual(self._config['target_dir'], operator.target_dir)
        self.assertEqual(self._config['append'], operator.append)
        self.assertEqual(self._config['file_type'], operator.file_type)
        self.assertEqual(self._config['num_mappers'], operator.num_mappers)
        self.assertEqual(self._config['split_by'], operator.split_by)
        self.assertEqual(self._config['input_null_string'],
                         operator.input_null_string)
        self.assertEqual(self._config['input_null_non_string'],
                         operator.input_null_non_string)
        self.assertEqual(self._config['staging_table'], operator.staging_table)
        self.assertEqual(self._config['clear_staging_table'],
                         operator.clear_staging_table)
        self.assertEqual(self._config['batch'], operator.batch)
        self.assertEqual(self._config['relaxed_isolation'],
                         operator.relaxed_isolation)
        self.assertEqual(self._config['direct'], operator.direct)
        self.assertEqual(self._config['driver'], operator.driver)
        self.assertEqual(self._config['properties'], operator.properties)
    def test_execute(self):
        """
        Tests to verify values of the SqoopOperator match that passed in from the config.
        """
        operator = SqoopOperator(task_id='sqoop_job',
                                 dag=self.dag,
                                 **self._config)

        self.assertEqual(self._config['conn_id'], operator.conn_id)
        self.assertEqual(self._config['query'], operator.query)
        self.assertEqual(self._config['cmd_type'], operator.cmd_type)
        self.assertEqual(self._config['table'], operator.table)
        self.assertEqual(self._config['target_dir'], operator.target_dir)
        self.assertEqual(self._config['append'], operator.append)
        self.assertEqual(self._config['file_type'], operator.file_type)
        self.assertEqual(self._config['num_mappers'], operator.num_mappers)
        self.assertEqual(self._config['split_by'], operator.split_by)
        self.assertEqual(self._config['input_null_string'],
                         operator.input_null_string)
        self.assertEqual(self._config['input_null_non_string'],
                         operator.input_null_non_string)
        self.assertEqual(self._config['staging_table'], operator.staging_table)
        self.assertEqual(self._config['clear_staging_table'],
                         operator.clear_staging_table)
        self.assertEqual(self._config['batch'], operator.batch)
        self.assertEqual(self._config['relaxed_isolation'],
                         operator.relaxed_isolation)
        self.assertEqual(self._config['direct'], operator.direct)
        self.assertEqual(self._config['driver'], operator.driver)
        self.assertEqual(self._config['properties'], operator.properties)
        self.assertEqual(self._config['hcatalog_database'],
                         operator.hcatalog_database)
        self.assertEqual(self._config['hcatalog_table'],
                         operator.hcatalog_table)
        self.assertEqual(self._config['create_hcatalog_table'],
                         operator.create_hcatalog_table)
        self.assertEqual(self._config['extra_import_options'],
                         operator.extra_import_options)
        self.assertEqual(self._config['extra_export_options'],
                         operator.extra_export_options)

        # the following are meant to be more of examples
        sqoop_import_op = SqoopOperator(task_id='sqoop_import_using_table',
                                        cmd_type='import',
                                        conn_id='sqoop_default',
                                        table='company',
                                        verbose=True,
                                        num_mappers=8,
                                        hcatalog_database='default',
                                        hcatalog_table='import_table_1',
                                        create_hcatalog_table=True,
                                        extra_import_options={
                                            'hcatalog-storage-stanza':
                                            "\"stored as orcfile\""
                                        },
                                        dag=self.dag)

        sqoop_import_op_qry = SqoopOperator(
            task_id='sqoop_import_using_query',
            cmd_type='import',
            conn_id='sqoop_default',
            query='select name, age from company where $CONDITIONS',
            split_by=
            'age',  # the mappers will pass in values to the $CONDITIONS based on the field you select to split by
            verbose=True,
            num_mappers=None,
            hcatalog_database='default',
            hcatalog_table='import_table_2',
            create_hcatalog_table=True,
            extra_import_options={
                'hcatalog-storage-stanza': "\"stored as orcfile\""
            },
            dag=self.dag)

        sqoop_import_op_with_partition = SqoopOperator(
            task_id='sqoop_import_with_partition',
            cmd_type='import',
            conn_id='sqoop_default',
            table='company',
            verbose=True,
            num_mappers=None,
            hcatalog_database='default',
            hcatalog_table='import_table_3',
            create_hcatalog_table=True,
            extra_import_options={
                'hcatalog-storage-stanza': "\"stored as orcfile\"",
                'hive-partition-key': 'day',
                'hive-partition-value': '2017-10-18'
            },
            dag=self.dag)

        sqoop_export_op_name = SqoopOperator(
            task_id='sqoop_export_tablename',
            cmd_type='export',
            conn_id='sqoop_default',
            table='rbdms_export_table_1',
            verbose=True,
            num_mappers=None,
            hcatalog_database='default',
            hcatalog_table='hive_export_table_1',
            extra_export_options=None,
            dag=self.dag)

        sqoop_export_op_path = SqoopOperator(
            task_id='sqoop_export_tablepath',
            cmd_type='export',
            conn_id='sqoop_default',
            table='rbdms_export_table_2',
            export_dir='/user/hive/warehouse/export_table_2',
            direct=True,  # speeds up for data transfer
            verbose=True,
            num_mappers=None,
            extra_export_options=None,
            dag=self.dag)
def hook(dag,
         conn_id,
         tables,
         staging_dir='/tmp/airflow',
         staging_db=None,
         **options):

    staging_db = staging_db or 'staging_%s' % conn_id

    create_staging_db = HiveOperator(task_id='create_staging_db',
                                     hql='create database if not exists %s;' %
                                     staging_db,
                                     dag=dag)

    create_staging_dir = BashOperator(task_id='create_staging_dir',
                                      bash_command='hdfs dfs -mkdir -p %s' %
                                      staging_dir,
                                      dag=dag)

    for tbl in tables:
        table = {
            'hive-database': None,
            'hive-table': None,
            'mappers': 1,
            'direct': False,
            'format': 'parquet',
            'format-options': None,
            'partition_fields': [],
            'bucket_fields': []
        }
        table.update(tbl)
        assert table['hive-database'] is not None
        if table['hive-table'] is None:
            table['hive-table'] = table['name']

        staging_tbl_dir = os.path.join(staging_dir, conn_id, table['name'])

        clean_sqoop_staging = BashOperator(
            task_id=('clean_sqoop_staging_dir.%s' % (table['name'])).lower(),
            bash_command='hdfs dfs -rm -R -f %s' % staging_tbl_dir,
            dag=dag)

        clean_staging_tbl = HiveOperator(
            task_id=('clean_staging_table.%s' % (table['name'])).lower(),
            hql='''drop table if exists %(staging_db)s.%(staging_tbl)s''' % {
                'staging_db': staging_db,
                'staging_tbl': table['name']
            },
            dag=dag)

        sqoop = SqoopOperator(task_id=('sqoop.%s' % (table['name'])).lower(),
                              conn_id=conn_id,
                              table=table['name'],
                              split_by=table['split_by'],
                              num_mappers=table['mappers'],
                              direct=table['direct'],
                              target_dir=staging_tbl_dir,
                              extra_import_options={
                                  'hive-import': '',
                                  'hive-database': staging_db,
                                  'hive-table': table['name'],
                                  'hive-delims-replacement': ' ',
                                  'temporary-rootdir': staging_dir,
                              },
                              dag=dag)

        create_statement = ('create table %s.%s_tmp\n') % (
            table['hive-database'], table['hive-table'])

        create_statement += 'stored as %s\n' % table['format']

        format_opts = table.get('format-options', None)
        if format_opts:
            create_statement += '%s\n' % format_opts

        convert_to_parquet = HiveOperator(
            task_id=('hive_convert_format.%s' % (table['name'])).lower(),
            hql=
            ('create database if not exists %(dst_db)s;\n'
             'drop table if exists %(dst_db)s.%(dst_tbl)s_tmp;\n'
             '%(create_statement)s'
             'as select * from %(staging_db)s.%(staging_tbl)s;\n'
             'drop table if exists %(dst_db)s.%(dst_tbl)s;\n'
             'alter table %(dst_db)s.%(dst_tbl)s_tmp rename to  %(dst_db)s.%(dst_tbl)s;\n'
             ) % {
                 'dst_db': table['hive-database'],
                 'dst_tbl': table['hive-table'],
                 'staging_db': staging_db,
                 'staging_tbl': table['name'],
                 'create_statement': create_statement
             },
            dag=dag)

        clean_staging_tbl.set_upstream(create_staging_db)
        clean_sqoop_staging.set_upstream(create_staging_dir)
        sqoop.set_upstream(clean_sqoop_staging)
        sqoop.set_upstream(clean_staging_tbl)
        convert_to_parquet.set_upstream(sqoop)
Пример #8
0
 def test_invalid_cmd_type(self):
     operator = SqoopOperator(task_id='sqoop_job', dag=self.dag,
                              cmd_type='invalid')
     with self.assertRaises(AirflowException):
         operator.execute({})
 def test_invalid_cmd_type(self):
     operator = SqoopOperator(task_id='sqoop_job',
                              dag=self.dag,
                              cmd_type='invalid')
     with self.assertRaises(AirflowException):
         operator.execute({})
Пример #10
0
from airflow import DAG
from airflow.operators.hive_operator import HiveOperator
from airflow.contrib.operators.sqoop_operator import SqoopOperator
from datetime import datetime, timedelta

default_args = {'owner': 'Talha', 'start_date': datetime(2019, 3, 11)}

dag = DAG('my_pipeline_v2',
          default_args=default_args,
          schedule_interval='@once')

sqoop = SqoopOperator(
    conn_id='docker_mysql',
    task_id='sqoop_task',
    query='"SELECT * FROM training.customer WHERE $CONDITIONS"',
    target_dir='hdfs:///training/customer/data',
    import_type='import',
    dag=dag,
    num_mappers=1)

hive_load = HiveOperator(
    hql=
    'LOAD DATA INPATH \'hdfs:///training/customer/data\' OVERWRITE INTO TABLE training.customer',
    schema='training',
    task_id='hive_load',
    dag=dag)

hive_transformation = HiveOperator(
    hql=
    'SELECT cust_city, AVG(PAYMENT_AMT) FROM training.customer GROUP BY cust_city',
    schema='training',