def test_invalid_cmd_type(self): """ Tests to verify if the cmd_type is not import or export, an exception is raised. """ operator = SqoopOperator(task_id='sqoop_job', dag=self.dag, cmd_type='invalid') with self.assertRaises(AirflowException): operator.execute({})
def test_invalid_cmd_type(self): """ Tests to verify if the cmd_type is not import or export, an exception is raised. """ operator = SqoopOperator(task_id='sqoop_job', dag=self.dag, cmd_type='invalid') with self.assertRaises(AirflowException): operator.execute({})
def test_invalid_import_options(self): """ Tests to verify if a user passes both a query and a table then an exception is raised. """ import_query_and_table_configs = self._config.copy() import_query_and_table_configs['cmd_type'] = 'import' operator = SqoopOperator(task_id='sqoop_job', dag=self.dag, **import_query_and_table_configs) with self.assertRaises(AirflowException): operator.execute({})
def test_invalid_import_options(self): """ Tests to verify if a user passes both a query and a table then an exception is raised. """ import_query_and_table_configs = self._config.copy() import_query_and_table_configs['cmd_type'] = 'import' operator = SqoopOperator( task_id='sqoop_job', dag=self.dag, **import_query_and_table_configs ) with self.assertRaises(AirflowException): operator.execute({})
def test_execute(self, conn_id='sqoop_default'): operator = SqoopOperator(task_id='sqoop_job', dag=self.dag, **self._config) self.assertEqual(conn_id, operator.conn_id) self.assertEqual(self._config['cmd_type'], operator.cmd_type) self.assertEqual(self._config['table'], operator.table) self.assertEqual(self._config['target_dir'], operator.target_dir) self.assertEqual(self._config['append'], operator.append) self.assertEqual(self._config['file_type'], operator.file_type) self.assertEqual(self._config['num_mappers'], operator.num_mappers) self.assertEqual(self._config['split_by'], operator.split_by) self.assertEqual(self._config['input_null_string'], operator.input_null_string) self.assertEqual(self._config['input_null_non_string'], operator.input_null_non_string) self.assertEqual(self._config['staging_table'], operator.staging_table) self.assertEqual(self._config['clear_staging_table'], operator.clear_staging_table) self.assertEqual(self._config['batch'], operator.batch) self.assertEqual(self._config['relaxed_isolation'], operator.relaxed_isolation) self.assertEqual(self._config['direct'], operator.direct) self.assertEqual(self._config['driver'], operator.driver) self.assertEqual(self._config['properties'], operator.properties)
def test_execute(self): """ Tests to verify values of the SqoopOperator match that passed in from the config. """ operator = SqoopOperator(task_id='sqoop_job', dag=self.dag, **self._config) self.assertEqual(self._config['conn_id'], operator.conn_id) self.assertEqual(self._config['query'], operator.query) self.assertEqual(self._config['cmd_type'], operator.cmd_type) self.assertEqual(self._config['table'], operator.table) self.assertEqual(self._config['target_dir'], operator.target_dir) self.assertEqual(self._config['append'], operator.append) self.assertEqual(self._config['file_type'], operator.file_type) self.assertEqual(self._config['num_mappers'], operator.num_mappers) self.assertEqual(self._config['split_by'], operator.split_by) self.assertEqual(self._config['input_null_string'], operator.input_null_string) self.assertEqual(self._config['input_null_non_string'], operator.input_null_non_string) self.assertEqual(self._config['staging_table'], operator.staging_table) self.assertEqual(self._config['clear_staging_table'], operator.clear_staging_table) self.assertEqual(self._config['batch'], operator.batch) self.assertEqual(self._config['relaxed_isolation'], operator.relaxed_isolation) self.assertEqual(self._config['direct'], operator.direct) self.assertEqual(self._config['driver'], operator.driver) self.assertEqual(self._config['properties'], operator.properties) self.assertEqual(self._config['hcatalog_database'], operator.hcatalog_database) self.assertEqual(self._config['hcatalog_table'], operator.hcatalog_table) self.assertEqual(self._config['create_hcatalog_table'], operator.create_hcatalog_table) self.assertEqual(self._config['extra_import_options'], operator.extra_import_options) self.assertEqual(self._config['extra_export_options'], operator.extra_export_options) # the following are meant to be more of examples sqoop_import_op = SqoopOperator(task_id='sqoop_import_using_table', cmd_type='import', conn_id='sqoop_default', table='company', verbose=True, num_mappers=8, hcatalog_database='default', hcatalog_table='import_table_1', create_hcatalog_table=True, extra_import_options={ 'hcatalog-storage-stanza': "\"stored as orcfile\"" }, dag=self.dag) sqoop_import_op_qry = SqoopOperator( task_id='sqoop_import_using_query', cmd_type='import', conn_id='sqoop_default', query='select name, age from company where $CONDITIONS', split_by= 'age', # the mappers will pass in values to the $CONDITIONS based on the field you select to split by verbose=True, num_mappers=None, hcatalog_database='default', hcatalog_table='import_table_2', create_hcatalog_table=True, extra_import_options={ 'hcatalog-storage-stanza': "\"stored as orcfile\"" }, dag=self.dag) sqoop_import_op_with_partition = SqoopOperator( task_id='sqoop_import_with_partition', cmd_type='import', conn_id='sqoop_default', table='company', verbose=True, num_mappers=None, hcatalog_database='default', hcatalog_table='import_table_3', create_hcatalog_table=True, extra_import_options={ 'hcatalog-storage-stanza': "\"stored as orcfile\"", 'hive-partition-key': 'day', 'hive-partition-value': '2017-10-18' }, dag=self.dag) sqoop_export_op_name = SqoopOperator( task_id='sqoop_export_tablename', cmd_type='export', conn_id='sqoop_default', table='rbdms_export_table_1', verbose=True, num_mappers=None, hcatalog_database='default', hcatalog_table='hive_export_table_1', extra_export_options=None, dag=self.dag) sqoop_export_op_path = SqoopOperator( task_id='sqoop_export_tablepath', cmd_type='export', conn_id='sqoop_default', table='rbdms_export_table_2', export_dir='/user/hive/warehouse/export_table_2', direct=True, # speeds up for data transfer verbose=True, num_mappers=None, extra_export_options=None, dag=self.dag)
def hook(dag, conn_id, tables, staging_dir='/tmp/airflow', staging_db=None, **options): staging_db = staging_db or 'staging_%s' % conn_id create_staging_db = HiveOperator(task_id='create_staging_db', hql='create database if not exists %s;' % staging_db, dag=dag) create_staging_dir = BashOperator(task_id='create_staging_dir', bash_command='hdfs dfs -mkdir -p %s' % staging_dir, dag=dag) for tbl in tables: table = { 'hive-database': None, 'hive-table': None, 'mappers': 1, 'direct': False, 'format': 'parquet', 'format-options': None, 'partition_fields': [], 'bucket_fields': [] } table.update(tbl) assert table['hive-database'] is not None if table['hive-table'] is None: table['hive-table'] = table['name'] staging_tbl_dir = os.path.join(staging_dir, conn_id, table['name']) clean_sqoop_staging = BashOperator( task_id=('clean_sqoop_staging_dir.%s' % (table['name'])).lower(), bash_command='hdfs dfs -rm -R -f %s' % staging_tbl_dir, dag=dag) clean_staging_tbl = HiveOperator( task_id=('clean_staging_table.%s' % (table['name'])).lower(), hql='''drop table if exists %(staging_db)s.%(staging_tbl)s''' % { 'staging_db': staging_db, 'staging_tbl': table['name'] }, dag=dag) sqoop = SqoopOperator(task_id=('sqoop.%s' % (table['name'])).lower(), conn_id=conn_id, table=table['name'], split_by=table['split_by'], num_mappers=table['mappers'], direct=table['direct'], target_dir=staging_tbl_dir, extra_import_options={ 'hive-import': '', 'hive-database': staging_db, 'hive-table': table['name'], 'hive-delims-replacement': ' ', 'temporary-rootdir': staging_dir, }, dag=dag) create_statement = ('create table %s.%s_tmp\n') % ( table['hive-database'], table['hive-table']) create_statement += 'stored as %s\n' % table['format'] format_opts = table.get('format-options', None) if format_opts: create_statement += '%s\n' % format_opts convert_to_parquet = HiveOperator( task_id=('hive_convert_format.%s' % (table['name'])).lower(), hql= ('create database if not exists %(dst_db)s;\n' 'drop table if exists %(dst_db)s.%(dst_tbl)s_tmp;\n' '%(create_statement)s' 'as select * from %(staging_db)s.%(staging_tbl)s;\n' 'drop table if exists %(dst_db)s.%(dst_tbl)s;\n' 'alter table %(dst_db)s.%(dst_tbl)s_tmp rename to %(dst_db)s.%(dst_tbl)s;\n' ) % { 'dst_db': table['hive-database'], 'dst_tbl': table['hive-table'], 'staging_db': staging_db, 'staging_tbl': table['name'], 'create_statement': create_statement }, dag=dag) clean_staging_tbl.set_upstream(create_staging_db) clean_sqoop_staging.set_upstream(create_staging_dir) sqoop.set_upstream(clean_sqoop_staging) sqoop.set_upstream(clean_staging_tbl) convert_to_parquet.set_upstream(sqoop)
def test_invalid_cmd_type(self): operator = SqoopOperator(task_id='sqoop_job', dag=self.dag, cmd_type='invalid') with self.assertRaises(AirflowException): operator.execute({})
def test_invalid_cmd_type(self): operator = SqoopOperator(task_id='sqoop_job', dag=self.dag, cmd_type='invalid') with self.assertRaises(AirflowException): operator.execute({})
from airflow import DAG from airflow.operators.hive_operator import HiveOperator from airflow.contrib.operators.sqoop_operator import SqoopOperator from datetime import datetime, timedelta default_args = {'owner': 'Talha', 'start_date': datetime(2019, 3, 11)} dag = DAG('my_pipeline_v2', default_args=default_args, schedule_interval='@once') sqoop = SqoopOperator( conn_id='docker_mysql', task_id='sqoop_task', query='"SELECT * FROM training.customer WHERE $CONDITIONS"', target_dir='hdfs:///training/customer/data', import_type='import', dag=dag, num_mappers=1) hive_load = HiveOperator( hql= 'LOAD DATA INPATH \'hdfs:///training/customer/data\' OVERWRITE INTO TABLE training.customer', schema='training', task_id='hive_load', dag=dag) hive_transformation = HiveOperator( hql= 'SELECT cust_city, AVG(PAYMENT_AMT) FROM training.customer GROUP BY cust_city', schema='training',