def test_hyphen_args_note_id(self): dag = DAG(DAG_ID, start_date=DEFAULT_DATE) with dag: task = QuboleOperator(task_id=TASK_ID, command_type='sparkcmd', note_id="123", dag=dag) self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[0], "--note-id=123")
def test_get_hook(self): dag = DAG(DAG_ID, start_date=DEFAULT_DATE) with dag: task = QuboleOperator(task_id=TASK_ID, command_type='hivecmd', dag=dag) hook = task.get_hook() self.assertEqual(hook.__class__, QuboleHook)
def test_position_args_parameters(self): dag = DAG(DAG_ID, start_date=DEFAULT_DATE) with dag: task = QuboleOperator(task_id=TASK_ID, command_type='pigcmd', parameters="key1=value1 key2=value2", dag=dag) self.assertEqual(task.get_hook().create_cmd_args({'run_id':'dummy'})[1], "key1=value1") self.assertEqual(task.get_hook().create_cmd_args({'run_id':'dummy'})[2], "key2=value2") task = QuboleOperator(task_id=TASK_ID, command_type='hadoopcmd', sub_command="s3distcp --src s3n://airflow/source_hadoopcmd " + "--dest s3n://airflow/destination_hadoopcmd", dag=dag) self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[1], "s3distcp") self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[2], "--src") self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[3], "s3n://airflow/source_hadoopcmd") self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[4], "--dest") self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[5], "s3n://airflow/destination_hadoopcmd")
def test_init_with_template_connection(self): dag = DAG(DAG_ID, start_date=DEFAULT_DATE) with dag: task = QuboleOperator(task_id=TASK_ID, dag=dag, qubole_conn_id="{{ dag_run.conf['qubole_conn_id'] }}") result = task.render_template('qubole_conn_id', "{{ qubole_conn_id }}", {'qubole_conn_id' : TEMPLATE_CONN}) self.assertEqual(task.task_id, TASK_ID) self.assertEqual(result, TEMPLATE_CONN)
def test_position_args_parameters(self): dag = DAG(DAG_ID, start_date=DEFAULT_DATE) with dag: task = QuboleOperator(task_id=TASK_ID, command_type='pigcmd', parameters="key1=value1 key2=value2", dag=dag) self.assertEqual(task.get_hook().create_cmd_args({'run_id':'dummy'})[1], "key1=value1") self.assertEqual(task.get_hook().create_cmd_args({'run_id':'dummy'})[2], "key2=value2")
def compare_result(ds, **kwargs): ti = kwargs['ti'] r1 = t1.get_results(ti) r2 = t2.get_results(ti) return filecmp.cmp(r1, r2) t1 = QuboleOperator( task_id='hive_show_table', command_type='hivecmd', query='show tables', cluster_label='{{ params.cluster_label }}', fetch_logs=True, # If `fetch_logs`=true, will fetch qubole command logs and concatenate # them into corresponding airflow task logs tags='airflow_example_run', # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id qubole_conn_id='qubole_default', # Connection id to submit commands inside QDS, if not set "qubole_default" is used dag=dag, params={ 'cluster_label': 'default', } ) t2 = QuboleOperator( task_id='hive_s3_location', command_type="hivecmd", script_location="s3n://public-qubole/qbol-library/scripts/show_table.hql", notfiy=True, tags=['tag1', 'tag2'],
'owner': 'REPLACE', 'depends_on_past': False, 'start_date': airflow.utils.dates.days_ago(0), 'email': ['REPLACE'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 1, 'retry_delay': timedelta(minutes=1), } dag = DAG('Workshop-ML-Model-REPLACE', default_args=default_args) start = DummyOperator(task_id='start', dag=dag) ingestData = QuboleOperator(task_id='ingestData', command_type="sparkcmd", note_id="1271", qubole_conn_id='qubole_default', dag=dag) analyze_data = QuboleOperator(task_id='analyze_data', command_type="sparkcmd", note_id="1274", qubole_conn_id='qubole_default', dag=dag) # Spark Command - Run a Notebook build_dashboards = QuboleOperator(task_id='build_dashboards', command_type="sparkcmd", note_id="1273", qubole_conn_id='qubole_default', dag=dag)
'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2018, 1, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('import and analyze movies', DAG_DEFAULTS) start = DummyOperator(task_id='start', dag=dag) import_from_web = QuboleOperator(task_id='import_web_data', command_type="shellcmd", script=web_import_script, dag=dag) make_hive_table = QuboleOperator( task_id='create_movies_table', command_type='hivecmd', query=create_hive_table, cluster_label='default', tags= 'create_movies_table', # Attach tags to Qubole command, auto attaches 3 tags - dag_id, task_id, run_id qubole_conn_id= 'qubole_default', # Connection ID to submit commands inside QDS, if not set **qubole_default** is used dag=dag) query_hive_table = QuboleOperator( task_id='count_movies_table',
def test_position_args_parameters(self): dag = DAG(DAG_ID, start_date=DEFAULT_DATE) with dag: task = QuboleOperator(task_id=TASK_ID, command_type='pigcmd', parameters="key1=value1 key2=value2", dag=dag) self.assertEqual( task.get_hook().create_cmd_args({'run_id': 'dummy'})[1], "key1=value1") self.assertEqual( task.get_hook().create_cmd_args({'run_id': 'dummy'})[2], "key2=value2") cmd = "s3distcp --src s3n://airflow/source_hadoopcmd --dest s3n://airflow/destination_hadoopcmd" task = QuboleOperator(task_id=TASK_ID + "_1", command_type='hadoopcmd', dag=dag, sub_command=cmd) self.assertEqual( task.get_hook().create_cmd_args({'run_id': 'dummy'})[1], "s3distcp") self.assertEqual( task.get_hook().create_cmd_args({'run_id': 'dummy'})[2], "--src") self.assertEqual( task.get_hook().create_cmd_args({'run_id': 'dummy'})[3], "s3n://airflow/source_hadoopcmd") self.assertEqual( task.get_hook().create_cmd_args({'run_id': 'dummy'})[4], "--dest") self.assertEqual( task.get_hook().create_cmd_args({'run_id': 'dummy'})[5], "s3n://airflow/destination_hadoopcmd")
object SparkPi { def main(args: Array[String]) { val conf = new SparkConf().setAppName("Spark Pi") val spark = new SparkContext(conf) val slices = if (args.length > 0) args(0).toInt else 2 val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow val count = spark.parallelize(1 until n, slices).map { i => val x = random * 2 - 1 val y = random * 2 - 1 if (x*x + y*y < 1) 1 else 0 }.reduce(_ + _) println("Pi is roughly " + 4.0 * count / n) spark.stop() } } ''' dag = DAG('SampleDag', default_args=default_args, description='A simple tutorial DAG jmsample', schedule_interval=timedelta(days=1)) # Spark Command - Scala Program QuboleOperator(task_id='spark_cmd', command_type="sparkcmd", program=prog, language='scala', arguments='--class SparkPi', cluster_label='spark-baseline', qubole_conn_id='qubole_default', dag=dag)
pprint(kwargs) print(ds) return 'Whatever you return gets printed in the logs' python_task = PythonOperator(task_id='python_task', provide_context=True, python_callable=print_context, dag=dag) qubole_task = QuboleOperator( task_id='qubole_shell_command', command_type='shellcmd', script="sleep 60", cluster_label='default', fetch_logs= True, # If true, will fetch qubole command logs and concatenate them into corresponding airflow task logs tags= 'aiflow_example_run', # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id qubole_conn_id= 'qubole_default', # Connection id to submit commands inside QDS, if not set "qubole_default" is used dag=dag) bash_task = BashOperator( task_id='bash_task', bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"', pool='test', dag=dag) http_sensor_task = HttpSensor(task_id='http_sensor_task', http_conn_id='http_default', endpoint='',
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': seven_days_ago, 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False } dag = DAG('process_static_data', default_args=default_args, schedule_interval='@once') t1 = QuboleOperator( task_id='create_page_table', command_type='hivecmd', query="DROP TABLE if exists page;\ CREATE EXTERNAL TABLE page (page_id BIGINT, page_latest BIGINT, page_title STRING)\ ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' \ LOCATION 's3n://paid-qubole/default-datasets/demotrends/page/';", dag=dag) t2 = QuboleOperator( task_id='create_redirect_table', command_type='hivecmd', query="DROP TABLE if exists redirect;\ CREATE EXTERNAL TABLE redirect( rd_from BIGINT, page_title STRING) \ ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' \ LOCATION 's3n://paid-qubole/default-datasets/demotrends/redirect/';", dag=dag) join = DummyOperator( task_id='join',
"parallelism": 1 }, { "name": "products", "parallelism": 1 }, { "name": "salesperson", "parallelism": 1 }] } import_table_array = [] start = DummyOperator(task_id='start', dag=dag) for table in conf['tables']: task_id = "db_import_%s" % (table['name']) import_table_array.append( QuboleOperator(task_id=task_id, command_type='dbimportcmd', mode=1, db_name="", hive_table=table['name'], db_table=table['name'], parallelism=table['parallelism'], dbtap_id=Variable.get(conf['db_tap_variable']), dag=dag)) end = DummyOperator(task_id='end', dag=dag) start >> import_table_array >> end
dag = DAG('ecommerce-airflow-demo', default_args=default_args, schedule_interval='@daily') dag.doc_md = __doc__ # Task = start start = DummyOperator(task_id='start', dag=dag) # Task = cleanup (cleanup schema) # start ---> cleanup (cleanup schema) cleanup = QuboleOperator( task_id='hive_schema_cleanup', command_type='hivecmd', script_location="s3n://uwddefbucket/scripts/ecommerce_schema_cleanup.hql", cluster_label='hadoop2', tags= 'airflow_example_run', # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id qubole_conn_id= 'qubole_default', # Connection id to submit commands inside QDS, if not set "qubole_default" is used dag=dag) start.set_downstream(cleanup) # Task = t1 (create schema) # cleanup ---> t1 (create schemas) t1 = QuboleOperator( task_id='hive_create_schema', command_type='hivecmd', script_location="s3n://uwddefbucket/scripts/ecommerce_create_schema.hql", cluster_label='hadoop2', tags= 'airflow_example_run', # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
def test_init_with_default_connection(self): op = QuboleOperator(task_id=TASK_ID) self.assertEqual(op.task_id, TASK_ID) self.assertEqual(op.qubole_conn_id, DEFAULT_CONN)
branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(t3) join = DummyOperator( task_id='join', trigger_rule='one_success', dag=dag ) t4 = QuboleOperator( task_id='hadoop_jar_cmd', command_type='hadoopcmd', sub_command='jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc', cluster_label='default', fetch_logs=True, dag=dag) t5 = QuboleOperator( task_id='pig_cmd', command_type="pigcmd", script_location="s3://public-qubole/qbol-library/scripts/script1-hadoop-s3-small.pig", parameters="key1=value1 key2=value2", trigger_rule="all_done", dag=dag) t4.set_upstream(branching) t5.set_upstream(t4) t5.set_downstream(join)
'start_date': datetime(2016, 8, 1), 'end_date': datetime(2016, 8, 5) 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False } dag = DAG('process_daily_data', default_args=default_args, schedule_interval='@daily') def_loc = 's3://BUCKET/DEF_LOC' t1 = QuboleOperator( task_id='fetch_pagecount_data', command_type="shellcmd", script="wget -r https://dumps.wikimedia.org/other/pagecounts-raw/{{ ds.split('-')[0] }}/{{ '{0}-{1}'.format(ds.split('-')[0], ds.split('-')[1]) }} \ -P pagecounts/{{ ds }} -A pagecounts-{{ macros.ds_format(ds, '%%Y-%%m-%%d', '%%Y%%m%%d') }}-*.gz; \ s3cmd -c /usr/lib/hustler/s3cfg sync pagecounts/{{ ds }} %s/wikitrends/pagecounts/"%(def_loc), dag=dag) t2 = QuboleOperator( task_id='create_pagecount_table', command_type="hivecmd", query= "CREATE EXTERNAL TABLE IF NOT EXISTS pagecounts (`group` STRING, page_title STRING, `views` BIGINT, bytes_sent BIGINT) \ PARTITIONED BY (`date` STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' LOCATION '{0}/demotrends/pagecounts/'; \ ALTER TABLE pagecounts recover partitions;". format(def_loc), dag=dag) t3 = QuboleOperator( task_id='create_filtered_pagecounts_table',