def test_legacy_file_exist(self): """ Test the legacy behaviour :return: """ # Given logging.info("Test for existing file with the legacy behaviour") # When task = HdfsSensor(task_id='Should_be_file_legacy', filepath='/datadirectory/datafile', timeout=1, retry_delay=timedelta(seconds=1), poke_interval=1, hook=self.hook) task.execute(None)
def test_legacy_file_does_not_exists(self): """ Test the legacy behaviour :return: """ # Given logging.info("Test for non existing file with the legacy behaviour") task = HdfsSensor(task_id='Should_not_be_file_legacy', filepath='/datadirectory/not_existing_file_or_directory', timeout=1, retry_delay=timedelta(seconds=1), poke_interval=1, hook=self.hook) # When # Then with self.assertRaises(AirflowSensorTimeout): task.execute(None)
def test_legacy_file_exist_but_filesize(self): """ Test the legacy behaviour with the filesize :return: """ # Given logging.info("Test for existing file with the legacy behaviour") # When task = HdfsSensor(task_id='Should_be_file_legacy', filepath='/datadirectory/datafile', timeout=1, file_size=20, retry_delay=timedelta(seconds=1), poke_interval=1, hook=self.hook) # When # Then with self.assertRaises(AirflowSensorTimeout): task.execute(None)
python_callable=get_data_from_hdfs, dag=dag, ) # sensor这玩意怎么从xcom中取数据呀?现在从全局变量里取 def get_daily_file_path(filename): return base_remote_daily_path.format(filename, today) # 循环生成多个sensor for daily_dir in daily_dir_list: daily_file_sensor = HdfsSensor( task_id='daily_{}_file_sensor'.format(daily_dir), poke_interval=poke_interval, # (seconds) timeout=60 * 60 * 24, # timeout in 12 hours filepath=get_daily_file_path(daily_dir), hdfs_conn_id='aml_hdfs', dag=dag ) daily_file_sensor >> get_data_operator def process_data(**kwargs): date_list = kwargs['ti'].xcom_pull(key='date_list', task_ids='get_data_operator') X, y, str_with_trx_with_retail_with_corporate_with_account = process_data_from_local(date_list, base_local_path) kwargs['ti'].xcom_push(key='X', value=X) kwargs['ti'].xcom_push(key='y', value=y) def process_data_from_local(date_list, base_local_path): # 读取每日更新的表
pyFile = '/opt/scripts/mig/csdr/csdr_xml_validation/app/{}'.format(pyScript) xmlFile = 'hdfs://migration/data/raw/csdr/settlement_internalisation/to_validate/{}'.format( xml_File) jsonFile = '/opt/scripts/mig/csdr/csdr_xml_validation/app/resources/{}'.format( json_File) logFile = 'hdfs://migration/data/raw/csdr/stage_status/{}'.format(log_file) command = "sudo python {} -f {} -j {} -l {}".format(pyFile, xmlFile, jsonFile, logFile) hdfs_dir = xmlFile args = {'owner': 'Airflow', 'start_date': days_ago(1)} with DAG(dag_id='mig_csdr_setins_xml_validation', description='CSDR XML validation', default_args=args, schedule_interval='* * */1 * *') as dag: start = DummyOperator(task_id='start') fileCheckSensor = HdfsSensor(task_id='source_data_sensor', hdfs_conn_id='webhdfs_default', filepath=hdfs_dir) xmlFileValidation = SSHOperator(ssh_conn_id='zaloni', task_id='xmlFile_Validation', command=command) start >> fileCheckSensor >> xmlFileValidation
start_date=(datetime(2020, 3, 16, 0, 0, 0, 0)), catchup=False) input_path = f'{os.environ["INPUT_PATH"]}/sales' raw_path = f'{os.environ["RAW_PATH"]}/sales/' + '{{ ts_nodash }}' destination_path = f'{os.environ["DESTINATION_PATH"]}/sales' error_path = f'{os.environ["ERROR_PATH"]}/sales' # file_sensor = FileSensor(task_id='file_exists', # dag=dag, # filepath=input_path, # timeout=300, # poke_interval=10) file_sensor = HdfsSensor( task_id='file_exists', filepath=input_path, hdfs_conn_id='hdfs_default', dag=dag) extract_data = BashOperator( task_id='extract_data', dag=dag, bash_command='mkdir -p ' + raw_path + ' && ' + ' cp ' + input_path + '/* ' + raw_path ) transform_data = LivyOperator( task_id='transform_data', dag=dag, livy_conn_id=HTTP_CONN_ID, file='file:///tmp/jars/spark-etl-assembly-0.1.0-SNAPSHOT.jar', num_executors=1,
DQcommand = 'sudo sh {} -j {} -l {}csdr_status.log'.format(csdrDQ_sscript, csdrDQ_json, csdrDQG_log) args = { 'owner':'Airflow', 'start_date': days_ago(1) } with DAG(dag_id='mig_csdr_setins_xmlSpark_and_DQ_process', description='CSDR xml Data Quality checking process', default_args=args, schedule_interval='* * */1 * *') as dag: start = DummyOperator( task_id='start' ) xmlFileCheckSensor = HdfsSensor( task_id='xmlFile_sensor', hdfs_conn_id='webhdfs_default', filepath=xmlFile ) xmlSpark_DQ_process = SSHOperator( ssh_conn_id='zaloni', task_id='xmlSpark_DQ_process', command=xmlCommand ) refreshMV_hiveQuery = SSHOperator( ssh_conn_id='zaloni', task_id='refreshMV_hiveQuery', command=hqlCommand )