Пример #1
0
 def test_legacy_file_exist(self):
     """
     Test the legacy behaviour
     :return:
     """
     # Given
     logging.info("Test for existing file with the legacy behaviour")
     # When
     task = HdfsSensor(task_id='Should_be_file_legacy',
                       filepath='/datadirectory/datafile',
                       timeout=1,
                       retry_delay=timedelta(seconds=1),
                       poke_interval=1,
                       hook=self.hook)
     task.execute(None)
Пример #2
0
 def test_legacy_file_exist(self):
     """
     Test the legacy behaviour
     :return:
     """
     # Given
     logging.info("Test for existing file with the legacy behaviour")
     # When
     task = HdfsSensor(task_id='Should_be_file_legacy',
                       filepath='/datadirectory/datafile',
                       timeout=1,
                       retry_delay=timedelta(seconds=1),
                       poke_interval=1,
                       hook=self.hook)
     task.execute(None)
    def test_legacy_file_does_not_exists(self):
        """
        Test the legacy behaviour
        :return:
        """
        # Given
        logging.info("Test for non existing file with the legacy behaviour")
        task = HdfsSensor(task_id='Should_not_be_file_legacy',
                          filepath='/datadirectory/not_existing_file_or_directory',
                          timeout=1,
                          retry_delay=timedelta(seconds=1),
                          poke_interval=1,
                          hook=self.hook)

        # When
        # Then
        with self.assertRaises(AirflowSensorTimeout):
            task.execute(None)
Пример #4
0
    def test_legacy_file_does_not_exists(self):
        """
        Test the legacy behaviour
        :return:
        """
        # Given
        logging.info("Test for non existing file with the legacy behaviour")
        task = HdfsSensor(task_id='Should_not_be_file_legacy',
                          filepath='/datadirectory/not_existing_file_or_directory',
                          timeout=1,
                          retry_delay=timedelta(seconds=1),
                          poke_interval=1,
                          hook=self.hook)

        # When
        # Then
        with self.assertRaises(AirflowSensorTimeout):
            task.execute(None)
Пример #5
0
    def test_legacy_file_exist_but_filesize(self):
        """
        Test the legacy behaviour with the filesize
        :return:
        """
        # Given
        logging.info("Test for existing file with the legacy behaviour")
        # When
        task = HdfsSensor(task_id='Should_be_file_legacy',
                          filepath='/datadirectory/datafile',
                          timeout=1,
                          file_size=20,
                          retry_delay=timedelta(seconds=1),
                          poke_interval=1,
                          hook=self.hook)

        # When
        # Then
        with self.assertRaises(AirflowSensorTimeout):
            task.execute(None)
Пример #6
0
    def test_legacy_file_exist_but_filesize(self):
        """
        Test the legacy behaviour with the filesize
        :return:
        """
        # Given
        logging.info("Test for existing file with the legacy behaviour")
        # When
        task = HdfsSensor(task_id='Should_be_file_legacy',
                          filepath='/datadirectory/datafile',
                          timeout=1,
                          file_size=20,
                          retry_delay=timedelta(seconds=1),
                          poke_interval=1,
                          hook=self.hook)

        # When
        # Then
        with self.assertRaises(AirflowSensorTimeout):
            task.execute(None)
Пример #7
0
    python_callable=get_data_from_hdfs,
    dag=dag,
)


# sensor这玩意怎么从xcom中取数据呀?现在从全局变量里取
def get_daily_file_path(filename):
    return base_remote_daily_path.format(filename, today)


# 循环生成多个sensor
for daily_dir in daily_dir_list:
    daily_file_sensor = HdfsSensor(
        task_id='daily_{}_file_sensor'.format(daily_dir),
        poke_interval=poke_interval,  # (seconds)
        timeout=60 * 60 * 24,  # timeout in 12 hours
        filepath=get_daily_file_path(daily_dir),
        hdfs_conn_id='aml_hdfs',
        dag=dag
    )
    daily_file_sensor >> get_data_operator


def process_data(**kwargs):
    date_list = kwargs['ti'].xcom_pull(key='date_list', task_ids='get_data_operator')
    X, y, str_with_trx_with_retail_with_corporate_with_account = process_data_from_local(date_list, base_local_path)
    kwargs['ti'].xcom_push(key='X', value=X)
    kwargs['ti'].xcom_push(key='y', value=y)


def process_data_from_local(date_list, base_local_path):
    # 读取每日更新的表
pyFile = '/opt/scripts/mig/csdr/csdr_xml_validation/app/{}'.format(pyScript)
xmlFile = 'hdfs://migration/data/raw/csdr/settlement_internalisation/to_validate/{}'.format(
    xml_File)
jsonFile = '/opt/scripts/mig/csdr/csdr_xml_validation/app/resources/{}'.format(
    json_File)
logFile = 'hdfs://migration/data/raw/csdr/stage_status/{}'.format(log_file)

command = "sudo python {} -f {} -j {} -l {}".format(pyFile, xmlFile, jsonFile,
                                                    logFile)
hdfs_dir = xmlFile

args = {'owner': 'Airflow', 'start_date': days_ago(1)}

with DAG(dag_id='mig_csdr_setins_xml_validation',
         description='CSDR XML validation',
         default_args=args,
         schedule_interval='* * */1 * *') as dag:

    start = DummyOperator(task_id='start')

    fileCheckSensor = HdfsSensor(task_id='source_data_sensor',
                                 hdfs_conn_id='webhdfs_default',
                                 filepath=hdfs_dir)

    xmlFileValidation = SSHOperator(ssh_conn_id='zaloni',
                                    task_id='xmlFile_Validation',
                                    command=command)

    start >> fileCheckSensor >> xmlFileValidation
Пример #9
0
          start_date=(datetime(2020, 3, 16, 0, 0, 0, 0)), catchup=False)

input_path = f'{os.environ["INPUT_PATH"]}/sales'
raw_path = f'{os.environ["RAW_PATH"]}/sales/' + '{{ ts_nodash }}'
destination_path = f'{os.environ["DESTINATION_PATH"]}/sales'
error_path = f'{os.environ["ERROR_PATH"]}/sales'

# file_sensor = FileSensor(task_id='file_exists',
#                          dag=dag,
#                          filepath=input_path,
#                          timeout=300,
#                          poke_interval=10)

file_sensor = HdfsSensor(
    task_id='file_exists',
    filepath=input_path,
    hdfs_conn_id='hdfs_default',
    dag=dag)

extract_data = BashOperator(
    task_id='extract_data',
    dag=dag,
    bash_command='mkdir -p ' + raw_path + ' && ' + ' cp ' + input_path + '/* ' + raw_path
)

transform_data = LivyOperator(
    task_id='transform_data',
    dag=dag,
    livy_conn_id=HTTP_CONN_ID,
    file='file:///tmp/jars/spark-etl-assembly-0.1.0-SNAPSHOT.jar',
    num_executors=1,
Пример #10
0
DQcommand = 'sudo sh {} -j {} -l {}csdr_status.log'.format(csdrDQ_sscript, csdrDQ_json, csdrDQG_log)

args = {
    'owner':'Airflow',
    'start_date': days_ago(1)
}

with DAG(dag_id='mig_csdr_setins_xmlSpark_and_DQ_process', description='CSDR xml Data Quality checking process', default_args=args, schedule_interval='* * */1 * *') as dag:

    start = DummyOperator(
        task_id='start'
    )

    xmlFileCheckSensor = HdfsSensor(
        task_id='xmlFile_sensor',
        hdfs_conn_id='webhdfs_default',
        filepath=xmlFile
    )

    xmlSpark_DQ_process = SSHOperator(
        ssh_conn_id='zaloni',
        task_id='xmlSpark_DQ_process',
        command=xmlCommand
    )

    refreshMV_hiveQuery = SSHOperator(
        ssh_conn_id='zaloni',
        task_id='refreshMV_hiveQuery',
        command=hqlCommand
    )