def test_ftp_origin_xml(sdc_builder, sdc_executor, ftp): """Test FTP origin, message is in format XML. We first create a file on FTP server and have the FTP origin stage read it. We then assert its snapshot. The pipeline looks like: sftp_ftp_client >> trash """ ftp_file_name = get_random_string(string.ascii_letters, 10) raw_text_data = '<developers><developer>Alex</developer><developer>Xavi</developer></developers>' expected = [{'value': 'Alex'}, {'value': 'Xavi'}] ftp.put_string(ftp_file_name, raw_text_data) builder = sdc_builder.get_pipeline_builder() sftp_ftp_client = builder.add_stage(name=FTP_ORIGIN_CLIENT_NAME) sftp_ftp_client.set_attributes(file_name_pattern=ftp_file_name, data_format='XML') trash = builder.add_stage('Trash') sftp_ftp_client >> trash sftp_ftp_client_pipeline = builder.build('FTP Origin Pipeline XML').configure_for_environment(ftp) sdc_executor.add_pipeline(sftp_ftp_client_pipeline) snapshot = sdc_executor.capture_snapshot(sftp_ftp_client_pipeline, start_pipeline=True).snapshot sdc_executor.stop_pipeline(sftp_ftp_client_pipeline) try: assert len(snapshot[sftp_ftp_client].output) == 1 assert snapshot[sftp_ftp_client].output[0].field['developer'] == expected finally: # Delete the test FTP origin file we created client = ftp.client client.delete(ftp_file_name) client.quit()
def test_ftp_origin(sdc_builder, sdc_executor, ftp): """Smoke test FTP origin. We first create a file on FTP server and have the FTP origin stage read it. We then assert its snapshot. The pipeline look like: sftp_ftp_client >> trash """ ftp_file_name = get_random_string(string.ascii_letters, 10) raw_text_data = 'Hello World!' ftp.put_string(ftp_file_name, raw_text_data) builder = sdc_builder.get_pipeline_builder() sftp_ftp_client = builder.add_stage('SFTP/FTP Client', type='origin') sftp_ftp_client.file_name_pattern = ftp_file_name sftp_ftp_client.data_format = 'TEXT' trash = builder.add_stage('Trash') sftp_ftp_client >> trash sftp_ftp_client_pipeline = builder.build('FTP Origin Pipeline').configure_for_environment(ftp) sdc_executor.add_pipeline(sftp_ftp_client_pipeline) snapshot = sdc_executor.capture_snapshot(sftp_ftp_client_pipeline, start_pipeline=True).snapshot sdc_executor.stop_pipeline(sftp_ftp_client_pipeline) assert len(snapshot[sftp_ftp_client].output) == 1 assert snapshot[sftp_ftp_client].output[0].field['text'] == raw_text_data # Delete the test FTP origin file we created client = ftp.client try: client.delete(ftp_file_name) finally: client.quit()
def test_ftp_origin_text(sdc_builder, sdc_executor, ftp, use_subdirectory): """Test FTP origin, message is in format Text. We first create a file on FTP server and have the FTP origin stage read it. We include the directory in the path URL instead of in the pattern We then assert the data from the wiretap. The pipeline looks like: sftp_ftp_client >> wiretap sftp_ftp_client >= wiretap_events """ directory = os.path.join(get_random_string(), get_random_string()) if use_subdirectory else get_random_string() ftp_file_name = get_random_string(string.ascii_letters, 10) ftp_file_path = os.path.join(directory, ftp_file_name) raw_text_data = '[{\'value\': \'Alex\'}, {\'value\': \'Xavi\'}]' expected = [{'value': 'Alex'}, {'value': 'Xavi'}] client = ftp.client client.cwd('/') for path in directory.split('/'): client.mkd(path) client.cwd(path) client.cwd('/') ftp.put_string(ftp_file_path, raw_text_data) builder = sdc_builder.get_pipeline_builder() sftp_ftp_client = builder.add_stage(name=FTP_ORIGIN_CLIENT_NAME) sftp_ftp_client.set_attributes(file_name_pattern=ftp_file_name, process_subdirectories=True, data_format='TEXT') wiretap = builder.add_wiretap() wiretap_events = builder.add_wiretap() sftp_ftp_client >> wiretap.destination sftp_ftp_client >= wiretap_events.destination sftp_ftp_client_pipeline = builder.build('FTP Origin Pipeline XML').configure_for_environment(ftp) sftp_ftp_client.resource_url = f'{sftp_ftp_client.resource_url}/{directory}' sdc_executor.add_pipeline(sftp_ftp_client_pipeline) sdc_executor.start_pipeline(sftp_ftp_client_pipeline).wait_for_pipeline_output_records_count(1) sdc_executor.stop_pipeline(sftp_ftp_client_pipeline) try: assert len(wiretap.output_records) == 1 assert ftp_file_name == wiretap.output_records[0].header.values['filename'] assert len(wiretap_events.output_records) == 3 assert wiretap_events.output_records[0].field['filepath'] == f'/{ftp_file_path}' assert wiretap_events.output_records[1].field['filepath'] == f'/{ftp_file_path}' assert wiretap.output_records[0].field['text'] == str(expected) finally: # Delete the test FTP origin file we created client = ftp.client client.delete(f'/{directory}/{ftp_file_name}') client.rmd(f'/{directory}') client.quit()
def test_ftp_origin_delimited_with_finisher(sdc_builder, sdc_executor, ftp): """Test FTP origin, message is in format Delimited. We first create a file on FTP server and have the FTP origin stage read it. We add a pipeline finisher and stop the pipeline when there is no more data. We put the file in ftp server. We then assert its snapshot. The pipeline looks like: sftp_ftp_client >> trash Sftp_ftp_client>= Pipeline finisher """ ftp_file_name = get_random_string(string.ascii_letters, 10) ftp_file_name_1 = f'{ftp_file_name}_1' ftp_file_name_2 = f'{ftp_file_name}_2' message_1 = 'Alex,Xavi' message_2 = 'Tucu,Martin' expected_1 = {str(i): name for i, name in enumerate(message_1.split(','))} expected_2 = {str(i): name for i, name in enumerate(message_2.split(','))} ftp.put_string(ftp_file_name_1, message_1) ftp.put_string(ftp_file_name_2, message_2) builder = sdc_builder.get_pipeline_builder() sftp_ftp_client = builder.add_stage(name=FTP_ORIGIN_CLIENT_NAME) sftp_ftp_client.set_attributes(file_name_pattern=f'{ftp_file_name}*', data_format='DELIMITED') trash = builder.add_stage('Trash') pipeline_finished_executor = builder.add_stage('Pipeline Finisher Executor') pipeline_finished_executor.set_attributes( stage_record_preconditions=["${record:eventType() == 'no-more-data'}"]) sftp_ftp_client >> trash sftp_ftp_client >= pipeline_finished_executor sftp_ftp_client_pipeline = builder.build('FTP Origin Pipeline CSV-Finisher').configure_for_environment(ftp) sdc_executor.add_pipeline(sftp_ftp_client_pipeline) snapshot = sdc_executor.capture_snapshot(sftp_ftp_client_pipeline, batches=3, batch_size=10, start_pipeline=True).snapshot try: assert len(snapshot.snapshot_batches[0][sftp_ftp_client.instance_name].output) == 1 assert len(snapshot.snapshot_batches[1][sftp_ftp_client.instance_name].output) == 1 assert snapshot.snapshot_batches[0][sftp_ftp_client.instance_name].output[0].field == expected_1 assert snapshot.snapshot_batches[1][sftp_ftp_client.instance_name].output[0].field == expected_2 finally: # Delete the test FTP origin files we created client = ftp.client client.delete(ftp_file_name_1) client.delete(ftp_file_name_2) client.quit()
def test_ftp_origin_wholefile_with_finisher(sdc_builder, sdc_executor, ftp): """Test FTP origin message is in format Whole File. We first create two files on FTP server and have the FTP origin stage read them. We add a pipeline finisher to check when there is no more data. We then assert the data using a wiretap. The pipeline looks like: sftp_ftp_client >> wiretap Sftp_ftp_client>= Pipeline finisher """ ftp_file_name = get_random_string(string.ascii_letters, 10) ftp_file_name_1 = f'{ftp_file_name}_1' ftp_file_name_2 = f'{ftp_file_name}_2' message_1 = 'Useless Message 1' message_2 = 'Useless Message 2' client = ftp.client client.cwd('/') ftp.put_string(ftp_file_name_1, message_1) ftp.put_string(ftp_file_name_2, message_2) expected_1 = f'/{ftp_file_name_1}' expected_2 = f'/{ftp_file_name_2}' builder = sdc_builder.get_pipeline_builder() origin = builder.add_stage(name=FTP_ORIGIN_CLIENT_NAME) origin.set_attributes(file_name_pattern=f'{ftp_file_name}*', data_format='WHOLE_FILE') wiretap = builder.add_wiretap() pipeline_finished_executor = builder.add_stage('Pipeline Finisher Executor') pipeline_finished_executor.stage_record_preconditions = ["${record:eventType() == 'no-more-data'}"] origin >> wiretap.destination origin >= pipeline_finished_executor pipeline = builder.build().configure_for_environment(ftp) sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_finished() try: assert len(wiretap.output_records) == 2 assert wiretap.output_records[0].field['fileInfo']['file'] == expected_1 assert wiretap.output_records[1].field['fileInfo']['file'] == expected_2 finally: client.quit()
def test_ftp_origin_xml(sdc_builder, sdc_executor, ftp): """Test FTP origin, message is in format XML. We first create a file on FTP server and have the FTP origin stage read it. We then assert the data from the wiretap. The pipeline looks like: sftp_ftp_client >> wiretap """ ftp_file_name = get_random_string(string.ascii_letters, 10) ftp_dir_name = get_random_string(string.ascii_letters, 10) raw_text_data = '<developers><developer>Alex</developer><developer>Xavi</developer></developers>' expected = [{'value': 'Alex'}, {'value': 'Xavi'}] client = ftp.client client.cwd('/') client.mkd(ftp_dir_name) ftp.put_string(f'{ftp_dir_name}/{ftp_file_name}', raw_text_data) builder = sdc_builder.get_pipeline_builder() sftp_ftp_client = builder.add_stage(name=FTP_ORIGIN_CLIENT_NAME) sftp_ftp_client.set_attributes(file_name_pattern=ftp_file_name, process_subdirectories=True, data_format='XML') wiretap = builder.add_wiretap() sftp_ftp_client >> wiretap.destination sftp_ftp_client_pipeline = builder.build('FTP Origin Pipeline XML').configure_for_environment(ftp) sdc_executor.add_pipeline(sftp_ftp_client_pipeline) sdc_executor.start_pipeline(sftp_ftp_client_pipeline).wait_for_pipeline_output_records_count(1) sdc_executor.stop_pipeline(sftp_ftp_client_pipeline) try: assert len(wiretap.output_records) == 1 output_data = wiretap.output_records[0].field assert f'/{ftp_dir_name}/{ftp_file_name}' == wiretap.output_records[0].header.values['file'] assert ftp_file_name == wiretap.output_records[0].header.values['filename'] developers_element = get_xml_output_field(sftp_ftp_client, output_data, 'developers') assert developers_element['developer'] == expected finally: # Delete the test FTP origin file we created client = ftp.client client.delete(f'/{ftp_dir_name}/{ftp_file_name}') client.rmd(f'/{ftp_dir_name}') client.quit()
def test_ftp_origin_syslog(sdc_builder, sdc_executor, ftp): """Test FTP origin using syslog format. We first create a file on FTP server and have the FTP origin stage read it. We then assert its snapshot. The pipeline looks like: sftp_ftp_client >> trash """ message = ( '+20150320 [15:53:31,161] DEBUG PipelineConfigurationValidator - Pipeline \'test:preview\' validation. ' 'valid=true, canPreview=true, issuesCount=0 - ') ftp_file_name = get_random_string(string.ascii_letters, 10) ftp.put_string(ftp_file_name, message) builder = sdc_builder.get_pipeline_builder() sftp_ftp_client = builder.add_stage(name=FTP_ORIGIN_CLIENT_NAME) sftp_ftp_client.file_name_pattern = ftp_file_name sftp_ftp_client.set_attributes(data_format='LOG', log_format='LOG4J', retain_original_line=True, on_parse_error='INCLUDE_AS_STACK_TRACE') trash = builder.add_stage('Trash') sftp_ftp_client >> trash sftp_ftp_client_pipeline = builder.build( 'FTP Origin Pipeline SysLog').configure_for_environment(ftp) sdc_executor.add_pipeline(sftp_ftp_client_pipeline) snapshot = sdc_executor.capture_snapshot(sftp_ftp_client_pipeline, start_pipeline=True).snapshot sdc_executor.stop_pipeline(sftp_ftp_client_pipeline) try: assert len(snapshot[sftp_ftp_client].output) == 1 assert snapshot[sftp_ftp_client].output[0].field[ 'originalLine'] == message finally: # Delete the test FTP origin file we created client = ftp.client client.delete(ftp_file_name) client.quit()
def test_ftp_origin_text_delete_subdirectory(sdc_builder, sdc_executor, ftp): """FTP origin test. We first create a two files on FTP server in root directory and in /TMP directore. The FTP origin stage read it. We then assert its snapshot. The pipeline looks like: sftp_ftp_client >> trash The pipeline delete the files after processing """ ftp_file_name = f'a{get_random_string(string.ascii_letters, 10)}' ftp_dir_name = f'b{get_random_string(string.ascii_letters, 10)}' ftp_file_name_1 = f'{ftp_file_name}_1' ftp_file_name_2 = f'{ftp_file_name}_2' raw_text_data_1 = 'Hello World 1!' raw_text_data_2 = 'Hello World 2!' client = ftp.client client.cwd('/') ftp.put_string(ftp_file_name_1, raw_text_data_1) client.mkd(ftp_dir_name) ftp.put_string(f'{ftp_dir_name}/{ftp_file_name_2}', raw_text_data_2) builder = sdc_builder.get_pipeline_builder() sftp_ftp_client = builder.add_stage(name=FTP_ORIGIN_CLIENT_NAME) sftp_ftp_client.set_attributes(file_name_pattern=f'{ftp_file_name}*', data_format='TEXT', process_subdirectories=True, file_post_processing="DELETE") trash = builder.add_stage('Trash') sftp_ftp_client >> trash sftp_ftp_client_pipeline = builder.build('FTP Origin Pipeline Text').configure_for_environment(ftp) sdc_executor.add_pipeline(sftp_ftp_client_pipeline) snapshot = sdc_executor.capture_snapshot(sftp_ftp_client_pipeline, start_pipeline=True, batches=2, batch_size=10).snapshot sdc_executor.stop_pipeline(sftp_ftp_client_pipeline) try: assert len(snapshot.snapshot_batches[0][sftp_ftp_client.instance_name].output) == 1 assert len(snapshot.snapshot_batches[1][sftp_ftp_client.instance_name].output) == 1 assert snapshot.snapshot_batches[0][sftp_ftp_client.instance_name].output[0].field['text'] == raw_text_data_1 assert snapshot.snapshot_batches[1][sftp_ftp_client.instance_name].output[0].field['text'] == raw_text_data_2 # Assert the first file was deleted by the pipeline. client.cwd('/') file_list = client.nlst() assert ftp_file_name_1 not in file_list # Assert the second file was deleted in tmp folder by the pipeline. client.cwd(ftp_dir_name) file_list = client.nlst() assert ftp_file_name_2 not in file_list finally: # Delete the tmp folder. client.cwd('/') client.rmd(ftp_dir_name) client.quit()
def test_ftp_origin_wholefile_with_finisher(sdc_builder, sdc_executor, ftp): """Test FTP origin message is in format Whole File. We first create two files on FTP server and have the FTP origin stage read them. We add a pipeline finisher to check when there is no more data. The pipeline deletes the files. We then assert its snapshot. The pipeline looks like: sftp_ftp_client >> trash Sftp_ftp_client>= Pipeline finisher """ ftp_file_name = get_random_string(string.ascii_letters, 10) ftp_file_name_1 = f'{ftp_file_name}_1' ftp_file_name_2 = f'{ftp_file_name}_2' message_1 = 'Useless Message 1' message_2 = 'Useless Message 2' client = ftp.client client.cwd('/') ftp.put_string(ftp_file_name_1, message_1) ftp.put_string(ftp_file_name_2, message_2) expected_1 = f'/{ftp_file_name_1}' expected_2 = f'/{ftp_file_name_2}' builder = sdc_builder.get_pipeline_builder() sftp_ftp_client = builder.add_stage(name=FTP_ORIGIN_CLIENT_NAME) sftp_ftp_client.set_attributes(file_name_pattern=f'{ftp_file_name}*', data_format='WHOLE_FILE', file_post_processing="DELETE") trash = builder.add_stage('Trash') pipeline_finished_executor = builder.add_stage('Pipeline Finisher Executor') pipeline_finished_executor.set_attributes( stage_record_preconditions=["${record:eventType() == 'no-more-data'}"]) sftp_ftp_client >> trash sftp_ftp_client >= pipeline_finished_executor sftp_ftp_client_pipeline = builder.build('FTP Origin Pipeline WholeFile-Finisher').configure_for_environment(ftp) sdc_executor.add_pipeline(sftp_ftp_client_pipeline) snapshot = sdc_executor.capture_snapshot(sftp_ftp_client_pipeline, batches=3, batch_size=10, start_pipeline=True).snapshot try: assert len(snapshot.snapshot_batches[0][sftp_ftp_client.instance_name].output) == 1 assert len(snapshot.snapshot_batches[1][sftp_ftp_client.instance_name].output) == 1 assert snapshot.snapshot_batches[0][sftp_ftp_client.instance_name].output[0].field['fileInfo'][ 'file'] == expected_1 assert snapshot.snapshot_batches[1][sftp_ftp_client.instance_name].output[0].field['fileInfo'][ 'file'] == expected_2 # Assert the first file was deleted by the pipeline. client.cwd('/') file_list = client.nlst() assert ftp_file_name_1 not in file_list # Assert the second file was deleted in tmp folder by the pipeline. assert ftp_file_name_2 not in file_list finally: client.quit()
def test_ftp_origin_whole_file_with_no_read_permission(sdc_builder, sdc_executor, ftp): """This is a test for SDC-14867. It creates a file with no read permissions and creates one more file with read permissions, when the pipeline runs we will start ingesting from the second file and first file is skipped and an error is reported. We also drop another file when the pipeline is running and see whether that is also picked up rightly. """ prefix = get_random_string(string.ascii_letters, 5) ftp_file_name1 = f'{prefix}{get_random_string(string.ascii_letters, 10)}.txt' ftp_file_name2 = f'{prefix}{get_random_string(string.ascii_letters, 10)}.txt' ftp_file_name3 = f'{prefix}{get_random_string(string.ascii_letters, 10)}.txt' raw_text_data = get_random_string(string.printable, 30000000) ftp.put_string(ftp_file_name1, raw_text_data) ftp.chmod(ftp_file_name1, 000) ftp.put_string(ftp_file_name2, raw_text_data) # Build the pipeline builder = sdc_builder.get_pipeline_builder() sftp_ftp_client = builder.add_stage('SFTP/FTP/FTPS Client', type='origin') sftp_ftp_client.set_attributes(file_name_pattern=f'{prefix}*', data_format='WHOLE_FILE', batch_wait_time_in_ms=10000, max_batch_size_in_records=1) trash = builder.add_stage('Trash') wiretap = builder.add_wiretap() sftp_ftp_client >> [wiretap.destination, trash] sftp_to_trash_pipeline = builder.build().configure_for_environment(ftp) sdc_executor.add_pipeline(sftp_to_trash_pipeline) try: # Start the pipeline and wait for 1 record start_command = sdc_executor.start_pipeline(sftp_to_trash_pipeline) start_command.wait_for_pipeline_output_records_count(3) ftp.put_string(ftp_file_name3, raw_text_data) start_command.wait_for_pipeline_output_records_count(6) error_msgs = sdc_executor.get_stage_errors(sftp_to_trash_pipeline, sftp_ftp_client) # Verify the stage error message assert 'REMOTE_DOWNLOAD_10' in [e.error_code for e in error_msgs] actual_records = [record.field['fileInfo']['filename'] for record in wiretap.output_records] sdc_executor.stop_pipeline(sftp_to_trash_pipeline) wiretap.reset() assert [ftp_file_name2, ftp_file_name3] == actual_records finally: if sdc_executor.get_pipeline_status(sftp_to_trash_pipeline).response.json().get('status') == 'RUNNING': sdc_executor.stop_pipeline(sftp_to_trash_pipeline) # Delete the test SFTP origin files we created ftp.chmod(ftp_file_name1, 700) for ftp_file_name in [ftp_file_name1, ftp_file_name2, ftp_file_name3]: logger.debug('Removing file at %s/%s on FTP server ...', ftp.path, ftp_file_name) ftp.rm(ftp_file_name)
def test_ftp_origin_text_delete_subdirectory(sdc_builder, sdc_executor, ftp): """FTP origin test. We first create a two files on FTP server in root directory and in /TMP directory. The FTP origin stage read it. We then assert the data using wiretap. The pipeline looks like: sftp_ftp_client >> wiretap The pipeline delete the files after processing """ ftp_file_name = f'a{get_random_string(string.ascii_letters, 10)}' ftp_dir_name = f'b{get_random_string(string.ascii_letters, 10)}' ftp_file_name_1 = f'{ftp_file_name}_1' ftp_file_name_2 = f'{ftp_file_name}_2' raw_text_data_1 = 'Hello World 1!' raw_text_data_2 = 'Hello World 2!' client = ftp.client client.cwd('/') ftp.put_string(ftp_file_name_1, raw_text_data_1) client.mkd(ftp_dir_name) ftp.put_string(f'{ftp_dir_name}/{ftp_file_name_2}', raw_text_data_2) builder = sdc_builder.get_pipeline_builder() sftp_ftp_client = builder.add_stage(name=FTP_ORIGIN_CLIENT_NAME) sftp_ftp_client.set_attributes(file_name_pattern=f'{ftp_file_name}*', data_format='TEXT', process_subdirectories=True, file_post_processing="DELETE") wiretap = builder.add_wiretap() sftp_ftp_client >> wiretap.destination sftp_ftp_client_pipeline = builder.build('FTP Origin Pipeline Text').configure_for_environment(ftp) sdc_executor.add_pipeline(sftp_ftp_client_pipeline) sdc_executor.start_pipeline(sftp_ftp_client_pipeline).wait_for_pipeline_output_records_count(2) sdc_executor.stop_pipeline(sftp_ftp_client_pipeline) try: assert len(wiretap.output_records) == 2 assert wiretap.output_records[0].field['text'] == raw_text_data_1 assert wiretap.output_records[1].field['text'] == raw_text_data_2 # Assert the first file was deleted by the pipeline. client.cwd('/') file_list = client.nlst() assert ftp_file_name_1 not in file_list # Assert the second file was deleted in tmp folder by the pipeline. client.cwd(ftp_dir_name) file_list = client.nlst() assert ftp_file_name_2 not in file_list finally: # Delete the tmp folder. client.cwd('/') client.rmd(ftp_dir_name) client.quit()